Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -437,6 +437,9 @@ /// \brief Don't restrict interleaved unrolling to small loops. bool enableAggressiveInterleaving(bool LoopHasReductions) const; + /// \brief Enable inline expansion of memcmp + bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) const; + /// \brief Enable matching of interleaved access groups. bool enableInterleavedAccessVectorization() const; @@ -772,6 +775,7 @@ virtual unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF) = 0; virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; + virtual bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) = 0; virtual bool enableInterleavedAccessVectorization() = 0; virtual bool isFPVectorizationPotentiallyUnsafe() = 0; virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context, @@ -978,6 +982,9 @@ bool enableAggressiveInterleaving(bool LoopHasReductions) override { return Impl.enableAggressiveInterleaving(LoopHasReductions); } + bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) override { + return Impl.expandMemCmp(I, MaxLoadSize); + } bool enableInterleavedAccessVectorization() override { return Impl.enableInterleavedAccessVectorization(); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -264,6 +264,8 @@ bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; } + bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) { return false; } + bool enableInterleavedAccessVectorization() { return false; } bool isFPVectorizationPotentiallyUnsafe() { return false; } Index: include/llvm/IR/Instruction.h =================================================================== --- include/llvm/IR/Instruction.h +++ include/llvm/IR/Instruction.h @@ -479,6 +479,9 @@ /// instruction's result is undefined. bool isIdenticalToWhenDefined(const Instruction *I) const; + /// Return true if only matters that the value is equal or not-equal to zero. + bool isOnlyUsedInZeroEqualityComparison() const; + /// When checking for operation equivalence (using isSameOperationAs) it is /// sometimes useful to ignore certain attributes. enum OperationEquivalenceFlags { Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -1011,6 +1011,16 @@ return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy; } + /// \brief Get maximum size in bytes to load for memcmp + /// + /// This function returns the maximum size in bytes to load when + /// expanding memcmp. The value is set by the target at the + /// performance threshold for such a replacement. If OptSize is true, + /// return the limit for functions that have OptSize attribute. + unsigned getMaxExpandSizeMemcmp(bool OptSize) const { + return OptSize ? MaxExpandSizeMemcmpOptSize : MaxExpandSizeMemcmp; + } + /// \brief Get maximum # of store operations permitted for llvm.memmove /// /// This function returns the maximum number of store operations permitted @@ -2180,6 +2190,8 @@ /// Maximum number of store operations that may be substituted for a call to /// memcpy, used for functions with OptSize attribute. unsigned MaxStoresPerMemcpyOptSize; + unsigned MaxExpandSizeMemcmp; + unsigned MaxExpandSizeMemcmpOptSize; /// \brief Specify maximum bytes of store instructions per memmove call. /// Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -201,6 +201,10 @@ return TTIImpl->enableAggressiveInterleaving(LoopHasReductions); } +bool TargetTransformInfo::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) const { + return TTIImpl->expandMemCmp(I, MaxLoadSize); +} + bool TargetTransformInfo::enableInterleavedAccessVectorization() const { return TTIImpl->enableInterleavedAccessVectorization(); } Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -55,6 +55,7 @@ #include "llvm/Transforms/Utils/BypassSlowDivision.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SimplifyLibCalls.h" + using namespace llvm; using namespace llvm::PatternMatch; @@ -79,6 +80,11 @@ STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed"); +STATISTIC(NumMemCmpCalls, "Number of memcmp calls"); +STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size"); +STATISTIC(NumMemCmpGreaterThanMax, "Number of memcmp calls with size greater than max size"); +STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls"); + static cl::opt DisableBranchOpts( "disable-cgp-branch-opts", cl::Hidden, cl::init(false), cl::desc("Disable branch optimizations in CodeGenPrepare")); @@ -1944,6 +1950,507 @@ return true; } +// This class provides helper functions to expand a memcmp library call into an +// inline expansion. +class MemCmpExpansion { + struct ResultBlock { + private: + BasicBlock *BB; + PHINode *PhiSrc1; + PHINode *PhiSrc2; + PHINode *PhiDiff; + + public: + ResultBlock(); + void setBB(BasicBlock *BB); + BasicBlock *getBB() const; + PHINode *getPhiSrc1() const; + PHINode *getPhiSrc2() const; + PHINode *getPhiDiff() const; + void setPHINodes(PHINode *PhiSrc1, PHINode *PhiSrc2, PHINode *PhiDiff); + }; + + CallInst *CI; + ResultBlock ResBlock; + unsigned MaxLoadSize; + unsigned NumBlocks; + unsigned NumBlocksNonOneByte; + std::vector LoadCmpBlocks; + BasicBlock *EndBlock; + PHINode *PhiRes; + int calculateNumBlocks(unsigned Size); + void createLoadCmpBlocks(int Count, Function *Parent, BasicBlock *BeforeBB); + void createResultBlock(Function *Parent, BasicBlock *BeforeBB); + void setupResultBlockPHINodes(); + void setupEndBlockPHINodes(); + void emitLoadCompareBlock(unsigned Index, int LoadSize, int GEPIndex); + void emitLoadCompareByteBlock(unsigned Index, int GEPIndex); + void emitMemCmpResultBlock(bool IsLittleEndian); + BasicBlock *getFirstBlock() const; + BasicBlock *getLastLoadCmpBlock() const; + void addPhiResEdge(Value *Res, BasicBlock *BB); + +public: + MemCmpExpansion(CallInst *CI, unsigned MaxLoadSize, unsigned Size); + Value *getMemCmpExpansion(unsigned Size, bool IsLittleEndian); +}; + +MemCmpExpansion::ResultBlock::ResultBlock() { + BB = nullptr; + PhiSrc1 = nullptr; + PhiSrc2 = nullptr; + PhiDiff = nullptr; +} + +void MemCmpExpansion::ResultBlock::setBB(BasicBlock *BB) { this->BB = BB; } + +BasicBlock *MemCmpExpansion::ResultBlock::getBB() const { return BB; } +PHINode *MemCmpExpansion::ResultBlock::getPhiSrc1() const { return PhiSrc1; } +PHINode *MemCmpExpansion::ResultBlock::getPhiSrc2() const { return PhiSrc2; } +PHINode *MemCmpExpansion::ResultBlock::getPhiDiff() const { return PhiDiff; } + +void MemCmpExpansion::ResultBlock::setPHINodes(PHINode *PhiSrc1, + PHINode *PhiSrc2, + PHINode *PhiDiff) { + this->PhiSrc1 = PhiSrc1; + this->PhiSrc2 = PhiSrc2; + this->PhiDiff = PhiDiff; +} + +// Initialize the basic block structure required for expansion of memcmp call +// with given maximum load size and memcmp size parameter. +// This structure includes: +// 1. A list of load compare blocks - LoadCmpBlocks. +// 2. An EndBlock, split from original instruction point, which is the block to +// return from. +// 3. ResultBlock, block to branch to for early exit when a +// LoadCmpBlock finds a difference. +MemCmpExpansion::MemCmpExpansion(CallInst *CI, unsigned MaxLoadSize, + unsigned Size) { + this->CI = CI; + this->MaxLoadSize = MaxLoadSize; + + LLVMContext &Context = CI->getContext(); + IRBuilder<> Builder(Context); + + BasicBlock *StartBlock = CI->getParent(); + EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); + setupEndBlockPHINodes(); + + // Calculate how many load compare blocks are required for an expansion of + // given Size. + NumBlocks = calculateNumBlocks(Size); + NumBlocksNonOneByte = 0; + + createResultBlock(StartBlock->getParent(), EndBlock); + // If return value of memcmp is not used in a zero equality, we need to + // calculate which source was larger. The calculation sequence requires the + // two loaded source values, and the xor result of each load compare block. + // These will be saved in the phi nodes created by setupResultBlockPHINodes. + if (!CI->isOnlyUsedInZeroEqualityComparison()) + setupResultBlockPHINodes(); + + // Create the number of required load compare basic blocks. + createLoadCmpBlocks(NumBlocks, StartBlock->getParent(), EndBlock); + + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + StartBlock->getTerminator()->eraseFromParent(); + + // Create a branch to the first load compare block from the entry block. + BranchInst *NewBr = BranchInst::Create(getFirstBlock()); + Builder.SetInsertPoint(StartBlock, StartBlock->end()); + Builder.Insert(NewBr); +} + +BasicBlock *MemCmpExpansion::getFirstBlock() const { return LoadCmpBlocks[0]; } + +BasicBlock *MemCmpExpansion::getLastLoadCmpBlock() const { + return LoadCmpBlocks.back(); +} + +void MemCmpExpansion::createLoadCmpBlocks(int Count, Function *Parent, + BasicBlock *BeforeBB) { + LLVMContext &Context = CI->getContext(); + for (int i = 0; i < Count; i++) { + BasicBlock *BB = BasicBlock::Create(Context, "loadbb", Parent, BeforeBB); + LoadCmpBlocks.push_back(BB); + } +} + +void MemCmpExpansion::createResultBlock(Function *Parent, + BasicBlock *BeforeBB) { + LLVMContext &Context = CI->getContext(); + BasicBlock *BB = BasicBlock::Create(Context, "res_block", Parent, BeforeBB); + ResBlock.setBB(BB); +} + +void MemCmpExpansion::addPhiResEdge(Value *Res, BasicBlock *BB) { + PhiRes->addIncoming(Res, BB); +} + +// This function creates the IR instructions for loading and comparing 1 byte. +// It loads 1 byte from each source of the memcmp paramters with the given +// GEPIndex. It then subtracts the two loaded values and adds this result to the +// final phi node for selecting the memcmp result. +void MemCmpExpansion::emitLoadCompareByteBlock(unsigned Index, int GEPIndex) { + LLVMContext &Context = CI->getContext(); + IRBuilder<> Builder(Context); + Value *LoadSrc1, *LoadSrc2, *Diff; + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + Builder.SetInsertPoint(LoadCmpBlocks[Index]); + Type *LoadSizeType = Type::getInt8Ty(Context); + // Cast source to LoadSizeType* + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using the GEPIndex + if (GEPIndex != 0) { + Source1 = Builder.CreateGEP(LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, GEPIndex)); + Source2 = Builder.CreateGEP(LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, GEPIndex)); + } + + LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + LoadSrc1 = Builder.CreateZExtOrTrunc(LoadSrc1, Type::getInt32Ty(Context)); + LoadSrc2 = Builder.CreateZExtOrTrunc(LoadSrc2, Type::getInt32Ty(Context)); + Diff = Builder.CreateSub(LoadSrc1, LoadSrc2); + + PhiRes->addIncoming(Diff, LoadCmpBlocks[Index]); + BranchInst *CmpBr = BranchInst::Create(EndBlock); + Builder.Insert(CmpBr); +} + +// This function creates the IR intructions for loading and comparing using the +// given LoadSize. It loads the number of bytes specified by LoadSize from each +// source of the memcmp parameters. It then does an xor to see if there was a +// difference in any of the loaded bits. If a difference is found, it branches +// with an early exit to the ResultBlock for calculating which source was larger +// at the differing bit position. Otherwise, it falls through to the either the +// next LoadCmpBlock or the EndBlock if this is the last LoadCmpBlock. Loading +// 1 byte is handled with a special case through emitLoadCompareByteBlock. +// The special handling can simply subtract the loaded values and does not +// require branching to ResultBlock for find which byte is larger at the +// differing bit position as there is only 1 byte. +void MemCmpExpansion::emitLoadCompareBlock(unsigned Index, int LoadSize, + int GEPIndex) { + if (LoadSize == 1) { + MemCmpExpansion::emitLoadCompareByteBlock(Index, GEPIndex); + return; + } + + LLVMContext &Context = CI->getContext(); + IRBuilder<> Builder(Context); + Value *LoadSrc1, *LoadSrc2, *Diff; + + Type *LoadSizeType = IntegerType::get(Context, LoadSize * 8); + Type *MaxLoadType = IntegerType::get(Context, MaxLoadSize * 8); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + + Builder.SetInsertPoint(LoadCmpBlocks[Index]); + // Cast source to LoadSizeType* + if (Source1->getType() != LoadSizeType) + Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo()); + if (Source2->getType() != LoadSizeType) + Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo()); + + // Get the base address using the GEPIndex + if (GEPIndex != 0) { + Source1 = Builder.CreateGEP(LoadSizeType, Source1, + ConstantInt::get(LoadSizeType, GEPIndex)); + Source2 = Builder.CreateGEP(LoadSizeType, Source2, + ConstantInt::get(LoadSizeType, GEPIndex)); + } + + // Load LoadSizeType from the base address + LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1); + LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2); + + if (LoadSizeType != MaxLoadType) { + LoadSrc1 = Builder.CreateZExtOrTrunc(LoadSrc1, MaxLoadType); + LoadSrc2 = Builder.CreateZExtOrTrunc(LoadSrc2, MaxLoadType); + } + + // Add the loaded values to the phi nodes for calculating memcmp result only + // if result is not used in a zero equality. + if (!CI->isOnlyUsedInZeroEqualityComparison()) { + ResBlock.getPhiSrc1()->addIncoming(LoadSrc1, LoadCmpBlocks[Index]); + ResBlock.getPhiSrc2()->addIncoming(LoadSrc2, LoadCmpBlocks[Index]); + } + + Diff = Builder.CreateXor(LoadSrc1, LoadSrc2); + Diff = Builder.CreateSExtOrTrunc(Diff, MaxLoadType); + + if (!CI->isOnlyUsedInZeroEqualityComparison()) { + ResBlock.getPhiDiff()->addIncoming(Diff, LoadCmpBlocks[Index]); + } + + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, + ConstantInt::get(Diff->getType(), 0)); + BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1)) + ? EndBlock + : LoadCmpBlocks[Index + 1]; + // Early exit branch if difference found to ResultBlock, otherwise continue to + // next LoadCmpBlock or EndBlock. + BranchInst *CmpBr = BranchInst::Create(ResBlock.getBB(), NextBB, Cmp); + Builder.Insert(CmpBr); + + // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0 + // since early exit to ResultBlock was not taken (no difference was found in + // any of the bytes) + if (Index == LoadCmpBlocks.size() - 1) { + Value *Zero = ConstantInt::get(Type::getInt32Ty(Context), 0); + addPhiResEdge(Zero, LoadCmpBlocks[Index]); + } +} + +// This function populates the ResultBlock with a sequence to calculate the +// memcmp result. It uses the PhiDiff node to find the first differing byte, +// shifts this byte from PhiSrc1 and PhiSrc2 into the lowest byte position in +// the register, masks all other bytes, and does a subtraction. +void MemCmpExpansion::emitMemCmpResultBlock(bool IsLittleEndian) { + LLVMContext &Context = CI->getContext(); + IRBuilder<> Builder(Context); + + // Special case: if memcmp result is used in a zero equality, result does not + // need to be calculated and can simply return 1. + if (CI->isOnlyUsedInZeroEqualityComparison()) { + BasicBlock::iterator InsertPt = ResBlock.getBB()->getFirstInsertionPt(); + Builder.SetInsertPoint(ResBlock.getBB(), InsertPt); + Value *Res = ConstantInt::get(Type::getInt32Ty(Context), 1); + addPhiResEdge(Res, ResBlock.getBB()); + BranchInst *NewBr = BranchInst::Create(EndBlock); + Builder.Insert(NewBr); + return; + } + + Type *LoadSizeType = IntegerType::get(Context, MaxLoadSize * 8); + + BasicBlock::iterator InsertPt = ResBlock.getBB()->getFirstInsertionPt(); + Builder.SetInsertPoint(ResBlock.getBB(), InsertPt); + Function *F = ResBlock.getBB()->getParent(); + + // Find the first differing bit + Function *CountZeros = Intrinsic::getDeclaration( + F->getParent(), IsLittleEndian ? Intrinsic::cttz : Intrinsic::ctlz, + LoadSizeType); + Value *CntZerosMasked = Builder.CreateCall( + CountZeros, {ResBlock.getPhiDiff(), Builder.getFalse()}); + + if (!IsLittleEndian) + CntZerosMasked = Builder.CreateSub( + ConstantInt::get(LoadSizeType, ((MaxLoadSize * 8) - 1)), + CntZerosMasked); + + // Find which byte this bit belongs in + CntZerosMasked = + Builder.CreateAnd(CntZerosMasked, ConstantInt::get(LoadSizeType, ~7)); + + // Shift this first differing byte to the least significant position of the + // register + Value *Shift1 = + Builder.CreateAShr(ResBlock.getPhiSrc1(), CntZerosMasked, "src1_sh"); + Value *Shift2 = + Builder.CreateAShr(ResBlock.getPhiSrc2(), CntZerosMasked, "src2_sh"); + + // Mask out all other bits of the register + Value *And1 = Builder.CreateAnd(Shift1, ConstantInt::get(LoadSizeType, 0xFF)); + Value *And2 = Builder.CreateAnd(Shift2, ConstantInt::get(LoadSizeType, 0xFF)); + Value *Subtract = Builder.CreateSub(And1, And2); + Value *Res = Builder.CreateSExtOrTrunc(Subtract, Builder.getInt32Ty()); + + BranchInst *NewBr = BranchInst::Create(EndBlock); + Builder.Insert(NewBr); + + addPhiResEdge(Res, ResBlock.getBB()); +} + +int MemCmpExpansion::calculateNumBlocks(unsigned Size) { + int NumBlocks = 0; + bool haveOneByteLoad = false; + unsigned RemainingSize = Size; + unsigned LoadSize = MaxLoadSize; + while (RemainingSize) { + if (LoadSize == 1) + haveOneByteLoad = true; + NumBlocks += RemainingSize / LoadSize; + RemainingSize = RemainingSize % LoadSize; + LoadSize = LoadSize / 2; + } + NumBlocksNonOneByte = haveOneByteLoad ? (NumBlocks - 1) : NumBlocks; + return NumBlocks; +} + +void MemCmpExpansion::setupResultBlockPHINodes() { + LLVMContext &Context = CI->getContext(); + IRBuilder<> Builder(Context); + Type *MaxLoadType = IntegerType::get(Context, MaxLoadSize * 8); + Builder.SetInsertPoint(ResBlock.getBB()); + PHINode *PhiDiff = + Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.diff"); + PHINode *PhiSrc1 = + Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src1"); + PHINode *PhiSrc2 = + Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src2"); + ResBlock.setPHINodes(PhiSrc1, PhiSrc2, PhiDiff); +} + +void MemCmpExpansion::setupEndBlockPHINodes() { + LLVMContext &Context = CI->getContext(); + IRBuilder<> Builder(Context); + + Builder.SetInsertPoint(&EndBlock->front()); + PhiRes = Builder.CreatePHI(Type::getInt32Ty(Context), 2, "phi.res"); +} + +// This function expands the memcmp call into an inline expansion and returns +// the memcmp result. +Value *MemCmpExpansion::getMemCmpExpansion(unsigned Size, bool IsLittleEndian) { + int LoadSize = MaxLoadSize; + int NumBytesToBeProcessed = Size; + + unsigned Index = 0; + // This loop calls emitLoadCompareBlock for comparing SizeVal bytes of the two + // memcmp source. It starts with loading using the maximum load size set by + // the target. It processes any remaining bytes using a load size which is the + // next smallest power of 2. + while (NumBytesToBeProcessed) { + // Calculate how many blocks we can create with the current load size + int NumBlocks = NumBytesToBeProcessed / LoadSize; + int GEPIndex = (Size - NumBytesToBeProcessed) / LoadSize; + NumBytesToBeProcessed = NumBytesToBeProcessed % LoadSize; + + // For each NumBlocks, populate the instruction sequence for loading and + // comparing LoadSize bytes + while (NumBlocks--) { + emitLoadCompareBlock(Index, LoadSize, GEPIndex); + Index++; + GEPIndex++; + } + // Get the next LoadSize to use + LoadSize = LoadSize / 2; + } + + emitMemCmpResultBlock(IsLittleEndian); + return PhiRes; +} + +// This function checks to see if an expansion of memcmp can be generated. +// It checks for constant compare size that is less than the max inline size. +// If an expansion cannot occur, returns false to leave as a library call. +// Otherwise, the library call is replaced wtih new IR instruction sequence. +/// We want to transform: +/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 13) +/// To: +/// loadbb: +/// %0 = bitcast i32* %buffer2 to i8* +/// %1 = bitcast i32* %buffer1 to i8* +/// %2 = bitcast i8* %1 to i64* +/// %3 = bitcast i8* %0 to i64* +/// %4 = load i64, i64* %2 +/// %5 = load i64, i64* %3 +/// %6 = xor i64 %4, %5 +/// %7 = icmp ne i64 %6, 0 +/// br i1 %7, label %res_block, label %loadbb4 +/// res_block: ; preds = %loadbb4, +/// %loadbb +/// %res.phi1 = phi i64 [ %6, %loadbb ], [ %24, %loadbb4 ] +/// %res.phi2 = phi i64 [ %4, %loadbb ], [ %22, %loadbb4 ] +/// %res.phi3 = phi i64 [ %5, %loadbb ], [ %23, %loadbb4 ] +/// %8 = call i64 @llvm.cttz.i64(i64 %res.phi1, i1 false) +/// %9 = and i64 %8, -8 +/// %src1_sh = ashr i64 %res.phi2, %9 +/// %src2_sh = ashr i64 %res.phi3, %9 +/// %10 = and i64 %src1_sh, 255 +/// %11 = and i64 %src2_sh, 255 +/// %12 = sub i64 %10, %11 +/// %13 = trunc i64 %12 to i32 +/// br label %endblock +/// loadbb4: ; preds = %loadbb +/// %14 = bitcast i32* %buffer2 to i8* +/// %15 = bitcast i32* %buffer1 to i8* +/// %16 = bitcast i8* %15 to i32* +/// %17 = bitcast i8* %14 to i32* +/// %18 = getelementptr i32, i32* %16, i32 2 +/// %19 = getelementptr i32, i32* %17, i32 2 +/// %20 = load i32, i32* %18 +/// %21 = load i32, i32* %19 +/// %22 = zext i32 %20 to i64 +/// %23 = zext i32 %21 to i64 +/// %24 = xor i64 %22, %23 +/// %25 = icmp ne i64 %24, 0 +/// br i1 %25, label %res_block, label %loadbb5 +/// loadbb5: ; preds = %loadbb4 +/// %26 = bitcast i32* %buffer2 to i8* +/// %27 = bitcast i32* %buffer1 to i8* +/// %28 = getelementptr i8, i8* %27, i8 12 +/// %29 = getelementptr i8, i8* %26, i8 12 +/// %30 = load i8, i8* %28 +/// %31 = load i8, i8* %29 +/// %32 = zext i8 %30 to i32 +/// %33 = zext i8 %31 to i32 +/// %34 = sub i32 %32, %33 +/// br label %endblock +/// endblock: ; preds = %res_block, +/// %loadbb5 +/// %res.phi = phi i32 [ %34, %loadbb5 ], [ %13, %res_block ] +/// ret i32 %res.phi +static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, + const TargetLowering *TLI, const DataLayout *DL) { + NumMemCmpCalls++; + LLVMContext &Context = CI->getContext(); + IRBuilder<> Builder(Context); + + // TTI call to check if target would like to expand memcmp and get the + // MaxLoadSize + unsigned MaxLoadSize; + if (!TTI->expandMemCmp(CI, MaxLoadSize)) + return false; + + // Early exit from expansion if -Oz + if (CI->getCalledFunction()->optForMinSize()) { + return false; + } + + // Early exit from expansion if size is not a constant + ConstantInt *SizeCast = dyn_cast(CI->getArgOperand(2)); + if (!SizeCast) { + NumMemCmpNotConstant++; + return false; + } + + // Early exit from expansion if size greater than max bytes to load + uint64_t SizeVal = SizeCast->getZExtValue(); + if ((SizeVal > + TLI->getMaxExpandSizeMemcmp(CI->getCalledFunction()->optForSize()))) { + NumMemCmpGreaterThanMax++; + return false; + } + + NumMemCmpInlined++; + + // MemCmpHelper object, creates and sets up basic blocks required for + // expanding memcmp with size SizeVal + MemCmpExpansion MemCmpHelper(CI, MaxLoadSize, SizeVal); + + Value *Res = MemCmpHelper.getMemCmpExpansion(SizeVal, DL->isLittleEndian()); + + // Replace call with result of expansion and erarse call. + CI->replaceAllUsesWith(Res); + CI->eraseFromParent(); + + return true; +} + bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { BasicBlock *BB = CI->getParent(); @@ -2127,6 +2634,15 @@ CI->eraseFromParent(); return true; } + + LibFunc Func; + if (TLInfo->getLibFunc(CI->getCalledFunction()->getName(), Func) && + Func == LibFunc_memcmp) { + if (expandMemCmp(CI, TTI, TLI, DL)) { + ModifiedDT = true; + return true; + } + } return false; } @@ -5074,6 +5590,7 @@ return true; } + namespace { /// \brief Helper class to promote a scalar operation to a vector one. /// This class is used to move downward extractelement transition. Index: lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- lib/CodeGen/TargetLoweringBase.cpp +++ lib/CodeGen/TargetLoweringBase.cpp @@ -829,9 +829,10 @@ initActions(); // Perform these initializations only once. - MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = 8; - MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize - = MaxStoresPerMemmoveOptSize = 4; + MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = + MaxExpandSizeMemcmp = 8; + MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize = + MaxStoresPerMemmoveOptSize = MaxExpandSizeMemcmpOptSize = 4; UseUnderscoreSetJmp = false; UseUnderscoreLongJmp = false; HasMultipleConditionRegisters = false; Index: lib/IR/Instruction.cpp =================================================================== --- lib/IR/Instruction.cpp +++ lib/IR/Instruction.cpp @@ -412,6 +412,20 @@ return haveSameSpecialState(this, I); } +/// Return true if it only matters that the value is equal or not-equal to zero. +bool Instruction::isOnlyUsedInZeroEqualityComparison() const { + for (const User *U : users()) { + if (const ICmpInst *IC = dyn_cast(U)) + if (IC->isEquality()) + if (Constant *C = dyn_cast(IC->getOperand(1))) + if (C->isNullValue()) + continue; + // Unknown instruction. + return false; + } + return true; +} + // Keep this in sync with FunctionComparator::cmpOperations in // lib/Transforms/IPO/MergeFunctions.cpp. bool Instruction::isSameOperationAs(const Instruction *I, Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -1020,6 +1020,10 @@ MaxStoresPerMemset = 128; MaxStoresPerMemcpy = 128; MaxStoresPerMemmove = 128; + MaxExpandSizeMemcmp = 128; + } else { + MaxExpandSizeMemcmp = 64; + MaxExpandSizeMemcmpOptSize = 8; } } Index: lib/Target/PowerPC/PPCTargetTransformInfo.h =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.h +++ lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -60,6 +60,7 @@ /// @{ bool enableAggressiveInterleaving(bool LoopHasReductions); + bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize); bool enableInterleavedAccessVectorization(); unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -215,6 +215,11 @@ return LoopHasReductions; } +bool PPCTTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) { + MaxLoadSize = 8; + return true; +} + bool PPCTTIImpl::enableInterleavedAccessVectorization() { return true; } Index: lib/Transforms/Utils/SimplifyLibCalls.cpp =================================================================== --- lib/Transforms/Utils/SimplifyLibCalls.cpp +++ lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -88,20 +88,6 @@ return false; } -/// Return true if it only matters that the value is equal or not-equal to zero. -static bool isOnlyUsedInZeroEqualityComparison(Value *V) { - for (User *U : V->users()) { - if (ICmpInst *IC = dyn_cast(U)) - if (IC->isEquality()) - if (Constant *C = dyn_cast(IC->getOperand(1))) - if (C->isNullValue()) - continue; - // Unknown instruction. - return false; - } - return true; -} - /// Return true if it is only used in equality comparisons with With. static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) { for (User *U : V->users()) { @@ -504,7 +490,7 @@ // strlen(x) != 0 --> *x != 0 // strlen(x) == 0 --> *x == 0 - if (isOnlyUsedInZeroEqualityComparison(CI)) + if (CI->isOnlyUsedInZeroEqualityComparison()) return B.CreateZExt(B.CreateLoad(Src, "strlenfirst"), CI->getType()); return nullptr; @@ -675,7 +661,7 @@ // // memchr("\r\n", C, 2) != nullptr -> (C & ((1 << '\r') | (1 << '\n'))) != 0 // after bounds check. - if (!CharC && !Str.empty() && isOnlyUsedInZeroEqualityComparison(CI)) { + if (!CharC && !Str.empty() && CI->isOnlyUsedInZeroEqualityComparison()) { unsigned char Max = *std::max_element(reinterpret_cast(Str.begin()), reinterpret_cast(Str.end())); @@ -750,7 +736,7 @@ } // memcmp(S1,S2,N/8)==0 -> (*(intN_t*)S1 != *(intN_t*)S2)==0 - if (DL.isLegalInteger(Len * 8) && isOnlyUsedInZeroEqualityComparison(CI)) { + if (DL.isLegalInteger(Len * 8) && CI->isOnlyUsedInZeroEqualityComparison()) { IntegerType *IntType = IntegerType::get(CI->getContext(), Len * 8); unsigned PrefAlignment = DL.getPrefTypeAlignment(IntType); Index: test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll @@ -0,0 +1,117 @@ +; RUN: llc -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +@zeroEqualityTest01.buffer1 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 4], align 4 +@zeroEqualityTest01.buffer2 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 3], align 4 +@zeroEqualityTest02.buffer1 = private unnamed_addr constant [4 x i32] [i32 4, i32 0, i32 0, i32 0], align 4 +@zeroEqualityTest02.buffer2 = private unnamed_addr constant [4 x i32] [i32 3, i32 0, i32 0, i32 0], align 4 +@zeroEqualityTest03.buffer1 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 3], align 4 +@zeroEqualityTest03.buffer2 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 4], align 4 +@zeroEqualityTest04.buffer1 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14], align 4 +@zeroEqualityTest04.buffer2 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 13], align 4 + +; Function Attrs: nounwind readonly +declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_addr #1 + +; Validate with if(memcmp()) +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest01() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([3 x i32]* @zeroEqualityTest01.buffer1 to i8*), i8* bitcast ([3 x i32]* @zeroEqualityTest01.buffer2 to i8*), i64 16) + %not.tobool = icmp ne i32 %call, 0 + %. = zext i1 %not.tobool to i32 + ret i32 %. + + ; CHECK-LABEL: @zeroEqualityTest01 + ; CHECK-LABEL: %res_block + ; CHECK: li 3, 1 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr + ; CHECK: li 3, 0 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr +} + +; Validate with if(memcmp() == 0) +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest02() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer2 to i8*), i64 16) + %not.cmp = icmp ne i32 %call, 0 + %. = zext i1 %not.cmp to i32 + ret i32 %. + + ; CHECK-LABEL: @zeroEqualityTest02 + ; CHECK-LABEL: %res_block + ; CHECK: li 3, 1 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr + ; CHECK: li 3, 0 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr +} + +; Validate with > 0 +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest03() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer2 to i8*), i64 16) + %not.cmp = icmp slt i32 %call, 1 + %. = zext i1 %not.cmp to i32 + ret i32 %. + + ; CHECK-LABEL: @zeroEqualityTest03 + ; CHECK-LABEL: %res_block + ; CHECK: popcntd + ; CHECK-NOT: li 3, 1 +} + +; Validate with < 0 +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest04() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer2 to i8*), i64 16) + %call.lobit = lshr i32 %call, 31 + %call.lobit.not = xor i32 %call.lobit, 1 + ret i32 %call.lobit.not + + ; CHECK-LABEL: @zeroEqualityTest04 + ; CHECK-LABEL: %res_block + ; CHECK: popcntd + ; CHECK-NOT: li 3, 1 +} + +; Validate with memcmp()?: +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest05() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer2 to i8*), i64 16) + %not.tobool = icmp eq i32 %call, 0 + %cond = zext i1 %not.tobool to i32 + ret i32 %cond + + ; CHECK-LABEL: @zeroEqualityTest05 + ; CHECK-LABEL: %res_block + ; CHECK: li 3, 1 + ; CHECK: li 3, 0 +} + +; Validate with !memcmp()?: +; Function Attrs: nounwind readonly +define signext i32 @zeroEqualityTest06() local_unnamed_addr #0 { +entry: + %call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer2 to i8*), i64 16) + %not.lnot = icmp ne i32 %call, 0 + %cond = zext i1 %not.lnot to i32 + ret i32 %cond + + ; CHECK-LABEL: @zeroEqualityTest06 + ; CHECK-LABEL: %res_block + ; CHECK: li 3, 1 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr + ; CHECK: li 3, 0 + ; CHECK-NEXT: clrldi + ; CHECK-NEXT: blr +} Index: test/CodeGen/PowerPC/memcmp.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/memcmp.ll @@ -0,0 +1,104 @@ +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux < %s | FileCheck %s -check-prefix=CHECK + +; Check size 8 +; Function Attrs: nounwind readonly +define signext i32 @test1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 8) #2 + ret i32 %call + +; CHECK-LABEL: @test1 +; CHECK: ld [[LOAD1:[0-9]+]] +; CHECK-NEXT: ld [[LOAD2:[0-9]+]] +; CHECK-NEXT: xor. [[XOR:[0-9]+]], [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: beq +; CHECK: popcntd [[POPCNTD:[0-9]+]] +; CHECK-NEXT: andi. [[ANDI:[0-9]+]], [[POPCNTD]], 120 +; CHECK-NEXT: srad [[SRAD1:[0-9]+]], [[LOAD1]], [[ANDI]] +; CHECK-NEXT: srad [[SRAD2:[0-9]+]], [[LOAD2]], [[ANDI]] +; CHECK-NEXT: clrldi [[CLR1:[0-9]+]], [[SRAD1]], 56 +; CHECK-NEXT: clrldi [[CLR2:[0-9]+]], [[SRAD2]], 56 +; CHECK-NEXT: sub [[SUB:[0-9]+]], [[CLR1]], [[CLR2]] +; CHECK-NEXT: extsw 3, [[SUB]] +; CHECK-NEXT: blr +; CHECK: li [[LI:[0-9]+]], 0 +; CHECK-NEXT: extsw 3, [[LI]] +} + +; Check size 4 +; Function Attrs: nounwind readonly +define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 4) #2 + ret i32 %call + +; CHECK-LABEL: @test2 +; CHECK: lwz [[LOAD1:[0-9]+]] +; CHECK-NEXT: lwz [[LOAD2:[0-9]+]] +; CHECK-NEXT: xor [[XOR:[0-9]+]], [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: cmplwi [[CMPLWI:[0-9]+]], 0 +; CHECK-NEXT: beq +; CHECK: popcntd [[POPCNTD:[0-9]+]] +; CHECK-NEXT: andi. [[ANDI:[0-9]+]], [[POPCNTD]], 120 +; CHECK-NEXT: srd [[SRD1:[0-9]+]], [[LOAD1]], [[ANDI]] +; CHECK-NEXT: srd [[SRAD2:[0-9]+]], [[LOAD2]], [[ANDI]] +; CHECK-NEXT: clrldi [[CLR1:[0-9]+]], [[SRAD1]], 56 +; CHECK-NEXT: clrldi [[CLR2:[0-9]+]], [[SRAD2]], 56 +; CHECK-NEXT: sub [[SUB:[0-9]+]], [[CLR1]], [[CLR2]] +; CHECK-NEXT: extsw 3, [[SUB]] +; CHECK-NEXT: blr +; CHECK: li [[LI:[0-9]+]], 0 +; CHECK-NEXT: extsw 3, [[LI]] +} + +; Check size 2 +; Function Attrs: nounwind readonly +define signext i32 @test3(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 2) #2 + ret i32 %call + +; CHECK-LABEL: @test3 +; CHECK: lhz [[LOAD1:[0-9]+]] +; CHECK-NEXT: lhz [[LOAD2:[0-9]+]] +; CHECK-NEXT: xor [[XOR:[0-9]+]], [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: rlwinm. [[RLWINM:[0-9]+]], [[XOR]], 0, 16, 31 +; CHECK-NEXT: beq +; CHECK: popcntd [[POPCNTD:[0-9]+]] +; CHECK-NEXT: andi. [[ANDI:[0-9]+]], [[POPCNTD]], 120 +; CHECK-NEXT: srd [[SRD1:[0-9]+]], [[LOAD1]], [[ANDI]] +; CHECK-NEXT: srd [[SRAD2:[0-9]+]], [[LOAD2]], [[ANDI]] +; CHECK-NEXT: clrldi [[CLR1:[0-9]+]], [[SRAD1]], 56 +; CHECK-NEXT: clrldi [[CLR2:[0-9]+]], [[SRAD2]], 56 +; CHECK-NEXT: sub [[SUB:[0-9]+]], [[CLR1]], [[CLR2]] +; CHECK-NEXT: extsw 3, [[SUB]] +; CHECK-NEXT: blr +; CHECK: li [[LI:[0-9]+]], 0 +; CHECK-NEXT: extsw 3, [[LI]] +} + +; Check size 1 +; Function Attrs: nounwind readonly +define signext i32 @test4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 1) #2 + ret i32 %call + +; CHECK-LABEL: @test4 +; CHECK: lbz [[LOAD1:[0-9]+]] +; CHECK-NEXT: lbz [[LOAD2:[0-9]+]] +; CHECK-NEXT: subf [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]] +; CHECK-NEXT: extsw 3, [[SUB]] +; CHECK-NEXT: blr +} + +; Function Attrs: nounwind readonly +declare signext i32 @memcmp(i8*, i8*, i64) #1 Index: test/CodeGen/PowerPC/memcmpIR.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/memcmpIR.ll @@ -0,0 +1,213 @@ +; RUN: llc -o - -mtriple=powerpc64le-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s +; RUN: llc -o - -mtriple=powerpc64-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s --check-prefix=CHECK-BE + +; Check multiples of 8 +; Function Attrs: nounwind readonly +define signext i32 @test1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 16) #2 + ret i32 %call + + ; CHECK: [[LOAD1:%[0-9]+]] = load i64, i64* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* + ; CHECK-NEXT: [[XOR:%[0-9]+]] = xor i64 [[LOAD1]], [[LOAD2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[XOR]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-LABEL: res_block:{{.*}} + ; CHECK: [[CTTZ:%[0-9]+]] = call i64 @llvm.cttz.i64 + ; CHECK: [[AND:%[0-9]+]] = and i64 [[CTTZ]], -8 + ; CHECK: [[ASHR1:%[a-z]+[0-9]+_[a-z]+]] = ashr i64 {{.*}}, [[AND]] + ; CHECK: [[ASHR2:%[a-z]+[0-9]+_[a-z]+]] = ashr i64 {{.*}}, [[AND]] + ; CHECK-NEXT: [[AND1:%[0-9]+]] = and i64 [[ASHR1]], 255 + ; CHECK-NEXT: [[AND2:%[0-9]+]] = and i64 [[ASHR2]], 255 + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[AND1]], [[AND2]] + + ; CHECK: [[GEP1:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1 + ; CHECK-NEXT: [[GEP2:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1 + ; CHECK-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[GEP1]] + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[GEP2]] + ; CHECK-NEXT: [[XOR:%[0-9]+]] = xor i64 [[LOAD1]], [[LOAD2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[XOR]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, i64* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* + ; CHECK-BE-NEXT: [[XOR:%[0-9]+]] = xor i64 [[LOAD1]], [[LOAD2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[XOR]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label + + ; CHECK-BE-LABEL: res_block:{{.*}} + ; CHECK-BE: [[CTLZ:%[0-9]+]] = call i64 @llvm.ctlz.i64 + ; CHECK-BE: [[SUB:%[0-9]+]] = sub i64 63, [[CTLZ]] + ; CHECK-BE: [[AND:%[0-9]+]] = and i64 [[SUB]], -8 + ; CHECK-BE: [[ASHR1:%[a-z]+[0-9]+_[a-z]+]] = ashr i64 {{.*}}, [[AND]] + ; CHECK-BE: [[ASHR2:%[a-z]+[0-9]+_[a-z]+]] = ashr i64 {{.*}}, [[AND]] + ; CHECK-BE-NEXT: [[AND1:%[0-9]+]] = and i64 [[ASHR1]], 255 + ; CHECK-BE-NEXT: [[AND2:%[0-9]+]] = and i64 [[ASHR2]], 255 + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[AND1]], [[AND2]] + + ; CHECK-BE: [[GEP1:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1 + ; CHECK-BE-NEXT: [[GEP2:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1 + ; CHECK-BE-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[GEP1]] + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[GEP2]] + ; CHECK-BE-NEXT: [[XOR:%[0-9]+]] = xor i64 [[LOAD1]], [[LOAD2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[XOR]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label +} + +define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 4) #2 + ret i32 %call + + ; CHECK: [[LOAD1:%[0-9]+]] = load i32, i32* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32* + ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64 + ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64 + ; CHECK-NEXT: [[XOR:%[0-9]+]] = xor i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[XOR]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label %endblock + + ; CHECK-LABEL: res_block:{{.*}} + ; CHECK: [[CTTZ:%[0-9]+]] = call i64 @llvm.cttz.i64 + ; CHECK: [[AND:%[0-9]+]] = and i64 [[CTTZ]], -8 + ; CHECK: [[ASHR1:%[a-z]+[0-9]+_[a-z]+]] = ashr i64 {{.*}}, [[AND]] + ; CHECK: [[ASHR2:%[a-z]+[0-9]+_[a-z]+]] = ashr i64 {{.*}}, [[AND]] + ; CHECK-NEXT: [[AND1:%[0-9]+]] = and i64 [[ASHR1]], 255 + ; CHECK-NEXT: [[AND2:%[0-9]+]] = and i64 [[ASHR2]], 255 + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[AND1]], [[AND2]] + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32* + ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64 + ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64 + ; CHECK-BE-NEXT: [[XOR:%[0-9]+]] = xor i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[XOR]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label %endblock + + ; CHECK-BE-LABEL: res_block:{{.*}} + ; CHECK-BE: [[CTLZ:%[0-9]+]] = call i64 @llvm.ctlz.i64 + ; CHECK-BE: [[SUB:%[0-9]+]] = sub i64 63, [[CTLZ]] + ; CHECK-BE: [[AND:%[0-9]+]] = and i64 [[SUB]], -8 + ; CHECK-BE: [[ASHR1:%[a-z]+[0-9]+_[a-z]+]] = ashr i64 {{.*}}, [[AND]] + ; CHECK-BE: [[ASHR2:%[a-z]+[0-9]+_[a-z]+]] = ashr i64 {{.*}}, [[AND]] + ; CHECK-BE-NEXT: [[AND1:%[0-9]+]] = and i64 [[ASHR1]], 255 + ; CHECK-BE-NEXT: [[AND2:%[0-9]+]] = and i64 [[ASHR2]], 255 + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[AND1]], [[AND2]] +} + +define signext i32 @test3(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 15) #2 + ret i32 %call + + ; CHECK: [[LOAD1:%[0-9]+]] = load i64, i64* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* + ; CHECK-NEXT: [[XOR:%[0-9]+]] = xor i64 [[LOAD1]], [[LOAD2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[XOR]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label %loadbb{{[0-9]+}} + + ; CHECK-LABEL: res_block:{{.*}} + ; CHECK: [[CTTZ:%[0-9]+]] = call i64 @llvm.cttz.i64 + ; CHECK: [[AND:%[0-9]+]] = and i64 [[CTTZ]], -8 + ; CHECK: [[ASHR1:%[a-z]+[0-9]+_[a-z]+]] = ashr i64 {{.*}}, [[AND]] + ; CHECK: [[ASHR2:%[a-z]+[0-9]+_[a-z]+]] = ashr i64 {{.*}}, [[AND]] + ; CHECK-NEXT: [[AND1:%[0-9]+]] = and i64 [[ASHR1]], 255 + ; CHECK-NEXT: [[AND2:%[0-9]+]] = and i64 [[ASHR2]], 255 + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[AND1]], [[AND2]] + + ; CHECK: [[LOAD1:%[0-9]+]] = load i32, i32* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32* + ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64 + ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64 + ; CHECK-NEXT: [[XOR:%[0-9]+]] = xor i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[XOR]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label %loadbb{{[0-9]+}} + + ; CHECK: [[LOAD1:%[0-9]+]] = load i16, i16* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i16, i16* + ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[LOAD1]] to i64 + ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[LOAD2]] to i64 + ; CHECK-NEXT: [[XOR:%[0-9]+]] = xor i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[XOR]], 0 + ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label %loadbb{{[0-9]+}} + + ; CHECK: [[LOAD1:%[0-9]+]] = load i8, i8* + ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i8, i8* + ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32 + ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32 + ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]] + ; CHECK-NEXT: br label %endblock + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, i64* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* + ; CHECK-BE-NEXT: [[XOR:%[0-9]+]] = xor i64 [[LOAD1]], [[LOAD2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[XOR]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label %loadbb{{[0-9]+}} + + ; CHECK-BE-LABEL: res_block:{{.*}} + ; CHECK-BE: [[CTLZ:%[0-9]+]] = call i64 @llvm.ctlz.i64 + ; CHECK-BE: [[SUB:%[0-9]+]] = sub i64 63, [[CTLZ]] + ; CHECK-BE: [[AND:%[0-9]+]] = and i64 [[SUB]], -8 + ; CHECK-BE: [[ASHR1:%[a-z]+[0-9]+_[a-z]+]] = ashr i64 {{.*}}, [[AND]] + ; CHECK-BE: [[ASHR2:%[a-z]+[0-9]+_[a-z]+]] = ashr i64 {{.*}}, [[AND]] + ; CHECK-BE-NEXT: [[AND1:%[0-9]+]] = and i64 [[ASHR1]], 255 + ; CHECK-BE-NEXT: [[AND2:%[0-9]+]] = and i64 [[ASHR2]], 255 + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[AND1]], [[AND2]] + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32* + ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64 + ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64 + ; CHECK-BE-NEXT: [[XOR:%[0-9]+]] = xor i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[XOR]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label %loadbb{{[0-9]+}} + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i16, i16* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i16, i16* + ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[LOAD1]] to i64 + ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[LOAD2]] to i64 + ; CHECK-BE-NEXT: [[XOR:%[0-9]+]] = xor i64 [[ZEXT1]], [[ZEXT2]] + ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[XOR]], 0 + ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label %loadbb{{[0-9]+}} + + ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i8, i8* + ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i8, i8* + ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32 + ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32 + ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]] + ; CHECK-BE-NEXT: br label %endblock +} + +define signext i32 @test4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 65) #2 + ret i32 %call + + ; CHECK: %call = tail call signext i32 @memcmp + ; CHECK-BE: %call = tail call signext i32 @memcmp +} + +define signext i32 @test5(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2, i32 signext %SIZE) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %conv = sext i32 %SIZE to i64 + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 %conv) #2 + ret i32 %call + + ; CHECK: call = tail call signext i32 @memcmp + ; CHECK-BE: call = tail call signext i32 @memcmp +} + +; Function Attrs: nounwind readonly +declare signext i32 @memcmp(i8*, i8*, i64) #1