Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -419,6 +419,10 @@ /// \brief Don't restrict interleaved unrolling to small loops. bool enableAggressiveInterleaving(bool LoopHasReductions) const; + /// \brief Enable inline expansion of memcmp + bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize, bool &ByteSwapLoads, + bool &AllowUnalignedLoads) const; + /// \brief Enable matching of interleaved access groups. bool enableInterleavedAccessVectorization() const; @@ -753,6 +757,8 @@ virtual unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF) = 0; virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; + virtual bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize, + bool &ByteSwapLoads, bool &AllowUnalignedLoads) = 0; virtual bool enableInterleavedAccessVectorization() = 0; virtual bool isFPVectorizationPotentiallyUnsafe() = 0; virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context, @@ -954,6 +960,11 @@ bool enableAggressiveInterleaving(bool LoopHasReductions) override { return Impl.enableAggressiveInterleaving(LoopHasReductions); } + bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize, bool &ByteSwapLoads, + bool &AllowUnalignedLoads) override { + return Impl.expandMemCmp(I, MaxLoadSize, ByteSwapLoads, + AllowUnalignedLoads); + } bool enableInterleavedAccessVectorization() override { return Impl.enableInterleavedAccessVectorization(); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -260,6 +260,11 @@ bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; } + bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize, bool &ByteSwapLoads, + bool &AllowUnalignedLoads) { + return false; + } + bool enableInterleavedAccessVectorization() { return false; } bool isFPVectorizationPotentiallyUnsafe() { return false; } Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -1006,6 +1006,16 @@ unsigned getMaxStoresPerMemcpy(bool OptSize) const { return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy; } + + /// \brief Get maximum size in bytes to load for memcmp + /// + /// This function returns the maximum size in bytes to load when + /// expanding memcmp. The value is set by the target at the + /// performance threshold for such a replacement. If OptSize is true, + /// return the limit for functions that have OptSize attribute. + unsigned getMaxLoadSizeMemcmp(bool OptSize) const { + return OptSize ? MaxLoadSizeMemcmpOptSize : MaxLoadSizeMemcmp; + } /// \brief Get maximum # of store operations permitted for llvm.memmove /// @@ -2177,6 +2187,8 @@ /// Maximum number of store operations that may be substituted for a call to /// memcpy, used for functions with OptSize attribute. unsigned MaxStoresPerMemcpyOptSize; + unsigned MaxLoadSizeMemcmp; + unsigned MaxLoadSizeMemcmpOptSize; /// \brief Specify maximum bytes of store instructions per memmove call. /// Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -197,6 +197,13 @@ return TTIImpl->enableAggressiveInterleaving(LoopHasReductions); } +bool TargetTransformInfo::expandMemCmp(Instruction *I, unsigned &MaxLoadSize, + bool &ByteSwapLoads, + bool &AllowUnalignedLoads) const { + return TTIImpl->expandMemCmp(I, MaxLoadSize, ByteSwapLoads, + AllowUnalignedLoads); +} + bool TargetTransformInfo::enableInterleavedAccessVectorization() const { return TTIImpl->enableInterleavedAccessVectorization(); } Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -80,6 +80,11 @@ STATISTIC(NumAndCmpsMoved, "Number of and/cmp's pushed into branches"); STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed"); +STATISTIC(NumMemCmpCalls, "Number of memcmp calls"); +STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size"); +STATISTIC(NumMemCmpGreaterThanMax, "Number of memcmp calls with size greater than max size"); +STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls"); + static cl::opt DisableBranchOpts( "disable-cgp-branch-opts", cl::Hidden, cl::init(false), cl::desc("Disable branch optimizations in CodeGenPrepare")); @@ -1869,6 +1874,337 @@ ModifiedDT = true; return true; } +// Populates the EndBlock with a sequence to calculate the memcmp result. +Value *getResult(LLVMContext &C, PHINode *PhiXor, PHINode *PhiSrc1, + PHINode *PhiSrc2, Type *LoadType, BasicBlock *EndBlock) { + + IRBuilder<> Builder(C); + Function *F; + Value *Res; + BasicBlock::iterator InsertPt = EndBlock->getFirstInsertionPt(); + Builder.SetInsertPoint(EndBlock, InsertPt); + F = EndBlock->getParent(); + Function *CTTZ = + Intrinsic::getDeclaration(F->getParent(), Intrinsic::cttz, LoadType); + Value *CntZerosMasked = + Builder.CreateCall(CTTZ, {PhiXor, Builder.getFalse()}); + + uint64_t Mask; + Mask = (LoadType == Type::getInt64Ty(C)) ? UINT64_MAX << 3 : UINT_MAX << 3; + CntZerosMasked = Builder.CreateAnd( + CntZerosMasked, + ConstantInt::get((LoadType == Type::getInt64Ty(C)) ? LoadType + : Type::getInt32Ty(C), + Mask)); + Value *Shift1 = Builder.CreateAShr(PhiSrc1, CntZerosMasked, "src1_sh"); + Value *Shift2 = Builder.CreateAShr(PhiSrc2, CntZerosMasked, "src2_sh"); + Value *And1 = Builder.CreateAnd(Shift1, ConstantInt::get(LoadType, 0xFF)); + Value *And2 = Builder.CreateAnd(Shift2, ConstantInt::get(LoadType, 0xFF)); + Value *Subtract = Builder.CreateSub(And1, And2); + Res = Builder.CreateSExtOrTrunc(Subtract, Builder.getInt32Ty()); + return Res; +} + +// Populates the load compare block for the given LoadType. +// If LoadType is i8, we can just subtract and return. +// If LoadType is greater than i8, we need to populate the EndBlock +// with a sequence that calculates the memcmp result. +void EmitLoadCompareBlock(LLVMContext &C, BasicBlock *LoadBlockCurr, + BasicBlock *LoadBlockNext, Value *Source1, + Value *Source2, Type *LoadType, Type *LoadPtrTy, + Type *MaxLoadType, BasicBlock *EndBlock, + PHINode *PhiXor, PHINode *PhiSrc1, PHINode *PhiSrc2, + unsigned GEPIndex) { + IRBuilder<> Builder(C); + Value *Source1Cast, *Source2Cast, *LoadSrc1, *LoadSrc2, *Diff; + + Builder.SetInsertPoint(LoadBlockCurr); + + Source1Cast = Builder.CreateBitCast(Source1, LoadPtrTy); + Source2Cast = Builder.CreateBitCast(Source2, LoadPtrTy); + + if (GEPIndex != 0) { + Source1Cast = Builder.CreateGEP(LoadType, Source1Cast, + ConstantInt::get(LoadType, GEPIndex)); + Source2Cast = Builder.CreateGEP(LoadType, Source2Cast, + ConstantInt::get(LoadType, GEPIndex)); + } + + LoadSrc1 = Builder.CreateLoad(LoadType, Source1Cast); + LoadSrc2 = Builder.CreateLoad(LoadType, Source2Cast); + + // Extend i8 to i32 for memcmp result + if (LoadType == Type::getInt8Ty(C)) { + LoadSrc1 = Builder.CreateZExtOrTrunc(LoadSrc1, Type::getInt32Ty(C)); + LoadSrc2 = Builder.CreateZExtOrTrunc(LoadSrc2, Type::getInt32Ty(C)); + // Extend all types other than i8 to MaxLoadType + } else if (LoadType != MaxLoadType) { + LoadSrc1 = Builder.CreateZExtOrTrunc(LoadSrc1, MaxLoadType); + LoadSrc2 = Builder.CreateZExtOrTrunc(LoadSrc2, MaxLoadType); + } + + // If LoadType is i8, we can just subtract and return + if (LoadType == Type::getInt8Ty(C)) { + Diff = Builder.CreateSub(LoadSrc1, LoadSrc2); + } else { + PhiSrc1->addIncoming(LoadSrc1, LoadBlockCurr); + PhiSrc2->addIncoming(LoadSrc2, LoadBlockCurr); + Diff = Builder.CreateXor(LoadSrc1, LoadSrc2); + Diff = Builder.CreateSExtOrTrunc(Diff, MaxLoadType); + } + + PhiXor->addIncoming(Diff, LoadBlockCurr); + + if (LoadBlockNext == nullptr) { + BranchInst *NewBr = BranchInst::Create(EndBlock); + Builder.Insert(NewBr); + } else { + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff, + ConstantInt::get(Diff->getType(), 0)); + Builder.CreateCondBr(Cmp, EndBlock, LoadBlockNext); + } +} + +Type *getPtrTypeFromSize(LLVMContext &C, unsigned Size) { + Type *LoadPtrTy; + switch (Size) { + case 8: { + LoadPtrTy = Type::getInt64PtrTy(C); + break; + } + case 4: { + LoadPtrTy = Type::getInt32PtrTy(C); + break; + } + case 2: { + LoadPtrTy = Type::getInt16PtrTy(C); + break; + } + case 1: { + LoadPtrTy = Type::getInt8PtrTy(C); + break; + } + } + return LoadPtrTy; +} + +Type *getTypeFromSize(LLVMContext &C, unsigned Size) { + Type *LoadType; + switch (Size) { + case 8: { + LoadType = Type::getInt64Ty(C); + break; + } + case 4: { + LoadType = Type::getInt32Ty(C); + break; + } + case 2: { + LoadType = Type::getInt16Ty(C); + break; + } + case 1: { + LoadType = Type::getInt8Ty(C); + break; + } + } + return LoadType; +} +// This function checks to see if an expansion of memcmp can be generated. +// It checks for constant compare size that is less than the max inline size. +// If an expansion cannot occur, returns false to leave as a library call. +// Otherwise, the library call is replaced wtih new IR instruction sequence. +/// We want to transform: +/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 13) +/// +/// loadb: Loads 8 bytes and branches on early exit to calculate result block +/// %0 = bitcast i32* %buffer2 to i8* +/// %1 = bitcast i32* %buffer1 to i8* +/// %2 = bitcast i8* %1 to i64* +/// %3 = bitcast i8* %0 to i64* +/// %4 = load i64, i64* %2 +/// %5 = load i64, i64* %3 +/// %6 = xor i64 %4, %5 +/// %7 = icmp ne i64 %6, 0 +/// br i1 %7, label %endblockx, label %loadb4 +/// loadb4: Loads next 4 bytes ; preds = %loadb +/// %14 = bitcast i32* %buffer2 to i8* +/// %15 = bitcast i32* %buffer1 to i8* +/// %16 = bitcast i8* %15 to i32* +/// %17 = bitcast i8* %14 to i32* +/// %18 = getelementptr i32, i32* %16, i32 2 +/// %19 = getelementptr i32, i32* %17, i32 2 +/// %20 = load i32, i32* %18 +/// %21 = load i32, i32* %19 +/// %22 = zext i32 %20 to i64 +/// %23 = zext i32 %21 to i64 +/// %24 = xor i64 %22, %23 +/// %25 = icmp ne i64 %24, 0 +/// br i1 %25, label %endblockx, label %loadb5 +/// loadb5: Loads last remaining byte ; preds = %loadb4 +/// %26 = bitcast i32* %buffer2 to i8* +/// %27 = bitcast i32* %buffer1 to i8* +/// %28 = getelementptr i8, i8* %27, i8 12 +/// %29 = getelementptr i8, i8* %26, i8 12 +/// %30 = load i8, i8* %28 +/// %31 = load i8, i8* %29 +/// %32 = zext i8 %30 to i32 +/// %33 = zext i8 %31 to i32 +/// %34 = sub i32 %32, %33 +/// br label %endblock +/// endblockx: Calculate result ; preds = %loadb4, %loadb +/// %res.phi = phi i64 [ %6, %loadb ], [ %24, %loadb4 ] +/// %res.phi1 = phi i64 [ %4, %loadb ], [ %22, %loadb4 ] +/// %res.phi2 = phi i64 [ %5, %loadb ], [ %23, %loadb4 ] +/// %8 = call i64 @llvm.cttz.i64(i64 %res.phi, i1 false) +/// %9 = and i64 %8, -8 +/// %src1_sh = ashr i64 %res.phi1, %9 +/// %src2_sh = ashr i64 %res.phi2, %9 +/// %10 = and i64 %src1_sh, 255 +/// %11 = and i64 %src2_sh, 255 +/// %12 = sub i64 %10, %11 +/// %13 = trunc i64 %12 to i32 +/// br label %endblock +/// endblock: Return depending on which block we came from ; preds = %endblockx, %loadb5 +/// %res.phi3 = phi i32 [ %34, %loadb5 ], [ %13, %endblockx ] +/// ret i32 %res.phi3 +static bool memcmpExpansion(CallInst *CI, const TargetLowering *TLI, + const DataLayout *DL, unsigned MaxLoadSize, + bool ByteSwapLoads, bool AllowUnalignedLoads) { + NumMemCmpCalls++; + LLVMContext &C = CI->getContext(); + IRBuilder<> Builder(C); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + Value *Size = CI->getArgOperand(2); + + ConstantInt *SizeCast = dyn_cast(Size); + if (!SizeCast) { + NumMemCmpNotConstant++; + return false; + } + + uint64_t SizeVal = SizeCast->getZExtValue(); + if (SizeVal > TLI->getMaxLoadSizeMemcmp(0)) { + NumMemCmpGreaterThanMax++; + return false; + } + + NumMemCmpInlined++; + + BasicBlock *StartBlock = CI->getParent(); + // Block to return from + BasicBlock *EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); + + // Block that calcualtes the memcmp result + BasicBlock *EndBlockXor = + BasicBlock::Create(C, "endblockx", StartBlock->getParent(), EndBlock); + Builder.SetInsertPoint(EndBlockXor); + + Type *LoadPtrTy; + Type *LoadType, *MaxLoadType; + MaxLoadType = getTypeFromSize(C, MaxLoadSize); + + PHINode *PhiXor = nullptr; + PHINode *PhiSrc1 = nullptr; + PHINode *PhiSrc2 = nullptr; + + PhiXor = Builder.CreatePHI(MaxLoadType, 0, "res.phi"); + PhiSrc1 = Builder.CreatePHI(MaxLoadType, 0, "res.phi"); + PhiSrc2 = Builder.CreatePHI(MaxLoadType, 0, "res.phi"); + + // EndBlock phi node to return the final result + PHINode *PhiRes = nullptr; + Builder.SetInsertPoint(&EndBlock->front()); + PhiRes = Builder.CreatePHI(Type::getInt32Ty(C), 2, "res.phi"); + + // Check if we have one byte blocks + bool HaveOneByteLoads = SizeVal % 2 != 0; + + // Working with two basic blocks at a time + BasicBlock *LoadBlockCurr, *LoadBlockNext; + + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + StartBlock->getTerminator()->eraseFromParent(); + + Value *Res; + + int CurrLoadSize = MaxLoadSize; + int BlockCount, GEPIndex; + int Remainder = SizeVal; + GEPIndex = 0; + + // Create first load compare block and set the entry block to branch to it + LoadBlockCurr = BasicBlock::Create(C, "loadb", StartBlock->getParent(), EndBlock); + LoadBlockNext = nullptr; + BranchInst *NewBr = BranchInst::Create(LoadBlockCurr); + Builder.SetInsertPoint(StartBlock, StartBlock->end()); + Builder.Insert(NewBr); + + // Find the first load size that is less than remainder + while (CurrLoadSize > Remainder) + CurrLoadSize = CurrLoadSize >> 1; + + // Continue to create load compare blocks until we no longer have a remainder + while (Remainder) { + // Calculate how many blocks can we create with the current load size + BlockCount = Remainder / CurrLoadSize; + Remainder = Remainder % CurrLoadSize; + bool LastBlock = Remainder == 0 && (BlockCount == 1); + + // If not the last block, create the next block for current block to branch to + if (!LastBlock) { + LoadBlockNext = + BasicBlock::Create(C, "loadb", StartBlock->getParent(),EndBlock); + } + // Call EmitLoadCompareBlock for the number of blocks we need of CurrLoadSize + int i; + for (i = 0; i < BlockCount; i++) { + LoadType = getTypeFromSize(C, CurrLoadSize); + LoadPtrTy = getPtrTypeFromSize(C, CurrLoadSize); + if (i == BlockCount - 1 && Remainder == 0) + LastBlock = true; + EmitLoadCompareBlock(C, LoadBlockCurr, LoadBlockNext, Source1, Source2, + LoadType, LoadPtrTy, MaxLoadType, + CurrLoadSize > 1 ? EndBlockXor : EndBlock, + CurrLoadSize > 1 ? PhiXor : PhiRes, PhiSrc1, PhiSrc2, + GEPIndex); + LoadBlockCurr = LoadBlockNext; + // Already created two blocks, check to see if third is needed + if ((i < BlockCount - 2) || (i == BlockCount - 2 && Remainder != 0)) { + LoadBlockNext = + BasicBlock::Create(C, "loadb", StartBlock->getParent(), EndBlock); + } else { + LoadBlockNext = nullptr; + } + GEPIndex++; + } + // Calculate the next load size to use + if (Remainder > 0) { + while (CurrLoadSize > Remainder) + CurrLoadSize = CurrLoadSize >> 1; + } + // New index into the source using CurrLoadSize + GEPIndex = (SizeVal - Remainder) / CurrLoadSize; + } + + // Calculate the memcmp result in EndBlockXor and branch to the final EndBlock + Res = getResult(C, PhiXor, PhiSrc1, PhiSrc2, MaxLoadType, EndBlockXor); + NewBr = BranchInst::Create(EndBlock); + Builder.SetInsertPoint(EndBlockXor, EndBlockXor->end()); + Builder.Insert(NewBr); + + if (HaveOneByteLoads) { + PhiRes->addIncoming(Res, EndBlockXor); + CI->replaceAllUsesWith(PhiRes); + } else { + CI->replaceAllUsesWith(Res); + } + + CI->eraseFromParent(); + return true; +} bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { BasicBlock *BB = CI->getParent(); @@ -2054,6 +2390,21 @@ CI->eraseFromParent(); return true; } + + Function *F = CI->getCalledFunction(); + LibFunc Func; + bool Expanded = false; + if( TLInfo->getLibFunc(F->getName(), Func) && Func==LibFunc_memcmp){ + bool ByteSwapLoads, AllowUnalignedLoads; + unsigned MaxLoadSize; + if (TTI->expandMemCmp(CI, MaxLoadSize, ByteSwapLoads, AllowUnalignedLoads)) { + Expanded = memcmpExpansion(CI, TLI, DL, MaxLoadSize, ByteSwapLoads, AllowUnalignedLoads); + if (Expanded) { + ModifiedDT = true; + return true; + } + } + } return false; } Index: lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- lib/CodeGen/TargetLoweringBase.cpp +++ lib/CodeGen/TargetLoweringBase.cpp @@ -829,9 +829,10 @@ initActions(); // Perform these initializations only once. - MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = 8; - MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize - = MaxStoresPerMemmoveOptSize = 4; + MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = + MaxLoadSizeMemcmp = 8; + MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize = + MaxStoresPerMemmoveOptSize = MaxLoadSizeMemcmpOptSize = 4; UseUnderscoreSetJmp = false; UseUnderscoreLongJmp = false; HasMultipleConditionRegisters = false; Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -1020,6 +1020,10 @@ MaxStoresPerMemset = 128; MaxStoresPerMemcpy = 128; MaxStoresPerMemmove = 128; + MaxLoadSizeMemcmp = 128; + } else { + MaxLoadSizeMemcmp = 64; + MaxLoadSizeMemcmpOptSize = 8; } } Index: lib/Target/PowerPC/PPCTargetTransformInfo.h =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.h +++ lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -60,6 +60,8 @@ /// @{ bool enableAggressiveInterleaving(bool LoopHasReductions); + bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize, bool &ByteSwapLoads, + bool &AllowUnalignedLoads); bool enableInterleavedAccessVectorization(); unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -215,6 +215,14 @@ return LoopHasReductions; } +bool PPCTTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize, + bool &ByteSwapLoads, bool &AllowUnalignedLoads) { + MaxLoadSize = 8; + ByteSwapLoads = true; + AllowUnalignedLoads = true; + return true; +} + bool PPCTTIImpl::enableInterleavedAccessVectorization() { return true; } Index: test/CodeGen/PowerPC/memcmp.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/memcmp.ll @@ -0,0 +1,122 @@ +; RUN: llc -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s -check-prefix=CHECK +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +; Check multiples of 8 +; Function Attrs: nounwind readonly +define signext i32 @test1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 16) #2 + ret i32 %call + +; CHECK-LABEL: @test1 +; CHECK: ld +; CHECK-NEXT: ld +; CHECK-NEXT: xor. +; CHECK-NEXT: bne +; CHECK: popcntd +; CHECK-NEXT: andi. +; CHECK-NEXT: srad +; CHECK-NEXT: srad +; CHECK-NEXT: clrldi +; CHECK-NEXT: clrldi +; CHECK-NEXT: sub +; CHECK-NEXT: blr +} + +; Check less than 8 +; Function Attrs: nounwind readonly +define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 4) #2 + ret i32 %call + +; CHECK-LABEL: @test2 +; CHECK: lwz +; CHECK-NEXT: lwz +; CHECK-NEXT: xor +; CHECK: popcntd +; CHECK-NEXT: andi. +; CHECK-NEXT: srd +; CHECK-NEXT: srd +; CHECK-NEXT: clrldi +; CHECK-NEXT: clrldi +; CHECK-NEXT: sub +; CHECK-NEXT: blr +} + +; Check greater than 8 with remainder that uses each load type. Ex: 15 bytes = 8 + 4 + 2 + 1 +; Function Attrs: nounwind readonly +define signext i32 @test3(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 15) #2 + ret i32 %call + +; CHECK-LABEL: @test3 + +; CHECK: ld +; CHECK-NEXT: ld +; CHECK-NEXT: xor. +; CHECK-NEXT: bne + +; CHECK: lwz +; CHECK-NEXT: lwz +; CHECK-NEXT: xor + +; CHECK: lhz +; CHECK-NEXT: lhz +; CHECK-NEXT: xor + +; CHECK: popcntd +; CHECK-NEXT: andi. +; CHECK-NEXT: srad +; CHECK-NEXT: srad +; CHECK-NEXT: clrldi +; CHECK-NEXT: clrldi +; CHECK-NEXT: sub +; CHECK-NEXT: extsw +; CHECK-NEXT: blr + +; CHECK: lbz +; CHECK-NEXT: lbz +; CHECK-NEXT: subf +; CHECK-NEXT: extsw +; CHECK-NEXT: blr + +} + +; Check greater than max allowed size = 64 +; Function Attrs: nounwind readonly +define signext i32 @test4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 65) #2 + ret i32 %call + +; CHECK: bl memcmp + +} + +; Check not a constant size +; Function Attrs: nounwind readonly +define signext i32 @test5(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2, i32 signext %SIZE) local_unnamed_addr #0 { +entry: + %0 = bitcast i32* %buffer1 to i8* + %1 = bitcast i32* %buffer2 to i8* + %conv = sext i32 %SIZE to i64 + %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 %conv) #2 + ret i32 %call + +; CHECK: bl memcmp + +} + +; Function Attrs: nounwind readonly +declare signext i32 @memcmp(i8*, i8*, i64) #1