Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -407,6 +407,9 @@ /// \brief Don't restrict interleaved unrolling to small loops. bool enableAggressiveInterleaving(bool LoopHasReductions) const; + /// \brief Enable inline expansion of memcmp + bool expandMemCmp() const; + /// \brief Enable matching of interleaved access groups. bool enableInterleavedAccessVectorization() const; @@ -732,6 +735,7 @@ virtual bool shouldBuildLookupTables() = 0; virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0; virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; + virtual bool expandMemCmp() = 0; virtual bool enableInterleavedAccessVectorization() = 0; virtual bool isFPVectorizationPotentiallyUnsafe() = 0; virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context, @@ -922,6 +926,9 @@ bool enableAggressiveInterleaving(bool LoopHasReductions) override { return Impl.enableAggressiveInterleaving(LoopHasReductions); } + bool expandMemCmp() override { + return Impl.expandMemCmp(); + } bool enableInterleavedAccessVectorization() override { return Impl.enableInterleavedAccessVectorization(); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -252,6 +252,8 @@ bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; } + bool expandMemCmp() { return false; } + bool enableInterleavedAccessVectorization() { return false; } bool isFPVectorizationPotentiallyUnsafe() { return false; } Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -992,6 +992,16 @@ unsigned getMaxStoresPerMemcpy(bool OptSize) const { return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy; } + + /// \brief Get maximum # of load operations permitted for memcmp + /// + /// This function returns the maximum number of load operations permitted + /// to replace a call to memcmp. The value is set by the target at the + /// performance threshold for such a replacement. If OptSize is true, + /// return the limit for functions that have OptSize attribute. + unsigned getMaxLoadsPerMemcmp(bool OptSize) const { + return OptSize ? MaxLoadsPerMemcmpOptSize : MaxLoadsPerMemcmp; + } /// \brief Get maximum # of store operations permitted for llvm.memmove /// @@ -2164,6 +2174,8 @@ /// Maximum number of store operations that may be substituted for a call to /// memcpy, used for functions with OptSize attribute. unsigned MaxStoresPerMemcpyOptSize; + unsigned MaxLoadsPerMemcmp; + unsigned MaxLoadsPerMemcmpOptSize; /// \brief Specify maximum bytes of store instructions per memmove call. /// Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -186,6 +186,10 @@ return TTIImpl->enableAggressiveInterleaving(LoopHasReductions); } +bool TargetTransformInfo::expandMemCmp() const { + return TTIImpl->expandMemCmp(); +} + bool TargetTransformInfo::enableInterleavedAccessVectorization() const { return TTIImpl->enableInterleavedAccessVectorization(); } Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -80,6 +80,12 @@ STATISTIC(NumAndCmpsMoved, "Number of and/cmp's pushed into branches"); STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed"); +STATISTIC(NumMemCmpCalls, "Number of memcmp calls"); +STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size"); +STATISTIC(NumMemCmpNot8ByteMultiples, "Number of memcmp calls without 8 byte multiples"); +STATISTIC(NumMemCmpGreaterThanMax, "Number of memcmp calls with size greater than max size"); +STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls"); + static cl::opt DisableBranchOpts( "disable-cgp-branch-opts", cl::Hidden, cl::init(false), cl::desc("Disable branch optimizations in CodeGenPrepare")); @@ -1870,9 +1876,172 @@ return true; } +// This function checks to see if an expansion of memcmp can be generated. +// It checks for constant compare size that is less than the max inline size. +// If an expansion cannot occur, returns false to leave as a library call. +// Otherwise, the library call is replaced wtih new IR instruction sequence. +/// We want to transform: +/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 16) +/// +/// into: +/// loadb: ; preds = %entry +/// %2 = bitcast i8* %0 to i64* +/// %3 = bitcast i8* %1 to i64* +/// %4 = load i64, i64* %2 +/// %5 = load i64, i64* %3 +/// %6 = xor i64 %4, %5 +/// %7 = icmp ne i64 %6, 0 +/// br i1 %7, label %endblock, label %loadb1 +/// loadb1: ; preds = %loadb +/// %8 = getelementptr i64, i64* %2, i64 1 +/// %9 = getelementptr i64, i64* %3, i64 1 +/// %10 = load i64, i64* %8 +/// %11 = load i64, i64* %9 +/// %12 = xor i64 %10, %11 +/// br label %endblock +/// endblock: ; preds = %loadb1, %loadb +/// %res.phi = phi i64 [ %6, %loadb ], [ %12, %loadb1 ] +/// %res.phi2 = phi i64 [ %4, %loadb ], [ %10, %loadb1 ] +/// %res.phi3 = phi i64 [ %5, %loadb ], [ %11, %loadb1 ] +/// %13 = call i64 @llvm.cttz.i64(i64 %res.phi, i1 false) +/// %src1_sh = ashr i64 %res.phi2, %13 +/// %src2_sh = ashr i64 %res.phi3, %13 +/// %14 = and i64 %src1_sh, 255 +/// %15 = and i64 %src2_sh, 255 +/// %16 = sub i64 %14, %15 +/// %17 = trunc i64 %16 to i32 +/// ret i32 %17 +static bool memcmpExpansion(CallInst *CI, const TargetLowering *TLI, + const DataLayout *DL) { + + if (!DL->isLittleEndian() || DL->getPointerSizeInBits() != 64) { + return false; + } + + NumMemCmpCalls++; + LLVMContext &C = CI->getContext(); + IRBuilder<> Builder(C); + Type *Int64Ty = Type::getInt64Ty(C); + Type *Int64PtrTy = Type::getInt64PtrTy(C); + + Value *Source1 = CI->getArgOperand(0); + Value *Source2 = CI->getArgOperand(1); + Value *Size = CI->getArgOperand(2); + + ConstantInt *SizeCast = dyn_cast(Size); + if (!SizeCast) { + NumMemCmpNotConstant++; + return false; + } + + unsigned SizeVal = SizeCast->getZExtValue(); + if (SizeVal % 8 != 0) { + NumMemCmpNot8ByteMultiples++; + return false; + } + + if (SizeVal > TLI->getMaxLoadsPerMemcmp(0)) { + NumMemCmpGreaterThanMax++; + return false; + } + + NumMemCmpInlined++; + unsigned Count = SizeVal / 8; + + std::vector BBList; + std::vector XorList; + std::vector Src1List; + std::vector Src2List; + + BasicBlock *StartBlock = CI->getParent(); + BasicBlock *EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); + BasicBlock *LoadBlockTemp; + + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + // Create the number of load and compare basic blocks we need for this size + unsigned i; + for (i = 0; i < Count; i++) { + LoadBlockTemp = + BasicBlock::Create(C, "loadb", StartBlock->getParent(), EndBlock); + BBList.push_back(LoadBlockTemp); + } + + // remove the previous terminator and add a branch to the first load, compare block + StartBlock->getTerminator()->eraseFromParent(); + BranchInst *NewBr = BranchInst::Create(BBList[0]); + Builder.SetInsertPoint(StartBlock, StartBlock->end()); + Builder.Insert(NewBr); + + Value *Source1Cast, *Source2Cast, *LoadSrc1, *LoadSrc2, *Xor; + // Generate load, xor, compare, branch instructions for each block + for (i = 0; i < Count; i++) { + Builder.SetInsertPoint(BBList[i], BBList[i]->getFirstInsertionPt()); + // Cast the source pointers to i64* for generating 8 byte loads + if (i == 0) { + Source1Cast = Builder.CreateBitCast(Source1, Int64PtrTy); + Source2Cast = Builder.CreateBitCast(Source2, Int64PtrTy); + } else { + // Increment the base pointers for each source + Source1Cast = + Builder.CreateGEP(Int64Ty, Source1Cast, ConstantInt::get(Int64Ty, 1)); + Source2Cast = + Builder.CreateGEP(Int64Ty, Source2Cast, ConstantInt::get(Int64Ty, 1)); + } + LoadSrc1 = Builder.CreateLoad(Int64Ty, Source1Cast); + LoadSrc2 = Builder.CreateLoad(Int64Ty, Source2Cast); + Src1List.push_back(LoadSrc1); + Src2List.push_back(LoadSrc2); + Xor = Builder.CreateXor(LoadSrc1, LoadSrc2); + XorList.push_back(Xor); + // Create an early exit branch instruction for all but the last block + if (i == Count - 1) { + BranchInst *NewBr = BranchInst::Create(EndBlock); + Builder.Insert(NewBr); + } else { + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Xor, + ConstantInt::get(Xor->getType(), 0)); + Builder.CreateCondBr(Cmp, EndBlock, BBList[i + 1]); + } + } + + Builder.SetInsertPoint(&EndBlock->front()); + PHINode *PhiXor = nullptr; + PhiXor = Builder.CreatePHI(Int64Ty, Count, "res.phi"); + for (i = 0; i < Count; i++) { + PhiXor->addIncoming(XorList[i], BBList[i]); + } + PHINode *PhiSrc1 = nullptr; + PhiSrc1 = Builder.CreatePHI(Int64Ty, Count, "res.phi"); + for (i = 0; i < Count; i++) { + PhiSrc1->addIncoming(Src1List[i], BBList[i]); + } + + PHINode *PhiSrc2 = nullptr; + PhiSrc2 = Builder.CreatePHI(Int64Ty, Count, "res.phi"); + for (i = 0; i < Count; i++) { + PhiSrc2->addIncoming(Src2List[i], BBList[i]); + } + + // Add instructions to EndBlock for calculating memcmp return value + Function *F = EndBlock->getParent(); + Function *CTTZ = + Intrinsic::getDeclaration(F->getParent(), Intrinsic::cttz, Int64Ty); + Value *CntZeros = Builder.CreateCall(CTTZ, {PhiXor, Builder.getFalse()}); + Value *Shift1 = Builder.CreateAShr(PhiSrc1, CntZeros, "src1_sh"); + Value *Shift2 = Builder.CreateAShr(PhiSrc2, CntZeros, "src2_sh"); + Value *And1 = Builder.CreateAnd(Shift1, ConstantInt::get(Int64Ty, 0xFF)); + Value *And2 = Builder.CreateAnd(Shift2, ConstantInt::get(Int64Ty, 0xFF)); + Value *Subtract = Builder.CreateSub(And1, And2); + Value *Res = Builder.CreateSExtOrTrunc(Subtract, Builder.getInt32Ty()); + + CI->replaceAllUsesWith(Res); + CI->eraseFromParent(); + return true; +} + bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { BasicBlock *BB = CI->getParent(); - // Lower inline assembly if we can. // If we found an inline asm expession, and if the target knows how to // lower it to normal LLVM code, do so now. @@ -2054,6 +2223,27 @@ CI->eraseFromParent(); return true; } + + Function *F = CI->getCalledFunction(); + LibFunc::Func Func; + bool expanded = false; + if (!CI->isNoBuiltin() && !F->hasLocalLinkage() && F->hasName() && + TLInfo->getLibFunc(F->getName(), Func) && + TLInfo->hasOptimizedCodeGen(Func)) { + switch (Func) { + default: + break; + case LibFunc::memcmp: + if (TTI->expandMemCmp()) { + expanded = memcmpExpansion(CI, TLI, DL); + if (expanded) { + ModifiedDT = true; + return true; + } + } + break; + } + } return false; } Index: lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- lib/CodeGen/TargetLoweringBase.cpp +++ lib/CodeGen/TargetLoweringBase.cpp @@ -829,9 +829,9 @@ initActions(); // Perform these initializations only once. - MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = 8; + MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = MaxLoadsPerMemcmp = 8; MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize - = MaxStoresPerMemmoveOptSize = 4; + = MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4; UseUnderscoreSetJmp = false; UseUnderscoreLongJmp = false; HasMultipleConditionRegisters = false; Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -971,6 +971,10 @@ MaxStoresPerMemset = 128; MaxStoresPerMemcpy = 128; MaxStoresPerMemmove = 128; + MaxLoadsPerMemcmp = 128; + } else { + MaxLoadsPerMemcmp = 64; + MaxLoadsPerMemcmpOptSize = 8; } } Index: lib/Target/PowerPC/PPCTargetTransformInfo.h =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.h +++ lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -60,6 +60,7 @@ /// @{ bool enableAggressiveInterleaving(bool LoopHasReductions); + bool expandMemCmp(); bool enableInterleavedAccessVectorization(); unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -215,6 +215,10 @@ return LoopHasReductions; } +bool PPCTTIImpl::expandMemCmp() { + return true; +} + bool PPCTTIImpl::enableInterleavedAccessVectorization() { return true; } Index: test/CodeGen/PowerPC/memcmp.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/memcmp.ll @@ -0,0 +1,27 @@ +; RUN: llc -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s -check-prefix=CHECK +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-unknown-linux-gnu" + +; Function Attrs: nounwind +define signext i32 @foo(double* %x, double* %y) #0 { +entry: + %0 = bitcast double* %x to i8* + %1 = bitcast double* %y to i8* + %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 16) #2 + ret i32 %call + +; CHECK-LABEL: @foo +; CHECK: ld +; CHECK-NEXT: ld +; CHECK-NEXT: xor. +; CHECK-NEXT: bne +; CHECK: popcntd +; CHECK-NEXT: srad +; CHECK-NEXT: srad +; CHECK-NEXT: clrldi +; CHECK-NEXT: clrldi +; CHECK-NEXT: sub +} + +; Function Attrs: nounwind readonly +declare signext i32 @memcmp(i8*, i8*, i64) #1