diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.def b/llvm/include/llvm/Analysis/TargetLibraryInfo.def --- a/llvm/include/llvm/Analysis/TargetLibraryInfo.def +++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.def @@ -366,6 +366,9 @@ /// size_t dstsize) TLI_DEFINE_ENUM_INTERNAL(memccpy_chk) TLI_DEFINE_STRING_INTERNAL("__memccpy_chk") +/// int __memcmpeq(const void *s1, const void *s2, size_t n); +TLI_DEFINE_ENUM_INTERNAL(memcmpeq) +TLI_DEFINE_STRING_INTERNAL("__memcmpeq") /// void *__memcpy_chk(void *s1, const void *s2, size_t n, size_t s1size); TLI_DEFINE_ENUM_INTERNAL(memcpy_chk) TLI_DEFINE_STRING_INTERNAL("__memcpy_chk") diff --git a/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h b/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h --- a/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h @@ -85,10 +85,10 @@ return SDValue(); } - /// Emit target-specific code that performs a memcmp/bcmp, in cases where that is - /// faster than a libcall. The first returned SDValue is the result of the - /// memcmp and the second is the chain. Both SDValues can be null if a normal - /// libcall should be used. + /// Emit target-specific code that performs a memcmp/bcmp/__memcmpeq, in cases + /// where that is faster than a libcall. The first returned SDValue is the + /// result of the memcmp and the second is the chain. Both SDValues can be + /// null if a normal libcall should be used. virtual std::pair EmitTargetCodeForMemcmp(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Op1, SDValue Op2, SDValue Op3, diff --git a/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h b/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h --- a/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h +++ b/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h @@ -147,6 +147,10 @@ Value *emitBCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilderBase &B, const DataLayout &DL, const TargetLibraryInfo *TLI); + /// Emit a call to the __memcmpeq function. + Value *emitMemCmpEq(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilderBase &B, + const DataLayout &DL, const TargetLibraryInfo *TLI); + /// Emit a call to the memccpy function. Value *emitMemCCpy(Value *Ptr1, Value *Ptr2, Value *Val, Value *Len, IRBuilderBase &B, const TargetLibraryInfo *TLI); diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h --- a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h +++ b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h @@ -173,6 +173,7 @@ Value *optimizeMemRChr(CallInst *CI, IRBuilderBase &B); Value *optimizeMemCmp(CallInst *CI, IRBuilderBase &B); Value *optimizeBCmp(CallInst *CI, IRBuilderBase &B); + Value *optimizeMemCmpEq(CallInst *CI, IRBuilderBase &B); Value *optimizeMemCmpBCmpCommon(CallInst *CI, IRBuilderBase &B); Value *optimizeMemCCpy(CallInst *CI, IRBuilderBase &B); Value *optimizeMemPCpy(CallInst *CI, IRBuilderBase &B); diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp --- a/llvm/lib/Analysis/TargetLibraryInfo.cpp +++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp @@ -33,6 +33,11 @@ clEnumValN(TargetLibraryInfoImpl::SVML, "SVML", "Intel SVML library"))); +static cl::opt WithBuiltinMemcmpeq( + "with-builtin-memcmpeq", cl::Hidden, cl::init(false), + cl::desc("Enable emitting __memcmpeq (as replacement for " + "boolean usage of memcmp/bcmp)")); + StringLiteral const TargetLibraryInfoImpl::StandardNames[LibFunc::NumLibFuncs] = { #define TLI_DEFINE_STRING @@ -67,6 +72,13 @@ return TT.isOSFreeBSD() || TT.isOSSolaris(); } +static bool hasMemcmpeq(const Triple &TT) { + if (WithBuiltinMemcmpeq) { + return true; + } + return false; +} + static bool isCallingConvCCompatible(CallingConv::ID CC, StringRef TT, FunctionType *FuncTy) { switch (CC) { @@ -204,6 +216,12 @@ if (!hasBcmp(T)) TLI.setUnavailable(LibFunc_bcmp); + if (!hasMemcmpeq(T)) { + // TLI.setAvailable(LibFunc_memcmpeq); + // else + TLI.setUnavailable(LibFunc_memcmpeq); + } + if (T.isMacOSX() && T.getArch() == Triple::x86 && !T.isMacOSXVersionLT(10, 7)) { // x86-32 OSX has a scheme where fwrite and fputs (and some other functions @@ -574,6 +592,7 @@ TLI.setUnavailable(LibFunc_dunder_isoc99_sscanf); TLI.setUnavailable(LibFunc_under_IO_getc); TLI.setUnavailable(LibFunc_under_IO_putc); + TLI.setUnavailable(LibFunc_memcmpeq); // But, Android and musl have memalign. if (!T.isAndroid() && !T.isMusl()) TLI.setUnavailable(LibFunc_memalign); @@ -1191,6 +1210,7 @@ case LibFunc_aligned_alloc: return (NumParams == 2 && FTy.getReturnType()->isPointerTy()); case LibFunc_bcopy: + case LibFunc_memcmpeq: case LibFunc_bcmp: return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() && FTy.getParamType(1)->isPointerTy()); diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp --- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -1316,6 +1316,7 @@ // on any target: A size_t argument (which may be an i32 on some targets) // should not trigger the assert below. case LibFunc_bcmp: + case LibFunc_memcmpeq: case LibFunc_calloc: case LibFunc_fwrite: case LibFunc_malloc: @@ -1556,6 +1557,16 @@ {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, B, TLI); } +Value *llvm::emitMemCmpEq(Value *Ptr1, Value *Ptr2, Value *Len, + IRBuilderBase &B, const DataLayout &DL, + const TargetLibraryInfo *TLI) { + LLVMContext &Context = B.GetInsertBlock()->getContext(); + return emitLibCall( + LibFunc_memcmpeq, B.getInt32Ty(), + {B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context)}, + {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, B, TLI); +} + Value *llvm::emitMemCCpy(Value *Ptr1, Value *Ptr2, Value *Val, Value *Len, IRBuilderBase &B, const TargetLibraryInfo *TLI) { return emitLibCall( diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -1181,7 +1181,9 @@ return nullptr; } -// Most simplifications for memcmp also apply to bcmp. +// Most simplifications for memcmp also apply to memcmpeq functions +// including bcmp, __memcmpeq, or any other derivation function of +// memcmp. Value *LibCallSimplifier::optimizeMemCmpBCmpCommon(CallInst *CI, IRBuilderBase &B) { Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1); @@ -1212,20 +1214,46 @@ return V; // memcmp(x, y, Len) == 0 -> bcmp(x, y, Len) == 0 - // bcmp can be more efficient than memcmp because it only has to know that - // there is a difference, not how different one is to the other. - if (isLibFuncEmittable(M, TLI, LibFunc_bcmp) && + // bcmp and __memcmpeq can be more efficient than memcmp because it only has + // to know that there is a difference, not how different one is to the other. + + // Note: We check both __memcmpeq and bcmp here (rather than having one + // forward to the other) because only one of them may be available. + if (((isLibFuncEmittable(M, TLI, LibFunc_bcmp) && TLI->has(LibFunc_bcmp)) || + (isLibFuncEmittable(M, TLI, LibFunc_memcmpeq) && + TLI->has(LibFunc_memcmpeq))) && isOnlyUsedInZeroEqualityComparison(CI)) { Value *LHS = CI->getArgOperand(0); Value *RHS = CI->getArgOperand(1); Value *Size = CI->getArgOperand(2); - return copyFlags(*CI, emitBCmp(LHS, RHS, Size, B, DL, TLI)); + // Prefer __memcmpeq if its available. + if (isLibFuncEmittable(M, TLI, LibFunc_memcmpeq) && + TLI->has(LibFunc_memcmpeq)) { + return copyFlags(*CI, emitMemCmpEq(LHS, RHS, Size, B, DL, TLI)); + } else { + return copyFlags(*CI, emitBCmp(LHS, RHS, Size, B, DL, TLI)); + } } return nullptr; } Value *LibCallSimplifier::optimizeBCmp(CallInst *CI, IRBuilderBase &B) { + Module *M = CI->getModule(); + if (Value *V = optimizeMemCmpBCmpCommon(CI, B)) + return V; + + // Replace calls to bcmp with __memcmpeq is available. + if (isLibFuncEmittable(M, TLI, LibFunc_bcmp) && TLI->has(LibFunc_memcmpeq)) { + Value *LHS = CI->getArgOperand(0); + Value *RHS = CI->getArgOperand(1); + Value *Size = CI->getArgOperand(2); + return copyFlags(*CI, emitMemCmpEq(LHS, RHS, Size, B, DL, TLI)); + } + return nullptr; +} + +Value *LibCallSimplifier::optimizeMemCmpEq(CallInst *CI, IRBuilderBase &B) { return optimizeMemCmpBCmpCommon(CI, B); } @@ -3053,6 +3081,8 @@ return optimizeMemRChr(CI, Builder); case LibFunc_bcmp: return optimizeBCmp(CI, Builder); + case LibFunc_memcmpeq: + return optimizeMemCmpEq(CI, Builder); case LibFunc_memcmp: return optimizeMemCmp(CI, Builder); case LibFunc_memcpy: diff --git a/llvm/test/Transforms/InstCombine/memcmp-1.ll b/llvm/test/Transforms/InstCombine/memcmp-1.ll --- a/llvm/test/Transforms/InstCombine/memcmp-1.ll +++ b/llvm/test/Transforms/InstCombine/memcmp-1.ll @@ -2,6 +2,7 @@ ; ; RUN: opt < %s -passes=instcombine -S | FileCheck --check-prefix=CHECK --check-prefix=NOBCMP %s ; RUN: opt < %s -passes=instcombine -mtriple=x86_64-unknown-linux-gnu -S | FileCheck --check-prefix=CHECK --check-prefix=BCMP %s +; RUN: opt < %s -passes=instcombine --with-builtin-memcmpeq -mtriple=x86_64-unknown-linux-gnu -S | FileCheck --check-prefix=CHECK --check-prefix=MEMCMPEQ %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32:64" @@ -144,6 +145,11 @@ ; BCMP-NEXT: [[CALL:%.*]] = call i32 @bcmp(i8* %mem1, i8* %mem2, i32 %size) ; BCMP-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 ; BCMP-NEXT: ret i1 [[CMP]] +; +; MEMCMPEQ-LABEL: @test_simplify10( +; MEMCMPEQ-NEXT: [[CALL:%.*]] = call i32 @__memcmpeq(i8* %mem1, i8* %mem2, i32 %size) +; MEMCMPEQ-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], 0 +; MEMCMPEQ-NEXT: ret i1 [[CMP]] ; %call = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 %size) %cmp = icmp eq i32 %call, 0 diff --git a/llvm/test/Transforms/InstCombine/strcmp-1.ll b/llvm/test/Transforms/InstCombine/strcmp-1.ll --- a/llvm/test/Transforms/InstCombine/strcmp-1.ll +++ b/llvm/test/Transforms/InstCombine/strcmp-1.ll @@ -2,6 +2,7 @@ ; Test that the strcmp library call simplifier works correctly. ; RUN: opt < %s -passes=instcombine -S | FileCheck %s --check-prefix=NOBCMP ; RUN: opt < %s -passes=instcombine -mtriple=unknown-unknown-linux-gnu -S | FileCheck %s --check-prefix=BCMP +; RUN: opt < %s -passes=instcombine --with-builtin-memcmpeq -mtriple=unknown-unknown-linux-gnu -S | FileCheck %s --check-prefix=MEMCMPEQ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" @@ -140,6 +141,12 @@ ; BCMP-NEXT: [[BCMP:%.*]] = call i32 @bcmp(i8* noundef nonnull dereferenceable(5) getelementptr inbounds ([6 x i8], [6 x i8]* @hello, i32 0, i32 0), i8* noundef nonnull dereferenceable(5) [[STR2]], i32 5) ; BCMP-NEXT: [[RES:%.*]] = icmp eq i32 [[BCMP]], 0 ; BCMP-NEXT: ret i1 [[RES]] +; +; MEMCMPEQ-LABEL: @test7( +; MEMCMPEQ-NEXT: [[STR2:%.*]] = select i1 [[B:%.*]], i8* getelementptr inbounds ([5 x i8], [5 x i8]* @hell, i32 0, i32 0), i8* getelementptr inbounds ([5 x i8], [5 x i8]* @bell, i32 0, i32 0) +; MEMCMPEQ-NEXT: [[MEMCMPEQ:%.*]] = call i32 @__memcmpeq(i8* noundef nonnull dereferenceable(5) getelementptr inbounds ([6 x i8], [6 x i8]* @hello, i32 0, i32 0), i8* noundef nonnull dereferenceable(5) [[STR2]], i32 5) +; MEMCMPEQ-NEXT: [[RES:%.*]] = icmp eq i32 [[MEMCMPEQ]], 0 +; MEMCMPEQ-NEXT: ret i1 [[RES]] ; diff --git a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp --- a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp +++ b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp @@ -234,6 +234,7 @@ "declare x86_fp80 @logl(x86_fp80)\n" "declare i8* @malloc(i64)\n" "declare i8* @memccpy(i8*, i8*, i32, i64)\n" + "declare i8* @__memcmpeq(i8*, i8*, i64)\n" "declare i8* @memchr(i8*, i32, i64)\n" "declare i32 @memcmp(i8*, i8*, i64)\n" "declare i8* @memcpy(i8*, i8*, i64)\n"