Index: include/llvm/IR/RuntimeLibcalls.def =================================================================== --- include/llvm/IR/RuntimeLibcalls.def +++ include/llvm/IR/RuntimeLibcalls.def @@ -377,6 +377,7 @@ HANDLE_LIBCALL(MEMMOVE, "memmove") HANDLE_LIBCALL(MEMSET, "memset") HANDLE_LIBCALL(BZERO, nullptr) +HANDLE_LIBCALL(BCMP, nullptr) // Element-wise unordered-atomic memory of different sizes HANDLE_LIBCALL(MEMCPY_ELEMENT_UNORDERED_ATOMIC_1, "__llvm_memcpy_element_unordered_atomic_1") Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6715,12 +6715,13 @@ /// The caller already checked that \p I calls the appropriate LibFunc with a /// correct prototype. bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) { + const auto &TLI = DAG.getTargetLoweringInfo(); + const auto &DL = DAG.getDataLayout(); const Value *LHS = I.getArgOperand(0), *RHS = I.getArgOperand(1); const Value *Size = I.getArgOperand(2); const ConstantInt *CSize = dyn_cast(Size); if (CSize && CSize->getZExtValue() == 0) { - EVT CallVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(), - I.getType(), true); + EVT CallVT = TLI.getValueType(DL, I.getType(), true); setValue(&I, DAG.getConstant(0, getCurSDLoc(), CallVT)); return true; } @@ -6735,70 +6736,110 @@ return true; } - // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS) != 0 - // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS) != 0 - if (!CSize || !isOnlyUsedInZeroEqualityComparison(&I)) + if (!isOnlyUsedInZeroEqualityComparison(&I)) return false; - // If the target has a fast compare for the given size, it will return a - // preferred load type for that size. Require that the load VT is legal and - // that the target supports unaligned loads of that type. Otherwise, return - // INVALID. - auto hasFastLoadsAndCompare = [&](unsigned NumBits) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - MVT LVT = TLI.hasFastEqualityCompare(NumBits); - if (LVT != MVT::INVALID_SIMPLE_VALUE_TYPE) { - // TODO: Handle 5 byte compare as 4-byte + 1 byte. - // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads. - // TODO: Check alignment of src and dest ptrs. - unsigned DstAS = LHS->getType()->getPointerAddressSpace(); - unsigned SrcAS = RHS->getType()->getPointerAddressSpace(); - if (!TLI.isTypeLegal(LVT) || - !TLI.allowsMisalignedMemoryAccesses(LVT, SrcAS) || - !TLI.allowsMisalignedMemoryAccesses(LVT, DstAS)) - LVT = MVT::INVALID_SIMPLE_VALUE_TYPE; - } - - return LVT; - }; + // We're only interested in the boolean comparison value (equal/not equal). + + // If the size is a compile-time constant, we first try to lower to a single + // comparison between two loads: + // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS) != 0 + // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS) != 0 + if (CSize) { + // If the target has a fast compare for the given size, it will return a + // preferred load type for that size. Require that the load VT is legal and + // that the target supports unaligned loads of that type. Otherwise, return + // INVALID. + auto hasFastLoadsAndCompare = [&](unsigned NumBits) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MVT LVT = TLI.hasFastEqualityCompare(NumBits); + if (LVT != MVT::INVALID_SIMPLE_VALUE_TYPE) { + // TODO: Handle 5 byte compare as 4-byte + 1 byte. + // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads. + // TODO: Check alignment of src and dest ptrs. + unsigned DstAS = LHS->getType()->getPointerAddressSpace(); + unsigned SrcAS = RHS->getType()->getPointerAddressSpace(); + if (!TLI.isTypeLegal(LVT) || + !TLI.allowsMisalignedMemoryAccesses(LVT, SrcAS) || + !TLI.allowsMisalignedMemoryAccesses(LVT, DstAS)) + LVT = MVT::INVALID_SIMPLE_VALUE_TYPE; + } - // This turns into unaligned loads. We only do this if the target natively - // supports the MVT we'll be loading or if it is small enough (<= 4) that - // we'll only produce a small number of byte loads. - MVT LoadVT; - unsigned NumBitsToCompare = CSize->getZExtValue() * 8; - switch (NumBitsToCompare) { - default: - return false; - case 16: - LoadVT = MVT::i16; - break; - case 32: - LoadVT = MVT::i32; - break; - case 64: - case 128: - case 256: - LoadVT = hasFastLoadsAndCompare(NumBitsToCompare); - break; - } + return LVT; + }; - if (LoadVT == MVT::INVALID_SIMPLE_VALUE_TYPE) - return false; + // This turns into unaligned loads. We only do this if the target natively + // supports the MVT we'll be loading or if it is small enough (<= 4) that + // we'll only produce a small number of byte loads. + MVT LoadVT; + unsigned NumBitsToCompare = CSize->getZExtValue() * 8; + switch (NumBitsToCompare) { + default: + LoadVT = MVT::INVALID_SIMPLE_VALUE_TYPE; + break; + case 16: + LoadVT = MVT::i16; + break; + case 32: + LoadVT = MVT::i32; + break; + case 64: + case 128: + case 256: + LoadVT = hasFastLoadsAndCompare(NumBitsToCompare); + break; + } + + if (LoadVT != MVT::INVALID_SIMPLE_VALUE_TYPE) { + SDValue LoadL = getMemCmpLoad(LHS, LoadVT, *this); + SDValue LoadR = getMemCmpLoad(RHS, LoadVT, *this); - SDValue LoadL = getMemCmpLoad(LHS, LoadVT, *this); - SDValue LoadR = getMemCmpLoad(RHS, LoadVT, *this); + // Bitcast to a wide integer type if the loads are vectors. + if (LoadVT.isVector()) { + EVT CmpVT = + EVT::getIntegerVT(LHS->getContext(), LoadVT.getSizeInBits()); + LoadL = DAG.getBitcast(CmpVT, LoadL); + LoadR = DAG.getBitcast(CmpVT, LoadR); + } - // Bitcast to a wide integer type if the loads are vectors. - if (LoadVT.isVector()) { - EVT CmpVT = EVT::getIntegerVT(LHS->getContext(), LoadVT.getSizeInBits()); - LoadL = DAG.getBitcast(CmpVT, LoadL); - LoadR = DAG.getBitcast(CmpVT, LoadR); + SDValue Cmp = + DAG.getSetCC(getCurSDLoc(), MVT::i1, LoadL, LoadR, ISD::SETNE); + processIntegerCallValue(I, Cmp, false); + return true; + } } - SDValue Cmp = DAG.getSetCC(getCurSDLoc(), MVT::i1, LoadL, LoadR, ISD::SETNE); - processIntegerCallValue(I, Cmp, false); - return true; + // The size is not constant or it's not efficient to use the strategy above. + // If the target has a `bcmp()` function, call it. + if (const char* const BcmpFuncName = TLI.getLibcallName(RTLIB::BCMP)) { + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + // signature: bool(const char*, const char*, size_t) + Entry.Ty = DL.getIntPtrType(*DAG.getContext()); + Entry.Node = getValue(LHS); + Args.push_back(Entry); + Entry.Node = getValue(RHS); + Args.push_back(Entry); + Entry.Node = getValue(Size); + Args.push_back(Entry); + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(getCurSDLoc()) + .setChain(DAG.getRoot()) + .setLibCallee( + TLI.getLibcallCallingConv(RTLIB::BCMP), + Type::getInt1Ty(*DAG.getContext()), + DAG.getExternalSymbol(BcmpFuncName, TLI.getPointerTy(DL)), + std::move(Args)) + .setDiscardResult(false); + + std::pair CallResult = TLI.LowerCallTo(CLI); + processIntegerCallValue(I, CallResult.first, false); + PendingLoads.push_back(CallResult.second); + return true; + } + + // Nothing better, just call memcmp(). + return false; } /// See if we can lower a memchr call into an optimized form. If so, return Index: lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- lib/CodeGen/TargetLoweringBase.cpp +++ lib/CodeGen/TargetLoweringBase.cpp @@ -104,6 +104,16 @@ return true; } +static bool targetHasBcmp(const Triple &TT) { + // Posix removed support from bcmp() in 2001, but the the glibc and several + // implementations of the libc still have it. + if (TT.isOSLinux()) + return TT.isGNUEnvironment() || TT.isMusl(); + // Both NetBSD and OpenBSD are planning to remove the function. Windows does + // not have it. + return TT.isOSFreeBSD() || TT.isOSSolaris() || TT.isOSDarwin(); +} + // Although this default value is arbitrary, it is not random. It is assumed // that a condition that evaluates the same way by a higher percentage than this // is best represented as control flow. Therefore, the default value N should be @@ -173,6 +183,11 @@ if (TT.isOSOpenBSD()) { setLibcallName(RTLIB::STACKPROTECTOR_CHECK_FAIL, nullptr); } + + // Some environements have bcmp(), which can be used for more efficient + // equality comparison. + if (targetHasBcmp(TT)) + setLibcallName(RTLIB::BCMP, "bcmp"); } /// getFPEXT - Return the FPEXT_*_* value for the given types, or Index: test/CodeGen/X86/memcmp.ll =================================================================== --- test/CodeGen/X86/memcmp.ll +++ test/CodeGen/X86/memcmp.ll @@ -5,6 +5,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 --check-prefix=X64-BCMP ; This tests codegen time inlining/optimization of memcmp ; rdar://6480398 @@ -1344,3 +1345,108 @@ %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 9223372036854775807) nounwind ret i32 %m } + +define i1 @huge_length_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: huge_length_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $2147483647 # imm = 0x7FFFFFFF +; X86-NEXT: pushl $-1 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-SSE2-LABEL: huge_length_eq: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF +; X64-SSE2-NEXT: callq memcmp +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: popq %rcx +; X64-SSE2-NEXT: retq +; +; X64-AVX1-LABEL: huge_length_eq: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF +; X64-AVX1-NEXT: callq memcmp +; X64-AVX1-NEXT: testl %eax, %eax +; X64-AVX1-NEXT: sete %al +; X64-AVX1-NEXT: popq %rcx +; X64-AVX1-NEXT: retq +; +; X64-BCMP-LABEL: huge_length_eq: +; X64-BCMP: # %bb.0: +; X64-BCMP-NEXT: pushq %rax +; X64-BCMP-NEXT: movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF +; X64-BCMP-NEXT: callq bcmp +; X64-BCMP-NEXT: notb %al +; X64-BCMP-NEXT: andb $1, %al +; X64-BCMP-NEXT: popq %rcx +; X64-BCMP-NEXT: retq + + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 9223372036854775807) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +; This checks non-constant sizes. +define i32 @nonconst_length(i8* %X, i8* %Y, i64 %size) nounwind { +; X86-LABEL: nonconst_length: +; X86: # %bb.0: +; X86-NEXT: jmp memcmp # TAILCALL +; +; X64-LABEL: nonconst_length: +; X64: # %bb.0: +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 %size) nounwind + ret i32 %m +} + +define i1 @nonconst_length_eq(i8* %X, i8* %Y, i64 %size) nounwind { +; X86-LABEL: nonconst_length_eq: +; X86: # %bb.0: +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-SSE2-LABEL: nonconst_length_eq: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: callq memcmp +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: popq %rcx +; X64-SSE2-NEXT: retq +; +; X64-AVX1-LABEL: nonconst_length_eq: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: pushq %rax +; X64-AVX1-NEXT: callq memcmp +; X64-AVX1-NEXT: testl %eax, %eax +; X64-AVX1-NEXT: sete %al +; X64-AVX1-NEXT: popq %rcx +; X64-AVX1-NEXT: retq +; +; X64-BCMP-LABEL: nonconst_length_eq: +; X64-BCMP: # %bb.0: +; X64-BCMP-NEXT: pushq %rax +; X64-BCMP-NEXT: callq bcmp +; X64-BCMP-NEXT: notb %al +; X64-BCMP-NEXT: andb $1, %al +; X64-BCMP-NEXT: popq %rcx +; X64-BCMP-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 %size) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +}