Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -143,6 +143,13 @@ "switch statement. A value greater than 100 will void this " "optimization")); +static cl::opt MemeqLibraryFunction( + "memeq-lib-function", cl::Hidden, cl::init(""), + cl::desc("if provided, emit a call to this library function to check for " + "string equality instead of `memcmp() == 0`. The calling " + "convention is the same as that of `memcpy` and the signature is " + "`bool(const char*, const char*, size_t`")); + // Limit the width of DAG chains. This is important in general to prevent // DAG-based analysis from blowing up. For example, alias analysis and // load clustering may not complete in reasonable time. It is difficult to @@ -6715,12 +6722,13 @@ /// The caller already checked that \p I calls the appropriate LibFunc with a /// correct prototype. bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) { + const auto &TLI = DAG.getTargetLoweringInfo(); + const auto &DL = DAG.getDataLayout(); const Value *LHS = I.getArgOperand(0), *RHS = I.getArgOperand(1); const Value *Size = I.getArgOperand(2); const ConstantInt *CSize = dyn_cast(Size); if (CSize && CSize->getZExtValue() == 0) { - EVT CallVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(), - I.getType(), true); + EVT CallVT = TLI.getValueType(DL, I.getType(), true); setValue(&I, DAG.getConstant(0, getCurSDLoc(), CallVT)); return true; } @@ -6735,70 +6743,114 @@ return true; } - // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS) != 0 - // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS) != 0 - if (!CSize || !isOnlyUsedInZeroEqualityComparison(&I)) + if (!isOnlyUsedInZeroEqualityComparison(&I)) return false; - // If the target has a fast compare for the given size, it will return a - // preferred load type for that size. Require that the load VT is legal and - // that the target supports unaligned loads of that type. Otherwise, return - // INVALID. - auto hasFastLoadsAndCompare = [&](unsigned NumBits) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - MVT LVT = TLI.hasFastEqualityCompare(NumBits); - if (LVT != MVT::INVALID_SIMPLE_VALUE_TYPE) { - // TODO: Handle 5 byte compare as 4-byte + 1 byte. - // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads. - // TODO: Check alignment of src and dest ptrs. - unsigned DstAS = LHS->getType()->getPointerAddressSpace(); - unsigned SrcAS = RHS->getType()->getPointerAddressSpace(); - if (!TLI.isTypeLegal(LVT) || - !TLI.allowsMisalignedMemoryAccesses(LVT, SrcAS) || - !TLI.allowsMisalignedMemoryAccesses(LVT, DstAS)) - LVT = MVT::INVALID_SIMPLE_VALUE_TYPE; - } - - return LVT; - }; + // We're only interested in the boolean comparison value (equal/not equal). + + // If the size is a compile-time constant, we first try to lower to a single + // comparison between two loads: + // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS) != 0 + // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS) != 0 + if (CSize) { + // If the target has a fast compare for the given size, it will return a + // preferred load type for that size. Require that the load VT is legal and + // that the target supports unaligned loads of that type. Otherwise, return + // INVALID. + auto hasFastLoadsAndCompare = [&](unsigned NumBits) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MVT LVT = TLI.hasFastEqualityCompare(NumBits); + if (LVT != MVT::INVALID_SIMPLE_VALUE_TYPE) { + // TODO: Handle 5 byte compare as 4-byte + 1 byte. + // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads. + // TODO: Check alignment of src and dest ptrs. + unsigned DstAS = LHS->getType()->getPointerAddressSpace(); + unsigned SrcAS = RHS->getType()->getPointerAddressSpace(); + if (!TLI.isTypeLegal(LVT) || + !TLI.allowsMisalignedMemoryAccesses(LVT, SrcAS) || + !TLI.allowsMisalignedMemoryAccesses(LVT, DstAS)) + LVT = MVT::INVALID_SIMPLE_VALUE_TYPE; + } - // This turns into unaligned loads. We only do this if the target natively - // supports the MVT we'll be loading or if it is small enough (<= 4) that - // we'll only produce a small number of byte loads. - MVT LoadVT; - unsigned NumBitsToCompare = CSize->getZExtValue() * 8; - switch (NumBitsToCompare) { - default: - return false; - case 16: - LoadVT = MVT::i16; - break; - case 32: - LoadVT = MVT::i32; - break; - case 64: - case 128: - case 256: - LoadVT = hasFastLoadsAndCompare(NumBitsToCompare); - break; - } + return LVT; + }; - if (LoadVT == MVT::INVALID_SIMPLE_VALUE_TYPE) - return false; + // This turns into unaligned loads. We only do this if the target natively + // supports the MVT we'll be loading or if it is small enough (<= 4) that + // we'll only produce a small number of byte loads. + MVT LoadVT; + unsigned NumBitsToCompare = CSize->getZExtValue() * 8; + switch (NumBitsToCompare) { + default: + LoadVT = MVT::INVALID_SIMPLE_VALUE_TYPE; + break; + case 16: + LoadVT = MVT::i16; + break; + case 32: + LoadVT = MVT::i32; + break; + case 64: + case 128: + case 256: + LoadVT = hasFastLoadsAndCompare(NumBitsToCompare); + break; + } - SDValue LoadL = getMemCmpLoad(LHS, LoadVT, *this); - SDValue LoadR = getMemCmpLoad(RHS, LoadVT, *this); + if (LoadVT != MVT::INVALID_SIMPLE_VALUE_TYPE) { + SDValue LoadL = getMemCmpLoad(LHS, LoadVT, *this); + SDValue LoadR = getMemCmpLoad(RHS, LoadVT, *this); - // Bitcast to a wide integer type if the loads are vectors. - if (LoadVT.isVector()) { - EVT CmpVT = EVT::getIntegerVT(LHS->getContext(), LoadVT.getSizeInBits()); - LoadL = DAG.getBitcast(CmpVT, LoadL); - LoadR = DAG.getBitcast(CmpVT, LoadR); + // Bitcast to a wide integer type if the loads are vectors. + if (LoadVT.isVector()) { + EVT CmpVT = EVT::getIntegerVT(LHS->getContext(), LoadVT.getSizeInBits()); + LoadL = DAG.getBitcast(CmpVT, LoadL); + LoadR = DAG.getBitcast(CmpVT, LoadR); + } + + SDValue Cmp = DAG.getSetCC(getCurSDLoc(), MVT::i1, LoadL, LoadR, ISD::SETNE); + processIntegerCallValue(I, Cmp, false); + return true; + } } - SDValue Cmp = DAG.getSetCC(getCurSDLoc(), MVT::i1, LoadL, LoadR, ISD::SETNE); - processIntegerCallValue(I, Cmp, false); - return true; + // The size is not constant or it's not efficient to use the strategy above. + // If the user provided a `memeq` library function, call it. + if (!MemeqLibraryFunction.empty()) { + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + // signature: bool(const char*, const char*, size_t) + Entry.Ty = DL.getIntPtrType(*DAG.getContext()); + Entry.Node = getValue(LHS); + Args.push_back(Entry); + Entry.Node = getValue(RHS); + Args.push_back(Entry); + Entry.Node = getValue(Size); + Args.push_back(Entry); + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(getCurSDLoc()) + .setChain(DAG.getRoot()) + .setLibCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY), + Type::getInt1Ty(*DAG.getContext()), + DAG.getExternalSymbol(MemeqLibraryFunction.c_str(), + TLI.getPointerTy(DL)), + std::move(Args)) + .setDiscardResult(false); + + std::pair CallResult = TLI.LowerCallTo(CLI); + processIntegerCallValue( + I, + // We have now turned `memcmp() != 0` into `memeq() != 0`, we need to + // add a not to have: `(!memeq()) != 0`, i.e. `memeq()` + DAG.getLogicalNOT(getCurSDLoc(), CallResult.first, + EVT::getIntegerVT(*DAG.getContext(), 1)), + false); + PendingLoads.push_back(CallResult.second); + return true; + } + + // Nothing better, just call memcmp(). + return false; } /// See if we can lower a memchr call into an optimized form. If so, return Index: test/CodeGen/X86/memcmp.ll =================================================================== --- test/CodeGen/X86/memcmp.ll +++ test/CodeGen/X86/memcmp.ll @@ -5,6 +5,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 -memeq-lib-function=user_memeq | FileCheck %s --check-prefix=X64 --check-prefix=MEMEQ ; This tests codegen time inlining/optimization of memcmp ; rdar://6480398 @@ -801,6 +802,15 @@ ; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-AVX-NEXT: setne %al ; X64-AVX-NEXT: retq +; +; MEMEQ-LABEL: length16_eq: +; MEMEQ: # %bb.0: +; MEMEQ-NEXT: vmovdqu (%rdi), %xmm0 +; MEMEQ-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; MEMEQ-NEXT: vpmovmskb %xmm0, %eax +; MEMEQ-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; MEMEQ-NEXT: setne %al +; MEMEQ-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind %cmp = icmp ne i32 %call, 0 ret i1 %cmp @@ -858,6 +868,15 @@ ; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-AVX-NEXT: sete %al ; X64-AVX-NEXT: retq +; +; MEMEQ-LABEL: length16_eq_const: +; MEMEQ: # %bb.0: +; MEMEQ-NEXT: vmovdqu (%rdi), %xmm0 +; MEMEQ-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 +; MEMEQ-NEXT: vpmovmskb %xmm0, %eax +; MEMEQ-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; MEMEQ-NEXT: sete %al +; MEMEQ-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind %c = icmp eq i32 %m, 0 ret i1 %c @@ -951,6 +970,19 @@ ; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-AVX-NEXT: sete %al ; X64-AVX-NEXT: retq +; +; MEMEQ-LABEL: length24_eq: +; MEMEQ: # %bb.0: +; MEMEQ-NEXT: vmovdqu (%rdi), %xmm0 +; MEMEQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; MEMEQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; MEMEQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 +; MEMEQ-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0 +; MEMEQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; MEMEQ-NEXT: vpmovmskb %xmm0, %eax +; MEMEQ-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; MEMEQ-NEXT: sete %al +; MEMEQ-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind %cmp = icmp eq i32 %call, 0 ret i1 %cmp @@ -1021,6 +1053,20 @@ ; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-AVX-NEXT: setne %al ; X64-AVX-NEXT: retq +; +; MEMEQ-LABEL: length24_eq_const: +; MEMEQ: # %bb.0: +; MEMEQ-NEXT: vmovdqu (%rdi), %xmm0 +; MEMEQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; MEMEQ-NEXT: movabsq $3689065127958034230, %rax # imm = 0x3332313039383736 +; MEMEQ-NEXT: vmovq %rax, %xmm2 +; MEMEQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 +; MEMEQ-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0 +; MEMEQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; MEMEQ-NEXT: vpmovmskb %xmm0, %eax +; MEMEQ-NEXT: cmpl $65535, %eax # imm = 0xFFFF +; MEMEQ-NEXT: setne %al +; MEMEQ-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind %c = icmp ne i32 %m, 0 ret i1 %c @@ -1123,6 +1169,16 @@ ; X64-AVX2-NEXT: sete %al ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq +; +; MEMEQ-LABEL: length32_eq: +; MEMEQ: # %bb.0: +; MEMEQ-NEXT: vmovdqu (%rdi), %ymm0 +; MEMEQ-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 +; MEMEQ-NEXT: vpmovmskb %ymm0, %eax +; MEMEQ-NEXT: cmpl $-1, %eax +; MEMEQ-NEXT: sete %al +; MEMEQ-NEXT: vzeroupper +; MEMEQ-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind %cmp = icmp eq i32 %call, 0 ret i1 %cmp @@ -1199,6 +1255,16 @@ ; X64-AVX2-NEXT: setne %al ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq +; +; MEMEQ-LABEL: length32_eq_const: +; MEMEQ: # %bb.0: +; MEMEQ-NEXT: vmovdqu (%rdi), %ymm0 +; MEMEQ-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 +; MEMEQ-NEXT: vpmovmskb %ymm0, %eax +; MEMEQ-NEXT: cmpl $-1, %eax +; MEMEQ-NEXT: setne %al +; MEMEQ-NEXT: vzeroupper +; MEMEQ-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind %c = icmp ne i32 %m, 0 ret i1 %c @@ -1268,6 +1334,19 @@ ; X64-AVX2-NEXT: setne %al ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq +; +; MEMEQ-LABEL: length64_eq: +; MEMEQ: # %bb.0: +; MEMEQ-NEXT: vmovdqu (%rdi), %ymm0 +; MEMEQ-NEXT: vmovdqu 32(%rdi), %ymm1 +; MEMEQ-NEXT: vpcmpeqb 32(%rsi), %ymm1, %ymm1 +; MEMEQ-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0 +; MEMEQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; MEMEQ-NEXT: vpmovmskb %ymm0, %eax +; MEMEQ-NEXT: cmpl $-1, %eax +; MEMEQ-NEXT: setne %al +; MEMEQ-NEXT: vzeroupper +; MEMEQ-NEXT: retq %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind %cmp = icmp ne i32 %call, 0 ret i1 %cmp @@ -1320,6 +1399,19 @@ ; X64-AVX2-NEXT: sete %al ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq +; +; MEMEQ-LABEL: length64_eq_const: +; MEMEQ: # %bb.0: +; MEMEQ-NEXT: vmovdqu (%rdi), %ymm0 +; MEMEQ-NEXT: vmovdqu 32(%rdi), %ymm1 +; MEMEQ-NEXT: vpcmpeqb {{.*}}(%rip), %ymm1, %ymm1 +; MEMEQ-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0 +; MEMEQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; MEMEQ-NEXT: vpmovmskb %ymm0, %eax +; MEMEQ-NEXT: cmpl $-1, %eax +; MEMEQ-NEXT: sete %al +; MEMEQ-NEXT: vzeroupper +; MEMEQ-NEXT: retq %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind %c = icmp eq i32 %m, 0 ret i1 %c @@ -1344,3 +1436,106 @@ %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 9223372036854775807) nounwind ret i32 %m } + +define i1 @huge_length_eq(i8* %X, i8* %Y) nounwind { +; X86-LABEL: huge_length_eq: +; X86: # %bb.0: +; X86-NEXT: pushl $2147483647 # imm = 0x7FFFFFFF +; X86-NEXT: pushl $-1 +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-SSE2-LABEL: huge_length_eq: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF +; X64-SSE2-NEXT: callq memcmp +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: popq %rcx +; X64-SSE2-NEXT: retq +; +; X64-AVX-LABEL: huge_length_eq: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: pushq %rax +; X64-AVX-NEXT: movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF +; X64-AVX-NEXT: callq memcmp +; X64-AVX-NEXT: testl %eax, %eax +; X64-AVX-NEXT: sete %al +; X64-AVX-NEXT: popq %rcx +; X64-AVX-NEXT: retq +; +; MEMEQ-LABEL: huge_length_eq: +; MEMEQ: # %bb.0: +; MEMEQ-NEXT: pushq %rax +; MEMEQ-NEXT: movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF +; MEMEQ-NEXT: callq user_memeq +; MEMEQ-NEXT: andb $1, %al +; MEMEQ-NEXT: popq %rcx +; MEMEQ-NEXT: retq + + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 9223372036854775807) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +} + +; This checks non-constant sizes. +define i32 @nonconst_length(i8* %X, i8* %Y, i64 %size) nounwind { +; X86-LABEL: nonconst_length: +; X86: # %bb.0: +; X86-NEXT: jmp memcmp # TAILCALL +; +; X64-LABEL: nonconst_length: +; X64: # %bb.0: +; X64-NEXT: jmp memcmp # TAILCALL + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 %size) nounwind + ret i32 %m +} + +define i1 @nonconst_length_eq(i8* %X, i8* %Y, i64 %size) nounwind { +; X86-LABEL: nonconst_length_eq: +; X86: # %bb.0: +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: pushl {{[0-9]+}}(%esp) +; X86-NEXT: calll memcmp +; X86-NEXT: addl $16, %esp +; X86-NEXT: testl %eax, %eax +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-SSE2-LABEL: nonconst_length_eq: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: pushq %rax +; X64-SSE2-NEXT: callq memcmp +; X64-SSE2-NEXT: testl %eax, %eax +; X64-SSE2-NEXT: sete %al +; X64-SSE2-NEXT: popq %rcx +; X64-SSE2-NEXT: retq +; +; X64-AVX-LABEL: nonconst_length_eq: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: pushq %rax +; X64-AVX-NEXT: callq memcmp +; X64-AVX-NEXT: testl %eax, %eax +; X64-AVX-NEXT: sete %al +; X64-AVX-NEXT: popq %rcx +; X64-AVX-NEXT: retq +; +; MEMEQ-LABEL: nonconst_length_eq: +; MEMEQ: # %bb.0: +; MEMEQ-NEXT: pushq %rax +; MEMEQ-NEXT: callq user_memeq +; MEMEQ-NEXT: andb $1, %al +; MEMEQ-NEXT: popq %rcx +; MEMEQ-NEXT: retq + %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 %size) nounwind + %c = icmp eq i32 %m, 0 + ret i1 %c +}