Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -143,6 +143,13 @@
              "switch statement. A value greater than 100 will void this "
              "optimization"));
 
+static cl::opt<std::string> MemeqLibraryFunction(
+    "memeq-lib-function", cl::Hidden, cl::init(""),
+    cl::desc("if provided, emit a call to this library function to check for "
+             "string equality instead of `memcmp() == 0`. The calling "
+             "convention is the same as that of `memcpy` and the signature is "
+             "`bool(const char*, const char*, size_t`"));
+
 // Limit the width of DAG chains. This is important in general to prevent
 // DAG-based analysis from blowing up. For example, alias analysis and
 // load clustering may not complete in reasonable time. It is difficult to
@@ -6715,12 +6722,13 @@
 /// The caller already checked that \p I calls the appropriate LibFunc with a
 /// correct prototype.
 bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
+  const auto &TLI = DAG.getTargetLoweringInfo();
+  const auto &DL = DAG.getDataLayout();
   const Value *LHS = I.getArgOperand(0), *RHS = I.getArgOperand(1);
   const Value *Size = I.getArgOperand(2);
   const ConstantInt *CSize = dyn_cast<ConstantInt>(Size);
   if (CSize && CSize->getZExtValue() == 0) {
-    EVT CallVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
-                                                          I.getType(), true);
+    EVT CallVT = TLI.getValueType(DL, I.getType(), true);
     setValue(&I, DAG.getConstant(0, getCurSDLoc(), CallVT));
     return true;
   }
@@ -6735,70 +6743,114 @@
     return true;
   }
 
-  // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS)  != 0
-  // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS)  != 0
-  if (!CSize || !isOnlyUsedInZeroEqualityComparison(&I))
+  if (!isOnlyUsedInZeroEqualityComparison(&I))
     return false;
 
-  // If the target has a fast compare for the given size, it will return a
-  // preferred load type for that size. Require that the load VT is legal and
-  // that the target supports unaligned loads of that type. Otherwise, return
-  // INVALID.
-  auto hasFastLoadsAndCompare = [&](unsigned NumBits) {
-    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    MVT LVT = TLI.hasFastEqualityCompare(NumBits);
-    if (LVT != MVT::INVALID_SIMPLE_VALUE_TYPE) {
-      // TODO: Handle 5 byte compare as 4-byte + 1 byte.
-      // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
-      // TODO: Check alignment of src and dest ptrs.
-      unsigned DstAS = LHS->getType()->getPointerAddressSpace();
-      unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
-      if (!TLI.isTypeLegal(LVT) ||
-          !TLI.allowsMisalignedMemoryAccesses(LVT, SrcAS) ||
-          !TLI.allowsMisalignedMemoryAccesses(LVT, DstAS))
-        LVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
-    }
-
-    return LVT;
-  };
+  // We're only interested in the boolean comparison value (equal/not equal).
+
+  // If the size is a compile-time constant, we first try to lower to a single
+  // comparison between two loads:
+  //   memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS)  != 0
+  //   memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS)  != 0
+  if (CSize) {
+    // If the target has a fast compare for the given size, it will return a
+    // preferred load type for that size. Require that the load VT is legal and
+    // that the target supports unaligned loads of that type. Otherwise, return
+    // INVALID.
+    auto hasFastLoadsAndCompare = [&](unsigned NumBits) {
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      MVT LVT = TLI.hasFastEqualityCompare(NumBits);
+      if (LVT != MVT::INVALID_SIMPLE_VALUE_TYPE) {
+        // TODO: Handle 5 byte compare as 4-byte + 1 byte.
+        // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
+        // TODO: Check alignment of src and dest ptrs.
+        unsigned DstAS = LHS->getType()->getPointerAddressSpace();
+        unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
+        if (!TLI.isTypeLegal(LVT) ||
+            !TLI.allowsMisalignedMemoryAccesses(LVT, SrcAS) ||
+            !TLI.allowsMisalignedMemoryAccesses(LVT, DstAS))
+          LVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
+      }
 
-  // This turns into unaligned loads. We only do this if the target natively
-  // supports the MVT we'll be loading or if it is small enough (<= 4) that
-  // we'll only produce a small number of byte loads.
-  MVT LoadVT;
-  unsigned NumBitsToCompare = CSize->getZExtValue() * 8;
-  switch (NumBitsToCompare) {
-  default:
-    return false;
-  case 16:
-    LoadVT = MVT::i16;
-    break;
-  case 32:
-    LoadVT = MVT::i32;
-    break;
-  case 64:
-  case 128:
-  case 256:
-    LoadVT = hasFastLoadsAndCompare(NumBitsToCompare);
-    break;
-  }
+      return LVT;
+    };
 
-  if (LoadVT == MVT::INVALID_SIMPLE_VALUE_TYPE)
-    return false;
+    // This turns into unaligned loads. We only do this if the target natively
+    // supports the MVT we'll be loading or if it is small enough (<= 4) that
+    // we'll only produce a small number of byte loads.
+    MVT LoadVT;
+    unsigned NumBitsToCompare = CSize->getZExtValue() * 8;
+    switch (NumBitsToCompare) {
+    default:
+      LoadVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
+      break;
+    case 16:
+      LoadVT = MVT::i16;
+      break;
+    case 32:
+      LoadVT = MVT::i32;
+      break;
+    case 64:
+    case 128:
+    case 256:
+      LoadVT = hasFastLoadsAndCompare(NumBitsToCompare);
+      break;
+    }
 
-  SDValue LoadL = getMemCmpLoad(LHS, LoadVT, *this);
-  SDValue LoadR = getMemCmpLoad(RHS, LoadVT, *this);
+    if (LoadVT != MVT::INVALID_SIMPLE_VALUE_TYPE) {
+      SDValue LoadL = getMemCmpLoad(LHS, LoadVT, *this);
+      SDValue LoadR = getMemCmpLoad(RHS, LoadVT, *this);
 
-  // Bitcast to a wide integer type if the loads are vectors.
-  if (LoadVT.isVector()) {
-    EVT CmpVT = EVT::getIntegerVT(LHS->getContext(), LoadVT.getSizeInBits());
-    LoadL = DAG.getBitcast(CmpVT, LoadL);
-    LoadR = DAG.getBitcast(CmpVT, LoadR);
+      // Bitcast to a wide integer type if the loads are vectors.
+      if (LoadVT.isVector()) {
+        EVT CmpVT = EVT::getIntegerVT(LHS->getContext(), LoadVT.getSizeInBits());
+        LoadL = DAG.getBitcast(CmpVT, LoadL);
+        LoadR = DAG.getBitcast(CmpVT, LoadR);
+      }
+
+      SDValue Cmp = DAG.getSetCC(getCurSDLoc(), MVT::i1, LoadL, LoadR, ISD::SETNE);
+      processIntegerCallValue(I, Cmp, false);
+      return true;
+    }
   }
 
-  SDValue Cmp = DAG.getSetCC(getCurSDLoc(), MVT::i1, LoadL, LoadR, ISD::SETNE);
-  processIntegerCallValue(I, Cmp, false);
-  return true;
+  // The size is not constant or it's not efficient to use the strategy above.
+  // If the user provided a `memeq` library function, call it.
+  if (!MemeqLibraryFunction.empty()) {
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    // signature: bool(const char*, const char*, size_t)
+    Entry.Ty = DL.getIntPtrType(*DAG.getContext());
+    Entry.Node = getValue(LHS);
+    Args.push_back(Entry);
+    Entry.Node = getValue(RHS);
+    Args.push_back(Entry);
+    Entry.Node = getValue(Size);
+    Args.push_back(Entry);
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(getCurSDLoc())
+        .setChain(DAG.getRoot())
+        .setLibCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY),
+                      Type::getInt1Ty(*DAG.getContext()),
+                      DAG.getExternalSymbol(MemeqLibraryFunction.c_str(),
+                                            TLI.getPointerTy(DL)),
+                      std::move(Args))
+        .setDiscardResult(false);
+
+    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+    processIntegerCallValue(
+        I,
+        // We have now turned `memcmp() != 0` into `memeq() != 0`, we need to
+        // add a not to have: `(!memeq()) != 0`, i.e. `memeq()`
+        DAG.getLogicalNOT(getCurSDLoc(), CallResult.first,
+                          EVT::getIntegerVT(*DAG.getContext(), 1)),
+        false);
+    PendingLoads.push_back(CallResult.second);
+    return true;
+  }
+
+  // Nothing better, just call memcmp().
+  return false;
 }
 
 /// See if we can lower a memchr call into an optimized form. If so, return
Index: test/CodeGen/X86/memcmp.ll
===================================================================
--- test/CodeGen/X86/memcmp.ll
+++ test/CodeGen/X86/memcmp.ll
@@ -5,6 +5,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown             | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx  | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 -memeq-lib-function=user_memeq | FileCheck %s --check-prefix=X64 --check-prefix=MEMEQ
 
 ; This tests codegen time inlining/optimization of memcmp
 ; rdar://6480398
@@ -801,6 +802,15 @@
 ; X64-AVX-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X64-AVX-NEXT:    setne %al
 ; X64-AVX-NEXT:    retq
+;
+; MEMEQ-LABEL: length16_eq:
+; MEMEQ:       # %bb.0:
+; MEMEQ-NEXT:    vmovdqu (%rdi), %xmm0
+; MEMEQ-NEXT:    vpcmpeqb (%rsi), %xmm0, %xmm0
+; MEMEQ-NEXT:    vpmovmskb %xmm0, %eax
+; MEMEQ-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; MEMEQ-NEXT:    setne %al
+; MEMEQ-NEXT:    retq
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind
   %cmp = icmp ne i32 %call, 0
   ret i1 %cmp
@@ -858,6 +868,15 @@
 ; X64-AVX-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X64-AVX-NEXT:    sete %al
 ; X64-AVX-NEXT:    retq
+;
+; MEMEQ-LABEL: length16_eq_const:
+; MEMEQ:       # %bb.0:
+; MEMEQ-NEXT:    vmovdqu (%rdi), %xmm0
+; MEMEQ-NEXT:    vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
+; MEMEQ-NEXT:    vpmovmskb %xmm0, %eax
+; MEMEQ-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; MEMEQ-NEXT:    sete %al
+; MEMEQ-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind
   %c = icmp eq i32 %m, 0
   ret i1 %c
@@ -951,6 +970,19 @@
 ; X64-AVX-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X64-AVX-NEXT:    sete %al
 ; X64-AVX-NEXT:    retq
+;
+; MEMEQ-LABEL: length24_eq:
+; MEMEQ:       # %bb.0:
+; MEMEQ-NEXT:    vmovdqu (%rdi), %xmm0
+; MEMEQ-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; MEMEQ-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; MEMEQ-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm1
+; MEMEQ-NEXT:    vpcmpeqb (%rsi), %xmm0, %xmm0
+; MEMEQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; MEMEQ-NEXT:    vpmovmskb %xmm0, %eax
+; MEMEQ-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; MEMEQ-NEXT:    sete %al
+; MEMEQ-NEXT:    retq
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind
   %cmp = icmp eq i32 %call, 0
   ret i1 %cmp
@@ -1021,6 +1053,20 @@
 ; X64-AVX-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X64-AVX-NEXT:    setne %al
 ; X64-AVX-NEXT:    retq
+;
+; MEMEQ-LABEL: length24_eq_const:
+; MEMEQ:       # %bb.0:
+; MEMEQ-NEXT:    vmovdqu (%rdi), %xmm0
+; MEMEQ-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; MEMEQ-NEXT:    movabsq $3689065127958034230, %rax # imm = 0x3332313039383736
+; MEMEQ-NEXT:    vmovq %rax, %xmm2
+; MEMEQ-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm1
+; MEMEQ-NEXT:    vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
+; MEMEQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; MEMEQ-NEXT:    vpmovmskb %xmm0, %eax
+; MEMEQ-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; MEMEQ-NEXT:    setne %al
+; MEMEQ-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind
   %c = icmp ne i32 %m, 0
   ret i1 %c
@@ -1123,6 +1169,16 @@
 ; X64-AVX2-NEXT:    sete %al
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
+;
+; MEMEQ-LABEL: length32_eq:
+; MEMEQ:       # %bb.0:
+; MEMEQ-NEXT:    vmovdqu (%rdi), %ymm0
+; MEMEQ-NEXT:    vpcmpeqb (%rsi), %ymm0, %ymm0
+; MEMEQ-NEXT:    vpmovmskb %ymm0, %eax
+; MEMEQ-NEXT:    cmpl $-1, %eax
+; MEMEQ-NEXT:    sete %al
+; MEMEQ-NEXT:    vzeroupper
+; MEMEQ-NEXT:    retq
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind
   %cmp = icmp eq i32 %call, 0
   ret i1 %cmp
@@ -1199,6 +1255,16 @@
 ; X64-AVX2-NEXT:    setne %al
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
+;
+; MEMEQ-LABEL: length32_eq_const:
+; MEMEQ:       # %bb.0:
+; MEMEQ-NEXT:    vmovdqu (%rdi), %ymm0
+; MEMEQ-NEXT:    vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
+; MEMEQ-NEXT:    vpmovmskb %ymm0, %eax
+; MEMEQ-NEXT:    cmpl $-1, %eax
+; MEMEQ-NEXT:    setne %al
+; MEMEQ-NEXT:    vzeroupper
+; MEMEQ-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind
   %c = icmp ne i32 %m, 0
   ret i1 %c
@@ -1268,6 +1334,19 @@
 ; X64-AVX2-NEXT:    setne %al
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
+;
+; MEMEQ-LABEL: length64_eq:
+; MEMEQ:       # %bb.0:
+; MEMEQ-NEXT:    vmovdqu (%rdi), %ymm0
+; MEMEQ-NEXT:    vmovdqu 32(%rdi), %ymm1
+; MEMEQ-NEXT:    vpcmpeqb 32(%rsi), %ymm1, %ymm1
+; MEMEQ-NEXT:    vpcmpeqb (%rsi), %ymm0, %ymm0
+; MEMEQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; MEMEQ-NEXT:    vpmovmskb %ymm0, %eax
+; MEMEQ-NEXT:    cmpl $-1, %eax
+; MEMEQ-NEXT:    setne %al
+; MEMEQ-NEXT:    vzeroupper
+; MEMEQ-NEXT:    retq
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind
   %cmp = icmp ne i32 %call, 0
   ret i1 %cmp
@@ -1320,6 +1399,19 @@
 ; X64-AVX2-NEXT:    sete %al
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
+;
+; MEMEQ-LABEL: length64_eq_const:
+; MEMEQ:       # %bb.0:
+; MEMEQ-NEXT:    vmovdqu (%rdi), %ymm0
+; MEMEQ-NEXT:    vmovdqu 32(%rdi), %ymm1
+; MEMEQ-NEXT:    vpcmpeqb {{.*}}(%rip), %ymm1, %ymm1
+; MEMEQ-NEXT:    vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
+; MEMEQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; MEMEQ-NEXT:    vpmovmskb %ymm0, %eax
+; MEMEQ-NEXT:    cmpl $-1, %eax
+; MEMEQ-NEXT:    sete %al
+; MEMEQ-NEXT:    vzeroupper
+; MEMEQ-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind
   %c = icmp eq i32 %m, 0
   ret i1 %c
@@ -1344,3 +1436,106 @@
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 9223372036854775807) nounwind
   ret i32 %m
 }
+
+define i1 @huge_length_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: huge_length_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $2147483647 # imm = 0x7FFFFFFF
+; X86-NEXT:    pushl $-1
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-SSE2-LABEL: huge_length_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX-LABEL: huge_length_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    pushq %rax
+; X64-AVX-NEXT:    movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-AVX-NEXT:    callq memcmp
+; X64-AVX-NEXT:    testl %eax, %eax
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    popq %rcx
+; X64-AVX-NEXT:    retq
+;
+; MEMEQ-LABEL: huge_length_eq:
+; MEMEQ:       # %bb.0:
+; MEMEQ-NEXT:    pushq %rax
+; MEMEQ-NEXT:    movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
+; MEMEQ-NEXT:    callq user_memeq
+; MEMEQ-NEXT:    andb $1, %al
+; MEMEQ-NEXT:    popq %rcx
+; MEMEQ-NEXT:    retq
+
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 9223372036854775807) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; This checks non-constant sizes.
+define i32 @nonconst_length(i8* %X, i8* %Y, i64 %size) nounwind {
+; X86-LABEL: nonconst_length:
+; X86:       # %bb.0:
+; X86-NEXT:    jmp memcmp # TAILCALL
+;
+; X64-LABEL: nonconst_length:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 %size) nounwind
+  ret i32 %m
+}
+
+define i1 @nonconst_length_eq(i8* %X, i8* %Y, i64 %size) nounwind {
+; X86-LABEL: nonconst_length_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-SSE2-LABEL: nonconst_length_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX-LABEL: nonconst_length_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    pushq %rax
+; X64-AVX-NEXT:    callq memcmp
+; X64-AVX-NEXT:    testl %eax, %eax
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    popq %rcx
+; X64-AVX-NEXT:    retq
+;
+; MEMEQ-LABEL: nonconst_length_eq:
+; MEMEQ:       # %bb.0:
+; MEMEQ-NEXT:    pushq %rax
+; MEMEQ-NEXT:    callq user_memeq
+; MEMEQ-NEXT:    andb $1, %al
+; MEMEQ-NEXT:    popq %rcx
+; MEMEQ-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 %size) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}