Index: include/llvm/IR/RuntimeLibcalls.def
===================================================================
--- include/llvm/IR/RuntimeLibcalls.def
+++ include/llvm/IR/RuntimeLibcalls.def
@@ -377,6 +377,7 @@
 HANDLE_LIBCALL(MEMMOVE, "memmove")
 HANDLE_LIBCALL(MEMSET, "memset")
 HANDLE_LIBCALL(BZERO, nullptr)
+HANDLE_LIBCALL(BCMP, nullptr)
 
 // Element-wise unordered-atomic memory of different sizes
 HANDLE_LIBCALL(MEMCPY_ELEMENT_UNORDERED_ATOMIC_1, "__llvm_memcpy_element_unordered_atomic_1")
Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6715,12 +6715,13 @@
 /// The caller already checked that \p I calls the appropriate LibFunc with a
 /// correct prototype.
 bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
+  const auto &TLI = DAG.getTargetLoweringInfo();
+  const auto &DL = DAG.getDataLayout();
   const Value *LHS = I.getArgOperand(0), *RHS = I.getArgOperand(1);
   const Value *Size = I.getArgOperand(2);
   const ConstantInt *CSize = dyn_cast<ConstantInt>(Size);
   if (CSize && CSize->getZExtValue() == 0) {
-    EVT CallVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
-                                                          I.getType(), true);
+    EVT CallVT = TLI.getValueType(DL, I.getType(), true);
     setValue(&I, DAG.getConstant(0, getCurSDLoc(), CallVT));
     return true;
   }
@@ -6735,70 +6736,110 @@
     return true;
   }
 
-  // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS)  != 0
-  // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS)  != 0
-  if (!CSize || !isOnlyUsedInZeroEqualityComparison(&I))
+  if (!isOnlyUsedInZeroEqualityComparison(&I))
     return false;
 
-  // If the target has a fast compare for the given size, it will return a
-  // preferred load type for that size. Require that the load VT is legal and
-  // that the target supports unaligned loads of that type. Otherwise, return
-  // INVALID.
-  auto hasFastLoadsAndCompare = [&](unsigned NumBits) {
-    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    MVT LVT = TLI.hasFastEqualityCompare(NumBits);
-    if (LVT != MVT::INVALID_SIMPLE_VALUE_TYPE) {
-      // TODO: Handle 5 byte compare as 4-byte + 1 byte.
-      // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
-      // TODO: Check alignment of src and dest ptrs.
-      unsigned DstAS = LHS->getType()->getPointerAddressSpace();
-      unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
-      if (!TLI.isTypeLegal(LVT) ||
-          !TLI.allowsMisalignedMemoryAccesses(LVT, SrcAS) ||
-          !TLI.allowsMisalignedMemoryAccesses(LVT, DstAS))
-        LVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
-    }
-
-    return LVT;
-  };
+  // We're only interested in the boolean comparison value (equal/not equal).
+
+  // If the size is a compile-time constant, we first try to lower to a single
+  // comparison between two loads:
+  //   memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS)  != 0
+  //   memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS)  != 0
+  if (CSize) {
+    // If the target has a fast compare for the given size, it will return a
+    // preferred load type for that size. Require that the load VT is legal and
+    // that the target supports unaligned loads of that type. Otherwise, return
+    // INVALID.
+    auto hasFastLoadsAndCompare = [&](unsigned NumBits) {
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      MVT LVT = TLI.hasFastEqualityCompare(NumBits);
+      if (LVT != MVT::INVALID_SIMPLE_VALUE_TYPE) {
+        // TODO: Handle 5 byte compare as 4-byte + 1 byte.
+        // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
+        // TODO: Check alignment of src and dest ptrs.
+        unsigned DstAS = LHS->getType()->getPointerAddressSpace();
+        unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
+        if (!TLI.isTypeLegal(LVT) ||
+            !TLI.allowsMisalignedMemoryAccesses(LVT, SrcAS) ||
+            !TLI.allowsMisalignedMemoryAccesses(LVT, DstAS))
+          LVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
+      }
 
-  // This turns into unaligned loads. We only do this if the target natively
-  // supports the MVT we'll be loading or if it is small enough (<= 4) that
-  // we'll only produce a small number of byte loads.
-  MVT LoadVT;
-  unsigned NumBitsToCompare = CSize->getZExtValue() * 8;
-  switch (NumBitsToCompare) {
-  default:
-    return false;
-  case 16:
-    LoadVT = MVT::i16;
-    break;
-  case 32:
-    LoadVT = MVT::i32;
-    break;
-  case 64:
-  case 128:
-  case 256:
-    LoadVT = hasFastLoadsAndCompare(NumBitsToCompare);
-    break;
-  }
+      return LVT;
+    };
 
-  if (LoadVT == MVT::INVALID_SIMPLE_VALUE_TYPE)
-    return false;
+    // This turns into unaligned loads. We only do this if the target natively
+    // supports the MVT we'll be loading or if it is small enough (<= 4) that
+    // we'll only produce a small number of byte loads.
+    MVT LoadVT;
+    unsigned NumBitsToCompare = CSize->getZExtValue() * 8;
+    switch (NumBitsToCompare) {
+    default:
+      LoadVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
+      break;
+    case 16:
+      LoadVT = MVT::i16;
+      break;
+    case 32:
+      LoadVT = MVT::i32;
+      break;
+    case 64:
+    case 128:
+    case 256:
+      LoadVT = hasFastLoadsAndCompare(NumBitsToCompare);
+      break;
+    }
+
+    if (LoadVT != MVT::INVALID_SIMPLE_VALUE_TYPE) {
+      SDValue LoadL = getMemCmpLoad(LHS, LoadVT, *this);
+      SDValue LoadR = getMemCmpLoad(RHS, LoadVT, *this);
 
-  SDValue LoadL = getMemCmpLoad(LHS, LoadVT, *this);
-  SDValue LoadR = getMemCmpLoad(RHS, LoadVT, *this);
+      // Bitcast to a wide integer type if the loads are vectors.
+      if (LoadVT.isVector()) {
+        EVT CmpVT =
+            EVT::getIntegerVT(LHS->getContext(), LoadVT.getSizeInBits());
+        LoadL = DAG.getBitcast(CmpVT, LoadL);
+        LoadR = DAG.getBitcast(CmpVT, LoadR);
+      }
 
-  // Bitcast to a wide integer type if the loads are vectors.
-  if (LoadVT.isVector()) {
-    EVT CmpVT = EVT::getIntegerVT(LHS->getContext(), LoadVT.getSizeInBits());
-    LoadL = DAG.getBitcast(CmpVT, LoadL);
-    LoadR = DAG.getBitcast(CmpVT, LoadR);
+      SDValue Cmp =
+          DAG.getSetCC(getCurSDLoc(), MVT::i1, LoadL, LoadR, ISD::SETNE);
+      processIntegerCallValue(I, Cmp, false);
+      return true;
+    }
   }
 
-  SDValue Cmp = DAG.getSetCC(getCurSDLoc(), MVT::i1, LoadL, LoadR, ISD::SETNE);
-  processIntegerCallValue(I, Cmp, false);
-  return true;
+  // The size is not constant or it's not efficient to use the strategy above.
+  // If the target has a `bcmp()` function, call it.
+  if (const char* const BcmpFuncName = TLI.getLibcallName(RTLIB::BCMP)) {
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    // signature: bool(const char*, const char*, size_t)
+    Entry.Ty = DL.getIntPtrType(*DAG.getContext());
+    Entry.Node = getValue(LHS);
+    Args.push_back(Entry);
+    Entry.Node = getValue(RHS);
+    Args.push_back(Entry);
+    Entry.Node = getValue(Size);
+    Args.push_back(Entry);
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(getCurSDLoc())
+        .setChain(DAG.getRoot())
+        .setLibCallee(
+            TLI.getLibcallCallingConv(RTLIB::BCMP),
+            Type::getInt1Ty(*DAG.getContext()),
+            DAG.getExternalSymbol(BcmpFuncName, TLI.getPointerTy(DL)),
+            std::move(Args))
+        .setDiscardResult(false);
+
+    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+    processIntegerCallValue(I, CallResult.first, false);
+    PendingLoads.push_back(CallResult.second);
+    return true;
+  }
+
+  // Nothing better, just call memcmp().
+  return false;
 }
 
 /// See if we can lower a memchr call into an optimized form. If so, return
Index: lib/CodeGen/TargetLoweringBase.cpp
===================================================================
--- lib/CodeGen/TargetLoweringBase.cpp
+++ lib/CodeGen/TargetLoweringBase.cpp
@@ -104,6 +104,16 @@
   return true;
 }
 
+static bool targetHasBcmp(const Triple &TT) {
+  // Posix removed support from bcmp() in 2001, but the the glibc and several
+  // implementations of the libc still have it.
+  if (TT.isOSLinux())
+    return TT.isGNUEnvironment() || TT.isMusl();
+  // Both NetBSD and OpenBSD are planning to remove the function. Windows does
+  // not have it.
+  return TT.isOSFreeBSD() || TT.isOSSolaris() || TT.isOSDarwin();
+}
+
 // Although this default value is arbitrary, it is not random. It is assumed
 // that a condition that evaluates the same way by a higher percentage than this
 // is best represented as control flow. Therefore, the default value N should be
@@ -173,6 +183,11 @@
   if (TT.isOSOpenBSD()) {
     setLibcallName(RTLIB::STACKPROTECTOR_CHECK_FAIL, nullptr);
   }
+
+  // Some environements have bcmp(), which can be used for more efficient
+  // equality comparison.
+  if (targetHasBcmp(TT))
+    setLibcallName(RTLIB::BCMP, "bcmp");
 }
 
 /// getFPEXT - Return the FPEXT_*_* value for the given types, or
Index: test/CodeGen/X86/memcmp.ll
===================================================================
--- test/CodeGen/X86/memcmp.ll
+++ test/CodeGen/X86/memcmp.ll
@@ -5,6 +5,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown             | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx  | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 --check-prefix=X64-BCMP
 
 ; This tests codegen time inlining/optimization of memcmp
 ; rdar://6480398
@@ -1344,3 +1345,108 @@
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 9223372036854775807) nounwind
   ret i32 %m
 }
+
+define i1 @huge_length_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: huge_length_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $2147483647 # imm = 0x7FFFFFFF
+; X86-NEXT:    pushl $-1
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-SSE2-LABEL: huge_length_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX1-LABEL: huge_length_eq:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    pushq %rax
+; X64-AVX1-NEXT:    movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-AVX1-NEXT:    callq memcmp
+; X64-AVX1-NEXT:    testl %eax, %eax
+; X64-AVX1-NEXT:    sete %al
+; X64-AVX1-NEXT:    popq %rcx
+; X64-AVX1-NEXT:    retq
+;
+; X64-BCMP-LABEL: huge_length_eq:
+; X64-BCMP:       # %bb.0:
+; X64-BCMP-NEXT:    pushq %rax
+; X64-BCMP-NEXT:    movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-BCMP-NEXT:    callq bcmp
+; X64-BCMP-NEXT:    notb %al
+; X64-BCMP-NEXT:    andb $1, %al
+; X64-BCMP-NEXT:    popq %rcx
+; X64-BCMP-NEXT:    retq
+
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 9223372036854775807) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; This checks non-constant sizes.
+define i32 @nonconst_length(i8* %X, i8* %Y, i64 %size) nounwind {
+; X86-LABEL: nonconst_length:
+; X86:       # %bb.0:
+; X86-NEXT:    jmp memcmp # TAILCALL
+;
+; X64-LABEL: nonconst_length:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 %size) nounwind
+  ret i32 %m
+}
+
+define i1 @nonconst_length_eq(i8* %X, i8* %Y, i64 %size) nounwind {
+; X86-LABEL: nonconst_length_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-SSE2-LABEL: nonconst_length_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX1-LABEL: nonconst_length_eq:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    pushq %rax
+; X64-AVX1-NEXT:    callq memcmp
+; X64-AVX1-NEXT:    testl %eax, %eax
+; X64-AVX1-NEXT:    sete %al
+; X64-AVX1-NEXT:    popq %rcx
+; X64-AVX1-NEXT:    retq
+;
+; X64-BCMP-LABEL: nonconst_length_eq:
+; X64-BCMP:       # %bb.0:
+; X64-BCMP-NEXT:    pushq %rax
+; X64-BCMP-NEXT:    callq bcmp
+; X64-BCMP-NEXT:    notb %al
+; X64-BCMP-NEXT:    andb $1, %al
+; X64-BCMP-NEXT:    popq %rcx
+; X64-BCMP-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 %size) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}