Index: include/llvm/Target/TargetLowering.h
===================================================================
--- include/llvm/Target/TargetLowering.h
+++ include/llvm/Target/TargetLowering.h
@@ -437,6 +437,12 @@
     return false;
   }
 
+  /// Return true if the target has a quick way to compare values of the given
+  /// type. By default, assume that any legal type can be compared efficiently.
+  virtual bool hasFastEqualityCompare(EVT VT) const {
+    return isTypeLegal(VT);
+  }
+
   /// Return true if the target should transform:
   /// (X & Y) == Y ---> (~X & Y) == 0
   /// (X & Y) != Y ---> (~X & Y) != 0
Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5955,13 +5955,17 @@
 }
 
 static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
-                             Type *LoadTy,
                              SelectionDAGBuilder &Builder) {
 
   // Check to see if this load can be trivially constant folded, e.g. if the
   // input is from a string literal.
   if (const Constant *LoadInput = dyn_cast<Constant>(PtrVal)) {
     // Cast pointer to the type we really want to load.
+    Type *LoadTy =
+        Type::getIntNTy(PtrVal->getContext(), LoadVT.getScalarSizeInBits());
+    if (LoadVT.isVector())
+      LoadTy = VectorType::get(LoadTy, LoadVT.getVectorNumElements());
+
     LoadInput = ConstantExpr::getBitCast(const_cast<Constant *>(LoadInput),
                                          PointerType::getUnqual(LoadTy));
 
@@ -6039,56 +6043,75 @@
   if (!CSize || !IsOnlyUsedInZeroEqualityComparison(&I))
     return false;
 
-  MVT LoadVT;
-  Type *LoadTy;
+  // Require that the load VT is legal and that the target supports unaligned
+  // loads of that type. If the load VT is good, check that a scalar compare of
+  // the load size is fast and return that type. Otherwise, return INVALID.
+  auto hasFastLoadsAndCompare = [&](MVT LoadVT) {
+    // TODO: Handle 5 byte compare as 4-byte + 1 byte.
+    // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
+    // TODO: Check alignment of src and dest ptrs.
+
+    unsigned DstAS = LHS->getType()->getPointerAddressSpace();
+    unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    if (!TLI.isTypeLegal(LoadVT) ||
+        !TLI.allowsMisalignedMemoryAccesses(LoadVT, SrcAS) ||
+        !TLI.allowsMisalignedMemoryAccesses(LoadVT, DstAS))
+      return MVT::INVALID_SIMPLE_VALUE_TYPE;
+
+    // For a vector type, we need to do a scalar comparison of the whole vector.
+    MVT CmpVT = LoadVT.isVector() ? LoadVT.getIntegerVT(LoadVT.getSizeInBits())
+                                  : LoadVT;
+    if (!TLI.hasFastEqualityCompare(CmpVT))
+      return MVT::INVALID_SIMPLE_VALUE_TYPE;
+
+    return CmpVT.SimpleTy;
+  };
+
+  // This turns into unaligned loads. We only do this if the target natively
+  // supports the MVT we'll be loading or if it is small enough (<= 4) that
+  // we'll only produce a small number of byte loads.
+  MVT LoadVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
+  MVT CmpVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
   switch (CSize->getZExtValue()) {
   default:
-    return false;
+    break;
   case 2:
-    LoadVT = MVT::i16;
-    LoadTy = Type::getInt16Ty(CSize->getContext());
+    LoadVT = CmpVT = MVT::i16;
     break;
   case 4:
-    LoadVT = MVT::i32;
-    LoadTy = Type::getInt32Ty(CSize->getContext());
+    LoadVT = CmpVT = MVT::i32;
     break;
   case 8:
     LoadVT = MVT::i64;
-    LoadTy = Type::getInt64Ty(CSize->getContext());
+    CmpVT = hasFastLoadsAndCompare(LoadVT);
     break;
-  /*
   case 16:
-    LoadVT = MVT::v4i32;
-    LoadTy = Type::getInt32Ty(CSize->getContext());
-    LoadTy = VectorType::get(LoadTy, 4);
+    // Find a 16-byte load type that is fast for this target.
+    for (MVT VT16Bytes :
+         {MVT::i128, MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
+      CmpVT = hasFastLoadsAndCompare(VT16Bytes);
+      if (CmpVT != MVT::INVALID_SIMPLE_VALUE_TYPE) {
+        LoadVT = VT16Bytes;
+        break;
+      }
+    }
     break;
-  */
   }
 
-  // This turns into unaligned loads.  We only do this if the target natively
-  // supports the MVT we'll be loading or if it is small enough (<= 4) that
-  // we'll only produce a small number of byte loads.
+  if (LoadVT == MVT::INVALID_SIMPLE_VALUE_TYPE ||
+      CmpVT == MVT::INVALID_SIMPLE_VALUE_TYPE)
+    return false;
 
-  // Require that we can find a legal MVT, and only do this if the target
-  // supports unaligned loads of that type.  Expanding into byte loads would
-  // bloat the code.
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (CSize->getZExtValue() > 4) {
-    unsigned DstAS = LHS->getType()->getPointerAddressSpace();
-    unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
-    // TODO: Handle 5 byte compare as 4-byte + 1 byte.
-    // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
-    // TODO: Check alignment of src and dest ptrs.
-    if (!TLI.isTypeLegal(LoadVT) ||
-        !TLI.allowsMisalignedMemoryAccesses(LoadVT, SrcAS) ||
-        !TLI.allowsMisalignedMemoryAccesses(LoadVT, DstAS))
-      return false;
-  }
+  SDValue LoadL = getMemCmpLoad(LHS, LoadVT, *this);
+  SDValue LoadR = getMemCmpLoad(RHS, LoadVT, *this);
+
+  // Bitcast to integer type if the loads are vectors.
+  LoadL = DAG.getBitcast(CmpVT, LoadL);
+  LoadR = DAG.getBitcast(CmpVT, LoadR);
 
-  SDValue LHSVal = getMemCmpLoad(LHS, LoadVT, LoadTy, *this);
-  SDValue RHSVal = getMemCmpLoad(RHS, LoadVT, LoadTy, *this);
   SDValue SetCC =
-      DAG.getSetCC(getCurSDLoc(), MVT::i1, LHSVal, RHSVal, ISD::SETNE);
+      DAG.getSetCC(getCurSDLoc(), MVT::i1, LoadL, LoadR, ISD::SETNE);
   processIntegerCallValue(I, SetCC, false);
   return true;
 }
Index: lib/Target/X86/X86ISelLowering.h
===================================================================
--- lib/Target/X86/X86ISelLowering.h
+++ lib/Target/X86/X86ISelLowering.h
@@ -815,6 +815,9 @@
 
     bool hasAndNotCompare(SDValue Y) const override;
 
+    /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
+    bool hasFastEqualityCompare(EVT VT) const override;
+    
     /// Return the value type to use for ISD::SETCC.
     EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                            EVT VT) const override;
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -4637,6 +4637,12 @@
   return true;
 }
 
+bool X86TargetLowering::hasFastEqualityCompare(EVT VT) const {
+  // TODO: 256- and 512-bit types should be allowed, but make sure that those
+  // cases are handled in combineVectorSizedSetCCEquality().
+  return isTypeLegal(VT) || (Subtarget.hasSSE2() && VT == MVT::i128);
+}
+
 /// Val is the undef sentinel value or equal to the specified value.
 static bool isUndefOrEqual(int Val, int CmpVal) {
   return ((Val == SM_SentinelUndef) || (Val == CmpVal));
Index: test/CodeGen/X86/memcmp.ll
===================================================================
--- test/CodeGen/X86/memcmp.ll
+++ test/CodeGen/X86/memcmp.ll
@@ -97,12 +97,12 @@
 define i1 @length16(i8* %x, i8* %y) nounwind {
 ; CHECK-LABEL: length16:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    movl $16, %edx
-; CHECK-NEXT:    callq memcmp
-; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    movdqu (%rsi), %xmm0
+; CHECK-NEXT:    movdqu (%rdi), %xmm1
+; CHECK-NEXT:    pcmpeqb %xmm0, %xmm1
+; CHECK-NEXT:    pmovmskb %xmm1, %eax
+; CHECK-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; CHECK-NEXT:    setne %al
-; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    retq
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind
   %cmp = icmp ne i32 %call, 0
@@ -112,13 +112,11 @@
 define i1 @length16_const(i8* %X, i32* nocapture %P) nounwind {
 ; CHECK-LABEL: length16_const:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    movl $.L.str, %esi
-; CHECK-NEXT:    movl $16, %edx
-; CHECK-NEXT:    callq memcmp
-; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    movdqu (%rdi), %xmm0
+; CHECK-NEXT:    pcmpeqb {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    pmovmskb %xmm0, %eax
+; CHECK-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; CHECK-NEXT:    sete %al
-; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind
   %c = icmp eq i32 %m, 0