Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -547,8 +547,13 @@
   /// \brief Don't restrict interleaved unrolling to small loops.
   bool enableAggressiveInterleaving(bool LoopHasReductions) const;
 
-  /// \brief Enable inline expansion of memcmp
-  bool enableMemCmpExpansion(unsigned &MaxLoadSize) const;
+  /// \brief If not nullptr, enable inline expansion of memcmp. IsThreeWay is
+  /// false if this is the expansion of memcmp(p1, p2, s) == 0.
+  struct MemCmpExpansionOptions {
+    // The list of available load sizes (in bytes), sorted in decreasing order.
+    SmallVector<unsigned, 8> LoadSizes;
+  };
+  const MemCmpExpansionOptions *enableMemCmpExpansion(bool IsThreeWay) const;
 
   /// \brief Enable matching of interleaved access groups.
   bool enableInterleavedAccessVectorization() const;
@@ -985,7 +990,7 @@
                                                     unsigned VF) = 0;
   virtual bool supportsEfficientVectorElementLoadStore() = 0;
   virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
-  virtual bool enableMemCmpExpansion(unsigned &MaxLoadSize) = 0;
+  virtual const MemCmpExpansionOptions *enableMemCmpExpansion(bool IsThreeWay) const = 0;
   virtual bool enableInterleavedAccessVectorization() = 0;
   virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
   virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
@@ -1235,8 +1240,8 @@
   bool enableAggressiveInterleaving(bool LoopHasReductions) override {
     return Impl.enableAggressiveInterleaving(LoopHasReductions);
   }
-  bool enableMemCmpExpansion(unsigned &MaxLoadSize) override {
-    return Impl.enableMemCmpExpansion(MaxLoadSize);
+  const MemCmpExpansionOptions *enableMemCmpExpansion(bool IsThreeWay) const override {
+    return Impl.enableMemCmpExpansion(IsThreeWay);
   }
   bool enableInterleavedAccessVectorization() override {
     return Impl.enableInterleavedAccessVectorization();
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -292,7 +292,9 @@
 
   bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; }
 
-  bool enableMemCmpExpansion(unsigned &MaxLoadSize) { return false; }
+  const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(bool IsThreeWay) const {
+    return nullptr;
+  }
 
   bool enableInterleavedAccessVectorization() { return false; }
 
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -245,8 +245,9 @@
   return TTIImpl->enableAggressiveInterleaving(LoopHasReductions);
 }
 
-bool TargetTransformInfo::enableMemCmpExpansion(unsigned &MaxLoadSize) const {
-  return TTIImpl->enableMemCmpExpansion(MaxLoadSize);
+const TargetTransformInfo::MemCmpExpansionOptions *
+TargetTransformInfo::enableMemCmpExpansion(bool IsThreeWay) const {
+  return TTIImpl->enableMemCmpExpansion(IsThreeWay);
 }
 
 bool TargetTransformInfo::enableInterleavedAccessVectorization() const {
Index: lib/CodeGen/CodeGenPrepare.cpp
===================================================================
--- lib/CodeGen/CodeGenPrepare.cpp
+++ lib/CodeGen/CodeGenPrepare.cpp
@@ -1758,7 +1758,9 @@
   Value *getMemCmpOneBlock();
 
  public:
-  MemCmpExpansion(CallInst *CI, uint64_t Size, unsigned MaxLoadSize,
+  MemCmpExpansion(CallInst *CI, uint64_t Size,
+                  const TargetTransformInfo::MemCmpExpansionOptions& Options,
+                  bool IsUsedForZeroCmp,
                   unsigned NumLoadsPerBlock, const DataLayout &DL);
 
   unsigned getNumBlocks();
@@ -1778,26 +1780,30 @@
 // 3. ResultBlock, block to branch to for early exit when a
 // LoadCmpBlock finds a difference.
 MemCmpExpansion::MemCmpExpansion(CallInst *const CI, uint64_t Size,
-                                 const unsigned MaxLoadSize,
+                                 const TargetTransformInfo::MemCmpExpansionOptions& Options,
+                                 const bool IsUsedForZeroCmp,
                                  const unsigned LoadsPerBlock,
                                  const DataLayout &TheDataLayout)
     : CI(CI),
       Size(Size),
-      MaxLoadSize(MaxLoadSize),
       NumLoadsNonOneByte(0),
       NumLoadsPerBlock(LoadsPerBlock),
-      IsUsedForZeroCmp(isOnlyUsedInZeroEqualityComparison(CI)),
+      IsUsedForZeroCmp(IsUsedForZeroCmp),
       DL(TheDataLayout),
       Builder(CI) {
   // Scale the max size down if the target can load more bytes than we need.
-  while (this->MaxLoadSize > Size) {
-    this->MaxLoadSize /= 2;
+  size_t LoadSizeIndex = 0;
+  while (LoadSizeIndex < Options.LoadSizes.size() && Options.LoadSizes[LoadSizeIndex] > Size) {
+    ++LoadSizeIndex;
   }
+  assert(LoadSizeIndex < Options.LoadSizes.size());
+  MaxLoadSize = Options.LoadSizes[LoadSizeIndex];
+
   // Compute the decomposition.
-  unsigned LoadSize = this->MaxLoadSize;
   assert(Size > 0 && "zero blocks");
   uint64_t Offset = 0;
-  while (Size) {
+  while (Size && LoadSizeIndex < Options.LoadSizes.size()) {
+    const unsigned LoadSize = Options.LoadSizes[LoadSizeIndex];
     assert(LoadSize > 0 && "zero load size");
     const uint64_t NumLoadsForThisSize = Size / LoadSize;
     if (NumLoadsForThisSize > 0) {
@@ -1810,11 +1816,7 @@
       }
       Size = Size % LoadSize;
     }
-    // FIXME: This can result in a non-native load size (e.g. X86-32+SSE can
-    // load 16 and 4 but not 8), which throws the load count off (e.g. in the
-    // aforementioned case, 16 bytes will count for 2 loads but will generate
-    // 4).
-    LoadSize /= 2;
+    ++LoadSizeIndex;
   }
 }
 
@@ -2346,12 +2348,13 @@
   const uint64_t SizeVal = SizeCast->getZExtValue();
 
   // TTI call to check if target would like to expand memcmp. Also, get the
-  // max LoadSize.
-  unsigned MaxLoadSize;
-  if (!TTI->enableMemCmpExpansion(MaxLoadSize)) return false;
+  // available load sizes.
+  const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
+  const auto *const Options = TTI->enableMemCmpExpansion(!IsUsedForZeroCmp);
+  if (!Options) return false;
 
-  MemCmpExpansion Expansion(CI, SizeVal, MaxLoadSize, MemCmpNumLoadsPerBlock,
-                            *DL);
+  MemCmpExpansion Expansion(CI, SizeVal, *Options, IsUsedForZeroCmp,
+                            MemCmpNumLoadsPerBlock, *DL);
 
   // Don't expand if this will require more loads than desired by the target.
   if (Expansion.getNumLoads() >
Index: lib/Target/PowerPC/PPCTargetTransformInfo.h
===================================================================
--- lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -63,7 +63,7 @@
   /// @{
 
   bool enableAggressiveInterleaving(bool LoopHasReductions);
-  bool enableMemCmpExpansion(unsigned &MaxLoadSize);
+  const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(bool IsThreeWay) const;
   bool enableInterleavedAccessVectorization();
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector) const;
Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp
===================================================================
--- lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -226,9 +226,17 @@
   return LoopHasReductions;
 }
 
-bool PPCTTIImpl::enableMemCmpExpansion(unsigned &MaxLoadSize) {
-  MaxLoadSize = 8;
-  return true;
+const PPCTTIImpl::TTI::MemCmpExpansionOptions *
+PPCTTIImpl::enableMemCmpExpansion(bool IsThreeWay) const {
+  static const auto Options = []() {
+    TTI::MemCmpExpansionOptions Options;
+    Options.LoadSizes.push_back(8);
+    Options.LoadSizes.push_back(4);
+    Options.LoadSizes.push_back(2);
+    Options.LoadSizes.push_back(1);
+    return Options;
+  }();
+  return &Options;
 }
 
 bool PPCTTIImpl::enableInterleavedAccessVectorization() {
Index: lib/Target/X86/X86TargetTransformInfo.h
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.h
+++ lib/Target/X86/X86TargetTransformInfo.h
@@ -127,7 +127,7 @@
   bool hasDivRemOp(Type *DataType, bool IsSigned);
   bool areInlineCompatible(const Function *Caller,
                            const Function *Callee) const;
-  bool enableMemCmpExpansion(unsigned &MaxLoadSize);
+  const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(bool IsThreeWay) const;
   bool enableInterleavedAccessVectorization();
 private:
   int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2536,10 +2536,37 @@
   return (CallerBits & CalleeBits) == CalleeBits;
 }
 
-bool X86TTIImpl::enableMemCmpExpansion(unsigned &MaxLoadSize) {
-  // TODO: We can increase these based on available vector ops.
-  MaxLoadSize = ST->is64Bit() ? 8 : 4;
-  return true;
+const X86TTIImpl::TTI::MemCmpExpansionOptions *
+X86TTIImpl::enableMemCmpExpansion(bool IsThreeWay) const {
+  // Only enable vector loads for equality comparison, as we don't have a
+  // vector bswap.
+  static const auto ThreeWayOptions = [this]() {
+    TTI::MemCmpExpansionOptions Options;
+    if (ST->is64Bit()) {
+      Options.LoadSizes.push_back(8);
+    }
+    Options.LoadSizes.push_back(4);
+    Options.LoadSizes.push_back(2);
+    Options.LoadSizes.push_back(1);
+    return Options;
+  }();
+  static const auto EqZeroOptions = [this]() {
+    TTI::MemCmpExpansionOptions Options;
+    if (ST->hasAVX512())
+      Options.LoadSizes.push_back(64);
+    if (ST->hasAVX())
+      Options.LoadSizes.push_back(32);
+    if (ST->hasSSE1())
+      Options.LoadSizes.push_back(16);
+    if (ST->is64Bit()) {
+      Options.LoadSizes.push_back(8);
+    }
+    Options.LoadSizes.push_back(4);
+    Options.LoadSizes.push_back(2);
+    Options.LoadSizes.push_back(1);
+    return Options;
+  }();
+  return IsThreeWay ? &ThreeWayOptions : &EqZeroOptions;
 }
 
 bool X86TTIImpl::enableInterleavedAccessVectorization() {
Index: lib/Transforms/Scalar/MergeICmps.cpp
===================================================================
--- lib/Transforms/Scalar/MergeICmps.cpp
+++ lib/Transforms/Scalar/MergeICmps.cpp
@@ -625,8 +625,7 @@
 
   // We only try merging comparisons if the target wants to expand memcmp later.
   // The rationale is to avoid turning small chains into memcmp calls.
-  unsigned MaxLoadSize;
-  if (!TTI->enableMemCmpExpansion(MaxLoadSize)) return PreservedAnalyses::all();
+  if (!TTI->enableMemCmpExpansion(false)) return PreservedAnalyses::all();
 
   bool MadeChange = false;
 
Index: test/CodeGen/X86/memcmp-optsize.ll
===================================================================
--- test/CodeGen/X86/memcmp-optsize.ll
+++ test/CodeGen/X86/memcmp-optsize.ll
@@ -598,22 +598,24 @@
 ; X86-SSE2-NEXT:    setne %al
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-LABEL: length16_eq:
-; X64:       # BB#0: # %loadbb
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    cmpq (%rsi), %rax
-; X64-NEXT:    jne .LBB17_1
-; X64-NEXT:  # BB#2: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq 8(%rsi), %rcx
-; X64-NEXT:    je .LBB17_3
-; X64-NEXT:  .LBB17_1: # %res_block
-; X64-NEXT:    movl $1, %eax
-; X64-NEXT:  .LBB17_3: # %endblock
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
+; X64-SSE2-LABEL: length16_eq:
+; X64-SSE2:       # BB#0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length16_eq:
+; X64-AVX2:       # BB#0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX2-NEXT:    vpcmpeqb (%rsi), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; X64-AVX2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX2-NEXT:    setne %al
+; X64-AVX2-NEXT:    retq
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind
   %cmp = icmp ne i32 %call, 0
   ret i1 %cmp
@@ -642,22 +644,23 @@
 ; X86-SSE2-NEXT:    sete %al
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-LABEL: length16_eq_const:
-; X64:       # BB#0: # %loadbb
-; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
-; X64-NEXT:    cmpq %rax, (%rdi)
-; X64-NEXT:    jne .LBB18_1
-; X64-NEXT:  # BB#2: # %loadbb1
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    movabsq $3833745473465760056, %rcx # imm = 0x3534333231303938
-; X64-NEXT:    cmpq %rcx, 8(%rdi)
-; X64-NEXT:    je .LBB18_3
-; X64-NEXT:  .LBB18_1: # %res_block
-; X64-NEXT:    movl $1, %eax
-; X64-NEXT:  .LBB18_3: # %endblock
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
+; X64-SSE2-LABEL: length16_eq_const:
+; X64-SSE2:       # BB#0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length16_eq_const:
+; X64-AVX2:       # BB#0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; X64-AVX2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX2-NEXT:    sete %al
+; X64-AVX2-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind
   %c = icmp eq i32 %m, 0
   ret i1 %c
@@ -697,15 +700,44 @@
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: length24_eq:
-; X64:       # BB#0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $24, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
+; X64-SSE2-LABEL: length24_eq:
+; X64-SSE2:       # BB#0: # %loadbb
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    jne .LBB20_1
+; X64-SSE2-NEXT:  # BB#2: # %loadbb1
+; X64-SSE2-NEXT:    movq 16(%rdi), %rcx
+; X64-SSE2-NEXT:    xorl %eax, %eax
+; X64-SSE2-NEXT:    cmpq 16(%rsi), %rcx
+; X64-SSE2-NEXT:    je .LBB20_3
+; X64-SSE2-NEXT:  .LBB20_1: # %res_block
+; X64-SSE2-NEXT:    movl $1, %eax
+; X64-SSE2-NEXT:  .LBB20_3: # %endblock
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length24_eq:
+; X64-AVX2:       # BB#0: # %loadbb
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX2-NEXT:    vpcmpeqb (%rsi), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; X64-AVX2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX2-NEXT:    jne .LBB20_1
+; X64-AVX2-NEXT:  # BB#2: # %loadbb1
+; X64-AVX2-NEXT:    movq 16(%rdi), %rcx
+; X64-AVX2-NEXT:    xorl %eax, %eax
+; X64-AVX2-NEXT:    cmpq 16(%rsi), %rcx
+; X64-AVX2-NEXT:    je .LBB20_3
+; X64-AVX2-NEXT:  .LBB20_1: # %res_block
+; X64-AVX2-NEXT:    movl $1, %eax
+; X64-AVX2-NEXT:  .LBB20_3: # %endblock
+; X64-AVX2-NEXT:    testl %eax, %eax
+; X64-AVX2-NEXT:    sete %al
+; X64-AVX2-NEXT:    retq
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind
   %cmp = icmp eq i32 %call, 0
   ret i1 %cmp
@@ -724,16 +756,43 @@
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: length24_eq_const:
-; X64:       # BB#0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    movl $24, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
+; X64-SSE2-LABEL: length24_eq_const:
+; X64-SSE2:       # BB#0: # %loadbb
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    jne .LBB21_1
+; X64-SSE2-NEXT:  # BB#2: # %loadbb1
+; X64-SSE2-NEXT:    xorl %eax, %eax
+; X64-SSE2-NEXT:    movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
+; X64-SSE2-NEXT:    cmpq %rcx, 16(%rdi)
+; X64-SSE2-NEXT:    je .LBB21_3
+; X64-SSE2-NEXT:  .LBB21_1: # %res_block
+; X64-SSE2-NEXT:    movl $1, %eax
+; X64-SSE2-NEXT:  .LBB21_3: # %endblock
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length24_eq_const:
+; X64-AVX2:       # BB#0: # %loadbb
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; X64-AVX2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX2-NEXT:    jne .LBB21_1
+; X64-AVX2-NEXT:  # BB#2: # %loadbb1
+; X64-AVX2-NEXT:    xorl %eax, %eax
+; X64-AVX2-NEXT:    movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
+; X64-AVX2-NEXT:    cmpq %rcx, 16(%rdi)
+; X64-AVX2-NEXT:    je .LBB21_3
+; X64-AVX2-NEXT:  .LBB21_1: # %res_block
+; X64-AVX2-NEXT:    movl $1, %eax
+; X64-AVX2-NEXT:  .LBB21_3: # %endblock
+; X64-AVX2-NEXT:    testl %eax, %eax
+; X64-AVX2-NEXT:    setne %al
+; X64-AVX2-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind
   %c = icmp ne i32 %m, 0
   ret i1 %c
@@ -761,26 +820,65 @@
 ; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
 
 define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {
-; X86-LABEL: length32_eq:
-; X86:       # BB#0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
+; X86-NOSSE-LABEL: length32_eq:
+; X86-NOSSE:       # BB#0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length32_eq:
+; X86-SSE2:       # BB#0: # %loadbb
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pmovmskb %xmm1, %edx
+; X86-SSE2-NEXT:    cmpl $65535, %edx # imm = 0xFFFF
+; X86-SSE2-NEXT:    jne .LBB23_1
+; X86-SSE2-NEXT:  # BB#2: # %loadbb1
+; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pmovmskb %xmm1, %ecx
+; X86-SSE2-NEXT:    xorl %eax, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
+; X86-SSE2-NEXT:    je .LBB23_3
+; X86-SSE2-NEXT:  .LBB23_1: # %res_block
+; X86-SSE2-NEXT:    xorl %eax, %eax
+; X86-SSE2-NEXT:    incl %eax
+; X86-SSE2-NEXT:  .LBB23_3: # %endblock
+; X86-SSE2-NEXT:    testl %eax, %eax
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: length32_eq:
-; X64-SSE2:       # BB#0:
-; X64-SSE2-NEXT:    pushq %rax
-; X64-SSE2-NEXT:    movl $32, %edx
-; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2:       # BB#0: # %loadbb
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    jne .LBB23_1
+; X64-SSE2-NEXT:  # BB#2: # %loadbb1
+; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT:    pmovmskb %xmm1, %ecx
+; X64-SSE2-NEXT:    xorl %eax, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
+; X64-SSE2-NEXT:    je .LBB23_3
+; X64-SSE2-NEXT:  .LBB23_1: # %res_block
+; X64-SSE2-NEXT:    movl $1, %eax
+; X64-SSE2-NEXT:  .LBB23_3: # %endblock
 ; X64-SSE2-NEXT:    testl %eax, %eax
 ; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    popq %rcx
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-AVX2-LABEL: length32_eq:
@@ -798,27 +896,60 @@
 }
 
 define i1 @length32_eq_const(i8* %X) nounwind optsize {
-; X86-LABEL: length32_eq_const:
-; X86:       # BB#0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
+; X86-NOSSE-LABEL: length32_eq_const:
+; X86-NOSSE:       # BB#0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length32_eq_const:
+; X86-SSE2:       # BB#0: # %loadbb
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %ecx
+; X86-SSE2-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
+; X86-SSE2-NEXT:    jne .LBB24_1
+; X86-SSE2-NEXT:  # BB#2: # %loadbb1
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %ecx
+; X86-SSE2-NEXT:    xorl %eax, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
+; X86-SSE2-NEXT:    je .LBB24_3
+; X86-SSE2-NEXT:  .LBB24_1: # %res_block
+; X86-SSE2-NEXT:    xorl %eax, %eax
+; X86-SSE2-NEXT:    incl %eax
+; X86-SSE2-NEXT:  .LBB24_3: # %endblock
+; X86-SSE2-NEXT:    testl %eax, %eax
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: length32_eq_const:
-; X64-SSE2:       # BB#0:
-; X64-SSE2-NEXT:    pushq %rax
-; X64-SSE2-NEXT:    movl $.L.str, %esi
-; X64-SSE2-NEXT:    movl $32, %edx
-; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2:       # BB#0: # %loadbb
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    jne .LBB24_1
+; X64-SSE2-NEXT:  # BB#2: # %loadbb1
+; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm0
+; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %ecx
+; X64-SSE2-NEXT:    xorl %eax, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
+; X64-SSE2-NEXT:    je .LBB24_3
+; X64-SSE2-NEXT:  .LBB24_1: # %res_block
+; X64-SSE2-NEXT:    movl $1, %eax
+; X64-SSE2-NEXT:  .LBB24_3: # %endblock
 ; X64-SSE2-NEXT:    testl %eax, %eax
 ; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    popq %rcx
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-AVX2-LABEL: length32_eq_const:
@@ -867,15 +998,37 @@
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: length64_eq:
-; X64:       # BB#0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $64, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
+; X64-SSE2-LABEL: length64_eq:
+; X64-SSE2:       # BB#0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    movl $64, %edx
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length64_eq:
+; X64-AVX2:       # BB#0: # %loadbb
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT:    vpcmpeqb (%rsi), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; X64-AVX2-NEXT:    cmpl $-1, %eax
+; X64-AVX2-NEXT:    jne .LBB26_1
+; X64-AVX2-NEXT:  # BB#2: # %loadbb1
+; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm0
+; X64-AVX2-NEXT:    vpcmpeqb 32(%rsi), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmovmskb %ymm0, %ecx
+; X64-AVX2-NEXT:    xorl %eax, %eax
+; X64-AVX2-NEXT:    cmpl $-1, %ecx
+; X64-AVX2-NEXT:    je .LBB26_3
+; X64-AVX2-NEXT:  .LBB26_1: # %res_block
+; X64-AVX2-NEXT:    movl $1, %eax
+; X64-AVX2-NEXT:  .LBB26_3: # %endblock
+; X64-AVX2-NEXT:    testl %eax, %eax
+; X64-AVX2-NEXT:    setne %al
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind
   %cmp = icmp ne i32 %call, 0
   ret i1 %cmp
@@ -894,16 +1047,38 @@
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: length64_eq_const:
-; X64:       # BB#0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    movl $64, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
+; X64-SSE2-LABEL: length64_eq_const:
+; X64-SSE2:       # BB#0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    movl $.L.str, %esi
+; X64-SSE2-NEXT:    movl $64, %edx
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX2-LABEL: length64_eq_const:
+; X64-AVX2:       # BB#0: # %loadbb
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; X64-AVX2-NEXT:    cmpl $-1, %eax
+; X64-AVX2-NEXT:    jne .LBB27_1
+; X64-AVX2-NEXT:  # BB#2: # %loadbb1
+; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm0
+; X64-AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmovmskb %ymm0, %ecx
+; X64-AVX2-NEXT:    xorl %eax, %eax
+; X64-AVX2-NEXT:    cmpl $-1, %ecx
+; X64-AVX2-NEXT:    je .LBB27_3
+; X64-AVX2-NEXT:  .LBB27_1: # %res_block
+; X64-AVX2-NEXT:    movl $1, %eax
+; X64-AVX2-NEXT:  .LBB27_3: # %endblock
+; X64-AVX2-NEXT:    testl %eax, %eax
+; X64-AVX2-NEXT:    sete %al
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind
   %c = icmp eq i32 %m, 0
   ret i1 %c
Index: test/CodeGen/X86/memcmp.ll
===================================================================
--- test/CodeGen/X86/memcmp.ll
+++ test/CodeGen/X86/memcmp.ll
@@ -23,7 +23,7 @@
 ; X86-NEXT:    movzwl %cx, %eax
 ; X86-NEXT:    movzwl %dx, %ecx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: length2:
 ; X64:       # BB#0:
@@ -34,7 +34,7 @@
 ; X64-NEXT:    movzwl %ax, %eax
 ; X64-NEXT:    movzwl %cx, %ecx
 ; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
   ret i32 %m
 }
@@ -47,14 +47,14 @@
 ; X86-NEXT:    movzwl (%ecx), %ecx
 ; X86-NEXT:    cmpw (%eax), %cx
 ; X86-NEXT:    sete %al
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: length2_eq:
 ; X64:       # BB#0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    cmpw (%rsi), %ax
 ; X64-NEXT:    sete %al
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
   %c = icmp eq i32 %m, 0
   ret i1 %c
@@ -67,14 +67,14 @@
 ; X86-NEXT:    movzwl (%eax), %eax
 ; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
 ; X86-NEXT:    setne %al
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: length2_eq_const:
 ; X64:       # BB#0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    cmpl $12849, %eax # imm = 0x3231
 ; X64-NEXT:    setne %al
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 2) nounwind
   %c = icmp ne i32 %m, 0
   ret i1 %c
@@ -91,7 +91,7 @@
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    sete %al
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: length2_eq_nobuiltin_attr:
 ; X64:       # BB#0:
@@ -101,7 +101,7 @@
 ; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    sete %al
 ; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind nobuiltin
   %c = icmp eq i32 %m, 0
   ret i1 %c
@@ -124,13 +124,13 @@
 ; X86-NEXT:    movzbl 2(%ecx), %ecx
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ; X86-NEXT:  .LBB4_1: # %res_block
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: length3:
 ; X64:       # BB#0: # %loadbb
@@ -144,12 +144,12 @@
 ; X64-NEXT:    movzbl 2(%rdi), %eax
 ; X64-NEXT:    movzbl 2(%rsi), %ecx
 ; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
 ; X64-NEXT:  .LBB4_1: # %res_block
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
   ret i32 %m
 }
@@ -172,7 +172,7 @@
 ; X86-NEXT:  .LBB5_3: # %endblock
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setne %al
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: length3_eq:
 ; X64:       # BB#0: # %loadbb
@@ -189,7 +189,7 @@
 ; X64-NEXT:  .LBB5_3: # %endblock
 ; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    setne %al
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
   %c = icmp ne i32 %m, 0
   ret i1 %c
@@ -208,7 +208,7 @@
 ; X86-NEXT:    cmpl %edx, %ecx
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: length4:
 ; X64:       # BB#0:
@@ -220,7 +220,7 @@
 ; X64-NEXT:    cmpl %edx, %ecx
 ; X64-NEXT:    seta %al
 ; X64-NEXT:    sbbl $0, %eax
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
   ret i32 %m
 }
@@ -233,14 +233,14 @@
 ; X86-NEXT:    movl (%ecx), %ecx
 ; X86-NEXT:    cmpl (%eax), %ecx
 ; X86-NEXT:    setne %al
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: length4_eq:
 ; X64:       # BB#0:
 ; X64-NEXT:    movl (%rdi), %eax
 ; X64-NEXT:    cmpl (%rsi), %eax
 ; X64-NEXT:    setne %al
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
   %c = icmp ne i32 %m, 0
   ret i1 %c
@@ -252,13 +252,13 @@
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
 ; X86-NEXT:    sete %al
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: length4_eq_const:
 ; X64:       # BB#0:
 ; X64-NEXT:    cmpl $875770417, (%rdi) # imm = 0x34333231
 ; X64-NEXT:    sete %al
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 4) nounwind
   %c = icmp eq i32 %m, 0
   ret i1 %c
@@ -281,13 +281,13 @@
 ; X86-NEXT:    movzbl 4(%ecx), %ecx
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ; X86-NEXT:  .LBB9_1: # %res_block
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: length5:
 ; X64:       # BB#0: # %loadbb
@@ -301,12 +301,12 @@
 ; X64-NEXT:    movzbl 4(%rdi), %eax
 ; X64-NEXT:    movzbl 4(%rsi), %ecx
 ; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
 ; X64-NEXT:  .LBB9_1: # %res_block
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
   ret i32 %m
 }
@@ -329,7 +329,7 @@
 ; X86-NEXT:  .LBB10_3: # %endblock
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setne %al
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: length5_eq:
 ; X64:       # BB#0: # %loadbb
@@ -346,7 +346,7 @@
 ; X64-NEXT:  .LBB10_3: # %endblock
 ; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    setne %al
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
   %c = icmp ne i32 %m, 0
   ret i1 %c
@@ -374,14 +374,14 @@
 ; X86-NEXT:    jne .LBB11_1
 ; X86-NEXT:  # BB#3: # %endblock
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ; X86-NEXT:  .LBB11_1: # %res_block
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    cmpl %edx, %ecx
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: length8:
 ; X64:       # BB#0:
@@ -393,7 +393,7 @@
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    seta %al
 ; X64-NEXT:    sbbl $0, %eax
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
   ret i32 %m
 }
@@ -416,14 +416,14 @@
 ; X86-NEXT:  .LBB12_3: # %endblock
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    sete %al
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: length8_eq:
 ; X64:       # BB#0:
 ; X64-NEXT:    movq (%rdi), %rax
 ; X64-NEXT:    cmpq (%rsi), %rax
 ; X64-NEXT:    sete %al
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
   %c = icmp eq i32 %m, 0
   ret i1 %c
@@ -444,14 +444,14 @@
 ; X86-NEXT:  .LBB13_3: # %endblock
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setne %al
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: length8_eq_const:
 ; X64:       # BB#0:
 ; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
 ; X64-NEXT:    cmpq %rax, (%rdi)
 ; X64-NEXT:    setne %al
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 8) nounwind
   %c = icmp ne i32 %m, 0
   ret i1 %c
@@ -468,7 +468,7 @@
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setne %al
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: length12_eq:
 ; X64:       # BB#0: # %loadbb
@@ -485,7 +485,7 @@
 ; X64-NEXT:  .LBB14_3: # %endblock
 ; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    setne %al
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
   %c = icmp ne i32 %m, 0
   ret i1 %c
@@ -500,7 +500,7 @@
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: length12:
 ; X64:       # BB#0: # %loadbb
@@ -519,13 +519,13 @@
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    jne .LBB15_1
 ; X64-NEXT:  # BB#3: # %endblock
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
 ; X64-NEXT:  .LBB15_1: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
   ret i32 %m
 }
@@ -541,7 +541,7 @@
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: length16:
 ; X64:       # BB#0: # %loadbb
@@ -560,13 +560,13 @@
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    jne .LBB16_1
 ; X64-NEXT:  # BB#3: # %endblock
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
 ; X64-NEXT:  .LBB16_1: # %res_block
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq %rdx, %rcx
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:    retq
+; X64-NEXT:    ret{{[l|q]}}
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
   ret i32 %m
 }
@@ -582,7 +582,7 @@
 ; X86-NOSSE-NEXT:    addl $16, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
 ; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
+; X86-NOSSE-NEXT:    ret{{[l|q]}}
 ;
 ; X86-SSE2-LABEL: length16_eq:
 ; X86-SSE2:       # BB#0:
@@ -594,24 +594,26 @@
 ; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
 ; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
+; X86-SSE2-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: length16_eq:
-; X64:       # BB#0: # %loadbb
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    cmpq (%rsi), %rax
-; X64-NEXT:    jne .LBB17_1
-; X64-NEXT:  # BB#2: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq 8(%rsi), %rcx
-; X64-NEXT:    je .LBB17_3
-; X64-NEXT:  .LBB17_1: # %res_block
-; X64-NEXT:    movl $1, %eax
-; X64-NEXT:  .LBB17_3: # %endblock
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
+; X64-SSE2-LABEL: length16_eq:
+; X64-SSE2:       # BB#0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    ret{{[l|q]}}
+;
+; X64-AVX2-LABEL: length16_eq:
+; X64-AVX2:       # BB#0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX2-NEXT:    vpcmpeqb (%rsi), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; X64-AVX2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX2-NEXT:    setne %al
+; X64-AVX2-NEXT:    ret{{[l|q]}}
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind
   %cmp = icmp ne i32 %call, 0
   ret i1 %cmp
@@ -628,7 +630,7 @@
 ; X86-NOSSE-NEXT:    addl $16, %esp
 ; X86-NOSSE-NEXT:    testl %eax, %eax
 ; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
+; X86-NOSSE-NEXT:    ret{{[l|q]}}
 ;
 ; X86-SSE2-LABEL: length16_eq_const:
 ; X86-SSE2:       # BB#0:
@@ -638,24 +640,25 @@
 ; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
 ; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
 ; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
+; X86-SSE2-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: length16_eq_const:
-; X64:       # BB#0: # %loadbb
-; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
-; X64-NEXT:    cmpq %rax, (%rdi)
-; X64-NEXT:    jne .LBB18_1
-; X64-NEXT:  # BB#2: # %loadbb1
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    movabsq $3833745473465760056, %rcx # imm = 0x3534333231303938
-; X64-NEXT:    cmpq %rcx, 8(%rdi)
-; X64-NEXT:    je .LBB18_3
-; X64-NEXT:  .LBB18_1: # %res_block
-; X64-NEXT:    movl $1, %eax
-; X64-NEXT:  .LBB18_3: # %endblock
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
+; X64-SSE2-LABEL: length16_eq_const:
+; X64-SSE2:       # BB#0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    ret{{[l|q]}}
+;
+; X64-AVX2-LABEL: length16_eq_const:
+; X64-AVX2:       # BB#0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; X64-AVX2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX2-NEXT:    sete %al
+; X64-AVX2-NEXT:    ret{{[l|q]}}
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind
   %c = icmp eq i32 %m, 0
   ret i1 %c
@@ -672,7 +675,7 @@
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: length24:
 ; X64:       # BB#0:
@@ -693,17 +696,46 @@
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    sete %al
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: length24_eq:
-; X64:       # BB#0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $24, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
+; X64-SSE2-LABEL: length24_eq:
+; X64-SSE2:       # BB#0: # %loadbb
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    jne .LBB20_1
+; X64-SSE2-NEXT:  # BB#2: # %loadbb1
+; X64-SSE2-NEXT:    movq 16(%rdi), %rcx
+; X64-SSE2-NEXT:    xorl %eax, %eax
+; X64-SSE2-NEXT:    cmpq 16(%rsi), %rcx
+; X64-SSE2-NEXT:    je .LBB20_3
+; X64-SSE2-NEXT:  .LBB20_1: # %res_block
+; X64-SSE2-NEXT:    movl $1, %eax
+; X64-SSE2-NEXT:  .LBB20_3: # %endblock
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    ret{{[l|q]}}
+;
+; X64-AVX2-LABEL: length24_eq:
+; X64-AVX2:       # BB#0: # %loadbb
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX2-NEXT:    vpcmpeqb (%rsi), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; X64-AVX2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX2-NEXT:    jne .LBB20_1
+; X64-AVX2-NEXT:  # BB#2: # %loadbb1
+; X64-AVX2-NEXT:    movq 16(%rdi), %rcx
+; X64-AVX2-NEXT:    xorl %eax, %eax
+; X64-AVX2-NEXT:    cmpq 16(%rsi), %rcx
+; X64-AVX2-NEXT:    je .LBB20_3
+; X64-AVX2-NEXT:  .LBB20_1: # %res_block
+; X64-AVX2-NEXT:    movl $1, %eax
+; X64-AVX2-NEXT:  .LBB20_3: # %endblock
+; X64-AVX2-NEXT:    testl %eax, %eax
+; X64-AVX2-NEXT:    sete %al
+; X64-AVX2-NEXT:    ret{{[l|q]}}
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind
   %cmp = icmp eq i32 %call, 0
   ret i1 %cmp
@@ -720,18 +752,45 @@
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setne %al
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: length24_eq_const:
-; X64:       # BB#0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    movl $24, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
+; X64-SSE2-LABEL: length24_eq_const:
+; X64-SSE2:       # BB#0: # %loadbb
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    jne .LBB21_1
+; X64-SSE2-NEXT:  # BB#2: # %loadbb1
+; X64-SSE2-NEXT:    xorl %eax, %eax
+; X64-SSE2-NEXT:    movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
+; X64-SSE2-NEXT:    cmpq %rcx, 16(%rdi)
+; X64-SSE2-NEXT:    je .LBB21_3
+; X64-SSE2-NEXT:  .LBB21_1: # %res_block
+; X64-SSE2-NEXT:    movl $1, %eax
+; X64-SSE2-NEXT:  .LBB21_3: # %endblock
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    ret{{[l|q]}}
+;
+; X64-AVX2-LABEL: length24_eq_const:
+; X64-AVX2:       # BB#0: # %loadbb
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; X64-AVX2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX2-NEXT:    jne .LBB21_1
+; X64-AVX2-NEXT:  # BB#2: # %loadbb1
+; X64-AVX2-NEXT:    xorl %eax, %eax
+; X64-AVX2-NEXT:    movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
+; X64-AVX2-NEXT:    cmpq %rcx, 16(%rdi)
+; X64-AVX2-NEXT:    je .LBB21_3
+; X64-AVX2-NEXT:  .LBB21_1: # %res_block
+; X64-AVX2-NEXT:    movl $1, %eax
+; X64-AVX2-NEXT:  .LBB21_3: # %endblock
+; X64-AVX2-NEXT:    testl %eax, %eax
+; X64-AVX2-NEXT:    setne %al
+; X64-AVX2-NEXT:    ret{{[l|q]}}
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind
   %c = icmp ne i32 %m, 0
   ret i1 %c
@@ -746,7 +805,7 @@
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: length32:
 ; X64:       # BB#0:
@@ -759,27 +818,65 @@
 ; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
 
 define i1 @length32_eq(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length32_eq:
-; X86:       # BB#0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
+; X86-NOSSE-LABEL: length32_eq:
+; X86-NOSSE:       # BB#0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    ret{{[l|q]}}
+;
+; X86-SSE2-LABEL: length32_eq:
+; X86-SSE2:       # BB#0: # %loadbb
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pmovmskb %xmm1, %edx
+; X86-SSE2-NEXT:    cmpl $65535, %edx # imm = 0xFFFF
+; X86-SSE2-NEXT:    jne .LBB23_1
+; X86-SSE2-NEXT:  # BB#2: # %loadbb1
+; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pmovmskb %xmm1, %ecx
+; X86-SSE2-NEXT:    xorl %eax, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
+; X86-SSE2-NEXT:    je .LBB23_3
+; X86-SSE2-NEXT:  .LBB23_1: # %res_block
+; X86-SSE2-NEXT:    movl $1, %eax
+; X86-SSE2-NEXT:  .LBB23_3: # %endblock
+; X86-SSE2-NEXT:    testl %eax, %eax
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    ret{{[l|q]}}
 ;
 ; X64-SSE2-LABEL: length32_eq:
-; X64-SSE2:       # BB#0:
-; X64-SSE2-NEXT:    pushq %rax
-; X64-SSE2-NEXT:    movl $32, %edx
-; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2:       # BB#0: # %loadbb
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    jne .LBB23_1
+; X64-SSE2-NEXT:  # BB#2: # %loadbb1
+; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT:    pmovmskb %xmm1, %ecx
+; X64-SSE2-NEXT:    xorl %eax, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
+; X64-SSE2-NEXT:    je .LBB23_3
+; X64-SSE2-NEXT:  .LBB23_1: # %res_block
+; X64-SSE2-NEXT:    movl $1, %eax
+; X64-SSE2-NEXT:  .LBB23_3: # %endblock
 ; X64-SSE2-NEXT:    testl %eax, %eax
 ; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    popq %rcx
-; X64-SSE2-NEXT:    retq
+; X64-SSE2-NEXT:    ret{{[l|q]}}
 ;
 ; X64-AVX2-LABEL: length32_eq:
 ; X64-AVX2:       # BB#0:
@@ -789,35 +886,67 @@
 ; X64-AVX2-NEXT:    cmpl $-1, %eax
 ; X64-AVX2-NEXT:    sete %al
 ; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
+; X64-AVX2-NEXT:    ret{{[l|q]}}
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind
   %cmp = icmp eq i32 %call, 0
   ret i1 %cmp
 }
 
 define i1 @length32_eq_const(i8* %X) nounwind {
-; X86-LABEL: length32_eq_const:
-; X86:       # BB#0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
+; X86-NOSSE-LABEL: length32_eq_const:
+; X86-NOSSE:       # BB#0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    ret{{[l|q]}}
+;
+; X86-SSE2-LABEL: length32_eq_const:
+; X86-SSE2:       # BB#0: # %loadbb
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %ecx
+; X86-SSE2-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
+; X86-SSE2-NEXT:    jne .LBB24_1
+; X86-SSE2-NEXT:  # BB#2: # %loadbb1
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %ecx
+; X86-SSE2-NEXT:    xorl %eax, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
+; X86-SSE2-NEXT:    je .LBB24_3
+; X86-SSE2-NEXT:  .LBB24_1: # %res_block
+; X86-SSE2-NEXT:    movl $1, %eax
+; X86-SSE2-NEXT:  .LBB24_3: # %endblock
+; X86-SSE2-NEXT:    testl %eax, %eax
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    ret{{[l|q]}}
 ;
 ; X64-SSE2-LABEL: length32_eq_const:
-; X64-SSE2:       # BB#0:
-; X64-SSE2-NEXT:    pushq %rax
-; X64-SSE2-NEXT:    movl $.L.str, %esi
-; X64-SSE2-NEXT:    movl $32, %edx
-; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2:       # BB#0: # %loadbb
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    jne .LBB24_1
+; X64-SSE2-NEXT:  # BB#2: # %loadbb1
+; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm0
+; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %ecx
+; X64-SSE2-NEXT:    xorl %eax, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
+; X64-SSE2-NEXT:    je .LBB24_3
+; X64-SSE2-NEXT:  .LBB24_1: # %res_block
+; X64-SSE2-NEXT:    movl $1, %eax
+; X64-SSE2-NEXT:  .LBB24_3: # %endblock
 ; X64-SSE2-NEXT:    testl %eax, %eax
 ; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    popq %rcx
-; X64-SSE2-NEXT:    retq
+; X64-SSE2-NEXT:    ret{{[l|q]}}
 ;
 ; X64-AVX2-LABEL: length32_eq_const:
 ; X64-AVX2:       # BB#0:
@@ -827,7 +956,7 @@
 ; X64-AVX2-NEXT:    cmpl $-1, %eax
 ; X64-AVX2-NEXT:    setne %al
 ; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
+; X64-AVX2-NEXT:    ret{{[l|q]}}
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind
   %c = icmp ne i32 %m, 0
   ret i1 %c
@@ -842,7 +971,7 @@
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll memcmp
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ;
 ; X64-LABEL: length64:
 ; X64:       # BB#0:
@@ -863,17 +992,39 @@
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setne %al
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: length64_eq:
-; X64:       # BB#0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $64, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
+; X64-SSE2-LABEL: length64_eq:
+; X64-SSE2:       # BB#0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    movl $64, %edx
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    ret{{[l|q]}}
+;
+; X64-AVX2-LABEL: length64_eq:
+; X64-AVX2:       # BB#0: # %loadbb
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT:    vpcmpeqb (%rsi), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; X64-AVX2-NEXT:    cmpl $-1, %eax
+; X64-AVX2-NEXT:    jne .LBB26_1
+; X64-AVX2-NEXT:  # BB#2: # %loadbb1
+; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm0
+; X64-AVX2-NEXT:    vpcmpeqb 32(%rsi), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmovmskb %ymm0, %ecx
+; X64-AVX2-NEXT:    xorl %eax, %eax
+; X64-AVX2-NEXT:    cmpl $-1, %ecx
+; X64-AVX2-NEXT:    je .LBB26_3
+; X64-AVX2-NEXT:  .LBB26_1: # %res_block
+; X64-AVX2-NEXT:    movl $1, %eax
+; X64-AVX2-NEXT:  .LBB26_3: # %endblock
+; X64-AVX2-NEXT:    testl %eax, %eax
+; X64-AVX2-NEXT:    setne %al
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    ret{{[l|q]}}
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind
   %cmp = icmp ne i32 %call, 0
   ret i1 %cmp
@@ -890,18 +1041,40 @@
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    sete %al
-; X86-NEXT:    retl
+; X86-NEXT:    ret{{[l|q]}}
 ;
-; X64-LABEL: length64_eq_const:
-; X64:       # BB#0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    movl $64, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
+; X64-SSE2-LABEL: length64_eq_const:
+; X64-SSE2:       # BB#0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    movl $.L.str, %esi
+; X64-SSE2-NEXT:    movl $64, %edx
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    ret{{[l|q]}}
+;
+; X64-AVX2-LABEL: length64_eq_const:
+; X64-AVX2:       # BB#0: # %loadbb
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; X64-AVX2-NEXT:    cmpl $-1, %eax
+; X64-AVX2-NEXT:    jne .LBB27_1
+; X64-AVX2-NEXT:  # BB#2: # %loadbb1
+; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm0
+; X64-AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmovmskb %ymm0, %ecx
+; X64-AVX2-NEXT:    xorl %eax, %eax
+; X64-AVX2-NEXT:    cmpl $-1, %ecx
+; X64-AVX2-NEXT:    je .LBB27_3
+; X64-AVX2-NEXT:  .LBB27_1: # %res_block
+; X64-AVX2-NEXT:    movl $1, %eax
+; X64-AVX2-NEXT:  .LBB27_3: # %endblock
+; X64-AVX2-NEXT:    testl %eax, %eax
+; X64-AVX2-NEXT:    sete %al
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    ret{{[l|q]}}
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind
   %c = icmp eq i32 %m, 0
   ret i1 %c
Index: test/Transforms/CodeGenPrepare/X86/memcmp.ll
===================================================================
--- test/Transforms/CodeGenPrepare/X86/memcmp.ll
+++ test/Transforms/CodeGenPrepare/X86/memcmp.ll
@@ -753,27 +753,13 @@
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
 ; X64-LABEL: @cmp_eq16(
-; X64-NEXT:  loadbb:
-; X64-NEXT:    [[TMP0:%.*]] = bitcast i8* [[X:%.*]] to i64*
-; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP0]]
-; X64-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP1]]
-; X64-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP2]], [[TMP3]]
-; X64-NEXT:    br i1 [[TMP4]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
-; X64:       res_block:
-; X64-NEXT:    br label [[ENDBLOCK:%.*]]
-; X64:       loadbb1:
-; X64-NEXT:    [[TMP5:%.*]] = bitcast i8* [[X]] to i64*
-; X64-NEXT:    [[TMP6:%.*]] = bitcast i8* [[Y]] to i64*
-; X64-NEXT:    [[TMP7:%.*]] = getelementptr i64, i64* [[TMP5]], i64 1
-; X64-NEXT:    [[TMP8:%.*]] = getelementptr i64, i64* [[TMP6]], i64 1
-; X64-NEXT:    [[TMP9:%.*]] = load i64, i64* [[TMP7]]
-; X64-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP8]]
-; X64-NEXT:    [[TMP11:%.*]] = icmp ne i64 [[TMP9]], [[TMP10]]
-; X64-NEXT:    br i1 [[TMP11]], label [[RES_BLOCK]], label [[ENDBLOCK]]
-; X64:       endblock:
-; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
-; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i128*
+; X64-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i128*
+; X64-NEXT:    [[TMP3:%.*]] = load i128, i128* [[TMP1]]
+; X64-NEXT:    [[TMP4:%.*]] = load i128, i128* [[TMP2]]
+; X64-NEXT:    [[TMP5:%.*]] = icmp ne i128 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
 ; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64-NEXT:    ret i32 [[CONV]]
 ;