Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -407,6 +407,9 @@
   /// \brief Don't restrict interleaved unrolling to small loops.
   bool enableAggressiveInterleaving(bool LoopHasReductions) const;
 
+  /// \brief Enable inline expansion of memcmp
+  bool expandMemCmp() const;
+  
   /// \brief Enable matching of interleaved access groups.
   bool enableInterleavedAccessVectorization() const;
 
@@ -732,6 +735,7 @@
   virtual bool shouldBuildLookupTables() = 0;
   virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
   virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
+  virtual bool expandMemCmp() = 0;
   virtual bool enableInterleavedAccessVectorization() = 0;
   virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
   virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
@@ -922,6 +926,9 @@
   bool enableAggressiveInterleaving(bool LoopHasReductions) override {
     return Impl.enableAggressiveInterleaving(LoopHasReductions);
   }
+  bool expandMemCmp() override {
+    return Impl.expandMemCmp();
+  }
   bool enableInterleavedAccessVectorization() override {
     return Impl.enableInterleavedAccessVectorization();
   }
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -252,6 +252,8 @@
 
   bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; }
 
+  bool expandMemCmp() { return false; }
+  
   bool enableInterleavedAccessVectorization() { return false; }
 
   bool isFPVectorizationPotentiallyUnsafe() { return false; }
Index: include/llvm/Target/TargetLowering.h
===================================================================
--- include/llvm/Target/TargetLowering.h
+++ include/llvm/Target/TargetLowering.h
@@ -992,6 +992,16 @@
   unsigned getMaxStoresPerMemcpy(bool OptSize) const {
     return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy;
   }
+  
+  /// \brief Get maximum # of load operations permitted for memcmp
+  ///
+  /// This function returns the maximum number of load operations permitted
+  /// to replace a call to memcmp. The value is set by the target at the
+  /// performance threshold for such a replacement. If OptSize is true,
+  /// return the limit for functions that have OptSize attribute.
+  unsigned getMaxLoadsPerMemcmp(bool OptSize) const {
+    return OptSize ? MaxLoadsPerMemcmpOptSize : MaxLoadsPerMemcmp;
+  }
 
   /// \brief Get maximum # of store operations permitted for llvm.memmove
   ///
@@ -2164,6 +2174,8 @@
   /// Maximum number of store operations that may be substituted for a call to
   /// memcpy, used for functions with OptSize attribute.
   unsigned MaxStoresPerMemcpyOptSize;
+  unsigned MaxLoadsPerMemcmp;
+  unsigned MaxLoadsPerMemcmpOptSize;
 
   /// \brief Specify maximum bytes of store instructions per memmove call.
   ///
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -186,6 +186,10 @@
   return TTIImpl->enableAggressiveInterleaving(LoopHasReductions);
 }
 
+bool TargetTransformInfo::expandMemCmp() const {
+  return TTIImpl->expandMemCmp();
+}
+
 bool TargetTransformInfo::enableInterleavedAccessVectorization() const {
   return TTIImpl->enableInterleavedAccessVectorization();
 }
Index: lib/CodeGen/CodeGenPrepare.cpp
===================================================================
--- lib/CodeGen/CodeGenPrepare.cpp
+++ lib/CodeGen/CodeGenPrepare.cpp
@@ -80,6 +80,12 @@
 STATISTIC(NumAndCmpsMoved, "Number of and/cmp's pushed into branches");
 STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
 
+STATISTIC(NumMemCmpCalls, "Number of memcmp calls");
+STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size");
+STATISTIC(NumMemCmpNot8ByteMultiples, "Number of memcmp calls without 8 byte multiples");
+STATISTIC(NumMemCmpGreaterThanMax, "Number of memcmp calls with size greater than max size");
+STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls");
+
 static cl::opt<bool> DisableBranchOpts(
   "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
   cl::desc("Disable branch optimizations in CodeGenPrepare"));
@@ -1870,9 +1876,172 @@
   return true;
 }
 
+// This function checks to see if an expansion of memcmp can be generated.
+// It checks for constant compare size that is less than the max inline size.
+// If an expansion cannot occur, returns false to leave as a library call.
+// Otherwise, the library call is replaced wtih new IR instruction sequence.
+/// We want to transform:
+/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 16)
+///
+/// into:
+/// loadb:                                            ; preds = %entry
+///   %2 = bitcast i8* %0 to i64*
+///   %3 = bitcast i8* %1 to i64*
+///   %4 = load i64, i64* %2
+///   %5 = load i64, i64* %3
+///   %6 = xor i64 %4, %5
+///   %7 = icmp ne i64 %6, 0
+///   br i1 %7, label %endblock, label %loadb1
+/// loadb1:                                           ; preds = %loadb
+///   %8 = getelementptr i64, i64* %2, i64 1
+///   %9 = getelementptr i64, i64* %3, i64 1
+///   %10 = load i64, i64* %8
+///   %11 = load i64, i64* %9
+///   %12 = xor i64 %10, %11
+///   br label %endblock
+/// endblock:                                         ; preds = %loadb1, %loadb
+///   %res.phi = phi i64 [ %6, %loadb ], [ %12, %loadb1 ]
+///   %res.phi2 = phi i64 [ %4, %loadb ], [ %10, %loadb1 ]
+///   %res.phi3 = phi i64 [ %5, %loadb ], [ %11, %loadb1 ]
+///   %13 = call i64 @llvm.cttz.i64(i64 %res.phi, i1 false)
+///   %src1_sh = ashr i64 %res.phi2, %13
+///   %src2_sh = ashr i64 %res.phi3, %13
+///   %14 = and i64 %src1_sh, 255
+///   %15 = and i64 %src2_sh, 255
+///   %16 = sub i64 %14, %15
+///   %17 = trunc i64 %16 to i32
+///   ret i32 %17
+static bool memcmpExpansion(CallInst *CI, const TargetLowering *TLI,
+                            const DataLayout *DL) {
+
+  if (!DL->isLittleEndian() || DL->getPointerSizeInBits() != 64) {
+    return false;
+  }
+
+  NumMemCmpCalls++;
+  LLVMContext &C = CI->getContext();
+  IRBuilder<> Builder(C);
+  Type *Int64Ty = Type::getInt64Ty(C);
+  Type *Int64PtrTy = Type::getInt64PtrTy(C);
+
+  Value *Source1 = CI->getArgOperand(0);
+  Value *Source2 = CI->getArgOperand(1);
+  Value *Size = CI->getArgOperand(2);
+
+  ConstantInt *SizeCast = dyn_cast<ConstantInt>(Size);
+  if (!SizeCast) {
+    NumMemCmpNotConstant++;
+    return false;
+  }
+
+  unsigned SizeVal = SizeCast->getZExtValue();
+  if (SizeVal % 8 != 0) {
+    NumMemCmpNot8ByteMultiples++;
+    return false;
+  }
+
+  if (SizeVal > TLI->getMaxLoadsPerMemcmp(0)) {
+    NumMemCmpGreaterThanMax++;
+    return false;
+  }
+
+  NumMemCmpInlined++;
+  unsigned Count = SizeVal / 8;
+
+  std::vector<BasicBlock *> BBList;
+  std::vector<Value *> XorList;
+  std::vector<Value *> Src1List;
+  std::vector<Value *> Src2List;
+
+  BasicBlock *StartBlock = CI->getParent();
+  BasicBlock *EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
+  BasicBlock *LoadBlockTemp;
+
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  // Create the number of load and compare basic blocks we need for this size
+  unsigned i;
+  for (i = 0; i < Count; i++) {
+    LoadBlockTemp =
+        BasicBlock::Create(C, "loadb", StartBlock->getParent(), EndBlock);
+    BBList.push_back(LoadBlockTemp);
+  }
+
+  // remove the previous terminator and add a branch to the first load, compare block
+  StartBlock->getTerminator()->eraseFromParent();
+  BranchInst *NewBr = BranchInst::Create(BBList[0]);
+  Builder.SetInsertPoint(StartBlock, StartBlock->end());
+  Builder.Insert(NewBr);
+
+  Value *Source1Cast, *Source2Cast, *LoadSrc1, *LoadSrc2, *Xor;
+  // Generate load, xor, compare, branch instructions for each block
+  for (i = 0; i < Count; i++) {
+    Builder.SetInsertPoint(BBList[i], BBList[i]->getFirstInsertionPt());
+    // Cast the source pointers to i64* for generating 8 byte loads
+    if (i == 0) {
+      Source1Cast = Builder.CreateBitCast(Source1, Int64PtrTy);
+      Source2Cast = Builder.CreateBitCast(Source2, Int64PtrTy);
+    } else {
+      // Increment the base pointers for each source
+      Source1Cast =
+          Builder.CreateGEP(Int64Ty, Source1Cast, ConstantInt::get(Int64Ty, 1));
+      Source2Cast =
+          Builder.CreateGEP(Int64Ty, Source2Cast, ConstantInt::get(Int64Ty, 1));
+    }
+    LoadSrc1 = Builder.CreateLoad(Int64Ty, Source1Cast);
+    LoadSrc2 = Builder.CreateLoad(Int64Ty, Source2Cast);
+    Src1List.push_back(LoadSrc1);
+    Src2List.push_back(LoadSrc2);
+    Xor = Builder.CreateXor(LoadSrc1, LoadSrc2);
+    XorList.push_back(Xor);
+    // Create an early exit branch instruction for all but the last block
+    if (i == Count - 1) {
+      BranchInst *NewBr = BranchInst::Create(EndBlock);
+      Builder.Insert(NewBr);
+    } else {
+      Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Xor,
+                                      ConstantInt::get(Xor->getType(), 0));
+      Builder.CreateCondBr(Cmp, EndBlock, BBList[i + 1]);
+    }
+  }
+
+  Builder.SetInsertPoint(&EndBlock->front());
+  PHINode *PhiXor = nullptr;
+  PhiXor = Builder.CreatePHI(Int64Ty, Count, "res.phi");
+  for (i = 0; i < Count; i++) {
+    PhiXor->addIncoming(XorList[i], BBList[i]);
+  }
+  PHINode *PhiSrc1 = nullptr;
+  PhiSrc1 = Builder.CreatePHI(Int64Ty, Count, "res.phi");
+  for (i = 0; i < Count; i++) {
+    PhiSrc1->addIncoming(Src1List[i], BBList[i]);
+  }
+
+  PHINode *PhiSrc2 = nullptr;
+  PhiSrc2 = Builder.CreatePHI(Int64Ty, Count, "res.phi");
+  for (i = 0; i < Count; i++) {
+    PhiSrc2->addIncoming(Src2List[i], BBList[i]);
+  }
+
+  // Add instructions to EndBlock for calculating memcmp return value
+  Function *F = EndBlock->getParent();
+  Function *CTTZ =
+      Intrinsic::getDeclaration(F->getParent(), Intrinsic::cttz, Int64Ty);
+  Value *CntZeros = Builder.CreateCall(CTTZ, {PhiXor, Builder.getFalse()});
+  Value *Shift1 = Builder.CreateAShr(PhiSrc1, CntZeros, "src1_sh");
+  Value *Shift2 = Builder.CreateAShr(PhiSrc2, CntZeros, "src2_sh");
+  Value *And1 = Builder.CreateAnd(Shift1, ConstantInt::get(Int64Ty, 0xFF));
+  Value *And2 = Builder.CreateAnd(Shift2, ConstantInt::get(Int64Ty, 0xFF));
+  Value *Subtract = Builder.CreateSub(And1, And2);
+  Value *Res = Builder.CreateSExtOrTrunc(Subtract, Builder.getInt32Ty());
+
+  CI->replaceAllUsesWith(Res);
+  CI->eraseFromParent();
+  return true;
+}
+
 bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
   BasicBlock *BB = CI->getParent();
-
   // Lower inline assembly if we can.
   // If we found an inline asm expession, and if the target knows how to
   // lower it to normal LLVM code, do so now.
@@ -2054,6 +2223,27 @@
     CI->eraseFromParent();
     return true;
   }
+  
+  Function *F = CI->getCalledFunction();
+  LibFunc::Func Func;
+  bool expanded = false;
+  if (!CI->isNoBuiltin() && !F->hasLocalLinkage() && F->hasName() &&
+      TLInfo->getLibFunc(F->getName(), Func) &&
+      TLInfo->hasOptimizedCodeGen(Func)) {
+    switch (Func) {
+    default:
+      break;
+    case LibFunc::memcmp:
+      if (TTI->expandMemCmp()) {
+        expanded = memcmpExpansion(CI, TLI, DL);
+        if (expanded) {
+          ModifiedDT = true;
+          return true;
+        }
+      }
+      break;
+    }
+  }
   return false;
 }
 
Index: lib/CodeGen/TargetLoweringBase.cpp
===================================================================
--- lib/CodeGen/TargetLoweringBase.cpp
+++ lib/CodeGen/TargetLoweringBase.cpp
@@ -829,9 +829,9 @@
   initActions();
 
   // Perform these initializations only once.
-  MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = 8;
+  MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = MaxLoadsPerMemcmp = 8;
   MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize
-    = MaxStoresPerMemmoveOptSize = 4;
+    = MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4;
   UseUnderscoreSetJmp = false;
   UseUnderscoreLongJmp = false;
   HasMultipleConditionRegisters = false;
Index: lib/Target/PowerPC/PPCISelLowering.cpp
===================================================================
--- lib/Target/PowerPC/PPCISelLowering.cpp
+++ lib/Target/PowerPC/PPCISelLowering.cpp
@@ -971,6 +971,10 @@
     MaxStoresPerMemset = 128;
     MaxStoresPerMemcpy = 128;
     MaxStoresPerMemmove = 128;
+    MaxLoadsPerMemcmp = 128;
+  } else {
+    MaxLoadsPerMemcmp = 64;
+    MaxLoadsPerMemcmpOptSize = 8;
   }
 }
 
Index: lib/Target/PowerPC/PPCTargetTransformInfo.h
===================================================================
--- lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -60,6 +60,7 @@
   /// @{
 
   bool enableAggressiveInterleaving(bool LoopHasReductions);
+  bool expandMemCmp();
   bool enableInterleavedAccessVectorization();
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);
Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp
===================================================================
--- lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -215,6 +215,10 @@
   return LoopHasReductions;
 }
 
+bool PPCTTIImpl::expandMemCmp() {
+  return true;
+}
+
 bool PPCTTIImpl::enableInterleavedAccessVectorization() {
   return true;
 }
Index: test/CodeGen/PowerPC/memcmp.ll
===================================================================
--- /dev/null
+++ test/CodeGen/PowerPC/memcmp.ll
@@ -0,0 +1,27 @@
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s -check-prefix=CHECK
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define signext i32 @foo(double* %x, double* %y) #0 {
+entry:
+  %0 = bitcast double* %x to i8*
+  %1 = bitcast double* %y to i8*
+  %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 16) #2
+  ret i32 %call
+
+; CHECK-LABEL: @foo
+; CHECK: ld
+; CHECK-NEXT: ld
+; CHECK-NEXT: xor.
+; CHECK-NEXT: bne
+; CHECK: popcntd
+; CHECK-NEXT: srad
+; CHECK-NEXT: srad
+; CHECK-NEXT: clrldi
+; CHECK-NEXT: clrldi
+; CHECK-NEXT: sub
+}
+
+; Function Attrs: nounwind readonly
+declare signext i32 @memcmp(i8*, i8*, i64) #1