Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -419,6 +419,10 @@
   /// \brief Don't restrict interleaved unrolling to small loops.
   bool enableAggressiveInterleaving(bool LoopHasReductions) const;
 
+  /// \brief Enable inline expansion of memcmp
+  bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize, bool &ByteSwapLoads,
+                    bool &AllowUnalignedLoads) const;
+
   /// \brief Enable matching of interleaved access groups.
   bool enableInterleavedAccessVectorization() const;
 
@@ -753,6 +757,8 @@
   virtual unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                                     unsigned VF) = 0;
   virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
+  virtual bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize,
+                            bool &ByteSwapLoads, bool &AllowUnalignedLoads) = 0;
   virtual bool enableInterleavedAccessVectorization() = 0;
   virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
   virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
@@ -954,6 +960,11 @@
   bool enableAggressiveInterleaving(bool LoopHasReductions) override {
     return Impl.enableAggressiveInterleaving(LoopHasReductions);
   }
+  bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize, bool &ByteSwapLoads,
+                    bool &AllowUnalignedLoads) override {
+    return Impl.expandMemCmp(I, MaxLoadSize, ByteSwapLoads,
+                             AllowUnalignedLoads);
+  }
   bool enableInterleavedAccessVectorization() override {
     return Impl.enableInterleavedAccessVectorization();
   }
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -260,6 +260,11 @@
 
   bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; }
 
+  bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize, bool &ByteSwapLoads,
+                    bool &AllowUnalignedLoads) {
+    return false;
+  }
+  
   bool enableInterleavedAccessVectorization() { return false; }
 
   bool isFPVectorizationPotentiallyUnsafe() { return false; }
Index: include/llvm/Target/TargetLowering.h
===================================================================
--- include/llvm/Target/TargetLowering.h
+++ include/llvm/Target/TargetLowering.h
@@ -1006,6 +1006,16 @@
   unsigned getMaxStoresPerMemcpy(bool OptSize) const {
     return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy;
   }
+  
+  /// \brief Get maximum size in bytes to load for memcmp
+  ///
+  /// This function returns the maximum size in bytes to load when
+  /// expanding memcmp. The value is set by the target at the
+  /// performance threshold for such a replacement. If OptSize is true,
+  /// return the limit for functions that have OptSize attribute.
+  unsigned getMaxLoadSizeMemcmp(bool OptSize) const {
+    return OptSize ? MaxLoadSizeMemcmpOptSize : MaxLoadSizeMemcmp;
+  }
 
   /// \brief Get maximum # of store operations permitted for llvm.memmove
   ///
@@ -2177,6 +2187,8 @@
   /// Maximum number of store operations that may be substituted for a call to
   /// memcpy, used for functions with OptSize attribute.
   unsigned MaxStoresPerMemcpyOptSize;
+  unsigned MaxLoadSizeMemcmp;
+  unsigned MaxLoadSizeMemcmpOptSize;
 
   /// \brief Specify maximum bytes of store instructions per memmove call.
   ///
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -197,6 +197,13 @@
   return TTIImpl->enableAggressiveInterleaving(LoopHasReductions);
 }
 
+bool TargetTransformInfo::expandMemCmp(Instruction *I, unsigned &MaxLoadSize,
+                                       bool &ByteSwapLoads,
+                                       bool &AllowUnalignedLoads) const {
+  return TTIImpl->expandMemCmp(I, MaxLoadSize, ByteSwapLoads,
+                               AllowUnalignedLoads);
+}
+
 bool TargetTransformInfo::enableInterleavedAccessVectorization() const {
   return TTIImpl->enableInterleavedAccessVectorization();
 }
Index: lib/CodeGen/CodeGenPrepare.cpp
===================================================================
--- lib/CodeGen/CodeGenPrepare.cpp
+++ lib/CodeGen/CodeGenPrepare.cpp
@@ -80,6 +80,11 @@
 STATISTIC(NumAndCmpsMoved, "Number of and/cmp's pushed into branches");
 STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
 
+STATISTIC(NumMemCmpCalls, "Number of memcmp calls");
+STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size");
+STATISTIC(NumMemCmpGreaterThanMax, "Number of memcmp calls with size greater than max size");
+STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls");
+
 static cl::opt<bool> DisableBranchOpts(
   "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
   cl::desc("Disable branch optimizations in CodeGenPrepare"));
@@ -1869,6 +1874,337 @@
   ModifiedDT = true;
   return true;
 }
+// Populates the EndBlock with a sequence to calculate the memcmp result.
+Value *getResult(LLVMContext &C, PHINode *PhiXor, PHINode *PhiSrc1,
+                 PHINode *PhiSrc2, Type *LoadType, BasicBlock *EndBlock) {
+
+  IRBuilder<> Builder(C);
+  Function *F;
+  Value *Res;
+  BasicBlock::iterator InsertPt = EndBlock->getFirstInsertionPt();
+  Builder.SetInsertPoint(EndBlock, InsertPt);
+  F = EndBlock->getParent();
+  Function *CTTZ =
+      Intrinsic::getDeclaration(F->getParent(), Intrinsic::cttz, LoadType);
+  Value *CntZerosMasked =
+      Builder.CreateCall(CTTZ, {PhiXor, Builder.getFalse()});
+
+  uint64_t Mask;
+  Mask = (LoadType == Type::getInt64Ty(C)) ? UINT64_MAX << 3 : UINT_MAX << 3;
+  CntZerosMasked = Builder.CreateAnd(
+      CntZerosMasked,
+      ConstantInt::get((LoadType == Type::getInt64Ty(C)) ? LoadType
+                                                         : Type::getInt32Ty(C),
+                       Mask));
+  Value *Shift1 = Builder.CreateAShr(PhiSrc1, CntZerosMasked, "src1_sh");
+  Value *Shift2 = Builder.CreateAShr(PhiSrc2, CntZerosMasked, "src2_sh");
+  Value *And1 = Builder.CreateAnd(Shift1, ConstantInt::get(LoadType, 0xFF));
+  Value *And2 = Builder.CreateAnd(Shift2, ConstantInt::get(LoadType, 0xFF));
+  Value *Subtract = Builder.CreateSub(And1, And2);
+  Res = Builder.CreateSExtOrTrunc(Subtract, Builder.getInt32Ty());
+  return Res;
+}
+
+// Populates the load compare block for the given LoadType.
+// If LoadType is i8, we can just subtract and return.
+// If LoadType is greater than i8, we need to populate the EndBlock
+// with a sequence that calculates the memcmp result.
+void EmitLoadCompareBlock(LLVMContext &C, BasicBlock *LoadBlockCurr,
+                          BasicBlock *LoadBlockNext, Value *Source1,
+                          Value *Source2, Type *LoadType, Type *LoadPtrTy,
+                          Type *MaxLoadType, BasicBlock *EndBlock,
+                          PHINode *PhiXor, PHINode *PhiSrc1, PHINode *PhiSrc2,
+                          unsigned GEPIndex) {
+  IRBuilder<> Builder(C);
+  Value *Source1Cast, *Source2Cast, *LoadSrc1, *LoadSrc2, *Diff;
+
+  Builder.SetInsertPoint(LoadBlockCurr);
+
+  Source1Cast = Builder.CreateBitCast(Source1, LoadPtrTy);
+  Source2Cast = Builder.CreateBitCast(Source2, LoadPtrTy);
+
+  if (GEPIndex != 0) {
+    Source1Cast = Builder.CreateGEP(LoadType, Source1Cast,
+                                    ConstantInt::get(LoadType, GEPIndex));
+    Source2Cast = Builder.CreateGEP(LoadType, Source2Cast,
+                                    ConstantInt::get(LoadType, GEPIndex));
+  }
+
+  LoadSrc1 = Builder.CreateLoad(LoadType, Source1Cast);
+  LoadSrc2 = Builder.CreateLoad(LoadType, Source2Cast);
+  
+  // Extend i8 to i32 for memcmp result
+  if (LoadType == Type::getInt8Ty(C)) {
+    LoadSrc1 = Builder.CreateZExtOrTrunc(LoadSrc1, Type::getInt32Ty(C));
+    LoadSrc2 = Builder.CreateZExtOrTrunc(LoadSrc2, Type::getInt32Ty(C));
+  // Extend all types other than i8 to MaxLoadType
+  } else if (LoadType != MaxLoadType) {
+    LoadSrc1 = Builder.CreateZExtOrTrunc(LoadSrc1, MaxLoadType);
+    LoadSrc2 = Builder.CreateZExtOrTrunc(LoadSrc2, MaxLoadType);
+  }
+
+  // If LoadType is i8, we can just subtract and return
+  if (LoadType == Type::getInt8Ty(C)) {
+    Diff = Builder.CreateSub(LoadSrc1, LoadSrc2);
+  } else {
+    PhiSrc1->addIncoming(LoadSrc1, LoadBlockCurr);
+    PhiSrc2->addIncoming(LoadSrc2, LoadBlockCurr);
+    Diff = Builder.CreateXor(LoadSrc1, LoadSrc2);
+    Diff = Builder.CreateSExtOrTrunc(Diff, MaxLoadType);
+  }
+
+  PhiXor->addIncoming(Diff, LoadBlockCurr);
+
+  if (LoadBlockNext == nullptr) {
+    BranchInst *NewBr = BranchInst::Create(EndBlock);
+    Builder.Insert(NewBr);
+  } else {
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff,
+                                    ConstantInt::get(Diff->getType(), 0));
+    Builder.CreateCondBr(Cmp, EndBlock, LoadBlockNext);
+  }
+}
+
+Type *getPtrTypeFromSize(LLVMContext &C, unsigned Size) {
+  Type *LoadPtrTy;
+  switch (Size) {
+  case 8: {
+    LoadPtrTy = Type::getInt64PtrTy(C);
+    break;
+  }
+  case 4: {
+    LoadPtrTy = Type::getInt32PtrTy(C);
+    break;
+  }
+  case 2: {
+    LoadPtrTy = Type::getInt16PtrTy(C);
+    break;
+  }
+  case 1: {
+    LoadPtrTy = Type::getInt8PtrTy(C);
+    break;
+  }
+  }
+  return LoadPtrTy;
+}
+
+Type *getTypeFromSize(LLVMContext &C, unsigned Size) {
+  Type *LoadType;
+  switch (Size) {
+  case 8: {
+    LoadType = Type::getInt64Ty(C);
+    break;
+  }
+  case 4: {
+    LoadType = Type::getInt32Ty(C);
+    break;
+  }
+  case 2: {
+    LoadType = Type::getInt16Ty(C);
+    break;
+  }
+  case 1: {
+    LoadType = Type::getInt8Ty(C);
+    break;
+  }
+  }
+  return LoadType;
+}
+// This function checks to see if an expansion of memcmp can be generated.
+// It checks for constant compare size that is less than the max inline size.
+// If an expansion cannot occur, returns false to leave as a library call.
+// Otherwise, the library call is replaced wtih new IR instruction sequence.
+/// We want to transform:
+/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 13)
+///
+/// loadb: Loads 8 bytes and branches on early exit to calculate result block
+///   %0 = bitcast i32* %buffer2 to i8*
+///   %1 = bitcast i32* %buffer1 to i8*
+///   %2 = bitcast i8* %1 to i64*
+///   %3 = bitcast i8* %0 to i64*
+///   %4 = load i64, i64* %2
+///   %5 = load i64, i64* %3
+///   %6 = xor i64 %4, %5
+///   %7 = icmp ne i64 %6, 0
+///   br i1 %7, label %endblockx, label %loadb4
+/// loadb4: Loads next 4 bytes                             ; preds = %loadb
+///   %14 = bitcast i32* %buffer2 to i8*
+///   %15 = bitcast i32* %buffer1 to i8*
+///   %16 = bitcast i8* %15 to i32*
+///   %17 = bitcast i8* %14 to i32*
+///   %18 = getelementptr i32, i32* %16, i32 2
+///   %19 = getelementptr i32, i32* %17, i32 2
+///   %20 = load i32, i32* %18
+///   %21 = load i32, i32* %19
+///   %22 = zext i32 %20 to i64
+///   %23 = zext i32 %21 to i64
+///   %24 = xor i64 %22, %23
+///   %25 = icmp ne i64 %24, 0
+///   br i1 %25, label %endblockx, label %loadb5
+/// loadb5: Loads last remaining byte                      ; preds = %loadb4
+///   %26 = bitcast i32* %buffer2 to i8*
+///   %27 = bitcast i32* %buffer1 to i8*
+///   %28 = getelementptr i8, i8* %27, i8 12
+///   %29 = getelementptr i8, i8* %26, i8 12
+///   %30 = load i8, i8* %28
+///   %31 = load i8, i8* %29
+///   %32 = zext i8 %30 to i32
+///   %33 = zext i8 %31 to i32
+///   %34 = sub i32 %32, %33
+///   br label %endblock
+/// endblockx: Calculate result                         ; preds = %loadb4, %loadb
+///   %res.phi = phi i64 [ %6, %loadb ], [ %24, %loadb4 ]
+///   %res.phi1 = phi i64 [ %4, %loadb ], [ %22, %loadb4 ]
+///   %res.phi2 = phi i64 [ %5, %loadb ], [ %23, %loadb4 ]
+///   %8 = call i64 @llvm.cttz.i64(i64 %res.phi, i1 false)
+///   %9 = and i64 %8, -8
+///   %src1_sh = ashr i64 %res.phi1, %9
+///   %src2_sh = ashr i64 %res.phi2, %9
+///   %10 = and i64 %src1_sh, 255
+///   %11 = and i64 %src2_sh, 255
+///   %12 = sub i64 %10, %11
+///   %13 = trunc i64 %12 to i32
+///   br label %endblock
+/// endblock: Return depending on which block we came from   ; preds = %endblockx, %loadb5
+///   %res.phi3 = phi i32 [ %34, %loadb5 ], [ %13, %endblockx ]
+///   ret i32 %res.phi3
+static bool memcmpExpansion(CallInst *CI, const TargetLowering *TLI,
+                            const DataLayout *DL, unsigned MaxLoadSize,
+                            bool ByteSwapLoads, bool AllowUnalignedLoads) {
+  NumMemCmpCalls++;
+  LLVMContext &C = CI->getContext();
+  IRBuilder<> Builder(C);
+
+  Value *Source1 = CI->getArgOperand(0);
+  Value *Source2 = CI->getArgOperand(1);
+  Value *Size = CI->getArgOperand(2);
+
+  ConstantInt *SizeCast = dyn_cast<ConstantInt>(Size);
+  if (!SizeCast) {
+    NumMemCmpNotConstant++;
+    return false;
+  }
+
+  uint64_t SizeVal = SizeCast->getZExtValue();
+  if (SizeVal > TLI->getMaxLoadSizeMemcmp(0)) {
+    NumMemCmpGreaterThanMax++;
+    return false;
+  }
+
+  NumMemCmpInlined++;
+
+  BasicBlock *StartBlock = CI->getParent();
+  // Block to return from
+  BasicBlock *EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
+  
+  // Block that calcualtes the memcmp result
+  BasicBlock *EndBlockXor =
+      BasicBlock::Create(C, "endblockx", StartBlock->getParent(), EndBlock);
+  Builder.SetInsertPoint(EndBlockXor);
+
+  Type *LoadPtrTy;
+  Type *LoadType, *MaxLoadType;
+  MaxLoadType = getTypeFromSize(C, MaxLoadSize);
+
+  PHINode *PhiXor = nullptr;
+  PHINode *PhiSrc1 = nullptr;
+  PHINode *PhiSrc2 = nullptr;
+
+  PhiXor = Builder.CreatePHI(MaxLoadType, 0, "res.phi");
+  PhiSrc1 = Builder.CreatePHI(MaxLoadType, 0, "res.phi");
+  PhiSrc2 = Builder.CreatePHI(MaxLoadType, 0, "res.phi");
+
+  // EndBlock phi node to return the final result
+  PHINode *PhiRes = nullptr;
+  Builder.SetInsertPoint(&EndBlock->front());
+  PhiRes = Builder.CreatePHI(Type::getInt32Ty(C), 2, "res.phi");
+
+  // Check if we have one byte blocks
+  bool HaveOneByteLoads = SizeVal % 2 != 0;
+
+  // Working with two basic blocks at a time
+  BasicBlock *LoadBlockCurr, *LoadBlockNext;
+
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+  StartBlock->getTerminator()->eraseFromParent();
+
+  Value *Res;
+
+  int CurrLoadSize = MaxLoadSize;
+  int BlockCount, GEPIndex;
+  int Remainder = SizeVal;
+  GEPIndex = 0;
+
+  // Create first load compare block and set the entry block to branch to it
+  LoadBlockCurr = BasicBlock::Create(C, "loadb", StartBlock->getParent(), EndBlock);
+  LoadBlockNext = nullptr;
+  BranchInst *NewBr = BranchInst::Create(LoadBlockCurr);
+  Builder.SetInsertPoint(StartBlock, StartBlock->end());
+  Builder.Insert(NewBr);
+
+  // Find the first load size that is less than remainder
+  while (CurrLoadSize > Remainder)
+    CurrLoadSize = CurrLoadSize >> 1;
+
+  // Continue to create load compare blocks until we no longer have a remainder
+  while (Remainder) {
+    // Calculate how many blocks can we create with the current load size
+    BlockCount = Remainder / CurrLoadSize;
+    Remainder = Remainder % CurrLoadSize;
+    bool LastBlock = Remainder == 0 && (BlockCount == 1);
+
+    // If not the last block, create the next block for current block to branch to
+    if (!LastBlock) {
+      LoadBlockNext =
+          BasicBlock::Create(C, "loadb", StartBlock->getParent(),EndBlock);
+    }
+    // Call EmitLoadCompareBlock for the number of blocks we need of CurrLoadSize
+    int i;
+    for (i = 0; i < BlockCount; i++) {
+      LoadType = getTypeFromSize(C, CurrLoadSize);
+      LoadPtrTy = getPtrTypeFromSize(C, CurrLoadSize);
+      if (i == BlockCount - 1 && Remainder == 0)
+        LastBlock = true;
+      EmitLoadCompareBlock(C, LoadBlockCurr, LoadBlockNext, Source1, Source2,
+                           LoadType, LoadPtrTy, MaxLoadType,
+                           CurrLoadSize > 1 ? EndBlockXor : EndBlock,
+                           CurrLoadSize > 1 ? PhiXor : PhiRes, PhiSrc1, PhiSrc2,
+                           GEPIndex);
+      LoadBlockCurr = LoadBlockNext;
+      // Already created two blocks, check to see if third is needed
+      if ((i < BlockCount - 2) || (i == BlockCount - 2 && Remainder != 0)) {
+        LoadBlockNext =
+            BasicBlock::Create(C, "loadb", StartBlock->getParent(), EndBlock);
+      } else {
+        LoadBlockNext = nullptr;
+      }
+      GEPIndex++;
+    }
+    // Calculate the next load size to use
+    if (Remainder > 0) {
+      while (CurrLoadSize > Remainder)
+        CurrLoadSize = CurrLoadSize >> 1;
+    }
+    // New index into the source using CurrLoadSize
+    GEPIndex = (SizeVal - Remainder) / CurrLoadSize;
+  }
+
+  // Calculate the memcmp result in EndBlockXor and branch to the final EndBlock
+  Res = getResult(C, PhiXor, PhiSrc1, PhiSrc2, MaxLoadType, EndBlockXor);
+  NewBr = BranchInst::Create(EndBlock);
+  Builder.SetInsertPoint(EndBlockXor, EndBlockXor->end());
+  Builder.Insert(NewBr);
+
+  if (HaveOneByteLoads) {
+    PhiRes->addIncoming(Res, EndBlockXor);
+    CI->replaceAllUsesWith(PhiRes);
+  } else {
+    CI->replaceAllUsesWith(Res);
+  }
+
+  CI->eraseFromParent();
+  return true;
+}
 
 bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
   BasicBlock *BB = CI->getParent();
@@ -2054,6 +2390,21 @@
     CI->eraseFromParent();
     return true;
   }
+
+  Function *F = CI->getCalledFunction();
+  LibFunc Func;
+  bool Expanded = false;
+  if( TLInfo->getLibFunc(F->getName(), Func) && Func==LibFunc_memcmp){
+      bool ByteSwapLoads, AllowUnalignedLoads;
+      unsigned MaxLoadSize;
+      if (TTI->expandMemCmp(CI, MaxLoadSize, ByteSwapLoads, AllowUnalignedLoads)) {
+        Expanded = memcmpExpansion(CI, TLI, DL, MaxLoadSize, ByteSwapLoads, AllowUnalignedLoads);
+        if (Expanded) {
+          ModifiedDT = true;
+          return true;
+        }
+      }
+  }
   return false;
 }
 
Index: lib/CodeGen/TargetLoweringBase.cpp
===================================================================
--- lib/CodeGen/TargetLoweringBase.cpp
+++ lib/CodeGen/TargetLoweringBase.cpp
@@ -829,9 +829,10 @@
   initActions();
 
   // Perform these initializations only once.
-  MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = 8;
-  MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize
-    = MaxStoresPerMemmoveOptSize = 4;
+  MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove =
+      MaxLoadSizeMemcmp = 8;
+  MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize =
+      MaxStoresPerMemmoveOptSize = MaxLoadSizeMemcmpOptSize = 4;
   UseUnderscoreSetJmp = false;
   UseUnderscoreLongJmp = false;
   HasMultipleConditionRegisters = false;
Index: lib/Target/PowerPC/PPCISelLowering.cpp
===================================================================
--- lib/Target/PowerPC/PPCISelLowering.cpp
+++ lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1020,6 +1020,10 @@
     MaxStoresPerMemset = 128;
     MaxStoresPerMemcpy = 128;
     MaxStoresPerMemmove = 128;
+    MaxLoadSizeMemcmp = 128;
+  } else {
+    MaxLoadSizeMemcmp = 64;
+    MaxLoadSizeMemcmpOptSize = 8;
   }
 }
 
Index: lib/Target/PowerPC/PPCTargetTransformInfo.h
===================================================================
--- lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -60,6 +60,8 @@
   /// @{
 
   bool enableAggressiveInterleaving(bool LoopHasReductions);
+  bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize, bool &ByteSwapLoads,
+                    bool &AllowUnalignedLoads);
   bool enableInterleavedAccessVectorization();
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);
Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp
===================================================================
--- lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -215,6 +215,14 @@
   return LoopHasReductions;
 }
 
+bool PPCTTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize,
+                              bool &ByteSwapLoads, bool &AllowUnalignedLoads) {
+  MaxLoadSize = 8;
+  ByteSwapLoads = true;
+  AllowUnalignedLoads = true;
+  return true;
+}
+
 bool PPCTTIImpl::enableInterleavedAccessVectorization() {
   return true;
 }
Index: test/CodeGen/PowerPC/memcmp.ll
===================================================================
--- /dev/null
+++ test/CodeGen/PowerPC/memcmp.ll
@@ -0,0 +1,122 @@
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s -check-prefix=CHECK
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; Check multiples of 8
+; Function Attrs: nounwind readonly
+define signext i32 @test1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 16) #2
+  ret i32 %call
+
+; CHECK-LABEL: @test1
+; CHECK: ld
+; CHECK-NEXT: ld
+; CHECK-NEXT: xor.
+; CHECK-NEXT: bne
+; CHECK: popcntd
+; CHECK-NEXT: andi.
+; CHECK-NEXT: srad
+; CHECK-NEXT: srad
+; CHECK-NEXT: clrldi
+; CHECK-NEXT: clrldi
+; CHECK-NEXT: sub
+; CHECK-NEXT: blr
+}
+
+; Check less than 8
+; Function Attrs: nounwind readonly
+define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 4) #2
+  ret i32 %call
+
+; CHECK-LABEL: @test2
+; CHECK: lwz
+; CHECK-NEXT: lwz
+; CHECK-NEXT: xor
+; CHECK: popcntd
+; CHECK-NEXT: andi.
+; CHECK-NEXT: srd
+; CHECK-NEXT: srd
+; CHECK-NEXT: clrldi
+; CHECK-NEXT: clrldi
+; CHECK-NEXT: sub
+; CHECK-NEXT: blr
+}
+
+; Check greater than 8 with remainder that uses each load type. Ex: 15 bytes = 8 + 4 + 2 + 1
+; Function Attrs: nounwind readonly
+define signext i32 @test3(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 15) #2
+  ret i32 %call
+
+; CHECK-LABEL: @test3
+
+; CHECK: ld
+; CHECK-NEXT: ld
+; CHECK-NEXT: xor.
+; CHECK-NEXT: bne
+
+; CHECK: lwz
+; CHECK-NEXT: lwz
+; CHECK-NEXT: xor
+
+; CHECK: lhz
+; CHECK-NEXT: lhz
+; CHECK-NEXT: xor
+
+; CHECK: popcntd
+; CHECK-NEXT: andi.
+; CHECK-NEXT: srad
+; CHECK-NEXT: srad
+; CHECK-NEXT: clrldi
+; CHECK-NEXT: clrldi
+; CHECK-NEXT: sub
+; CHECK-NEXT: extsw
+; CHECK-NEXT: blr
+
+; CHECK: lbz
+; CHECK-NEXT: lbz
+; CHECK-NEXT: subf
+; CHECK-NEXT: extsw
+; CHECK-NEXT: blr
+
+}
+
+; Check greater than max allowed size = 64
+; Function Attrs: nounwind readonly
+define signext i32 @test4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 65) #2
+  ret i32 %call
+
+; CHECK: bl memcmp
+
+}
+
+; Check not a constant size
+; Function Attrs: nounwind readonly
+define signext i32 @test5(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2, i32 signext %SIZE) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %conv = sext i32 %SIZE to i64
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 %conv) #2
+  ret i32 %call
+
+; CHECK: bl memcmp
+
+}
+
+; Function Attrs: nounwind readonly
+declare signext i32 @memcmp(i8*, i8*, i64) #1