Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -437,6 +437,9 @@
   /// \brief Don't restrict interleaved unrolling to small loops.
   bool enableAggressiveInterleaving(bool LoopHasReductions) const;
 
+  /// \brief Enable inline expansion of memcmp
+  bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) const;
+
   /// \brief Enable matching of interleaved access groups.
   bool enableInterleavedAccessVectorization() const;
 
@@ -772,6 +775,7 @@
   virtual unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                                     unsigned VF) = 0;
   virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
+  virtual bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) = 0;
   virtual bool enableInterleavedAccessVectorization() = 0;
   virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
   virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
@@ -978,6 +982,9 @@
   bool enableAggressiveInterleaving(bool LoopHasReductions) override {
     return Impl.enableAggressiveInterleaving(LoopHasReductions);
   }
+  bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) override {
+    return Impl.expandMemCmp(I, MaxLoadSize);
+  }
   bool enableInterleavedAccessVectorization() override {
     return Impl.enableInterleavedAccessVectorization();
   }
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -264,6 +264,8 @@
 
   bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; }
 
+  bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) { return false; }
+
   bool enableInterleavedAccessVectorization() { return false; }
 
   bool isFPVectorizationPotentiallyUnsafe() { return false; }
Index: include/llvm/IR/Instruction.h
===================================================================
--- include/llvm/IR/Instruction.h
+++ include/llvm/IR/Instruction.h
@@ -479,6 +479,9 @@
   /// instruction's result is undefined.
   bool isIdenticalToWhenDefined(const Instruction *I) const;
 
+  /// Return true if only matters that the value is equal or not-equal to zero.
+  bool isOnlyUsedInZeroEqualityComparison() const;
+
   /// When checking for operation equivalence (using isSameOperationAs) it is
   /// sometimes useful to ignore certain attributes.
   enum OperationEquivalenceFlags {
Index: include/llvm/Target/TargetLowering.h
===================================================================
--- include/llvm/Target/TargetLowering.h
+++ include/llvm/Target/TargetLowering.h
@@ -1011,6 +1011,16 @@
     return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy;
   }
 
+  /// \brief Get maximum size in bytes to load for memcmp
+  ///
+  /// This function returns the maximum size in bytes to load when
+  /// expanding memcmp. The value is set by the target at the
+  /// performance threshold for such a replacement. If OptSize is true,
+  /// return the limit for functions that have OptSize attribute.
+  unsigned getMaxExpandSizeMemcmp(bool OptSize) const {
+    return OptSize ? MaxExpandSizeMemcmpOptSize : MaxExpandSizeMemcmp;
+  }
+
   /// \brief Get maximum # of store operations permitted for llvm.memmove
   ///
   /// This function returns the maximum number of store operations permitted
@@ -2180,6 +2190,8 @@
   /// Maximum number of store operations that may be substituted for a call to
   /// memcpy, used for functions with OptSize attribute.
   unsigned MaxStoresPerMemcpyOptSize;
+  unsigned MaxExpandSizeMemcmp;
+  unsigned MaxExpandSizeMemcmpOptSize;
 
   /// \brief Specify maximum bytes of store instructions per memmove call.
   ///
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -201,6 +201,10 @@
   return TTIImpl->enableAggressiveInterleaving(LoopHasReductions);
 }
 
+bool TargetTransformInfo::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) const {
+  return TTIImpl->expandMemCmp(I, MaxLoadSize);
+}
+
 bool TargetTransformInfo::enableInterleavedAccessVectorization() const {
   return TTIImpl->enableInterleavedAccessVectorization();
 }
Index: lib/CodeGen/CodeGenPrepare.cpp
===================================================================
--- lib/CodeGen/CodeGenPrepare.cpp
+++ lib/CodeGen/CodeGenPrepare.cpp
@@ -55,6 +55,7 @@
 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
+
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
@@ -79,6 +80,11 @@
 STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
 STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
 
+STATISTIC(NumMemCmpCalls, "Number of memcmp calls");
+STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size");
+STATISTIC(NumMemCmpGreaterThanMax, "Number of memcmp calls with size greater than max size");
+STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls");
+
 static cl::opt<bool> DisableBranchOpts(
   "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
   cl::desc("Disable branch optimizations in CodeGenPrepare"));
@@ -1944,6 +1950,507 @@
   return true;
 }
 
+// This class provides helper functions to expand a memcmp library call into an
+// inline expansion.
+class MemCmpExpansion {
+  struct ResultBlock {
+  private:
+    BasicBlock *BB;
+    PHINode *PhiSrc1;
+    PHINode *PhiSrc2;
+    PHINode *PhiDiff;
+
+  public:
+    ResultBlock();
+    void setBB(BasicBlock *BB);
+    BasicBlock *getBB() const;
+    PHINode *getPhiSrc1() const;
+    PHINode *getPhiSrc2() const;
+    PHINode *getPhiDiff() const;
+    void setPHINodes(PHINode *PhiSrc1, PHINode *PhiSrc2, PHINode *PhiDiff);
+  };
+
+  CallInst *CI;
+  ResultBlock ResBlock;
+  unsigned MaxLoadSize;
+  unsigned NumBlocks;
+  unsigned NumBlocksNonOneByte;
+  std::vector<BasicBlock *> LoadCmpBlocks;
+  BasicBlock *EndBlock;
+  PHINode *PhiRes;
+  int calculateNumBlocks(unsigned Size);
+  void createLoadCmpBlocks(int Count, Function *Parent, BasicBlock *BeforeBB);
+  void createResultBlock(Function *Parent, BasicBlock *BeforeBB);
+  void setupResultBlockPHINodes();
+  void setupEndBlockPHINodes();
+  void emitLoadCompareBlock(unsigned Index, int LoadSize, int GEPIndex);
+  void emitLoadCompareByteBlock(unsigned Index, int GEPIndex);
+  void emitMemCmpResultBlock(bool IsLittleEndian);
+  BasicBlock *getFirstBlock() const;
+  BasicBlock *getLastLoadCmpBlock() const;
+  void addPhiResEdge(Value *Res, BasicBlock *BB);
+
+public:
+  MemCmpExpansion(CallInst *CI, unsigned MaxLoadSize, unsigned Size);
+  Value *getMemCmpExpansion(unsigned Size, bool IsLittleEndian);
+};
+
+MemCmpExpansion::ResultBlock::ResultBlock() {
+  BB = nullptr;
+  PhiSrc1 = nullptr;
+  PhiSrc2 = nullptr;
+  PhiDiff = nullptr;
+}
+
+void MemCmpExpansion::ResultBlock::setBB(BasicBlock *BB) { this->BB = BB; }
+
+BasicBlock *MemCmpExpansion::ResultBlock::getBB() const { return BB; }
+PHINode *MemCmpExpansion::ResultBlock::getPhiSrc1() const  { return PhiSrc1; }
+PHINode *MemCmpExpansion::ResultBlock::getPhiSrc2() const { return PhiSrc2; }
+PHINode *MemCmpExpansion::ResultBlock::getPhiDiff() const { return PhiDiff; }
+
+void MemCmpExpansion::ResultBlock::setPHINodes(PHINode *PhiSrc1,
+                                               PHINode *PhiSrc2,
+                                               PHINode *PhiDiff) {
+  this->PhiSrc1 = PhiSrc1;
+  this->PhiSrc2 = PhiSrc2;
+  this->PhiDiff = PhiDiff;
+}
+
+// Initialize the basic block structure required for expansion of memcmp call
+// with given maximum load size and memcmp size parameter.
+// This structure includes:
+// 1. A list of load compare blocks - LoadCmpBlocks.
+// 2. An EndBlock, split from original instruction point, which is the block to
+// return from.
+// 3. ResultBlock, block to branch to for early exit when a
+// LoadCmpBlock finds a difference.
+MemCmpExpansion::MemCmpExpansion(CallInst *CI, unsigned MaxLoadSize,
+                                 unsigned Size) {
+  this->CI = CI;
+  this->MaxLoadSize = MaxLoadSize;
+
+  LLVMContext &Context = CI->getContext();
+  IRBuilder<> Builder(Context);
+
+  BasicBlock *StartBlock = CI->getParent();
+  EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
+  setupEndBlockPHINodes();
+
+  // Calculate how many load compare blocks are required for an expansion of
+  // given Size.
+  NumBlocks = calculateNumBlocks(Size);
+  NumBlocksNonOneByte = 0;
+
+  createResultBlock(StartBlock->getParent(), EndBlock);
+  // If return value of memcmp is not used in a zero equality, we need to
+  // calculate which source was larger. The calculation sequence requires the
+  // two loaded source values, and the xor result of each load compare block.
+  // These will be saved in the phi nodes created by setupResultBlockPHINodes.
+  if (!CI->isOnlyUsedInZeroEqualityComparison())
+    setupResultBlockPHINodes();
+
+  // Create the number of required load compare basic blocks.
+  createLoadCmpBlocks(NumBlocks, StartBlock->getParent(), EndBlock);
+
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+  StartBlock->getTerminator()->eraseFromParent();
+
+  // Create a branch to the first load compare block from the entry block.
+  BranchInst *NewBr = BranchInst::Create(getFirstBlock());
+  Builder.SetInsertPoint(StartBlock, StartBlock->end());
+  Builder.Insert(NewBr);
+}
+
+BasicBlock *MemCmpExpansion::getFirstBlock() const { return LoadCmpBlocks[0]; }
+
+BasicBlock *MemCmpExpansion::getLastLoadCmpBlock() const {
+  return LoadCmpBlocks.back();
+}
+
+void MemCmpExpansion::createLoadCmpBlocks(int Count, Function *Parent,
+                                          BasicBlock *BeforeBB) {
+  LLVMContext &Context = CI->getContext();
+  for (int i = 0; i < Count; i++) {
+    BasicBlock *BB = BasicBlock::Create(Context, "loadbb", Parent, BeforeBB);
+    LoadCmpBlocks.push_back(BB);
+  }
+}
+
+void MemCmpExpansion::createResultBlock(Function *Parent,
+                                        BasicBlock *BeforeBB) {
+  LLVMContext &Context = CI->getContext();
+  BasicBlock *BB = BasicBlock::Create(Context, "res_block", Parent, BeforeBB);
+  ResBlock.setBB(BB);
+}
+
+void MemCmpExpansion::addPhiResEdge(Value *Res, BasicBlock *BB) {
+  PhiRes->addIncoming(Res, BB);
+}
+
+// This function creates the IR instructions for loading and comparing 1 byte.
+// It loads 1 byte from each source of the memcmp paramters with the given
+// GEPIndex. It then subtracts the two loaded values and adds this result to the
+// final phi node for selecting the memcmp result.
+void MemCmpExpansion::emitLoadCompareByteBlock(unsigned Index, int GEPIndex) {
+  LLVMContext &Context = CI->getContext();
+  IRBuilder<> Builder(Context);
+  Value *LoadSrc1, *LoadSrc2, *Diff;
+
+  Value *Source1 = CI->getArgOperand(0);
+  Value *Source2 = CI->getArgOperand(1);
+
+  Builder.SetInsertPoint(LoadCmpBlocks[Index]);
+  Type *LoadSizeType = Type::getInt8Ty(Context);
+  // Cast source to LoadSizeType*
+  if (Source1->getType() != LoadSizeType)
+    Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+  if (Source2->getType() != LoadSizeType)
+    Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+  // Get the base address using the GEPIndex
+  if (GEPIndex != 0) {
+    Source1 = Builder.CreateGEP(LoadSizeType, Source1,
+                                ConstantInt::get(LoadSizeType, GEPIndex));
+    Source2 = Builder.CreateGEP(LoadSizeType, Source2,
+                                ConstantInt::get(LoadSizeType, GEPIndex));
+  }
+
+  LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
+  LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+
+  LoadSrc1 = Builder.CreateZExtOrTrunc(LoadSrc1, Type::getInt32Ty(Context));
+  LoadSrc2 = Builder.CreateZExtOrTrunc(LoadSrc2, Type::getInt32Ty(Context));
+  Diff = Builder.CreateSub(LoadSrc1, LoadSrc2);
+
+  PhiRes->addIncoming(Diff, LoadCmpBlocks[Index]);
+  BranchInst *CmpBr = BranchInst::Create(EndBlock);
+  Builder.Insert(CmpBr);
+}
+
+// This function creates the IR intructions for loading and comparing using the
+// given LoadSize. It loads the number of bytes specified by LoadSize from each
+// source of the memcmp parameters. It then does an xor to see if there was a
+// difference in any of the loaded bits. If a difference is found, it branches
+// with an early exit to the ResultBlock for calculating which source was larger
+// at the differing bit position. Otherwise, it falls through to the either the
+// next LoadCmpBlock or the EndBlock if this is the last LoadCmpBlock. Loading
+// 1 byte is handled with a special case through emitLoadCompareByteBlock.
+// The special handling can simply subtract the loaded values and does not
+// require branching to ResultBlock for find which byte is larger at the
+// differing bit position as there is only 1 byte.
+void MemCmpExpansion::emitLoadCompareBlock(unsigned Index, int LoadSize,
+                                           int GEPIndex) {
+  if (LoadSize == 1) {
+    MemCmpExpansion::emitLoadCompareByteBlock(Index, GEPIndex);
+    return;
+  }
+
+  LLVMContext &Context = CI->getContext();
+  IRBuilder<> Builder(Context);
+  Value *LoadSrc1, *LoadSrc2, *Diff;
+
+  Type *LoadSizeType = IntegerType::get(Context, LoadSize * 8);
+  Type *MaxLoadType = IntegerType::get(Context, MaxLoadSize * 8);
+
+  Value *Source1 = CI->getArgOperand(0);
+  Value *Source2 = CI->getArgOperand(1);
+
+  Builder.SetInsertPoint(LoadCmpBlocks[Index]);
+  // Cast source to LoadSizeType*
+  if (Source1->getType() != LoadSizeType)
+    Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+  if (Source2->getType() != LoadSizeType)
+    Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+  // Get the base address using the GEPIndex
+  if (GEPIndex != 0) {
+    Source1 = Builder.CreateGEP(LoadSizeType, Source1,
+                                ConstantInt::get(LoadSizeType, GEPIndex));
+    Source2 = Builder.CreateGEP(LoadSizeType, Source2,
+                                ConstantInt::get(LoadSizeType, GEPIndex));
+  }
+
+  // Load LoadSizeType from the base address
+  LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
+  LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+
+  if (LoadSizeType != MaxLoadType) {
+    LoadSrc1 = Builder.CreateZExtOrTrunc(LoadSrc1, MaxLoadType);
+    LoadSrc2 = Builder.CreateZExtOrTrunc(LoadSrc2, MaxLoadType);
+  }
+
+  // Add the loaded values to the phi nodes for calculating memcmp result only
+  // if result is not used in a zero equality.
+  if (!CI->isOnlyUsedInZeroEqualityComparison()) {
+    ResBlock.getPhiSrc1()->addIncoming(LoadSrc1, LoadCmpBlocks[Index]);
+    ResBlock.getPhiSrc2()->addIncoming(LoadSrc2, LoadCmpBlocks[Index]);
+  }
+
+  Diff = Builder.CreateXor(LoadSrc1, LoadSrc2);
+  Diff = Builder.CreateSExtOrTrunc(Diff, MaxLoadType);
+
+  if (!CI->isOnlyUsedInZeroEqualityComparison()) {
+    ResBlock.getPhiDiff()->addIncoming(Diff, LoadCmpBlocks[Index]);
+  }
+
+  Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff,
+                                  ConstantInt::get(Diff->getType(), 0));
+  BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1))
+                           ? EndBlock
+                           : LoadCmpBlocks[Index + 1];
+  // Early exit branch if difference found to ResultBlock, otherwise continue to
+  // next LoadCmpBlock or EndBlock.
+  BranchInst *CmpBr = BranchInst::Create(ResBlock.getBB(), NextBB, Cmp);
+  Builder.Insert(CmpBr);
+
+  // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
+  // since early exit to ResultBlock was not taken (no difference was found in
+  // any of the bytes)
+  if (Index == LoadCmpBlocks.size() - 1) {
+    Value *Zero = ConstantInt::get(Type::getInt32Ty(Context), 0);
+    addPhiResEdge(Zero, LoadCmpBlocks[Index]);
+  }
+}
+
+// This function populates the ResultBlock with a sequence to calculate the
+// memcmp result. It uses the PhiDiff node to find the first differing byte,
+// shifts this byte from PhiSrc1 and PhiSrc2 into the lowest byte position in
+// the register, masks all other bytes, and does a subtraction.
+void MemCmpExpansion::emitMemCmpResultBlock(bool IsLittleEndian) {
+  LLVMContext &Context = CI->getContext();
+  IRBuilder<> Builder(Context);
+
+  // Special case: if memcmp result is used in a zero equality, result does not
+  // need to be calculated and can simply return 1.
+  if (CI->isOnlyUsedInZeroEqualityComparison()) {
+    BasicBlock::iterator InsertPt = ResBlock.getBB()->getFirstInsertionPt();
+    Builder.SetInsertPoint(ResBlock.getBB(), InsertPt);
+    Value *Res = ConstantInt::get(Type::getInt32Ty(Context), 1);
+    addPhiResEdge(Res, ResBlock.getBB());
+    BranchInst *NewBr = BranchInst::Create(EndBlock);
+    Builder.Insert(NewBr);
+    return;
+  }
+
+  Type *LoadSizeType = IntegerType::get(Context, MaxLoadSize * 8);
+
+  BasicBlock::iterator InsertPt = ResBlock.getBB()->getFirstInsertionPt();
+  Builder.SetInsertPoint(ResBlock.getBB(), InsertPt);
+  Function *F = ResBlock.getBB()->getParent();
+
+  // Find the first differing bit
+  Function *CountZeros = Intrinsic::getDeclaration(
+      F->getParent(), IsLittleEndian ? Intrinsic::cttz : Intrinsic::ctlz,
+      LoadSizeType);
+  Value *CntZerosMasked = Builder.CreateCall(
+      CountZeros, {ResBlock.getPhiDiff(), Builder.getFalse()});
+
+  if (!IsLittleEndian)
+    CntZerosMasked = Builder.CreateSub(
+        ConstantInt::get(LoadSizeType, ((MaxLoadSize * 8) - 1)),
+        CntZerosMasked);
+
+  // Find which byte this bit belongs in
+  CntZerosMasked =
+      Builder.CreateAnd(CntZerosMasked, ConstantInt::get(LoadSizeType, ~7));
+
+  // Shift this first differing byte to the least significant position of the
+  // register
+  Value *Shift1 =
+      Builder.CreateAShr(ResBlock.getPhiSrc1(), CntZerosMasked, "src1_sh");
+  Value *Shift2 =
+      Builder.CreateAShr(ResBlock.getPhiSrc2(), CntZerosMasked, "src2_sh");
+
+  // Mask out all other bits of the register
+  Value *And1 = Builder.CreateAnd(Shift1, ConstantInt::get(LoadSizeType, 0xFF));
+  Value *And2 = Builder.CreateAnd(Shift2, ConstantInt::get(LoadSizeType, 0xFF));
+  Value *Subtract = Builder.CreateSub(And1, And2);
+  Value *Res = Builder.CreateSExtOrTrunc(Subtract, Builder.getInt32Ty());
+
+  BranchInst *NewBr = BranchInst::Create(EndBlock);
+  Builder.Insert(NewBr);
+
+  addPhiResEdge(Res, ResBlock.getBB());
+}
+
+int MemCmpExpansion::calculateNumBlocks(unsigned Size) {
+  int NumBlocks = 0;
+  bool haveOneByteLoad = false;
+  unsigned RemainingSize = Size;
+  unsigned LoadSize = MaxLoadSize;
+  while (RemainingSize) {
+    if (LoadSize == 1)
+      haveOneByteLoad = true;
+    NumBlocks += RemainingSize / LoadSize;
+    RemainingSize = RemainingSize % LoadSize;
+    LoadSize = LoadSize / 2;
+  }
+  NumBlocksNonOneByte = haveOneByteLoad ? (NumBlocks - 1) : NumBlocks;
+  return NumBlocks;
+}
+
+void MemCmpExpansion::setupResultBlockPHINodes() {
+  LLVMContext &Context = CI->getContext();
+  IRBuilder<> Builder(Context);
+  Type *MaxLoadType = IntegerType::get(Context, MaxLoadSize * 8);
+  Builder.SetInsertPoint(ResBlock.getBB());
+  PHINode *PhiDiff =
+      Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.diff");
+  PHINode *PhiSrc1 =
+      Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src1");
+  PHINode *PhiSrc2 =
+      Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src2");
+  ResBlock.setPHINodes(PhiSrc1, PhiSrc2, PhiDiff);
+}
+
+void MemCmpExpansion::setupEndBlockPHINodes() {
+  LLVMContext &Context = CI->getContext();
+  IRBuilder<> Builder(Context);
+
+  Builder.SetInsertPoint(&EndBlock->front());
+  PhiRes = Builder.CreatePHI(Type::getInt32Ty(Context), 2, "phi.res");
+}
+
+// This function expands the memcmp call into an inline expansion and returns
+// the memcmp result.
+Value *MemCmpExpansion::getMemCmpExpansion(unsigned Size, bool IsLittleEndian) {
+  int LoadSize = MaxLoadSize;
+  int NumBytesToBeProcessed = Size;
+
+  unsigned Index = 0;
+  // This loop calls emitLoadCompareBlock for comparing SizeVal bytes of the two
+  // memcmp source. It starts with loading using the maximum load size set by
+  // the target. It processes any remaining bytes using a load size which is the
+  // next smallest power of 2.
+  while (NumBytesToBeProcessed) {
+    // Calculate how many blocks we can create with the current load size
+    int NumBlocks = NumBytesToBeProcessed / LoadSize;
+    int GEPIndex = (Size - NumBytesToBeProcessed) / LoadSize;
+    NumBytesToBeProcessed = NumBytesToBeProcessed % LoadSize;
+
+    // For each NumBlocks, populate the instruction sequence for loading and
+    // comparing LoadSize bytes
+    while (NumBlocks--) {
+      emitLoadCompareBlock(Index, LoadSize, GEPIndex);
+      Index++;
+      GEPIndex++;
+    }
+    // Get the next LoadSize to use
+    LoadSize = LoadSize / 2;
+  }
+
+  emitMemCmpResultBlock(IsLittleEndian);
+  return PhiRes;
+}
+
+// This function checks to see if an expansion of memcmp can be generated.
+// It checks for constant compare size that is less than the max inline size.
+// If an expansion cannot occur, returns false to leave as a library call.
+// Otherwise, the library call is replaced wtih new IR instruction sequence.
+/// We want to transform:
+/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 13)
+/// To:
+/// loadbb:
+///  %0 = bitcast i32* %buffer2 to i8*
+///  %1 = bitcast i32* %buffer1 to i8*
+///  %2 = bitcast i8* %1 to i64*
+///  %3 = bitcast i8* %0 to i64*
+///  %4 = load i64, i64* %2
+///  %5 = load i64, i64* %3
+///  %6 = xor i64 %4, %5
+///  %7 = icmp ne i64 %6, 0
+///  br i1 %7, label %res_block, label %loadbb4
+/// res_block:                                        ; preds = %loadbb4,
+/// %loadbb
+///  %res.phi1 = phi i64 [ %6, %loadbb ], [ %24, %loadbb4 ]
+///  %res.phi2 = phi i64 [ %4, %loadbb ], [ %22, %loadbb4 ]
+///  %res.phi3 = phi i64 [ %5, %loadbb ], [ %23, %loadbb4 ]
+///  %8 = call i64 @llvm.cttz.i64(i64 %res.phi1, i1 false)
+///  %9 = and i64 %8, -8
+///  %src1_sh = ashr i64 %res.phi2, %9
+///  %src2_sh = ashr i64 %res.phi3, %9
+///  %10 = and i64 %src1_sh, 255
+///  %11 = and i64 %src2_sh, 255
+///  %12 = sub i64 %10, %11
+///  %13 = trunc i64 %12 to i32
+///  br label %endblock
+/// loadbb4:                                          ; preds = %loadbb
+///  %14 = bitcast i32* %buffer2 to i8*
+///  %15 = bitcast i32* %buffer1 to i8*
+///  %16 = bitcast i8* %15 to i32*
+///  %17 = bitcast i8* %14 to i32*
+///  %18 = getelementptr i32, i32* %16, i32 2
+///  %19 = getelementptr i32, i32* %17, i32 2
+///  %20 = load i32, i32* %18
+///  %21 = load i32, i32* %19
+///  %22 = zext i32 %20 to i64
+///  %23 = zext i32 %21 to i64
+///  %24 = xor i64 %22, %23
+///  %25 = icmp ne i64 %24, 0
+///  br i1 %25, label %res_block, label %loadbb5
+/// loadbb5:                                          ; preds = %loadbb4
+///  %26 = bitcast i32* %buffer2 to i8*
+///  %27 = bitcast i32* %buffer1 to i8*
+///  %28 = getelementptr i8, i8* %27, i8 12
+///  %29 = getelementptr i8, i8* %26, i8 12
+///  %30 = load i8, i8* %28
+///  %31 = load i8, i8* %29
+///  %32 = zext i8 %30 to i32
+///  %33 = zext i8 %31 to i32
+///  %34 = sub i32 %32, %33
+///  br label %endblock
+/// endblock:                                         ; preds = %res_block,
+/// %loadbb5
+///  %res.phi = phi i32 [ %34, %loadbb5 ], [ %13, %res_block ]
+///  ret i32 %res.phi
+static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
+                         const TargetLowering *TLI, const DataLayout *DL) {
+  NumMemCmpCalls++;
+  LLVMContext &Context = CI->getContext();
+  IRBuilder<> Builder(Context);
+
+  // TTI call to check if target would like to expand memcmp and get the
+  // MaxLoadSize
+  unsigned MaxLoadSize;
+  if (!TTI->expandMemCmp(CI, MaxLoadSize))
+    return false;
+
+  // Early exit from expansion if -Oz
+  if (CI->getCalledFunction()->optForMinSize()) {
+    return false;
+  }
+
+  // Early exit from expansion if size is not a constant
+  ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+  if (!SizeCast) {
+    NumMemCmpNotConstant++;
+    return false;
+  }
+
+  // Early exit from expansion if size greater than max bytes to load
+  uint64_t SizeVal = SizeCast->getZExtValue();
+  if ((SizeVal >
+       TLI->getMaxExpandSizeMemcmp(CI->getCalledFunction()->optForSize()))) {
+    NumMemCmpGreaterThanMax++;
+    return false;
+  }
+
+  NumMemCmpInlined++;
+
+  // MemCmpHelper object, creates and sets up basic blocks required for
+  // expanding memcmp with size SizeVal
+  MemCmpExpansion MemCmpHelper(CI, MaxLoadSize, SizeVal);
+
+  Value *Res = MemCmpHelper.getMemCmpExpansion(SizeVal, DL->isLittleEndian());
+
+  // Replace call with result of expansion and erarse call.
+  CI->replaceAllUsesWith(Res);
+  CI->eraseFromParent();
+
+  return true;
+}
+
 bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
   BasicBlock *BB = CI->getParent();
 
@@ -2127,6 +2634,15 @@
     CI->eraseFromParent();
     return true;
   }
+
+  LibFunc Func;
+  if (TLInfo->getLibFunc(CI->getCalledFunction()->getName(), Func) &&
+      Func == LibFunc_memcmp) {
+    if (expandMemCmp(CI, TTI, TLI, DL)) {
+      ModifiedDT = true;
+      return true;
+    }
+  }
   return false;
 }
 
@@ -5074,6 +5590,7 @@
   return true;
 }
 
+
 namespace {
 /// \brief Helper class to promote a scalar operation to a vector one.
 /// This class is used to move downward extractelement transition.
Index: lib/CodeGen/TargetLoweringBase.cpp
===================================================================
--- lib/CodeGen/TargetLoweringBase.cpp
+++ lib/CodeGen/TargetLoweringBase.cpp
@@ -829,9 +829,10 @@
   initActions();
 
   // Perform these initializations only once.
-  MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = 8;
-  MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize
-    = MaxStoresPerMemmoveOptSize = 4;
+  MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove =
+      MaxExpandSizeMemcmp = 8;
+  MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize =
+      MaxStoresPerMemmoveOptSize = MaxExpandSizeMemcmpOptSize = 4;
   UseUnderscoreSetJmp = false;
   UseUnderscoreLongJmp = false;
   HasMultipleConditionRegisters = false;
Index: lib/IR/Instruction.cpp
===================================================================
--- lib/IR/Instruction.cpp
+++ lib/IR/Instruction.cpp
@@ -412,6 +412,20 @@
   return haveSameSpecialState(this, I);
 }
 
+/// Return true if it only matters that the value is equal or not-equal to zero.
+bool Instruction::isOnlyUsedInZeroEqualityComparison() const {
+  for (const User *U : users()) {
+    if (const ICmpInst *IC = dyn_cast<ICmpInst>(U))
+      if (IC->isEquality())
+        if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
+          if (C->isNullValue())
+            continue;
+    // Unknown instruction.
+    return false;
+  }
+  return true;
+}
+
 // Keep this in sync with FunctionComparator::cmpOperations in
 // lib/Transforms/IPO/MergeFunctions.cpp.
 bool Instruction::isSameOperationAs(const Instruction *I,
Index: lib/Target/PowerPC/PPCISelLowering.cpp
===================================================================
--- lib/Target/PowerPC/PPCISelLowering.cpp
+++ lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1020,6 +1020,10 @@
     MaxStoresPerMemset = 128;
     MaxStoresPerMemcpy = 128;
     MaxStoresPerMemmove = 128;
+    MaxExpandSizeMemcmp = 128;
+  } else {
+    MaxExpandSizeMemcmp = 64;
+    MaxExpandSizeMemcmpOptSize = 8;
   }
 }
 
Index: lib/Target/PowerPC/PPCTargetTransformInfo.h
===================================================================
--- lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -60,6 +60,7 @@
   /// @{
 
   bool enableAggressiveInterleaving(bool LoopHasReductions);
+  bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize);
   bool enableInterleavedAccessVectorization();
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);
Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp
===================================================================
--- lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -215,6 +215,11 @@
   return LoopHasReductions;
 }
 
+bool PPCTTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) {
+  MaxLoadSize = 8;
+  return true;
+}
+
 bool PPCTTIImpl::enableInterleavedAccessVectorization() {
   return true;
 }
Index: lib/Transforms/Utils/SimplifyLibCalls.cpp
===================================================================
--- lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -88,20 +88,6 @@
   return false;
 }
 
-/// Return true if it only matters that the value is equal or not-equal to zero.
-static bool isOnlyUsedInZeroEqualityComparison(Value *V) {
-  for (User *U : V->users()) {
-    if (ICmpInst *IC = dyn_cast<ICmpInst>(U))
-      if (IC->isEquality())
-        if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
-          if (C->isNullValue())
-            continue;
-    // Unknown instruction.
-    return false;
-  }
-  return true;
-}
-
 /// Return true if it is only used in equality comparisons with With.
 static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) {
   for (User *U : V->users()) {
@@ -504,7 +490,7 @@
 
   // strlen(x) != 0 --> *x != 0
   // strlen(x) == 0 --> *x == 0
-  if (isOnlyUsedInZeroEqualityComparison(CI))
+  if (CI->isOnlyUsedInZeroEqualityComparison())
     return B.CreateZExt(B.CreateLoad(Src, "strlenfirst"), CI->getType());
 
   return nullptr;
@@ -675,7 +661,7 @@
   //
   // memchr("\r\n", C, 2) != nullptr -> (C & ((1 << '\r') | (1 << '\n'))) != 0
   //   after bounds check.
-  if (!CharC && !Str.empty() && isOnlyUsedInZeroEqualityComparison(CI)) {
+  if (!CharC && !Str.empty() && CI->isOnlyUsedInZeroEqualityComparison()) {
     unsigned char Max =
         *std::max_element(reinterpret_cast<const unsigned char *>(Str.begin()),
                           reinterpret_cast<const unsigned char *>(Str.end()));
@@ -750,7 +736,7 @@
   }
 
   // memcmp(S1,S2,N/8)==0 -> (*(intN_t*)S1 != *(intN_t*)S2)==0
-  if (DL.isLegalInteger(Len * 8) && isOnlyUsedInZeroEqualityComparison(CI)) {
+  if (DL.isLegalInteger(Len * 8) && CI->isOnlyUsedInZeroEqualityComparison()) {
 
     IntegerType *IntType = IntegerType::get(CI->getContext(), Len * 8);
     unsigned PrefAlignment = DL.getPrefTypeAlignment(IntType);
Index: test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
===================================================================
--- /dev/null
+++ test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
@@ -0,0 +1,117 @@
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+@zeroEqualityTest01.buffer1 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 4], align 4
+@zeroEqualityTest01.buffer2 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 3], align 4
+@zeroEqualityTest02.buffer1 = private unnamed_addr constant [4 x i32] [i32 4, i32 0, i32 0, i32 0], align 4
+@zeroEqualityTest02.buffer2 = private unnamed_addr constant [4 x i32] [i32 3, i32 0, i32 0, i32 0], align 4
+@zeroEqualityTest03.buffer1 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 3], align 4
+@zeroEqualityTest03.buffer2 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 4], align 4
+@zeroEqualityTest04.buffer1 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14], align 4
+@zeroEqualityTest04.buffer2 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 13], align 4
+
+; Function Attrs: nounwind readonly
+declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_addr #1
+
+; Validate with if(memcmp())
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest01() local_unnamed_addr #0 {
+entry:
+  %call = tail call signext i32 @memcmp(i8* bitcast ([3 x i32]* @zeroEqualityTest01.buffer1 to i8*), i8* bitcast ([3 x i32]* @zeroEqualityTest01.buffer2 to i8*), i64 16)
+  %not.tobool = icmp ne i32 %call, 0
+  %. = zext i1 %not.tobool to i32
+  ret i32 %.
+
+  ; CHECK-LABEL: @zeroEqualityTest01
+  ; CHECK-LABEL: %res_block
+  ; CHECK: li 3, 1
+  ; CHECK-NEXT: clrldi
+  ; CHECK-NEXT: blr
+  ; CHECK: li 3, 0
+  ; CHECK-NEXT: clrldi
+  ; CHECK-NEXT: blr
+}
+
+; Validate with if(memcmp() == 0)
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest02() local_unnamed_addr #0 {
+entry:
+  %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer2 to i8*), i64 16)
+  %not.cmp = icmp ne i32 %call, 0
+  %. = zext i1 %not.cmp to i32
+  ret i32 %.
+
+  ; CHECK-LABEL: @zeroEqualityTest02
+  ; CHECK-LABEL: %res_block
+  ; CHECK: li 3, 1
+  ; CHECK-NEXT: clrldi
+  ; CHECK-NEXT: blr
+  ; CHECK: li 3, 0
+  ; CHECK-NEXT: clrldi
+  ; CHECK-NEXT: blr
+}
+
+; Validate with > 0
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest03() local_unnamed_addr #0 {
+entry:
+  %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer2 to i8*), i64 16)
+  %not.cmp = icmp slt i32 %call, 1
+  %. = zext i1 %not.cmp to i32
+  ret i32 %.
+
+  ; CHECK-LABEL: @zeroEqualityTest03
+  ; CHECK-LABEL: %res_block
+  ; CHECK: popcntd
+  ; CHECK-NOT: li 3, 1
+}
+
+; Validate with < 0
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest04() local_unnamed_addr #0 {
+entry:
+  %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer2 to i8*), i64 16)
+  %call.lobit = lshr i32 %call, 31
+  %call.lobit.not = xor i32 %call.lobit, 1
+  ret i32 %call.lobit.not
+
+  ; CHECK-LABEL: @zeroEqualityTest04
+  ; CHECK-LABEL: %res_block
+  ; CHECK: popcntd
+  ; CHECK-NOT: li 3, 1
+}
+
+; Validate with memcmp()?:
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest05() local_unnamed_addr #0 {
+entry:
+  %call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer2 to i8*), i64 16)
+  %not.tobool = icmp eq i32 %call, 0
+  %cond = zext i1 %not.tobool to i32
+  ret i32 %cond
+
+  ; CHECK-LABEL: @zeroEqualityTest05
+  ; CHECK-LABEL: %res_block
+  ; CHECK: li 3, 1
+  ; CHECK: li 3, 0
+}
+
+; Validate with !memcmp()?:
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest06() local_unnamed_addr #0 {
+entry:
+  %call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer2 to i8*), i64 16)
+  %not.lnot = icmp ne i32 %call, 0
+  %cond = zext i1 %not.lnot to i32
+  ret i32 %cond
+
+  ; CHECK-LABEL: @zeroEqualityTest06
+  ; CHECK-LABEL: %res_block
+  ; CHECK: li 3, 1
+  ; CHECK-NEXT: clrldi
+  ; CHECK-NEXT: blr
+  ; CHECK: li 3, 0
+  ; CHECK-NEXT: clrldi
+  ; CHECK-NEXT: blr
+}
Index: test/CodeGen/PowerPC/memcmp.ll
===================================================================
--- /dev/null
+++ test/CodeGen/PowerPC/memcmp.ll
@@ -0,0 +1,104 @@
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux  < %s | FileCheck %s -check-prefix=CHECK
+
+; Check size 8
+; Function Attrs: nounwind readonly
+define signext i32 @test1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 8) #2
+  ret i32 %call
+
+; CHECK-LABEL: @test1
+; CHECK: ld [[LOAD1:[0-9]+]]
+; CHECK-NEXT: ld [[LOAD2:[0-9]+]]
+; CHECK-NEXT: xor. [[XOR:[0-9]+]], [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT: beq
+; CHECK: popcntd [[POPCNTD:[0-9]+]]
+; CHECK-NEXT: andi. [[ANDI:[0-9]+]], [[POPCNTD]], 120
+; CHECK-NEXT: srad [[SRAD1:[0-9]+]], [[LOAD1]], [[ANDI]]
+; CHECK-NEXT: srad [[SRAD2:[0-9]+]], [[LOAD2]], [[ANDI]]
+; CHECK-NEXT: clrldi [[CLR1:[0-9]+]], [[SRAD1]], 56
+; CHECK-NEXT: clrldi [[CLR2:[0-9]+]], [[SRAD2]], 56
+; CHECK-NEXT: sub [[SUB:[0-9]+]], [[CLR1]], [[CLR2]]
+; CHECK-NEXT: extsw 3, [[SUB]]
+; CHECK-NEXT: blr
+; CHECK: li [[LI:[0-9]+]], 0
+; CHECK-NEXT: extsw 3, [[LI]]
+}
+
+; Check size 4
+; Function Attrs: nounwind readonly
+define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 4) #2
+  ret i32 %call
+
+; CHECK-LABEL: @test2
+; CHECK: lwz [[LOAD1:[0-9]+]]
+; CHECK-NEXT: lwz [[LOAD2:[0-9]+]]
+; CHECK-NEXT: xor [[XOR:[0-9]+]], [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT: cmplwi [[CMPLWI:[0-9]+]], 0
+; CHECK-NEXT: beq
+; CHECK: popcntd [[POPCNTD:[0-9]+]]
+; CHECK-NEXT: andi. [[ANDI:[0-9]+]], [[POPCNTD]], 120
+; CHECK-NEXT: srd [[SRD1:[0-9]+]], [[LOAD1]], [[ANDI]]
+; CHECK-NEXT: srd [[SRAD2:[0-9]+]], [[LOAD2]], [[ANDI]]
+; CHECK-NEXT: clrldi [[CLR1:[0-9]+]], [[SRAD1]], 56
+; CHECK-NEXT: clrldi [[CLR2:[0-9]+]], [[SRAD2]], 56
+; CHECK-NEXT: sub [[SUB:[0-9]+]], [[CLR1]], [[CLR2]]
+; CHECK-NEXT: extsw 3, [[SUB]]
+; CHECK-NEXT: blr
+; CHECK: li [[LI:[0-9]+]], 0
+; CHECK-NEXT: extsw 3, [[LI]]
+}
+
+; Check size 2
+; Function Attrs: nounwind readonly
+define signext i32 @test3(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 2) #2
+  ret i32 %call
+
+; CHECK-LABEL: @test3
+; CHECK: lhz [[LOAD1:[0-9]+]]
+; CHECK-NEXT: lhz [[LOAD2:[0-9]+]]
+; CHECK-NEXT: xor [[XOR:[0-9]+]], [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT: rlwinm. [[RLWINM:[0-9]+]], [[XOR]], 0, 16, 31
+; CHECK-NEXT: beq
+; CHECK: popcntd [[POPCNTD:[0-9]+]]
+; CHECK-NEXT: andi. [[ANDI:[0-9]+]], [[POPCNTD]], 120
+; CHECK-NEXT: srd [[SRD1:[0-9]+]], [[LOAD1]], [[ANDI]]
+; CHECK-NEXT: srd [[SRAD2:[0-9]+]], [[LOAD2]], [[ANDI]]
+; CHECK-NEXT: clrldi [[CLR1:[0-9]+]], [[SRAD1]], 56
+; CHECK-NEXT: clrldi [[CLR2:[0-9]+]], [[SRAD2]], 56
+; CHECK-NEXT: sub [[SUB:[0-9]+]], [[CLR1]], [[CLR2]]
+; CHECK-NEXT: extsw 3, [[SUB]]
+; CHECK-NEXT: blr
+; CHECK: li [[LI:[0-9]+]], 0
+; CHECK-NEXT: extsw 3, [[LI]]
+}
+
+; Check size 1
+; Function Attrs: nounwind readonly
+define signext i32 @test4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 1) #2
+  ret i32 %call
+
+; CHECK-LABEL: @test4
+; CHECK: lbz [[LOAD1:[0-9]+]]
+; CHECK-NEXT: lbz [[LOAD2:[0-9]+]]
+; CHECK-NEXT: subf [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]]
+; CHECK-NEXT: extsw 3, [[SUB]]
+; CHECK-NEXT: blr
+}
+
+; Function Attrs: nounwind readonly
+declare signext i32 @memcmp(i8*, i8*, i64) #1
Index: test/CodeGen/PowerPC/memcmpIR.ll
===================================================================
--- /dev/null
+++ test/CodeGen/PowerPC/memcmpIR.ll
@@ -0,0 +1,213 @@
+; RUN: llc -o - -mtriple=powerpc64le-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s
+; RUN: llc -o - -mtriple=powerpc64-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s --check-prefix=CHECK-BE
+
+; Check multiples of 8
+; Function Attrs: nounwind readonly
+define signext i32 @test1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 16) #2
+  ret i32 %call
+
+  ; CHECK: [[LOAD1:%[0-9]+]] = load i64, i64*
+  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
+  ; CHECK-NEXT: [[XOR:%[0-9]+]] = xor i64 [[LOAD1]], [[LOAD2]]
+  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[XOR]], 0
+  ; CHECK-NEXT:  br i1 [[ICMP]], label %res_block, label
+
+  ; CHECK-LABEL: res_block:{{.*}}
+  ; CHECK: [[CTTZ:%[0-9]+]] = call i64 @llvm.cttz.i64
+  ; CHECK: [[AND:%[0-9]+]] = and i64 [[CTTZ]], -8
+  ; CHECK: [[ASHR1:%[a-z]+[0-9]+_[a-z]+]] = ashr i64 {{.*}}, [[AND]]
+  ; CHECK: [[ASHR2:%[a-z]+[0-9]+_[a-z]+]] = ashr i64 {{.*}}, [[AND]]
+  ; CHECK-NEXT: [[AND1:%[0-9]+]] = and i64 [[ASHR1]], 255
+  ; CHECK-NEXT: [[AND2:%[0-9]+]] = and i64 [[ASHR2]], 255
+  ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[AND1]], [[AND2]]
+
+  ; CHECK: [[GEP1:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1
+  ; CHECK-NEXT: [[GEP2:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1
+  ; CHECK-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[GEP1]]
+  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[GEP2]]
+  ; CHECK-NEXT: [[XOR:%[0-9]+]] = xor i64 [[LOAD1]], [[LOAD2]]
+  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[XOR]], 0
+  ; CHECK-NEXT:  br i1 [[ICMP]], label %res_block, label
+
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, i64*
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
+  ; CHECK-BE-NEXT: [[XOR:%[0-9]+]] = xor i64 [[LOAD1]], [[LOAD2]]
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[XOR]], 0
+  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %res_block, label
+
+  ; CHECK-BE-LABEL: res_block:{{.*}}
+  ; CHECK-BE: [[CTLZ:%[0-9]+]] = call i64 @llvm.ctlz.i64
+  ; CHECK-BE: [[SUB:%[0-9]+]] = sub i64 63, [[CTLZ]]
+  ; CHECK-BE: [[AND:%[0-9]+]] = and i64 [[SUB]], -8
+  ; CHECK-BE: [[ASHR1:%[a-z]+[0-9]+_[a-z]+]] = ashr i64 {{.*}}, [[AND]]
+  ; CHECK-BE: [[ASHR2:%[a-z]+[0-9]+_[a-z]+]] = ashr i64 {{.*}}, [[AND]]
+  ; CHECK-BE-NEXT: [[AND1:%[0-9]+]] = and i64 [[ASHR1]], 255
+  ; CHECK-BE-NEXT: [[AND2:%[0-9]+]] = and i64 [[ASHR2]], 255
+  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[AND1]], [[AND2]]
+
+  ; CHECK-BE: [[GEP1:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1
+  ; CHECK-BE-NEXT: [[GEP2:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1
+  ; CHECK-BE-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[GEP1]]
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[GEP2]]
+  ; CHECK-BE-NEXT: [[XOR:%[0-9]+]] = xor i64 [[LOAD1]], [[LOAD2]]
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[XOR]], 0
+  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %res_block, label
+}
+
+define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 4) #2
+  ret i32 %call
+
+  ; CHECK: [[LOAD1:%[0-9]+]] = load i32, i32*
+  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
+  ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64
+  ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64
+  ; CHECK-NEXT: [[XOR:%[0-9]+]] = xor i64 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[XOR]], 0
+  ; CHECK-NEXT:  br i1 [[ICMP]], label %res_block, label %endblock
+
+  ; CHECK-LABEL: res_block:{{.*}}
+  ; CHECK: [[CTTZ:%[0-9]+]] = call i64 @llvm.cttz.i64
+  ; CHECK: [[AND:%[0-9]+]] = and i64 [[CTTZ]], -8
+  ; CHECK: [[ASHR1:%[a-z]+[0-9]+_[a-z]+]] = ashr i64 {{.*}}, [[AND]]
+  ; CHECK: [[ASHR2:%[a-z]+[0-9]+_[a-z]+]] = ashr i64 {{.*}}, [[AND]]
+  ; CHECK-NEXT: [[AND1:%[0-9]+]] = and i64 [[ASHR1]], 255
+  ; CHECK-NEXT: [[AND2:%[0-9]+]] = and i64 [[ASHR2]], 255
+  ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[AND1]], [[AND2]]
+
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32*
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
+  ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64
+  ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64
+  ; CHECK-BE-NEXT: [[XOR:%[0-9]+]] = xor i64 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[XOR]], 0
+  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %res_block, label %endblock
+
+  ; CHECK-BE-LABEL: res_block:{{.*}}
+  ; CHECK-BE: [[CTLZ:%[0-9]+]] = call i64 @llvm.ctlz.i64
+  ; CHECK-BE: [[SUB:%[0-9]+]] = sub i64 63, [[CTLZ]]
+  ; CHECK-BE: [[AND:%[0-9]+]] = and i64 [[SUB]], -8
+  ; CHECK-BE: [[ASHR1:%[a-z]+[0-9]+_[a-z]+]] = ashr i64 {{.*}}, [[AND]]
+  ; CHECK-BE: [[ASHR2:%[a-z]+[0-9]+_[a-z]+]] = ashr i64 {{.*}}, [[AND]]
+  ; CHECK-BE-NEXT: [[AND1:%[0-9]+]] = and i64 [[ASHR1]], 255
+  ; CHECK-BE-NEXT: [[AND2:%[0-9]+]] = and i64 [[ASHR2]], 255
+  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[AND1]], [[AND2]]
+}
+
+define signext i32 @test3(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 15) #2
+  ret i32 %call
+
+  ; CHECK: [[LOAD1:%[0-9]+]] = load i64, i64*
+  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
+  ; CHECK-NEXT: [[XOR:%[0-9]+]] = xor i64 [[LOAD1]], [[LOAD2]]
+  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[XOR]], 0
+  ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label %loadbb{{[0-9]+}}
+
+  ; CHECK-LABEL: res_block:{{.*}}
+  ; CHECK: [[CTTZ:%[0-9]+]] = call i64 @llvm.cttz.i64
+  ; CHECK: [[AND:%[0-9]+]] = and i64 [[CTTZ]], -8
+  ; CHECK: [[ASHR1:%[a-z]+[0-9]+_[a-z]+]] = ashr i64 {{.*}}, [[AND]]
+  ; CHECK: [[ASHR2:%[a-z]+[0-9]+_[a-z]+]] = ashr i64 {{.*}}, [[AND]]
+  ; CHECK-NEXT: [[AND1:%[0-9]+]] = and i64 [[ASHR1]], 255
+  ; CHECK-NEXT: [[AND2:%[0-9]+]] = and i64 [[ASHR2]], 255
+  ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[AND1]], [[AND2]]
+
+  ; CHECK: [[LOAD1:%[0-9]+]] = load i32, i32*
+  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
+  ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64
+  ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64
+  ; CHECK-NEXT: [[XOR:%[0-9]+]] = xor i64 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[XOR]], 0
+  ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label %loadbb{{[0-9]+}}
+
+  ; CHECK: [[LOAD1:%[0-9]+]] = load i16, i16*
+  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i16, i16*
+  ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[LOAD1]] to i64
+  ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[LOAD2]] to i64
+  ; CHECK-NEXT: [[XOR:%[0-9]+]] = xor i64 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[XOR]], 0
+  ; CHECK-NEXT: br i1 [[ICMP]], label %res_block, label %loadbb{{[0-9]+}}
+
+  ; CHECK: [[LOAD1:%[0-9]+]] = load i8, i8*
+  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i8, i8*
+  ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32
+  ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32
+  ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-NEXT:  br label %endblock
+
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, i64*
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
+  ; CHECK-BE-NEXT: [[XOR:%[0-9]+]] = xor i64 [[LOAD1]], [[LOAD2]]
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[XOR]], 0
+  ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label %loadbb{{[0-9]+}}
+
+  ; CHECK-BE-LABEL: res_block:{{.*}}
+  ; CHECK-BE: [[CTLZ:%[0-9]+]] = call i64 @llvm.ctlz.i64
+  ; CHECK-BE: [[SUB:%[0-9]+]] = sub i64 63, [[CTLZ]]
+  ; CHECK-BE: [[AND:%[0-9]+]] = and i64 [[SUB]], -8
+  ; CHECK-BE: [[ASHR1:%[a-z]+[0-9]+_[a-z]+]] = ashr i64 {{.*}}, [[AND]]
+  ; CHECK-BE: [[ASHR2:%[a-z]+[0-9]+_[a-z]+]] = ashr i64 {{.*}}, [[AND]]
+  ; CHECK-BE-NEXT: [[AND1:%[0-9]+]] = and i64 [[ASHR1]], 255
+  ; CHECK-BE-NEXT: [[AND2:%[0-9]+]] = and i64 [[ASHR2]], 255
+  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[AND1]], [[AND2]]
+
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32*
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
+  ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64
+  ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64
+  ; CHECK-BE-NEXT: [[XOR:%[0-9]+]] = xor i64 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[XOR]], 0
+  ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label %loadbb{{[0-9]+}}
+
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i16, i16*
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i16, i16*
+  ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[LOAD1]] to i64
+  ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[LOAD2]] to i64
+  ; CHECK-BE-NEXT: [[XOR:%[0-9]+]] = xor i64 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[XOR]], 0
+  ; CHECK-BE-NEXT: br i1 [[ICMP]], label %res_block, label %loadbb{{[0-9]+}}
+
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i8, i8*
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i8, i8*
+  ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32
+  ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32
+  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-BE-NEXT:  br label %endblock
+}
+
+define signext i32 @test4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 65) #2
+  ret i32 %call
+
+  ; CHECK: %call = tail call signext i32 @memcmp
+  ; CHECK-BE: %call = tail call signext i32 @memcmp
+}
+
+define signext i32 @test5(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2, i32 signext %SIZE) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %conv = sext i32 %SIZE to i64
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 %conv) #2
+  ret i32 %call
+
+  ; CHECK: call = tail call signext i32 @memcmp
+  ; CHECK-BE: call = tail call signext i32 @memcmp
+}
+
+; Function Attrs: nounwind readonly
+declare signext i32 @memcmp(i8*, i8*, i64) #1