Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
@@ -454,6 +454,9 @@
   /// \brief Don't restrict interleaved unrolling to small loops.
   bool enableAggressiveInterleaving(bool LoopHasReductions) const;
 
+  /// \brief Enable inline expansion of memcmp
+  bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) const;
+
   /// \brief Enable matching of interleaved access groups.
   bool enableInterleavedAccessVectorization() const;
 
@@ -828,6 +831,7 @@
                                                     unsigned VF) = 0;
   virtual bool supportsEfficientVectorElementLoadStore() = 0;
   virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
+  virtual bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) = 0;
   virtual bool enableInterleavedAccessVectorization() = 0;
   virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
   virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
@@ -1047,6 +1051,9 @@
   bool enableAggressiveInterleaving(bool LoopHasReductions) override {
     return Impl.enableAggressiveInterleaving(LoopHasReductions);
   }
+  bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) override {
+    return Impl.expandMemCmp(I, MaxLoadSize);
+  }
   bool enableInterleavedAccessVectorization() override {
     return Impl.enableInterleavedAccessVectorization();
   }
Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -274,6 +274,8 @@
 
   bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; }
 
+  bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) { return false; }
+
   bool enableInterleavedAccessVectorization() { return false; }
 
   bool isFPVectorizationPotentiallyUnsafe() { return false; }
Index: llvm/trunk/include/llvm/Analysis/ValueTracking.h
===================================================================
--- llvm/trunk/include/llvm/Analysis/ValueTracking.h
+++ llvm/trunk/include/llvm/Analysis/ValueTracking.h
@@ -85,6 +85,8 @@
                               const Instruction *CxtI = nullptr,
                               const DominatorTree *DT = nullptr);
 
+  bool isOnlyUsedInZeroEqualityComparison(const Instruction *CxtI);
+  
   /// Return true if the given value is known to be non-zero when defined. For
   /// vectors, return true if every element is known to be non-zero when
   /// defined. For pointers, if the context instruction and dominator tree are
Index: llvm/trunk/include/llvm/Target/TargetLowering.h
===================================================================
--- llvm/trunk/include/llvm/Target/TargetLowering.h
+++ llvm/trunk/include/llvm/Target/TargetLowering.h
@@ -1143,6 +1143,16 @@
     return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy;
   }
 
+  /// Get maximum # of load operations permitted for memcmp
+  ///
+  /// This function returns the maximum number of load operations permitted
+  /// to replace a call to memcmp. The value is set by the target at the
+  /// performance threshold for such a replacement. If OptSize is true,
+  /// return the limit for functions that have OptSize attribute.
+  unsigned getMaxExpandSizeMemcmp(bool OptSize) const {
+    return OptSize ? MaxLoadsPerMemcmpOptSize : MaxLoadsPerMemcmp;
+  }
+
   /// \brief Get maximum # of store operations permitted for llvm.memmove
   ///
   /// This function returns the maximum number of store operations permitted
@@ -2330,6 +2340,8 @@
   /// Maximum number of store operations that may be substituted for a call to
   /// memcpy, used for functions with OptSize attribute.
   unsigned MaxStoresPerMemcpyOptSize;
+  unsigned MaxLoadsPerMemcmp;
+  unsigned MaxLoadsPerMemcmpOptSize;
 
   /// \brief Specify maximum bytes of store instructions per memmove call.
   ///
Index: llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
+++ llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
@@ -215,6 +215,10 @@
   return TTIImpl->enableAggressiveInterleaving(LoopHasReductions);
 }
 
+bool TargetTransformInfo::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) const {
+  return TTIImpl->expandMemCmp(I, MaxLoadSize);
+}
+
 bool TargetTransformInfo::enableInterleavedAccessVectorization() const {
   return TTIImpl->enableInterleavedAccessVectorization();
 }
Index: llvm/trunk/lib/Analysis/ValueTracking.cpp
===================================================================
--- llvm/trunk/lib/Analysis/ValueTracking.cpp
+++ llvm/trunk/lib/Analysis/ValueTracking.cpp
@@ -172,6 +172,18 @@
 }
 
 
+bool llvm::isOnlyUsedInZeroEqualityComparison(const Instruction *CxtI) {
+  for (const User *U : CxtI->users()) {
+    if (const ICmpInst *IC = dyn_cast<ICmpInst>(U))
+      if (IC->isEquality())
+        if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
+          if (C->isNullValue())
+            continue;
+    return false;
+  }
+  return true;
+}
+
 static bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
                                    const Query &Q);
 
Index: llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp
+++ llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp
@@ -24,12 +24,13 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -60,6 +61,7 @@
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
+
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
@@ -84,6 +86,12 @@
 STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
 STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
 
+STATISTIC(NumMemCmpCalls, "Number of memcmp calls");
+STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size");
+STATISTIC(NumMemCmpGreaterThanMax,
+          "Number of memcmp calls with size greater than max size");
+STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls");
+
 static cl::opt<bool> DisableBranchOpts(
   "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
   cl::desc("Disable branch optimizations in CodeGenPrepare"));
@@ -144,6 +152,11 @@
     cl::desc("Enable merging of redundant sexts when one is dominating"
     " the other."), cl::init(true));
 
+static cl::opt<unsigned> MemCmpNumLoadsPerBlock(
+    "memcmp-num-loads-per-block", cl::Hidden, cl::init(1),
+    cl::desc("The number of loads per basic block for inline expansion of "
+             "memcmp that is only being compared against zero."));
+
 namespace {
 typedef SmallPtrSet<Instruction *, 16> SetOfInstrs;
 typedef PointerIntPair<Type *, 1, bool> TypeIsSExt;
@@ -1629,6 +1642,593 @@
   return true;
 }
 
+// This class provides helper functions to expand a memcmp library call into an
+// inline expansion.
+class MemCmpExpansion {
+  struct ResultBlock {
+    BasicBlock *BB;
+    PHINode *PhiSrc1;
+    PHINode *PhiSrc2;
+    ResultBlock();
+  };
+
+  CallInst *CI;
+  ResultBlock ResBlock;
+  unsigned MaxLoadSize;
+  unsigned NumBlocks;
+  unsigned NumBlocksNonOneByte;
+  unsigned NumLoadsPerBlock;
+  std::vector<BasicBlock *> LoadCmpBlocks;
+  BasicBlock *EndBlock;
+  PHINode *PhiRes;
+  bool IsUsedForZeroCmp;
+  int calculateNumBlocks(unsigned Size);
+  void createLoadCmpBlocks();
+  void createResultBlock();
+  void setupResultBlockPHINodes();
+  void setupEndBlockPHINodes();
+  void emitLoadCompareBlock(unsigned Index, int LoadSize, int GEPIndex,
+                            bool IsLittleEndian);
+  void emitLoadCompareBlockMultipleLoads(unsigned Index, unsigned Size,
+                                         unsigned &NumBytesProcessed);
+  void emitLoadCompareByteBlock(unsigned Index, int GEPIndex);
+  void emitMemCmpResultBlock(bool IsLittleEndian);
+  Value *getMemCmpExpansionZeroCase(unsigned Size, bool IsLittleEndian);
+  unsigned getLoadSize(unsigned Size);
+  unsigned getNumLoads(unsigned Size);
+
+public:
+  MemCmpExpansion(CallInst *CI, unsigned MaxLoadSize,
+                  unsigned NumLoadsPerBlock);
+  Value *getMemCmpExpansion(bool IsLittleEndian);
+};
+
+MemCmpExpansion::ResultBlock::ResultBlock()
+    : BB(nullptr), PhiSrc1(nullptr), PhiSrc2(nullptr) {}
+
+// Initialize the basic block structure required for expansion of memcmp call
+// with given maximum load size and memcmp size parameter.
+// This structure includes:
+// 1. A list of load compare blocks - LoadCmpBlocks.
+// 2. An EndBlock, split from original instruction point, which is the block to
+// return from.
+// 3. ResultBlock, block to branch to for early exit when a
+// LoadCmpBlock finds a difference.
+MemCmpExpansion::MemCmpExpansion(CallInst *CI, unsigned MaxLoadSize,
+                                 unsigned NumLoadsPerBlock)
+    : CI(CI), MaxLoadSize(MaxLoadSize), NumLoadsPerBlock(NumLoadsPerBlock) {
+
+  IRBuilder<> Builder(CI->getContext());
+
+  BasicBlock *StartBlock = CI->getParent();
+  EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
+  setupEndBlockPHINodes();
+  IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
+
+  ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+  uint64_t Size = SizeCast->getZExtValue();
+
+  // Calculate how many load compare blocks are required for an expansion of
+  // given Size.
+  NumBlocks = calculateNumBlocks(Size);
+  createResultBlock();
+
+  // If return value of memcmp is not used in a zero equality, we need to
+  // calculate which source was larger. The calculation requires the
+  // two loaded source values of each load compare block.
+  // These will be saved in the phi nodes created by setupResultBlockPHINodes.
+  if (!IsUsedForZeroCmp)
+    setupResultBlockPHINodes();
+
+  // Create the number of required load compare basic blocks.
+  createLoadCmpBlocks();
+
+  // Update the terminator added by splitBasicBlock to branch to the first
+  // LoadCmpBlock.
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+  StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]);
+}
+
+void MemCmpExpansion::createLoadCmpBlocks() {
+  for (unsigned i = 0; i < NumBlocks; i++) {
+    BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb",
+                                        EndBlock->getParent(), EndBlock);
+    LoadCmpBlocks.push_back(BB);
+  }
+}
+
+void MemCmpExpansion::createResultBlock() {
+  ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block",
+                                   EndBlock->getParent(), EndBlock);
+}
+
+// This function creates the IR instructions for loading and comparing 1 byte.
+// It loads 1 byte from each source of the memcmp paramters with the given
+// GEPIndex. It then subtracts the two loaded values and adds this result to the
+// final phi node for selecting the memcmp result.
+void MemCmpExpansion::emitLoadCompareByteBlock(unsigned Index, int GEPIndex) {
+  IRBuilder<> Builder(CI->getContext());
+
+  Value *Source1 = CI->getArgOperand(0);
+  Value *Source2 = CI->getArgOperand(1);
+
+  Builder.SetInsertPoint(LoadCmpBlocks[Index]);
+  Type *LoadSizeType = Type::getInt8Ty(CI->getContext());
+  // Cast source to LoadSizeType*
+  if (Source1->getType() != LoadSizeType)
+    Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+  if (Source2->getType() != LoadSizeType)
+    Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+  // Get the base address using the GEPIndex
+  if (GEPIndex != 0) {
+    Source1 = Builder.CreateGEP(LoadSizeType, Source1,
+                                ConstantInt::get(LoadSizeType, GEPIndex));
+    Source2 = Builder.CreateGEP(LoadSizeType, Source2,
+                                ConstantInt::get(LoadSizeType, GEPIndex));
+  }
+
+  Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
+  Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+
+  LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext()));
+  LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext()));
+  Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2);
+
+  PhiRes->addIncoming(Diff, LoadCmpBlocks[Index]);
+
+  if (Index < (LoadCmpBlocks.size() - 1)) {
+    // Early exit branch if difference found to EndBlock, otherwise continue to
+    // next LoadCmpBlock
+
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff,
+                                    ConstantInt::get(Diff->getType(), 0));
+    BranchInst *CmpBr =
+        BranchInst::Create(EndBlock, LoadCmpBlocks[Index + 1], Cmp);
+    Builder.Insert(CmpBr);
+  } else {
+    // The last block has an unconditional branch to EndBlock
+    BranchInst *CmpBr = BranchInst::Create(EndBlock);
+    Builder.Insert(CmpBr);
+  }
+}
+
+unsigned MemCmpExpansion::getNumLoads(unsigned Size) {
+  return (Size / MaxLoadSize) + countPopulation(Size % MaxLoadSize);
+}
+
+unsigned MemCmpExpansion::getLoadSize(unsigned Size) {
+  return MinAlign(PowerOf2Floor(Size), MaxLoadSize);
+}
+
+void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(
+    unsigned Index, unsigned Size, unsigned &NumBytesProcessed) {
+
+  IRBuilder<> Builder(CI->getContext());
+
+  std::vector<Value *> XorList, OrList;
+  Value *Diff;
+
+  unsigned RemainingBytes = Size - NumBytesProcessed;
+  unsigned NumLoadsRemaining = getNumLoads(RemainingBytes);
+  unsigned NumLoads = std::min(NumLoadsRemaining, NumLoadsPerBlock);
+
+  Builder.SetInsertPoint(LoadCmpBlocks[Index]);
+
+  for (unsigned i = 0; i < NumLoads; ++i) {
+    unsigned LoadSize = getLoadSize(RemainingBytes);
+    unsigned GEPIndex = NumBytesProcessed / LoadSize;
+    NumBytesProcessed += LoadSize;
+    RemainingBytes -= LoadSize;
+
+    Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8);
+    Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+
+    Value *Source1 = CI->getArgOperand(0);
+    Value *Source2 = CI->getArgOperand(1);
+
+    // Cast source to LoadSizeType*
+    if (Source1->getType() != LoadSizeType)
+      Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+    if (Source2->getType() != LoadSizeType)
+      Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+    // Get the base address using the GEPIndex
+    if (GEPIndex != 0) {
+      Source1 = Builder.CreateGEP(LoadSizeType, Source1,
+                                  ConstantInt::get(LoadSizeType, GEPIndex));
+      Source2 = Builder.CreateGEP(LoadSizeType, Source2,
+                                  ConstantInt::get(LoadSizeType, GEPIndex));
+    }
+
+    // Load LoadSizeType from the base address
+    Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
+    Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+    if (LoadSizeType != MaxLoadType) {
+      LoadSrc1 = Builder.CreateZExtOrTrunc(LoadSrc1, MaxLoadType);
+      LoadSrc2 = Builder.CreateZExtOrTrunc(LoadSrc2, MaxLoadType);
+    }
+    Diff = Builder.CreateXor(LoadSrc1, LoadSrc2);
+    Diff = Builder.CreateZExtOrTrunc(Diff, MaxLoadType);
+    XorList.push_back(Diff);
+  }
+
+  auto pairWiseOr = [&](std::vector<Value *> &InList) -> std::vector<Value *> {
+    std::vector<Value *> OutList;
+    for (unsigned i = 0; i < InList.size() - 1; i = i + 2) {
+      Value *Or = Builder.CreateOr(InList[i], InList[i + 1]);
+      OutList.push_back(Or);
+    }
+    if (InList.size() % 2 != 0)
+      OutList.push_back(InList.back());
+    return OutList;
+  };
+
+  // Pair wise OR the XOR results
+  OrList = pairWiseOr(XorList);
+
+  // Pair wise OR the OR results until one result left
+  while (OrList.size() != 1) {
+    OrList = pairWiseOr(OrList);
+  }
+
+  Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, OrList[0],
+                                  ConstantInt::get(Diff->getType(), 0));
+  BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1))
+                           ? EndBlock
+                           : LoadCmpBlocks[Index + 1];
+  // Early exit branch if difference found to ResultBlock, otherwise continue to
+  // next LoadCmpBlock or EndBlock.
+  BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp);
+  Builder.Insert(CmpBr);
+
+  // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
+  // since early exit to ResultBlock was not taken (no difference was found in
+  // any of the bytes)
+  if (Index == LoadCmpBlocks.size() - 1) {
+    Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
+    PhiRes->addIncoming(Zero, LoadCmpBlocks[Index]);
+  }
+}
+
+// This function creates the IR intructions for loading and comparing using the
+// given LoadSize. It loads the number of bytes specified by LoadSize from each
+// source of the memcmp parameters. It then does a subtract to see if there was
+// a difference in the loaded values. If a difference is found, it branches
+// with an early exit to the ResultBlock for calculating which source was
+// larger. Otherwise, it falls through to the either the next LoadCmpBlock or
+// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with
+// a special case through emitLoadCompareByteBlock. The special handling can
+// simply subtract the loaded values and add it to the result phi node.
+void MemCmpExpansion::emitLoadCompareBlock(unsigned Index, int LoadSize,
+                                           int GEPIndex, bool IsLittleEndian) {
+  if (LoadSize == 1) {
+    MemCmpExpansion::emitLoadCompareByteBlock(Index, GEPIndex);
+    return;
+  }
+
+  IRBuilder<> Builder(CI->getContext());
+
+  Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8);
+  Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+
+  Value *Source1 = CI->getArgOperand(0);
+  Value *Source2 = CI->getArgOperand(1);
+
+  Builder.SetInsertPoint(LoadCmpBlocks[Index]);
+  // Cast source to LoadSizeType*
+  if (Source1->getType() != LoadSizeType)
+    Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+  if (Source2->getType() != LoadSizeType)
+    Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+  // Get the base address using the GEPIndex
+  if (GEPIndex != 0) {
+    Source1 = Builder.CreateGEP(LoadSizeType, Source1,
+                                ConstantInt::get(LoadSizeType, GEPIndex));
+    Source2 = Builder.CreateGEP(LoadSizeType, Source2,
+                                ConstantInt::get(LoadSizeType, GEPIndex));
+  }
+
+  // Load LoadSizeType from the base address
+  Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
+  Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+
+  if (IsLittleEndian) {
+    Function *F = LoadCmpBlocks[Index]->getParent();
+
+    Function *Bswap = Intrinsic::getDeclaration(F->getParent(),
+                                                Intrinsic::bswap, LoadSizeType);
+    LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
+    LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
+  }
+
+  if (LoadSizeType != MaxLoadType) {
+    LoadSrc1 = Builder.CreateZExtOrTrunc(LoadSrc1, MaxLoadType);
+    LoadSrc2 = Builder.CreateZExtOrTrunc(LoadSrc2, MaxLoadType);
+  }
+
+  // Add the loaded values to the phi nodes for calculating memcmp result only
+  // if result is not used in a zero equality.
+  if (!IsUsedForZeroCmp) {
+    ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[Index]);
+    ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[Index]);
+  }
+
+  Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2);
+
+  Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff,
+                                  ConstantInt::get(Diff->getType(), 0));
+  BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1))
+                           ? EndBlock
+                           : LoadCmpBlocks[Index + 1];
+  // Early exit branch if difference found to ResultBlock, otherwise continue to
+  // next LoadCmpBlock or EndBlock.
+  BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp);
+  Builder.Insert(CmpBr);
+
+  // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
+  // since early exit to ResultBlock was not taken (no difference was found in
+  // any of the bytes)
+  if (Index == LoadCmpBlocks.size() - 1) {
+    Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
+    PhiRes->addIncoming(Zero, LoadCmpBlocks[Index]);
+  }
+}
+
+// This function populates the ResultBlock with a sequence to calculate the
+// memcmp result. It compares the two loaded source values and returns -1 if
+// src1 < src2 and 1 if src1 > src2.
+void MemCmpExpansion::emitMemCmpResultBlock(bool IsLittleEndian) {
+  IRBuilder<> Builder(CI->getContext());
+
+  // Special case: if memcmp result is used in a zero equality, result does not
+  // need to be calculated and can simply return 1.
+  if (IsUsedForZeroCmp) {
+    BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
+    Builder.SetInsertPoint(ResBlock.BB, InsertPt);
+    Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1);
+    PhiRes->addIncoming(Res, ResBlock.BB);
+    BranchInst *NewBr = BranchInst::Create(EndBlock);
+    Builder.Insert(NewBr);
+    return;
+  }
+  BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
+  Builder.SetInsertPoint(ResBlock.BB, InsertPt);
+
+  Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1,
+                                  ResBlock.PhiSrc2);
+
+  Value *Res =
+      Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1),
+                           ConstantInt::get(Builder.getInt32Ty(), 1));
+
+  BranchInst *NewBr = BranchInst::Create(EndBlock);
+  Builder.Insert(NewBr);
+  PhiRes->addIncoming(Res, ResBlock.BB);
+}
+
+int MemCmpExpansion::calculateNumBlocks(unsigned Size) {
+  int NumBlocks = 0;
+  bool haveOneByteLoad = false;
+  unsigned RemainingSize = Size;
+  unsigned LoadSize = MaxLoadSize;
+  while (RemainingSize) {
+    if (LoadSize == 1)
+      haveOneByteLoad = true;
+    NumBlocks += RemainingSize / LoadSize;
+    RemainingSize = RemainingSize % LoadSize;
+    LoadSize = LoadSize / 2;
+  }
+  NumBlocksNonOneByte = haveOneByteLoad ? (NumBlocks - 1) : NumBlocks;
+
+  if (IsUsedForZeroCmp)
+    NumBlocks = NumBlocks / NumLoadsPerBlock +
+                (NumBlocks % NumLoadsPerBlock != 0 ? 1 : 0);
+
+  return NumBlocks;
+}
+
+void MemCmpExpansion::setupResultBlockPHINodes() {
+  IRBuilder<> Builder(CI->getContext());
+  Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+  Builder.SetInsertPoint(ResBlock.BB);
+  ResBlock.PhiSrc1 =
+      Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src1");
+  ResBlock.PhiSrc2 =
+      Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src2");
+}
+
+void MemCmpExpansion::setupEndBlockPHINodes() {
+  IRBuilder<> Builder(CI->getContext());
+
+  Builder.SetInsertPoint(&EndBlock->front());
+  PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res");
+}
+
+Value *MemCmpExpansion::getMemCmpExpansionZeroCase(unsigned Size,
+                                                   bool IsLittleEndian) {
+  unsigned NumBytesProcessed = 0;
+  // This loop populates each of the LoadCmpBlocks with IR sequence to handle
+  // multiple loads per block
+  for (unsigned i = 0; i < NumBlocks; ++i) {
+    emitLoadCompareBlockMultipleLoads(i, Size, NumBytesProcessed);
+  }
+
+  emitMemCmpResultBlock(IsLittleEndian);
+  return PhiRes;
+}
+
+// This function expands the memcmp call into an inline expansion and returns
+// the memcmp result.
+Value *MemCmpExpansion::getMemCmpExpansion(bool IsLittleEndian) {
+
+  ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+  uint64_t Size = SizeCast->getZExtValue();
+
+  int LoadSize = MaxLoadSize;
+  int NumBytesToBeProcessed = Size;
+
+  if (IsUsedForZeroCmp) {
+    return getMemCmpExpansionZeroCase(Size, IsLittleEndian);
+  }
+
+  unsigned Index = 0;
+  // This loop calls emitLoadCompareBlock for comparing SizeVal bytes of the two
+  // memcmp source. It starts with loading using the maximum load size set by
+  // the target. It processes any remaining bytes using a load size which is the
+  // next smallest power of 2.
+  while (NumBytesToBeProcessed) {
+    // Calculate how many blocks we can create with the current load size
+    int NumBlocks = NumBytesToBeProcessed / LoadSize;
+    int GEPIndex = (Size - NumBytesToBeProcessed) / LoadSize;
+    NumBytesToBeProcessed = NumBytesToBeProcessed % LoadSize;
+
+    // For each NumBlocks, populate the instruction sequence for loading and
+    // comparing LoadSize bytes
+    while (NumBlocks--) {
+      emitLoadCompareBlock(Index, LoadSize, GEPIndex, IsLittleEndian);
+      Index++;
+      GEPIndex++;
+    }
+    // Get the next LoadSize to use
+    LoadSize = LoadSize / 2;
+  }
+
+  emitMemCmpResultBlock(IsLittleEndian);
+  return PhiRes;
+}
+
+// This function checks to see if an expansion of memcmp can be generated.
+// It checks for constant compare size that is less than the max inline size.
+// If an expansion cannot occur, returns false to leave as a library call.
+// Otherwise, the library call is replaced wtih new IR instruction sequence.
+/// We want to transform:
+/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15)
+/// To:
+/// loadbb:
+///  %0 = bitcast i32* %buffer2 to i8*
+///  %1 = bitcast i32* %buffer1 to i8*
+///  %2 = bitcast i8* %1 to i64*
+///  %3 = bitcast i8* %0 to i64*
+///  %4 = load i64, i64* %2
+///  %5 = load i64, i64* %3
+///  %6 = call i64 @llvm.bswap.i64(i64 %4)
+///  %7 = call i64 @llvm.bswap.i64(i64 %5)
+///  %8 = sub i64 %6, %7
+///  %9 = icmp ne i64 %8, 0
+///  br i1 %9, label %res_block, label %loadbb1
+/// res_block:                                        ; preds = %loadbb2,
+/// %loadbb1, %loadbb
+///  %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ]
+///  %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ]
+///  %10 = icmp ult i64 %phi.src1, %phi.src2
+///  %11 = select i1 %10, i32 -1, i32 1
+///  br label %endblock
+/// loadbb1:                                          ; preds = %loadbb
+///  %12 = bitcast i32* %buffer2 to i8*
+///  %13 = bitcast i32* %buffer1 to i8*
+///  %14 = bitcast i8* %13 to i32*
+///  %15 = bitcast i8* %12 to i32*
+///  %16 = getelementptr i32, i32* %14, i32 2
+///  %17 = getelementptr i32, i32* %15, i32 2
+///  %18 = load i32, i32* %16
+///  %19 = load i32, i32* %17
+///  %20 = call i32 @llvm.bswap.i32(i32 %18)
+///  %21 = call i32 @llvm.bswap.i32(i32 %19)
+///  %22 = zext i32 %20 to i64
+///  %23 = zext i32 %21 to i64
+///  %24 = sub i64 %22, %23
+///  %25 = icmp ne i64 %24, 0
+///  br i1 %25, label %res_block, label %loadbb2
+/// loadbb2:                                          ; preds = %loadbb1
+///  %26 = bitcast i32* %buffer2 to i8*
+///  %27 = bitcast i32* %buffer1 to i8*
+///  %28 = bitcast i8* %27 to i16*
+///  %29 = bitcast i8* %26 to i16*
+///  %30 = getelementptr i16, i16* %28, i16 6
+///  %31 = getelementptr i16, i16* %29, i16 6
+///  %32 = load i16, i16* %30
+///  %33 = load i16, i16* %31
+///  %34 = call i16 @llvm.bswap.i16(i16 %32)
+///  %35 = call i16 @llvm.bswap.i16(i16 %33)
+///  %36 = zext i16 %34 to i64
+///  %37 = zext i16 %35 to i64
+///  %38 = sub i64 %36, %37
+///  %39 = icmp ne i64 %38, 0
+///  br i1 %39, label %res_block, label %loadbb3
+/// loadbb3:                                          ; preds = %loadbb2
+///  %40 = bitcast i32* %buffer2 to i8*
+///  %41 = bitcast i32* %buffer1 to i8*
+///  %42 = getelementptr i8, i8* %41, i8 14
+///  %43 = getelementptr i8, i8* %40, i8 14
+///  %44 = load i8, i8* %42
+///  %45 = load i8, i8* %43
+///  %46 = zext i8 %44 to i32
+///  %47 = zext i8 %45 to i32
+///  %48 = sub i32 %46, %47
+///  br label %endblock
+/// endblock:                                         ; preds = %res_block,
+/// %loadbb3
+///  %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ]
+///  ret i32 %phi.res
+static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
+                         const TargetLowering *TLI, const DataLayout *DL) {
+  NumMemCmpCalls++;
+  IRBuilder<> Builder(CI->getContext());
+
+  // TTI call to check if target would like to expand memcmp and get the
+  // MaxLoadSize
+  unsigned MaxLoadSize;
+  if (!TTI->expandMemCmp(CI, MaxLoadSize))
+    return false;
+
+  // Early exit from expansion if -Oz
+  if (CI->getParent()->getParent()->optForMinSize()) {
+    return false;
+  }
+
+  // Early exit from expansion if size is not a constant
+  ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+  if (!SizeCast) {
+    NumMemCmpNotConstant++;
+    return false;
+  }
+
+  // Early exit from expansion if size greater than max bytes to load
+  uint64_t SizeVal = SizeCast->getZExtValue();
+
+  unsigned NumLoads = 0;
+  unsigned RemainingSize = SizeVal;
+  unsigned LoadSize = MaxLoadSize;
+  while (RemainingSize) {
+    NumLoads += RemainingSize / LoadSize;
+    RemainingSize = RemainingSize % LoadSize;
+    LoadSize = LoadSize / 2;
+  }
+
+  if (NumLoads >
+      TLI->getMaxExpandSizeMemcmp(CI->getParent()->getParent()->optForSize())) {
+    NumMemCmpGreaterThanMax++;
+    return false;
+  }
+
+  NumMemCmpInlined++;
+
+  // MemCmpHelper object, creates and sets up basic blocks required for
+  // expanding memcmp with size SizeVal
+  unsigned NumLoadsPerBlock = MemCmpNumLoadsPerBlock;
+  MemCmpExpansion MemCmpHelper(CI, MaxLoadSize, NumLoadsPerBlock);
+
+  Value *Res = MemCmpHelper.getMemCmpExpansion(DL->isLittleEndian());
+
+  // Replace call with result of expansion and erarse call.
+  CI->replaceAllUsesWith(Res);
+  CI->eraseFromParent();
+
+  return true;
+}
+
 bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
   BasicBlock *BB = CI->getParent();
 
@@ -1780,6 +2380,15 @@
     CI->eraseFromParent();
     return true;
   }
+
+  LibFunc Func;
+  if (TLInfo->getLibFunc(*CI->getCalledFunction(), Func) &&
+      Func == LibFunc_memcmp) {
+    if (expandMemCmp(CI, TTI, TLI, DL)) {
+      ModifiedDT = true;
+      return true;
+    }
+  }
   return false;
 }
 
@@ -4927,6 +5536,7 @@
   return true;
 }
 
+
 namespace {
 /// \brief Helper class to promote a scalar operation to a vector one.
 /// This class is used to move downward extractelement transition.
Index: llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp
+++ llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp
@@ -842,9 +842,10 @@
   initActions();
 
   // Perform these initializations only once.
-  MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = 8;
-  MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize
-    = MaxStoresPerMemmoveOptSize = 4;
+  MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove =
+      MaxLoadsPerMemcmp = 8;
+  MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize =
+      MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4;
   UseUnderscoreSetJmp = false;
   UseUnderscoreLongJmp = false;
   HasMultipleConditionRegisters = false;
Index: llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp
===================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp
+++ llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1041,6 +1041,10 @@
     MaxStoresPerMemset = 128;
     MaxStoresPerMemcpy = 128;
     MaxStoresPerMemmove = 128;
+    MaxLoadsPerMemcmp = 128;
+  } else {
+    MaxLoadsPerMemcmp = 8;
+    MaxLoadsPerMemcmpOptSize = 4;
   }
 }
 
Index: llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.h
===================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -60,6 +60,7 @@
   /// @{
 
   bool enableAggressiveInterleaving(bool LoopHasReductions);
+  bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize);
   bool enableInterleavedAccessVectorization();
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);
Index: llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
===================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -215,6 +215,11 @@
   return LoopHasReductions;
 }
 
+bool PPCTTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) {
+  MaxLoadSize = 8;
+  return true;
+}
+
 bool PPCTTIImpl::enableInterleavedAccessVectorization() {
   return true;
 }
Index: llvm/trunk/lib/Transforms/Utils/SimplifyLibCalls.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ llvm/trunk/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -85,20 +85,6 @@
   return false;
 }
 
-/// Return true if it only matters that the value is equal or not-equal to zero.
-static bool isOnlyUsedInZeroEqualityComparison(Value *V) {
-  for (User *U : V->users()) {
-    if (ICmpInst *IC = dyn_cast<ICmpInst>(U))
-      if (IC->isEquality())
-        if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
-          if (C->isNullValue())
-            continue;
-    // Unknown instruction.
-    return false;
-  }
-  return true;
-}
-
 /// Return true if it is only used in equality comparisons with With.
 static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) {
   for (User *U : V->users()) {
Index: llvm/trunk/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
===================================================================
--- llvm/trunk/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
+++ llvm/trunk/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
@@ -0,0 +1,121 @@
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+@zeroEqualityTest01.buffer1 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 4], align 4
+@zeroEqualityTest01.buffer2 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 3], align 4
+@zeroEqualityTest02.buffer1 = private unnamed_addr constant [4 x i32] [i32 4, i32 0, i32 0, i32 0], align 4
+@zeroEqualityTest02.buffer2 = private unnamed_addr constant [4 x i32] [i32 3, i32 0, i32 0, i32 0], align 4
+@zeroEqualityTest03.buffer1 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 3], align 4
+@zeroEqualityTest03.buffer2 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 4], align 4
+@zeroEqualityTest04.buffer1 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14], align 4
+@zeroEqualityTest04.buffer2 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 13], align 4
+
+; Function Attrs: nounwind readonly
+declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_addr #1
+
+; Validate with if(memcmp())
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest01() local_unnamed_addr #0 {
+entry:
+  %call = tail call signext i32 @memcmp(i8* bitcast ([3 x i32]* @zeroEqualityTest01.buffer1 to i8*), i8* bitcast ([3 x i32]* @zeroEqualityTest01.buffer2 to i8*), i64 16)
+  %not.tobool = icmp ne i32 %call, 0
+  %. = zext i1 %not.tobool to i32
+  ret i32 %.
+
+  ; CHECK-LABEL: @zeroEqualityTest01
+  ; CHECK-LABEL: %res_block
+  ; CHECK: li 3, 1
+  ; CHECK-NEXT: clrldi
+  ; CHECK-NEXT: blr
+  ; CHECK: li 3, 0
+  ; CHECK-NEXT: clrldi
+  ; CHECK-NEXT: blr
+}
+
+; Validate with if(memcmp() == 0)
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest02() local_unnamed_addr #0 {
+entry:
+  %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer2 to i8*), i64 16)
+  %not.cmp = icmp ne i32 %call, 0
+  %. = zext i1 %not.cmp to i32
+  ret i32 %.
+
+  ; CHECK-LABEL: @zeroEqualityTest02
+  ; CHECK-LABEL: %res_block
+  ; CHECK: li 3, 1
+  ; CHECK-NEXT: clrldi
+  ; CHECK-NEXT: blr
+  ; CHECK: li 3, 0
+  ; CHECK-NEXT: clrldi
+  ; CHECK-NEXT: blr
+}
+
+; Validate with > 0
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest03() local_unnamed_addr #0 {
+entry:
+  %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer2 to i8*), i64 16)
+  %not.cmp = icmp slt i32 %call, 1
+  %. = zext i1 %not.cmp to i32
+  ret i32 %.
+
+  ; CHECK-LABEL: @zeroEqualityTest03
+  ; CHECK-LABEL: %res_block
+  ; CHECK: cmpld
+  ; CHECK-NEXT: li [[LI:[0-9]+]], 1
+  ; CHECK-NEXT: li [[LI2:[0-9]+]], -1
+  ; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 0
+}
+
+; Validate with < 0
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest04() local_unnamed_addr #0 {
+entry:
+  %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer2 to i8*), i64 16)
+  %call.lobit = lshr i32 %call, 31
+  %call.lobit.not = xor i32 %call.lobit, 1
+  ret i32 %call.lobit.not
+
+  ; CHECK-LABEL: @zeroEqualityTest04
+  ; CHECK-LABEL: %res_block
+  ; CHECK: cmpld
+  ; CHECK-NEXT: li [[LI:[0-9]+]], 1
+  ; CHECK-NEXT: li [[LI2:[0-9]+]], -1
+  ; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 0
+}
+
+; Validate with memcmp()?:
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest05() local_unnamed_addr #0 {
+entry:
+  %call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer2 to i8*), i64 16)
+  %not.tobool = icmp eq i32 %call, 0
+  %cond = zext i1 %not.tobool to i32
+  ret i32 %cond
+
+  ; CHECK-LABEL: @zeroEqualityTest05
+  ; CHECK-LABEL: %res_block
+  ; CHECK: li 3, 1
+  ; CHECK: li 3, 0
+}
+
+; Validate with !memcmp()?:
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest06() local_unnamed_addr #0 {
+entry:
+  %call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer2 to i8*), i64 16)
+  %not.lnot = icmp ne i32 %call, 0
+  %cond = zext i1 %not.lnot to i32
+  ret i32 %cond
+
+  ; CHECK-LABEL: @zeroEqualityTest06
+  ; CHECK-LABEL: %res_block
+  ; CHECK: li 3, 1
+  ; CHECK-NEXT: clrldi
+  ; CHECK-NEXT: blr
+  ; CHECK: li 3, 0
+  ; CHECK-NEXT: clrldi
+  ; CHECK-NEXT: blr
+}
Index: llvm/trunk/test/CodeGen/PowerPC/memcmp.ll
===================================================================
--- llvm/trunk/test/CodeGen/PowerPC/memcmp.ll
+++ llvm/trunk/test/CodeGen/PowerPC/memcmp.ll
@@ -0,0 +1,87 @@
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux  < %s | FileCheck %s -check-prefix=CHECK
+
+; Check size 8
+; Function Attrs: nounwind readonly
+define signext i32 @test1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 8) #2
+  ret i32 %call
+
+; CHECK-LABEL: @test1
+; CHECK: ldbrx [[LOAD1:[0-9]+]]
+; CHECK-NEXT: ldbrx [[LOAD2:[0-9]+]]
+; CHECK-NEXT: li [[LI:[0-9]+]], 1
+; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]]
+; CHECK-NEXT: li [[LI2:[0-9]+]], -1
+; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4
+; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2
+; CHECK-NEXT: extsw 3, [[ISEL2]]
+; CHECK-NEXT: blr
+}
+
+; Check size 4
+; Function Attrs: nounwind readonly
+define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 4) #2
+  ret i32 %call
+
+; CHECK-LABEL: @test2
+; CHECK: lwbrx [[LOAD1:[0-9]+]]
+; CHECK-NEXT: lwbrx [[LOAD2:[0-9]+]]
+; CHECK-NEXT: li [[LI:[0-9]+]], 1
+; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]]
+; CHECK-NEXT: li [[LI2:[0-9]+]], -1
+; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4
+; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2
+; CHECK-NEXT: extsw 3, [[ISEL2]]
+; CHECK-NEXT: blr
+}
+
+; Check size 2
+; Function Attrs: nounwind readonly
+define signext i32 @test3(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 2) #2
+  ret i32 %call
+
+; CHECK-LABEL: @test3
+; CHECK: lhbrx [[LOAD1:[0-9]+]]
+; CHECK-NEXT: lhbrx [[LOAD2:[0-9]+]]
+; CHECK-NEXT: li [[LI:[0-9]+]], 1
+; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]]
+; CHECK-NEXT: li [[LI2:[0-9]+]], -1
+; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4
+; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2
+; CHECK-NEXT: extsw 3, [[ISEL2]]
+; CHECK-NEXT: blr
+}
+
+; Check size 1
+; Function Attrs: nounwind readonly
+define signext i32 @test4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 1) #2
+  ret i32 %call
+
+; CHECK-LABEL: @test4
+; CHECK: lbz [[LOAD1:[0-9]+]]
+; CHECK-NEXT: lbz [[LOAD2:[0-9]+]]
+; CHECK-NEXT: subf [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]]
+; CHECK-NEXT: extsw 3, [[SUB]]
+; CHECK-NEXT: blr
+}
+
+; Function Attrs: nounwind readonly
+declare signext i32 @memcmp(i8*, i8*, i64) #1
Index: llvm/trunk/test/CodeGen/PowerPC/memcmpIR.ll
===================================================================
--- llvm/trunk/test/CodeGen/PowerPC/memcmpIR.ll
+++ llvm/trunk/test/CodeGen/PowerPC/memcmpIR.ll
@@ -0,0 +1,194 @@
+; RUN: llc -o - -mtriple=powerpc64le-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s
+; RUN: llc -o - -mtriple=powerpc64-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s --check-prefix=CHECK-BE
+
+define signext i32 @test1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2)  {
+entry:
+  ; CHECK: [[LOAD1:%[0-9]+]] = load i64, i64*
+  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
+  ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
+  ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
+  ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]]
+  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+  ; CHECK-NEXT:  br i1 [[ICMP]], label %res_block, label
+
+  ; CHECK-LABEL: res_block:{{.*}}
+  ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64
+  ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+  ; CHECK-NEXT: br label %endblock
+
+  ; CHECK: [[GEP1:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1
+  ; CHECK-NEXT: [[GEP2:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1
+  ; CHECK-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[GEP1]]
+  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[GEP2]]
+  ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
+  ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
+  ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]]
+  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+  ; CHECK-NEXT:  br i1 [[ICMP]], label %res_block, label %endblock
+
+
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, i64*
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
+  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]]
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %res_block, label
+
+  ; CHECK-BE-LABEL: res_block:{{.*}}
+  ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
+  ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+  ; CHECK-BE-NEXT: br label %endblock
+
+  ; CHECK-BE: [[GEP1:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1
+  ; CHECK-BE-NEXT: [[GEP2:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1
+  ; CHECK-BE-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[GEP1]]
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[GEP2]]
+  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]]
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %res_block, label %endblock
+
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 16)
+  ret i32 %call
+}
+
+declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_addr #1
+
+define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2)  {
+  ; CHECK: [[LOAD1:%[0-9]+]] = load i32, i32*
+  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
+  ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]])
+  ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]])
+  ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[BSWAP1]] to i64
+  ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[BSWAP2]] to i64
+  ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+  ; CHECK-NEXT:  br i1 [[ICMP]], label %res_block, label %endblock
+
+  ; CHECK-LABEL: res_block:{{.*}}
+  ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64
+  ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+  ; CHECK-NEXT: br label %endblock
+
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32*
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
+  ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64
+  ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64
+  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %res_block, label %endblock
+
+  ; CHECK-BE-LABEL: res_block:{{.*}}
+  ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
+  ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+  ; CHECK-BE-NEXT: br label %endblock
+
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 4)
+  ret i32 %call
+}
+
+define signext i32 @test3(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2)  {
+  ; CHECK: [[LOAD1:%[0-9]+]] = load i64, i64*
+  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
+  ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
+  ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
+  ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]]
+  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+  ; CHECK-NEXT:  br i1 [[ICMP]], label %res_block, label
+
+  ; CHECK-LABEL: res_block:{{.*}}
+  ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64
+  ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+  ; CHECK-NEXT: br label %endblock
+
+  ; CHECK: [[LOAD1:%[0-9]+]] = load i32, i32*
+  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
+  ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]])
+  ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]])
+  ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[BSWAP1]] to i64
+  ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[BSWAP2]] to i64
+  ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+  ; CHECK-NEXT:  br i1 [[ICMP]], label %res_block, label
+
+  ; CHECK: [[LOAD1:%[0-9]+]] = load i16, i16*
+  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i16, i16*
+  ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i16 @llvm.bswap.i16(i16 [[LOAD1]])
+  ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i16 @llvm.bswap.i16(i16 [[LOAD2]])
+  ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[BSWAP1]] to i64
+  ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[BSWAP2]] to i64
+  ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+  ; CHECK-NEXT:  br i1 [[ICMP]], label %res_block, label
+
+  ; CHECK: [[LOAD1:%[0-9]+]] = load i8, i8*
+  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i8, i8*
+  ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32
+  ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32
+  ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-NEXT:  br label %endblock
+
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, i64*
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
+  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]]
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %res_block, label
+
+  ; CHECK-BE-LABEL: res_block:{{.*}}
+  ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
+  ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+  ; CHECK-BE-NEXT: br label %endblock
+
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32*
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
+  ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64
+  ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64
+  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %res_block, label
+
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i16, i16*
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i16, i16*
+  ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[LOAD1]] to i64
+  ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[LOAD2]] to i64
+  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %res_block, label
+
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i8, i8*
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i8, i8*
+  ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32
+  ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32
+  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-BE-NEXT:  br label %endblock
+
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 15)
+  ret i32 %call
+}
+  ; CHECK: call = tail call signext i32 @memcmp
+  ; CHECK-BE: call = tail call signext i32 @memcmp
+define signext i32 @test4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2)  {
+
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 65)
+  ret i32 %call
+}
+
+define signext i32 @test5(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2, i32 signext %SIZE)  {
+  ; CHECK: call = tail call signext i32 @memcmp
+  ; CHECK-BE: call = tail call signext i32 @memcmp
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %conv = sext i32 %SIZE to i64
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 %conv)
+  ret i32 %call
+}