Index: lib/Target/PowerPC/CMakeLists.txt
===================================================================
--- lib/Target/PowerPC/CMakeLists.txt
+++ lib/Target/PowerPC/CMakeLists.txt
@@ -27,6 +27,7 @@
   PPCFastISel.cpp
   PPCFrameLowering.cpp
   PPCLoopPreIncPrep.cpp
+  PPCLowerMemIntrinsics.cpp
   PPCMCInstLower.cpp
   PPCMachineFunctionInfo.cpp
   PPCMIPeephole.cpp
Index: lib/Target/PowerPC/PPC.h
===================================================================
--- lib/Target/PowerPC/PPC.h
+++ lib/Target/PowerPC/PPC.h
@@ -27,6 +27,7 @@
   class FunctionPass;
   class MachineInstr;
   class MachineOperand;
+  class ModulePass;
   class AsmPrinter;
   class MCInst;
   class MCOperand;
@@ -50,6 +51,9 @@
   FunctionPass *createPPCTLSDynamicCallPass();
   FunctionPass *createPPCBoolRetToIntPass();
   FunctionPass *createPPCExpandISELPass();
+
+  ModulePass *createPPCLowerMemIntrinsicsPass();
+
   void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                     AsmPrinter &AP, bool isDarwin);
   bool LowerPPCMachineOperandToMCOperand(const MachineOperand &MO,
@@ -60,6 +64,7 @@
   void initializePPCBoolRetToIntPass(PassRegistry&);
   void initializePPCExpandISELPass(PassRegistry &);
   void initializePPCTLSDynamicCallPass(PassRegistry &);
+  void initializePPCLowerMemIntrinsicsPass(llvm::PassRegistry&);
   extern char &PPCVSXFMAMutateID;
 
   namespace PPCII {
Index: lib/Target/PowerPC/PPCLowerMemIntrinsics.cpp
===================================================================
--- /dev/null
+++ lib/Target/PowerPC/PPCLowerMemIntrinsics.cpp
@@ -0,0 +1,163 @@
+//===-------- PPCLowerMemIntrinsics.cpp - Expand memory instinsics  -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// An IR to IR  pass that expands llvm.memcpy intrinsics into the equivalent
+/// load-store loops.
+///
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
+
+#define DEBUG_TYPE "ppc-memcpy-loop-lowering"
+
+// This pass will loop over all MemCpyInstrs and expand some of them into loops.
+// For known compile time sizes, calls where the size belongs to
+// [MemcpyLoopFloor, MemcpyLoopCeil] will be expanded. For unknown sizes we are
+// expanding all call sites.
+
+STATISTIC(MemCpyLoopExpansions, "Number of memcpy calls expanded into a loop.");
+
+using namespace llvm;
+
+static cl::opt<bool> EnableMemcpyExpansionPass(
+    "ppc-enable-memcpy-loops",
+    cl::desc("Enable the PPC pass that lowers memcpy calls into loops."),
+    cl::init(false), cl::Hidden);
+
+// Options used to tune the size range where memcpy expansions occur.
+static cl::opt<unsigned> MemcpyLoopFloor(
+    "ppc-memcpy-loop-floor", cl::Hidden, cl::init(129),
+    cl::desc(
+        "The lower size bound of memcpy calls to get expanded into a loop"));
+
+static cl::opt<unsigned> MemcpyLoopCeil(
+    "ppc-memcpy-loop-ceil", cl::Hidden, cl::init(256),
+    cl::desc("The upper size bound of memcpy calls to get expanded in a loop"));
+
+namespace {
+class PPCLowerMemIntrinsics : public ModulePass {
+public:
+  static char ID;
+
+  PPCLowerMemIntrinsics() : ModulePass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+
+  bool runOnModule(Module &M) override;
+  /// Loops over all uses of llvm.memcpy and expands the call if warranted.
+  //  \p MemcpyDecl is the function declaration of llvm.memcpy.
+  bool expandMemcopies(Function &MemcpyDecl);
+
+  StringRef getPassName() const override {
+    return "PPC Lower memcpy into loops";
+  }
+};
+} // end anonymous namespace
+
+
+// Checks whether the cpu arch is one where we want to expand
+// memcpy calls.
+static bool CPUCheck(const std::string &CpuStr) {
+  return StringSwitch<bool>(CpuStr)
+      .Case("pwr8", true)
+      .Case("pwr9", true)
+      .Case("ppc64le", true) // The default cpu for little-endian.
+      .Default(false);
+}
+
+// Determines if we want to expand a specific memcpy call.
+static bool shouldExpandMemCpy(MemCpyInst *MC) {
+  // If compiling for -O0, -Oz or -Os we don't want to expand.
+  Function *ParentFunc = MC->getParent()->getParent();
+  if (ParentFunc->optForSize() ||
+      ParentFunc->hasFnAttribute(Attribute::OptimizeNone))
+    return false;
+
+  // See if the cpu arch is one we want to expand for. If there is no
+  // target-cpu attibute assume we don't want to  expand.
+  Attribute CPUAttr = ParentFunc->getFnAttribute("target-cpu");
+  if (CPUAttr.hasAttribute(Attribute::None) ||
+      !CPUCheck(CPUAttr.getValueAsString())) {
+    return false;
+  }
+
+  // Expand known sizes within the range [MemcpyLoopFloor, MemcpyLoopCeil].
+  ConstantInt *CISize = dyn_cast<ConstantInt>(MC->getLength());
+  if (CISize) {
+    return CISize->getZExtValue() >= MemcpyLoopFloor &&
+           CISize->getZExtValue() <= MemcpyLoopCeil;
+  }
+
+  // Otherwise expand unkown sizes ...
+  return true;
+}
+
+bool PPCLowerMemIntrinsics::expandMemcopies(Function &F) {
+  bool AnyExpanded = false;
+  assert(Intrinsic::memcpy == F.getIntrinsicID() &&
+         "expandMemcopies called on wrong function declaration.");
+  // loop over all memcpy calls
+  for (auto I : F.users()) {
+    MemCpyInst *MC = dyn_cast<MemCpyInst>(I);
+    assert(MC && "Must be a MemcpyInst!");
+    if (shouldExpandMemCpy(MC)) {
+      Function *ParentFunc = MC->getParent()->getParent();
+      const TargetTransformInfo &TTI =
+          getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*ParentFunc);
+      expandMemCpyAsLoop(MC, TTI);
+      MC->eraseFromParent();
+      AnyExpanded = true;
+      ++MemCpyLoopExpansions;
+    }
+  }
+  return AnyExpanded;
+}
+
+bool PPCLowerMemIntrinsics::runOnModule(Module &M) {
+  if (!EnableMemcpyExpansionPass || skipModule(M))
+    return false;
+
+  bool Modified = false;
+  for (Function &F : M) {
+    // Looking for the declaration of llvm.memcpy so we can skip
+    // any definition.
+    if (!F.isDeclaration())
+      continue;
+
+    switch (F.getIntrinsicID()) {
+    default:
+      break;
+    case Intrinsic::memcpy:
+      Modified = expandMemcopies(F);
+    }
+  }
+
+  return Modified;
+}
+
+ModulePass *llvm::createPPCLowerMemIntrinsicsPass() {
+  return new PPCLowerMemIntrinsics();
+}
+
+char PPCLowerMemIntrinsics::ID = 0;
+INITIALIZE_PASS(PPCLowerMemIntrinsics, "PPCLowerMemIntrinsics",
+                "Lower mem intrinsics into loops", false, false)
Index: lib/Target/PowerPC/PPCTargetMachine.cpp
===================================================================
--- lib/Target/PowerPC/PPCTargetMachine.cpp
+++ lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -102,6 +102,7 @@
   initializePPCBoolRetToIntPass(PR);
   initializePPCExpandISELPass(PR);
   initializePPCTLSDynamicCallPass(PR);
+  initializePPCLowerMemIntrinsicsPass(PR);
 }
 
 /// Return the datalayout string of a subtarget.
@@ -337,6 +338,10 @@
   if (UsePrefetching)
     addPass(createLoopDataPrefetchPass());
 
+
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createPPCLowerMemIntrinsicsPass());
+
   if (TM->getOptLevel() >= CodeGenOpt::Default && EnableGEPOpt) {
     // Call SeparateConstOffsetFromGEP pass to extract constants within indices
     // and lower a GEP with multiple indices to either arithmetic operations or
Index: lib/Target/PowerPC/PPCTargetTransformInfo.h
===================================================================
--- lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -91,7 +91,13 @@
                                  ArrayRef<unsigned> Indices,
                                  unsigned Alignment,
                                  unsigned AddressSpace);
-
+  Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
+                                  unsigned SrcAlign, unsigned DestAlign) const;
+  void getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type *> &OpsOut,
+                                         LLVMContext &Context,
+                                         unsigned RemainingBytes,
+                                         unsigned SrcAlign,
+                                         unsigned DestAlign) const;
   /// @}
 };
 
Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp
===================================================================
--- lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -480,3 +480,31 @@
   return Cost;
 }
 
+Type *PPCTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
+                                            unsigned SrcAlign,
+                                            unsigned DestAlign) const {
+  return Type::getInt64Ty(Context);
+}
+
+/// Decomposes a copy operation with size \p RemainingBytes into the individual
+/// operands.
+void PPCTTIImpl::getMemcpyLoopResidualLoweringType(
+    SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
+    unsigned RemainingBytes, unsigned SrcAlign, unsigned DestAlign) const {
+  // Types to use in copy operations.
+  IntegerType *CopyTypes[] = {
+      Type::getInt64Ty(Context), Type::getInt32Ty(Context),
+      Type::getInt16Ty(Context), Type::getInt8Ty(Context)};
+
+  // Deconstructs the remaining bytes into individual operands.
+  for (auto OpTy : CopyTypes) {
+    unsigned OpSize = OpTy->getBitWidth() / 8;
+    // Loops just in case the remaining bytes are greater or equal to
+    // twice the largest copy operand type.
+    while (RemainingBytes >= OpSize) {
+      RemainingBytes -= OpSize;
+      OpsOut.push_back(OpTy);
+    }
+  }
+}
+
Index: test/CodeGen/PowerPC/memcpy-loop-expansion.ll
===================================================================
--- /dev/null
+++ test/CodeGen/PowerPC/memcpy-loop-expansion.ll
@@ -0,0 +1,163 @@
+; RUN: opt -S -PPCLowerMemIntrinsics -ppc-enable-memcpy-loops=true   \
+; RUN: -mtriple=powerpc64le-unknown-linux-gnu -ppc-memcpy-loop-floor=0 \
+; RUN: -mcpu=pwr8 %s| FileCheck -check-prefix=OPT %s
+; RUN: opt -S -PPCLowerMemIntrinsics -ppc-enable-memcpy-loops=true \
+; RUN: -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 %s | \
+; RUN: FileCheck %s --check-prefix PWR7
+; RUN: llc < %s  -ppc-enable-memcpy-loops=true   \
+; RUN: -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8  -O0  | \
+; RUN: FileCheck %s --check-prefix OPTNONE
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #0
+
+; Check that memcpy calls with a known zero length are removed.
+define i8* @memcpy_zero_size(i8* %dst, i8* %src) {
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 0, i32 1, i1 false)
+  ret i8* %dst
+
+; OPT-LABEL: @memcpy_zero_size
+; OPT-NEXT:  entry:
+; OPT-NEXT:  ret i8* %dst
+}
+
+; Check that a memcpy call with a known size smaller then the loop operand
+; type is handled properly.
+define i8* @memcpy_small_size(i8* %dst, i8* %src) {
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 7, i32 1, i1 false)
+  ret i8* %dst
+
+; OPT-LABEL: @memcpy_small_size
+; OPT-NEXT:  entry:
+; OPT-NEXT:  [[SrcAsi32:%[0-9]+]] = bitcast i8* %src to i32*
+; OPT-NEXT:  [[SrcGep:%[0-9]+]] = getelementptr inbounds i32, i32* [[SrcAsi32]], i64 0
+; OPT-NEXT:  [[Load:%[0-9]+]] = load i32, i32* [[SrcGep]]
+; OPT-NEXT:  [[DstAsi32:%[0-9]+]] = bitcast i8* %dst to i32*
+; OPT-NEXT:  [[DstGep:%[0-9]+]] = getelementptr inbounds i32, i32* [[DstAsi32]], i64 0
+; OPT-NEXT:  store i32 [[Load]], i32* [[DstGep]]
+; OPT-NEXT:  [[SrcAsi16:%[0-9]+]] = bitcast i8* %src to i16*
+; OPT-NEXT:  [[SrcGep2:%[0-9]+]] = getelementptr inbounds i16, i16* [[SrcAsi16]], i64 2
+; OPT-NEXT:  [[Load2:%[0-9]+]] = load i16, i16* [[SrcGep2]]
+; OPT-NEXT:  [[DstAsi16:%[0-9]+]] = bitcast i8* %dst to i16*
+; OPT-NEXT:  [[DstGep2:%[0-9]+]] = getelementptr inbounds i16, i16* [[DstAsi16]], i64 2
+; OPT-NEXT:  store i16 [[Load2]], i16* [[DstGep2]]
+; OPT-NEXT:  [[SrcGep3:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 6
+; OPT-NEXT:  [[Load3:%[0-9]+]] = load i8, i8* [[SrcGep3]]
+; OPT-NEXT:  [[DstGep3:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 6
+; OPT-NEXT:  store i8 [[Load3]], i8* [[DstGep3]]
+; OPT-NEXT:  ret i8* %dst
+}
+
+; Check the expansion of a memcpy call with compile-time size.
+define i8* @memcpy_known_size(i8* %dst, i8* %src) {
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 100, i32 1, i1 false)
+  ret i8* %dst
+; OPT-LABEL: @memcpy_known_size
+; OPT:       entry:
+; OPT-NEXT:  [[SrcCast:%[0-9]+]] = bitcast i8* %src to i64*
+; OPT-NEXT:  [[DstCast:%[0-9]+]] = bitcast i8* %dst to i64*
+; OPT-NEXT:  br label %load-store-loop
+
+; OPT:       load-store-loop:
+; OPT-NEXT:  %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %load-store-loop ]
+; OPT-NEXT:  [[SrcGep:%[0-9]+]] = getelementptr inbounds i64, i64* [[SrcCast]], i64 %loop-index
+; OPT-NEXT:  [[Load:%[0-9]+]] = load i64, i64* [[SrcGep]]
+; OPT-NEXT:  [[DstGep:%[0-9]+]] = getelementptr inbounds i64, i64* [[DstCast]], i64 %loop-index
+; OPT-NEXT:  store i64 [[Load]], i64* [[DstGep]]
+; OPT-NEXT:  [[IndexInc]] = add i64 %loop-index, 1
+; OPT-NEXT:  [[CMP:%[0-9]+]] = icmp ult i64 [[IndexInc]], 12
+; OPT-NEXT:  br i1 [[CMP]], label %load-store-loop, label %memcpy-split
+
+; OPT:       memcpy-split:
+; OPT-NEXT:  [[SrcAsi32:%[0-9]+]] = bitcast i64* [[SrcCast]] to i32*
+; OPT-NEXT:  [[SrcGep2:%[0-9]+]] = getelementptr inbounds i32, i32* [[SrcAsi32]], i64 24
+; OPT-NEXT:  [[Load2:%[0-9]+]] = load i32, i32* [[SrcGep2]]
+; OPT-NEXT:  [[DstAsi32:%[0-9]+]] = bitcast i64* [[DstCast]] to i32*
+; OPT-NEXT:  [[DstGep2:%[0-9]+]] = getelementptr inbounds i32, i32* [[DstAsi32]], i64 24
+; OPT-NEXT:  store i32 [[Load2]], i32* [[DstGep2]]
+; OPT-NEXT:  ret i8* %dst
+}
+
+
+; Check the expansion of a memcpy whose size argument is not a compile time
+; constant.
+define i8* @memcpy_unkown_size(i8* %dst, i8* %src, i64 %len) {
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false)
+  ret i8* %dst
+
+; OPT-LABEL: @memcpy_unkown_size
+; OPT:       entry:
+; OPT-NEXT:  [[SrcCast:%[0-9]+]] = bitcast i8* %src to i64*
+; OPT-NEXT:  [[DstCast:%[0-9]+]] = bitcast i8* %dst to i64*
+; OPT-NEXT:  [[LoopCount:%[0-9]+]] = udiv i64 %len, 8
+; OPT-NEXT:  [[ResBytes:%[0-9]+]] = urem i64 %len, 8
+; OPT-NEXT:  [[BytesCopied:%[0-9]+]] = sub i64 %len, [[ResBytes]]
+; OPT-NEXT:  [[Cmp:%[0-9]+]] = icmp ne i64 [[LoopCount]], 0
+; OPT-NEXT:  br i1 [[Cmp]], label %loop-memcpy-expansion, label %loop-memcpy-residual-header
+
+; OPT:       post-loop-memcpy-expansion:
+; OPT-NEXT:  ret i8* %dst
+
+; OPT:       loop-memcpy-expansion:
+; OPT-NEXT:  %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ]
+; OPT-NEXT:  [[SrcGep:%[0-9]+]] = getelementptr inbounds i64, i64* [[SrcCast]], i64 %loop-index
+; OPT-NEXT:  [[Load:%[0-9]+]] = load i64, i64* [[SrcGep]]
+; OPT-NEXT:  [[DstGep:%[0-9]+]] = getelementptr inbounds i64, i64* [[DstCast]], i64 %loop-index
+; OPT-NEXT:  store i64 [[Load]], i64* [[DstGep]]
+; OPT-NEXT:  [[IndexInc]] = add i64 %loop-index, 1
+; OPT-NEXT:  [[LoopCmp:%[0-9]+]] = icmp ult i64 [[IndexInc]], [[LoopCount]]
+; OPT-NEXT:  br i1 [[LoopCmp]], label %loop-memcpy-expansion, label %loop-memcpy-residual-header
+
+; OPT:       loop-memcpy-residual:
+; OPT-NEXT:  %residual-loop-index = phi i64 [ 0, %loop-memcpy-residual-header ], [ [[ResIndexInc:%[0-9]+]], %loop-memcpy-residual ]
+; OPT-NEXT:  [[SrcAsi8:%[0-9]+]] = bitcast i64* [[SrcCast]] to i8*
+; OPT-NEXT:  [[DstAsi8:%[0-9]+]] = bitcast i64* [[DstCast]] to i8*
+; OPT-NEXT:  [[ResIndex:%[0-9]+]] = add i64 [[BytesCopied]], %residual-loop-index
+; OPT-NEXT:  [[SrcGep2:%[0-9]+]] = getelementptr inbounds i8, i8* [[SrcAsi8]], i64 [[ResIndex]]
+; OPT-NEXT:  [[Load2:%[0-9]+]] = load i8, i8* [[SrcGep2]]
+; OPT-NEXT:  [[DstGep2:%[0-9]+]] = getelementptr inbounds i8, i8* [[DstAsi8]], i64 [[ResIndex]]
+; OPT-NEXT:  store i8 [[Load2]], i8* [[DstGep2]]
+; OPT-NEXT:  [[ResIndexInc]] = add i64 %residual-loop-index, 1
+; OPT-NEXT:  [[RCmp:%[0-9]+]] = icmp ult i64 [[ResIndexInc]], [[ResBytes]]
+; OPT-NEXT:  br i1 [[RCmp]], label %loop-memcpy-residual, label %post-loop-memcpy-expansion
+
+; OPT:       loop-memcpy-residual-header:
+; OPT-NEXT:  [[RHCmp:%[0-9]+]] = icmp ne i64 [[ResBytes]], 0
+; OPT-NEXT:  br i1 [[RHCmp]], label %loop-memcpy-residual, label %post-loop-memcpy-expansion
+}
+
+; Ensure the pass doens't expand memcpy calls when compiling a function with an
+; unspported target_cpu attribute.
+define i8* @memcpy_power7(i8* %dst, i8* %src, i64 %len) #1 {
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false)
+  ret i8* %dst
+; PWR7-LABEL: @memcpy_power7
+; PWR7:       tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false)
+}
+
+; Ensure the pass doens't expand calls in a function compiled for size.
+define i8* @memcpy_opt_small(i8* %dst, i8* %src, i64 %len) #2 {
+  entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false)
+  ret i8* %dst
+; OPT-LABEL: @memcpy_opt_small
+; OPT:       tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false)
+}
+
+; Ensure the pass doesn't expand calls on functions not compiled with
+; optimizations.
+define i8* @memcpy_opt_none(i8* %dst, i8* %src, i64 %len) {
+  entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false)
+  ret i8* %dst
+; OPTNONE-LABEL: @memcpy_opt_none
+; OPTNONE:       bl memcpy
+}
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { "target-cpu"="pwr7" }
+attributes #2 = { "target-cpu"="pwr8" optsize }