Index: lib/Target/PowerPC/PPCLowerMemIntrinsics.cpp
===================================================================
--- lib/Target/PowerPC/PPCLowerMemIntrinsics.cpp
+++ lib/Target/PowerPC/PPCLowerMemIntrinsics.cpp
@@ -15,7 +15,10 @@
 #include "PPC.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
@@ -24,23 +27,26 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 #define DEBUG_TYPE "ppc-memcpy-loop-lowering"
 
 // This pass will loop over all MemCpyInstrs and expand some of them into loops.
 // For known compile time sizes, calls where the size belongs to
 // [MemcpyLoopFloor, MemcpyLoopCeil] will be expanded. For unknown sizes we are
-// currently expanding all call sites. The pass is off by default and can be
-// enabled with 'ppc-enable-memcpy-loops=true'.
+// currently not expanding memcpy calls be default. memcpy calls of unknown sizes
+// in hot paths can be expanded by the flag 'ppc-memcpy-unknown-loops=true'. 
+// The pass is on by default and can be disabled with 'ppc-enable-memcpy-loops=false'.
 
-STATISTIC(MemCpyLoopExpansions, "Number of memcpy calls expanded into a loop.");
+STATISTIC(MemCpyLoopExpansions,
+		          "Total number of memcpy calls expanded into a loop.");
 
 using namespace llvm;
 
 static cl::opt<bool> EnableMemcpyExpansionPass(
     "ppc-enable-memcpy-loops",
     cl::desc("Enable the PPC pass that lowers memcpy calls into loops."),
-    cl::init(false), cl::Hidden);
+    cl::init(true), cl::Hidden);
 
 // Options used to tune the size range where memcpy expansions occur.
 static cl::opt<unsigned> MemcpyLoopFloor(
@@ -49,9 +55,28 @@
         "The lower size bound of memcpy calls to get expanded into a loop"));
 
 static cl::opt<unsigned> MemcpyLoopCeil(
-    "ppc-memcpy-loop-ceil", cl::Hidden, cl::init(256),
+    "ppc-memcpy-loop-ceil", cl::Hidden, cl::init(512),
     cl::desc("The upper size bound of memcpy calls to get expanded in a loop"));
 
+static cl::opt<unsigned> MemcpyLoopUnknownThreshold(
+    "ppc-memcpy-loop-unknown-threshold", cl::Hidden, cl::init(128),
+    cl::desc("The upper size bound of memcpy calls to get expanded in a loop for unknown sizes"));
+
+static cl::opt<bool> MemcpyLoopDoKnown(
+    "ppc-memcpy-known-loops",
+    cl::desc("Enable memcpy loop expansion for known size loops."),
+    cl::init(true), cl::Hidden);
+
+static cl::opt<bool> MemcpyLoopDoUnknown(
+    "ppc-memcpy-unknown-loops", 
+    cl::desc("Enable memcpy loop expansion for unknown size loops."),
+    cl::init(false), cl::Hidden);
+
+static cl::opt<bool> MemcpyLoopDoUnknownNonPGO(
+    "ppc-memcpy-non-pgo-unknown-loops",
+    cl::desc("Enable memcpy loop expansion for unknown size loops even without PGO information."),
+    cl::init(false), cl::Hidden);
+
 namespace {
 class PPCLowerMemIntrinsics : public ModulePass {
 public:
@@ -61,8 +86,12 @@
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
   }
 
+  bool shouldExpandMemCpy(MemCpyInst *MC, ProfileSummaryInfo *PSI, 
+		  BlockFrequencyInfo &BFI);
   bool runOnModule(Module &M) override;
   /// Loops over all uses of llvm.memcpy and expands the call if warranted.
   //  \p MemcpyDecl is the function declaration of llvm.memcpy.
@@ -74,7 +103,6 @@
 };
 } // end anonymous namespace
 
-
 // Checks whether the cpu arch is one where we want to expand
 // memcpy calls.
 static bool CPUCheck(const std::string &CpuStr) {
@@ -86,7 +114,8 @@
 }
 
 // Determines if we want to expand a specific memcpy call.
-static bool shouldExpandMemCpy(MemCpyInst *MC) {
+bool PPCLowerMemIntrinsics::shouldExpandMemCpy(MemCpyInst *MC,
+		ProfileSummaryInfo *PSI, BlockFrequencyInfo &BFI) {
   // If compiling for -O0, -Oz or -Os we don't want to expand.
   Function *ParentFunc = MC->getParent()->getParent();
   if (ParentFunc->optForSize() ||
@@ -101,15 +130,42 @@
     return false;
   }
 
-  // Expand known sizes within the range [MemcpyLoopFloor, MemcpyLoopCeil].
+  // Check if the memcpy call has a known size.
   ConstantInt *CISize = dyn_cast<ConstantInt>(MC->getLength());
-  if (CISize) {
-    return CISize->getZExtValue() >= MemcpyLoopFloor &&
-           CISize->getZExtValue() <= MemcpyLoopCeil;
+  if (CISize && !MemcpyLoopDoKnown)
+      return false;
+
+  if (!CISize && !MemcpyLoopDoUnknown)
+      return false;
+
+  // Do not expand memcpy calls within cold call sites.
+  bool HasPGOInfo = false;
+  if (PSI) {
+    Optional<uint64_t> Count = PSI->getProfileCount(MC, &BFI);
+    if (Count.hasValue()) {
+      HasPGOInfo = true;
+      if (PSI->isColdCallSite(CallSite(MC), &BFI))
+        return false;
+    }
   }
 
-  // Otherwise expand unkown sizes ...
-  return true;
+  // Expand known sizes within the range [MemcpyLoopFloor, MemcpyLoopCeil].
+  if (CISize)
+    return CISize->getZExtValue() >= MemcpyLoopFloor &&
+              CISize->getZExtValue() <= MemcpyLoopCeil;
+
+  // For unknown size, only version if there is PGO info.
+  return (HasPGOInfo || MemcpyLoopDoUnknownNonPGO);
+}
+
+// Return the condition to be used to determine unknown size memCpy expansion.
+static Value *getExpandUnknownSizeMemCpyCond(MemCpyInst *MI) {
+
+  IRBuilder<> Builder(MI);
+  Value *Op1 = MI->getLength();
+  Value *Op2 = ConstantInt::get(Op1->getType(), MemcpyLoopUnknownThreshold);
+  Value *Cond = Builder.CreateICmpULE(Op1, Op2);
+  return Cond;
 }
 
 // Wrapper function that determines which expansion to call depending on if the
@@ -122,10 +178,20 @@
                               ConstLen, MI->getAlignment(), MI->getAlignment(),
                               MI->isVolatile(), MI->isVolatile(), TTI);
   } else {
-    createMemCpyLoopUnknownSize(MI, MI->getRawSource(), MI->getRawDest(),
+    // Create an if-then-else block and insert it before the memCpy instruction.
+    TerminatorInst *ThenTerm, *ElseTerm;
+    SplitBlockAndInsertIfThenElse(getExpandUnknownSizeMemCpyCond(MI),
+                                  MI, &ThenTerm, &ElseTerm, nullptr);
+    // Generate the memCpy expansion loop in the then-block.
+    createMemCpyLoopUnknownSize(ThenTerm, MI->getRawSource(), MI->getRawDest(),
                                 MI->getLength(), MI->getAlignment(),
                                 MI->getAlignment(), MI->isVolatile(),
                                 MI->isVolatile(), TTI);
+
+    // Create a copy of MI and instert it into the else-block.
+    IRBuilder<> Builder(MI);
+    Builder.SetInsertPoint(ElseTerm);
+    Builder.Insert(MI->clone());
   }
 }
 
@@ -133,18 +199,27 @@
   bool AnyExpanded = false;
   assert(Intrinsic::memcpy == F.getIntrinsicID() &&
          "expandMemcopies called on wrong function declaration.");
-  // loop over all memcpy calls
+
+  // Obtain profiling information.
+  ProfileSummaryInfo *PSI =
+      getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+
+  // Loop over all of the memcpy calls.
   for (auto I : F.users()) {
     MemCpyInst *MC = dyn_cast<MemCpyInst>(I);
     assert(MC && "Must be a MemcpyInst!");
-    if (shouldExpandMemCpy(MC)) {
-      Function *ParentFunc = MC->getParent()->getParent();
+
+    Function *ParentFunc = MC->getParent()->getParent();
+    BlockFrequencyInfo &BFI =
+	          getAnalysis<BlockFrequencyInfoWrapperPass>(*ParentFunc).getBFI();
+
+    if (shouldExpandMemCpy(MC, PSI, BFI)) {
       const TargetTransformInfo &TTI =
           getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*ParentFunc);
       ppcExpandMemCpyAsLoop(MC, TTI);
       MC->eraseFromParent();
       AnyExpanded = true;
-      MemCpyLoopExpansions += 1;
+      ++MemCpyLoopExpansions;
     }
   }
   return AnyExpanded;
@@ -177,5 +252,9 @@
 }
 
 char PPCLowerMemIntrinsics::ID = 0;
-INITIALIZE_PASS(PPCLowerMemIntrinsics, "PPCLowerMemIntrinsics",
-                "Lower mem intrinsics into loops", false, false)
+INITIALIZE_PASS_BEGIN(PPCLowerMemIntrinsics, "PPCLowerMemIntrinsics",
+		                                      "Lower mem intrinsics into loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_END(PPCLowerMemIntrinsics, "PPCLowerMemIntrinsics",
+				                      "Lower mem intrinsics into loops", false, false)
Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp
===================================================================
--- lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -483,7 +483,7 @@
 Type *PPCTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
                                             unsigned SrcAlign,
                                             unsigned DestAlign) const {
-  return Type::getInt64Ty(Context);
+  return VectorType::get(Type::getInt64Ty(Context),8);
 }
 
 /// Decomposes a copy operation with size \p RemainingBytes into the individual
@@ -492,13 +492,16 @@
     SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
     unsigned RemainingBytes, unsigned SrcAlign, unsigned DestAlign) const {
   // Types to use in copy operations.
-  IntegerType *CopyTypes[] = {
+  Type *CopyTypes[] = {
+      VectorType::get(Type::getInt64Ty(Context),2),
       Type::getInt64Ty(Context), Type::getInt32Ty(Context),
       Type::getInt16Ty(Context), Type::getInt8Ty(Context)};
 
   // Deconstructs the remaining bytes into individual operands.
   for (auto OpTy : CopyTypes) {
-    unsigned OpSize = OpTy->getBitWidth() / 8;
+    unsigned OpSize = OpTy->getScalarSizeInBits() / 8;
+    if (OpTy->isVectorTy())
+      OpSize *= OpTy->getVectorNumElements();
     // Loops just in case the remaining bytes are greater or equal to
     // twice the largest copy operand type.
     while (RemainingBytes >= OpSize) {
Index: lib/Transforms/Utils/LowerMemIntrinsics.cpp
===================================================================
--- lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -98,7 +98,7 @@
     SrcAlign = std::min(SrcAlign, LoopOpSize);
     DestAlign = std::min(DestAlign, LoopOpSize);
 
-    SmallVector<Type *, 5> RemainingOps;
+    SmallVector<Type *, 10> RemainingOps;
     TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
                                           SrcAlign, DestAlign);
 
Index: test/CodeGen/PowerPC/memcpy-loop-expansion.ll
===================================================================
--- test/CodeGen/PowerPC/memcpy-loop-expansion.ll
+++ test/CodeGen/PowerPC/memcpy-loop-expansion.ll
@@ -1,10 +1,17 @@
 ; RUN: opt -S -PPCLowerMemIntrinsics -ppc-enable-memcpy-loops=true   \
+; RUN: -ppc-memcpy-unknown-loops=true \
 ; RUN: -mtriple=powerpc64le-unknown-linux-gnu -ppc-memcpy-loop-floor=0 \
 ; RUN: -mcpu=pwr8 %s| FileCheck -check-prefix=OPT %s
 ; RUN: opt -S -PPCLowerMemIntrinsics -ppc-enable-memcpy-loops=true \
+; RUN: -ppc-memcpy-unknown-loops=true \
 ; RUN: -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 %s | \
 ; RUN: FileCheck %s --check-prefix PWR7
+; RUN: opt -S -PPCLowerMemIntrinsics -ppc-enable-memcpy-loops=true   \
+; RUN: -ppc-memcpy-unknown-loops=true \
+; RUN: -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8   %s | \
+; RUN: FileCheck %s --check-prefix OPTSMALL
 ; RUN: llc < %s  -ppc-enable-memcpy-loops=true   \
+; RUN: -ppc-memcpy-unknown-loops=true \
 ; RUN: -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8  -O0  | \
 ; RUN: FileCheck %s --check-prefix OPTNONE
 
@@ -56,65 +63,89 @@
   ret i8* %dst
 ; OPT-LABEL: @memcpy_known_size
 ; OPT:       entry:
-; OPT-NEXT:  [[SrcCast:%[0-9]+]] = bitcast i8* %src to i64*
-; OPT-NEXT:  [[DstCast:%[0-9]+]] = bitcast i8* %dst to i64*
+; OPT-NEXT:  [[SrcCast:%[0-9]+]] = bitcast i8* %src to <8 x i64>*
+; OPT-NEXT:  [[DstCast:%[0-9]+]] = bitcast i8* %dst to <8 x i64>*
 ; OPT-NEXT:  br label %load-store-loop
 
 ; OPT:       load-store-loop:
 ; OPT-NEXT:  %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %load-store-loop ]
-; OPT-NEXT:  [[SrcGep:%[0-9]+]] = getelementptr inbounds i64, i64* [[SrcCast]], i64 %loop-index
-; OPT-NEXT:  [[Load:%[0-9]+]] = load i64, i64* [[SrcGep]]
-; OPT-NEXT:  [[DstGep:%[0-9]+]] = getelementptr inbounds i64, i64* [[DstCast]], i64 %loop-index
-; OPT-NEXT:  store i64 [[Load]], i64* [[DstGep]]
+; OPT-NEXT:  [[SrcGep:%[0-9]+]] = getelementptr inbounds <8 x i64>, <8 x i64>* [[SrcCast]], i64 %loop-index
+; OPT-NEXT:  [[Load:%[0-9]+]] = load <8 x i64>, <8 x i64>* [[SrcGep]]
+; OPT-NEXT:  [[DstGep:%[0-9]+]] = getelementptr inbounds <8 x i64>, <8 x i64>* [[DstCast]], i64 %loop-index
+; OPT-NEXT:  store <8 x i64> [[Load]], <8 x i64>* [[DstGep]]
 ; OPT-NEXT:  [[IndexInc]] = add i64 %loop-index, 1
-; OPT-NEXT:  [[CMP:%[0-9]+]] = icmp ult i64 [[IndexInc]], 12
+; OPT-NEXT:  [[CMP:%[0-9]+]] = icmp ult i64 [[IndexInc]], 1
 ; OPT-NEXT:  br i1 [[CMP]], label %load-store-loop, label %memcpy-split
 
 ; OPT:       memcpy-split:
-; OPT-NEXT:  [[SrcAsi32:%[0-9]+]] = bitcast i64* [[SrcCast]] to i32*
-; OPT-NEXT:  [[SrcGep2:%[0-9]+]] = getelementptr inbounds i32, i32* [[SrcAsi32]], i64 24
-; OPT-NEXT:  [[Load2:%[0-9]+]] = load i32, i32* [[SrcGep2]]
-; OPT-NEXT:  [[DstAsi32:%[0-9]+]] = bitcast i64* [[DstCast]] to i32*
-; OPT-NEXT:  [[DstGep2:%[0-9]+]] = getelementptr inbounds i32, i32* [[DstAsi32]], i64 24
-; OPT-NEXT:  store i32 [[Load2]], i32* [[DstGep2]]
+; OPT-NEXT:  [[SrcAs2Xi64:%[0-9]+]] = bitcast <8 x i64>* [[SrcCast]] to <2 x i64>*
+; OPT-NEXT:  [[SrcGep2:%[0-9]+]] = getelementptr inbounds <2 x i64>, <2 x i64>* [[SrcAs2Xi64]], i64 4
+; OPT-NEXT:  [[Load2:%[0-9]+]] = load <2 x i64>, <2 x i64>* [[SrcGep2]]
+; OPT-NEXT:  [[DstAs2xi64:%[0-9]+]] = bitcast <8 x i64>* [[DstCast]] to <2 x i64>*
+; OPT-NEXT:  [[DstGep2:%[0-9]+]] = getelementptr inbounds <2 x i64>, <2 x i64>* [[DstAs2xi64]], i64 4
+; OPT-NEXT:  store <2 x i64> [[Load2]], <2 x i64>* [[DstGep2]]
+; OPT-NEXT:  [[SrcAs2Xi642:%[0-9]+]] = bitcast <8 x i64>* [[SrcCast]] to <2 x i64>*
+; OPT-NEXT:  [[SrcGep3:%[0-9]+]] = getelementptr inbounds <2 x i64>, <2 x i64>* [[SrcAs2Xi642]], i64 5
+; OPT-NEXT:  [[Load3:%[0-9]+]] = load <2 x i64>, <2 x i64>* [[SrcGep3]]
+; OPT-NEXT:  [[DstAs2Xi642:%[0-9]+]] = bitcast <8 x i64>* [[DstCast]] to <2 x i64>*
+; OPT-NEXT:  [[DstGep3:%[0-9]+]] = getelementptr inbounds <2 x i64>, <2 x i64>* [[DstAs2Xi642]], i64 5
+; OPT-NEXT:  store <2 x i64> [[Load3]], <2 x i64>* [[DstGep3]]
+; OPT-NEXT:  [[SrcAsi32:%[0-9]+]] = bitcast <8 x i64>* [[SrcCast]] to i32*
+; OPT-NEXT:  [[SrcGep4:%[0-9]+]] = getelementptr inbounds i32, i32* [[SrcAsi32]], i64 24
+; OPT-NEXT:  [[Load4:%[0-9]+]] = load i32, i32* [[SrcGep4]]
+; OPT-NEXT:  [[DstAsi32:%[0-9]+]] = bitcast <8 x i64>* [[DstCast]] to i32*
+; OPT-NEXT:  [[DstGep4:%[0-9]+]] = getelementptr inbounds i32, i32* [[DstAsi32]], i64 24
+; OPT-NEXT:  store i32 [[Load4]], i32* [[DstGep4]]
 ; OPT-NEXT:  ret i8* %dst
 }
 
 
 ; Check the expansion of a memcpy whose size argument is not a compile time
 ; constant.
-define i8* @memcpy_unkown_size(i8* %dst, i8* %src, i64 %len) {
+define i8* @memcpy_unkown_size(i8* %dst, i8* %src, i64 %len) !prof !29 {
 entry:
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false)
   ret i8* %dst
 
 ; OPT-LABEL: @memcpy_unkown_size
 ; OPT:       entry:
-; OPT-NEXT:  [[SrcCast:%[0-9]+]] = bitcast i8* %src to i64*
-; OPT-NEXT:  [[DstCast:%[0-9]+]] = bitcast i8* %dst to i64*
-; OPT-NEXT:  [[LoopCount:%[0-9]+]] = udiv i64 %len, 8
-; OPT-NEXT:  [[ResBytes:%[0-9]+]] = urem i64 %len, 8
+; OPT-NEXT:  [[SizeCmp:%[0-9]+]] = icmp ule i64 %len, 128
+; OPT-NEXT:  br i1 %0, label %[[ExpLabel:[0-9]+]], label %[[NoExpLabel:[0-9]+]]
+
+; OPT:       <label>:[[ExpLabel]]:
+; OPT-NEXT:  [[SrcCast:%[0-9]+]] = bitcast i8* %src to <8 x i64>*
+; OPT-NEXT:  [[DstCast:%[0-9]+]] = bitcast i8* %dst to <8 x i64>*
+; OPT-NEXT:  [[LoopCount:%[0-9]+]] = udiv i64 %len, 64
+; OPT-NEXT:  [[ResBytes:%[0-9]+]] = urem i64 %len, 64
 ; OPT-NEXT:  [[BytesCopied:%[0-9]+]] = sub i64 %len, [[ResBytes]]
 ; OPT-NEXT:  [[Cmp:%[0-9]+]] = icmp ne i64 [[LoopCount]], 0
 ; OPT-NEXT:  br i1 [[Cmp]], label %loop-memcpy-expansion, label %loop-memcpy-residual-header
 
 ; OPT:       post-loop-memcpy-expansion:
+; OPT-NEXT:  br label %[[RetBlock:[0-9]+]]
+
+
+; OPT:       <label>:[[NoExpLabel]]:
+; OPT-NEXT:  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false)
+; OPT-NEXT:  br label %[[RetBlock]]
+
+; OPT:       <label>:[[RetBlock]]:
 ; OPT-NEXT:  ret i8* %dst
 
 ; OPT:       loop-memcpy-expansion:
-; OPT-NEXT:  %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ]
-; OPT-NEXT:  [[SrcGep:%[0-9]+]] = getelementptr inbounds i64, i64* [[SrcCast]], i64 %loop-index
-; OPT-NEXT:  [[Load:%[0-9]+]] = load i64, i64* [[SrcGep]]
-; OPT-NEXT:  [[DstGep:%[0-9]+]] = getelementptr inbounds i64, i64* [[DstCast]], i64 %loop-index
-; OPT-NEXT:  store i64 [[Load]], i64* [[DstGep]]
+; OPT-NEXT:  %loop-index = phi i64 [ 0, %[[ExpLabel]] ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ]
+; OPT-NEXT:  [[SrcGep:%[0-9]+]] = getelementptr inbounds <8 x i64>, <8 x i64>* [[SrcCast]], i64 %loop-index
+; OPT-NEXT:  [[Load:%[0-9]+]] = load <8 x i64>, <8 x i64>* [[SrcGep]]
+; OPT-NEXT:  [[DstGep:%[0-9]+]] = getelementptr inbounds <8 x i64>, <8 x i64>* [[DstCast]], i64 %loop-index
+; OPT-NEXT:  store <8 x i64> [[Load]], <8 x i64>* [[DstGep]]
 ; OPT-NEXT:  [[IndexInc]] = add i64 %loop-index, 1
 ; OPT-NEXT:  [[LoopCmp:%[0-9]+]] = icmp ult i64 [[IndexInc]], [[LoopCount]]
 ; OPT-NEXT:  br i1 [[LoopCmp]], label %loop-memcpy-expansion, label %loop-memcpy-residual-header
 
 ; OPT:       loop-memcpy-residual:
 ; OPT-NEXT:  %residual-loop-index = phi i64 [ 0, %loop-memcpy-residual-header ], [ [[ResIndexInc:%[0-9]+]], %loop-memcpy-residual ]
-; OPT-NEXT:  [[SrcAsi8:%[0-9]+]] = bitcast i64* [[SrcCast]] to i8*
-; OPT-NEXT:  [[DstAsi8:%[0-9]+]] = bitcast i64* [[DstCast]] to i8*
+; OPT-NEXT:  [[SrcAsi8:%[0-9]+]] = bitcast <8 x i64>* [[SrcCast]] to i8*
+; OPT-NEXT:  [[DstAsi8:%[0-9]+]] = bitcast <8 x i64>* [[DstCast]] to i8*
 ; OPT-NEXT:  [[ResIndex:%[0-9]+]] = add i64 [[BytesCopied]], %residual-loop-index
 ; OPT-NEXT:  [[SrcGep2:%[0-9]+]] = getelementptr inbounds i8, i8* [[SrcAsi8]], i64 [[ResIndex]]
 ; OPT-NEXT:  [[Load2:%[0-9]+]] = load i8, i8* [[SrcGep2]]
@@ -129,6 +160,31 @@
 ; OPT-NEXT:  br i1 [[RHCmp]], label %loop-memcpy-residual, label %post-loop-memcpy-expansion
 }
 
+; Check that we don't expand cold calls
+; Function Attrs: nounwind
+define void @cold_test(i8* nocapture %dst, i8* nocapture readonly %src, i64 %size, i32 signext %cond) !prof !29 {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.end, label %if.then, !prof !30
+
+  if.then:                                          ; preds = %entry
+    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %size, i32 1, i1 false)
+    br label %if.end
+
+  if.end:                                           ; preds = %entry, %if.then
+    ret void
+
+; OPT-LABEL: @cold_test
+; OPT-NEXT:  entry:
+; OPT-NEXT:  tobool = icmp eq i32 %cond, 0
+; OPT-NEXT:  br i1 %tobool, label %if.end, label %if.then
+; OPT:       if.then:
+; OPT-NEXT:  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %size, i32 1, i1 false)
+; OPT-NEXT:  br label %if.end
+; OPT:       if.end:
+; OPT-NEXT:  ret void
+}
+
 ; Ensure the pass doens't expand memcpy calls when compiling a function with an
 ; unspported target_cpu attribute.
 define i8* @memcpy_power7(i8* %dst, i8* %src, i64 %len) #1 {
@@ -141,17 +197,17 @@
 
 ; Ensure the pass doens't expand calls in a function compiled for size.
 define i8* @memcpy_opt_small(i8* %dst, i8* %src, i64 %len) #2 {
-  entry:
+entry:
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false)
   ret i8* %dst
-; OPT-LABEL: @memcpy_opt_small
-; OPT:       tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false)
+; OPTSMALL-LABEL: @memcpy_opt_small
+; OPTSMALL:       tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false)
 }
 
 ; Ensure the pass doesn't expand calls on functions not compiled with
 ; optimizations.
 define i8* @memcpy_opt_none(i8* %dst, i8* %src, i64 %len) {
-  entry:
+entry:
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false)
   ret i8* %dst
 ; OPTNONE-LABEL: @memcpy_opt_none
@@ -161,3 +217,38 @@
 attributes #0 = { argmemonly nounwind }
 attributes #1 = { "target-cpu"="pwr7" }
 attributes #2 = { "target-cpu"="pwr8" optsize }
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 43218}
+!5 = !{!"MaxCount", i64 33153}
+!6 = !{!"MaxInternalCount", i64 33153}
+!7 = !{!"MaxFunctionCount", i64 8256}
+!8 = !{!"NumCounts", i64 30}
+!9 = !{!"NumFunctions", i64 11}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14, !15, !16, !17, !17, !18, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27}
+!12 = !{i32 10000, i64 33153, i32 1}
+!13 = !{i32 100000, i64 33153, i32 1}
+!14 = !{i32 200000, i64 33153, i32 1}
+!15 = !{i32 300000, i64 33153, i32 1}
+!16 = !{i32 400000, i64 33153, i32 1}
+!17 = !{i32 500000, i64 33153, i32 1}
+!18 = !{i32 600000, i64 33153, i32 1}
+!19 = !{i32 700000, i64 33153, i32 1}
+!20 = !{i32 800000, i64 8256, i32 2}
+!21 = !{i32 900000, i64 8256, i32 2}
+!22 = !{i32 950000, i64 8256, i32 2}
+!23 = !{i32 990000, i64 258, i32 9}
+!24 = !{i32 999000, i64 258, i32 9}
+!25 = !{i32 999900, i64 258, i32 9}
+!26 = !{i32 999990, i64 1, i32 12}
+!27 = !{i32 999999, i64 1, i32 12}
+!29 = !{!"function_entry_count", i64 258}
+!30 = !{!"branch_weights", i32 258, i32 0}
+!31 = !{!"function_entry_count", i64 0}
+
Index: test/CodeGen/PowerPC/memcpy.nopgo.ll
===================================================================
--- /dev/null
+++ test/CodeGen/PowerPC/memcpy.nopgo.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-enable-memcpy-loops=true -ppc-memcpy-unknown-loops=true < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @testMemcpyVersion(i8* nocapture %src, i8* nocapture readonly %dest, i32 signext %size) local_unnamed_addr #0 {
+; CHECK-LABEL: testMemcpyVersion:
+; CHECK:    mflr 0
+; CHECK-NEXT:    std 0, 16(1)
+; CHECK-NEXT:    stdu 1, -32(1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl memcpy
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi 1, 1, 32
+; CHECK-NEXT:    ld 0, 16(1)
+; CHECK-NEXT:    mtlr 0
+; CHECK-NEXT:    blr
+entry:
+  %conv = sext i32 %size to i64
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %src, i8* %dest, i64 %conv, i32 1, i1 false)
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1
+
Index: test/CodeGen/PowerPC/memcpy.pgo.hot.ll
===================================================================
--- /dev/null
+++ test/CodeGen/PowerPC/memcpy.pgo.hot.ll
@@ -0,0 +1,157 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-enable-memcpy-loops=true -ppc-memcpy-unknown-loops=true < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+@X = local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: inlinehint noinline nounwind
+define signext i32 @_Z3fooPiS_i(i32* nocapture %buffer1, i32* nocapture readonly %buffer2, i32 signext %SIZE) local_unnamed_addr #2 !prof !29 {
+; CHECK-LABEL: _Z3fooPiS_i:
+; CHECK:    mflr 0
+; CHECK-NEXT:    std 0, 16(1)
+; CHECK-NEXT:    stdu 1, -64(1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    .cfi_offset r29, -24
+; CHECK-NEXT:    .cfi_offset r30, -16
+; CHECK-NEXT:    std 29, 40(1) # 8-byte Folded Spill
+; CHECK-NEXT:    addis 29, 2, .LC0@toc@ha
+; CHECK-NEXT:    std 30, 48(1) # 8-byte Folded Spill
+; CHECK-NEXT:    mr 30, 5
+; CHECK-NEXT:    ld 5, .LC0@toc@l(29)
+; CHECK-NEXT:    mr 29, 3
+; CHECK-NEXT:    lwz 5, 0(5)
+; CHECK-NEXT:    cmpw 5, 30
+; CHECK-NEXT:    ble 0, .LBB0_9
+; CHECK:    cmpldi 30, 128
+; CHECK-NEXT:    bgt 0, .LBB0_8
+; CHECK:    rldicl. 5, 30, 58, 6
+; CHECK-NEXT:    clrldi 3, 30, 58
+; CHECK-NEXT:    beq 0, .LBB0_5
+; CHECK:    mtctr 5
+; CHECK-NEXT:    li 5, 16
+; CHECK-NEXT:    li 6, 32
+; CHECK-NEXT:    li 7, 48
+; CHECK-NEXT:    mr 8, 4
+; CHECK-NEXT:    mr 9, 29
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_4: # %loop-memcpy-expansion
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    lxvd2x 0, 8, 6
+; CHECK-NEXT:    lxvd2x 1, 8, 7
+; CHECK-NEXT:    lxvd2x 2, 8, 5
+; CHECK-NEXT:    lxvd2x 3, 0, 8
+; CHECK-NEXT:    addi 8, 8, 64
+; CHECK-NEXT:    stxvd2x 1, 9, 7
+; CHECK-NEXT:    stxvd2x 0, 9, 6
+; CHECK-NEXT:    stxvd2x 2, 9, 5
+; CHECK-NEXT:    stxvd2x 3, 0, 9
+; CHECK-NEXT:    addi 9, 9, 64
+; CHECK-NEXT:    bdnz .LBB0_4
+; CHECK-NEXT:  .LBB0_5: # %loop-memcpy-residual-header
+; CHECK-NEXT:    cmpldi 3, 0
+; CHECK-NEXT:    beq 0, .LBB0_9
+; CHECK:    addi 5, 30, -1
+; CHECK-NEXT:    mtctr 3
+; CHECK-NEXT:    sub 5, 5, 3
+; CHECK-NEXT:    add 4, 4, 5
+; CHECK-NEXT:    add 5, 29, 5
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_7: # %loop-memcpy-residual
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    lbzu 3, 1(4)
+; CHECK-NEXT:    stbu 3, 1(5)
+; CHECK-NEXT:    bdnz .LBB0_7
+; CHECK-NEXT:    b .LBB0_9
+; CHECK-NEXT:  .LBB0_8:
+; CHECK-NEXT:    mr 3, 29
+; CHECK-NEXT:    mr 5, 30
+; CHECK-NEXT:    bl memcpy
+; CHECK-NEXT:    nop
+; CHECK-NEXT:  .LBB0_9: # %if.end
+; CHECK-NEXT:    sldi 3, 30, 2
+; CHECK-NEXT:    ld 30, 48(1) # 8-byte Folded Reload
+; CHECK-NEXT:    lwax 3, 29, 3
+; CHECK-NEXT:    ld 29, 40(1) # 8-byte Folded Reload
+; CHECK-NEXT:    addi 1, 1, 64
+; CHECK-NEXT:    ld 0, 16(1)
+; CHECK-NEXT:    mtlr 0
+; CHECK-NEXT:    blr
+entry:
+  %0 = load i32, i32* @X, align 4, !tbaa !30
+  %cmp = icmp sgt i32 %0, %SIZE
+  %conv = sext i32 %SIZE to i64
+  br i1 %cmp, label %if.then, label %if.end, !prof !34
+
+if.then:                                          ; preds = %entry
+  %1 = bitcast i32* %buffer1 to i8*
+  %2 = bitcast i32* %buffer2 to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 %conv, i32 4, i1 false)
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %arrayidx = getelementptr inbounds i32, i32* %buffer1, i64 %conv
+  %3 = load i32, i32* %arrayidx, align 4, !tbaa !30
+  ret i32 %3
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #3
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!28}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 396}
+!5 = !{!"MaxCount", i64 200}
+!6 = !{!"MaxInternalCount", i64 20}
+!7 = !{!"MaxFunctionCount", i64 200}
+!8 = !{!"NumCounts", i64 16}
+!9 = !{!"NumFunctions", i64 5}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14, !15, !16, !17, !17, !18, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27}
+!12 = !{i32 10000, i64 200, i32 1}
+!13 = !{i32 100000, i64 200, i32 1}
+!14 = !{i32 200000, i64 200, i32 1}
+!15 = !{i32 300000, i64 200, i32 1}
+!16 = !{i32 400000, i64 200, i32 1}
+!17 = !{i32 500000, i64 200, i32 1}
+!18 = !{i32 600000, i64 100, i32 2}
+!19 = !{i32 700000, i64 100, i32 2}
+!20 = !{i32 800000, i64 30, i32 3}
+!21 = !{i32 900000, i64 18, i32 5}
+!22 = !{i32 950000, i64 10, i32 7}
+!23 = !{i32 990000, i64 2, i32 10}
+!24 = !{i32 999000, i64 2, i32 10}
+!25 = !{i32 999900, i64 2, i32 10}
+!26 = !{i32 999990, i64 2, i32 10}
+!27 = !{i32 999999, i64 2, i32 10}
+!28 = !{!"clang version 5.0.0 (git@github.ibm.com:llvm/clang.git d10c298f0fbab013ff45d395a3de8aa14085ccae) (llvm/llvm.git c3fbf2fa5a1161b7ddb3a64220e99a7ff2cb4bc6)"}
+!29 = !{!"function_entry_count", i64 30}
+!30 = !{!31, !31, i64 0}
+!31 = !{!"int", !32, i64 0}
+!32 = !{!"omnipotent char", !33, i64 0}
+!33 = !{!"Simple C++ TBAA"}
+!34 = !{!"branch_weights", i32 30, i32 10}
+!35 = !{!"function_entry_count", i64 2}
+!36 = !{!"branch_weights", i32 2, i32 200}
+!37 = !{!38, !38, i64 0}
+!38 = !{!"vtable pointer", !33, i64 0}
+!39 = !{!40, !41, i64 240}
+!40 = !{!"_ZTSSt9basic_iosIcSt11char_traitsIcEE", !41, i64 216, !32, i64 224, !42, i64 225, !41, i64 232, !41, i64 240, !41, i64 248, !41, i64 256}
+!41 = !{!"any pointer", !32, i64 0}
+!42 = !{!"bool", !32, i64 0}
+!43 = !{!"branch_weights", i32 0, i32 20}
+!44 = !{!45, !32, i64 56}
+!45 = !{!"_ZTSSt5ctypeIcE", !41, i64 16, !42, i64 24, !41, i64 32, !41, i64 40, !41, i64 48, !32, i64 56, !32, i64 57, !32, i64 313, !32, i64 569}
+!46 = !{!"branch_weights", i32 2, i32 18}
+!47 = !{!32, !32, i64 0}
+!48 = !{!"VP", i32 0, i64 1, i64 0, i64 1}
+!49 = !{!"branch_weights", i32 20, i32 2}
+!50 = !{!"function_entry_count", i64 3}
+!51 = !{!"memcpy2.cpp:_GLOBAL__sub_I_memcpy2.cpp"}
Index: test/CodeGen/PowerPC/memcpy.pgo.cold.ll
===================================================================
--- /dev/null
+++ test/CodeGen/PowerPC/memcpy.pgo.cold.ll
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-enable-memcpy-loops=true -ppc-memcpy-unknown-loops=true < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+@X = local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: inlinehint noinline nounwind
+define signext i32 @_Z3fooPiS_i(i32* nocapture %buffer1, i32* nocapture readonly %buffer2, i32 signext %SIZE) local_unnamed_addr #2 !prof !29 {
+; CHECK-LABEL: _Z3fooPiS_i:
+; CHECK:    mflr 0
+; CHECK-NEXT:    std 0, 16(1)
+; CHECK-NEXT:    stdu 1, -64(1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    .cfi_offset r29, -24
+; CHECK-NEXT:    .cfi_offset r30, -16
+; CHECK-NEXT:    std 29, 40(1) # 8-byte Folded Spill
+; CHECK-NEXT:    addis 29, 2, .LC0@toc@ha
+; CHECK-NEXT:    std 30, 48(1) # 8-byte Folded Spill
+; CHECK-NEXT:    mr 30, 5
+; CHECK-NEXT:    ld 5, .LC0@toc@l(29)
+; CHECK-NEXT:    mr 29, 3
+; CHECK-NEXT:    lwz 5, 0(5)
+; CHECK-NEXT:    cmpw 5, 30
+; CHECK-NEXT:    bgt- 0, .LBB0_2
+; CHECK-NEXT:  .LBB0_1: # %if.end
+; CHECK-NEXT:    sldi 3, 30, 2
+; CHECK-NEXT:    ld 30, 48(1) # 8-byte Folded Reload
+; CHECK-NEXT:    lwax 3, 29, 3
+; CHECK-NEXT:    ld 29, 40(1) # 8-byte Folded Reload
+; CHECK-NEXT:    addi 1, 1, 64
+; CHECK-NEXT:    ld 0, 16(1)
+; CHECK-NEXT:    mtlr 0
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB0_2: # %if.then
+; CHECK-NEXT:    mr 3, 29
+; CHECK-NEXT:    mr 5, 30
+; CHECK-NEXT:    bl memcpy
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    b .LBB0_1
+entry:
+  %0 = load i32, i32* @X, align 4, !tbaa !30
+  %cmp = icmp sgt i32 %0, %SIZE
+  %conv = sext i32 %SIZE to i64
+  br i1 %cmp, label %if.then, label %if.end, !prof !34
+
+if.then:                                          ; preds = %entry
+  %1 = bitcast i32* %buffer1 to i8*
+  %2 = bitcast i32* %buffer2 to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 %conv, i32 4, i1 false)
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %arrayidx = getelementptr inbounds i32, i32* %buffer1, i64 %conv
+  %3 = load i32, i32* %arrayidx, align 4, !tbaa !30
+  ret i32 %3
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #3
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!28}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 396}
+!5 = !{!"MaxCount", i64 200}
+!6 = !{!"MaxInternalCount", i64 20}
+!7 = !{!"MaxFunctionCount", i64 200}
+!8 = !{!"NumCounts", i64 16}
+!9 = !{!"NumFunctions", i64 5}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14, !15, !16, !17, !17, !18, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27}
+!12 = !{i32 10000, i64 200, i32 1}
+!13 = !{i32 100000, i64 200, i32 1}
+!14 = !{i32 200000, i64 200, i32 1}
+!15 = !{i32 300000, i64 200, i32 1}
+!16 = !{i32 400000, i64 200, i32 1}
+!17 = !{i32 500000, i64 200, i32 1}
+!18 = !{i32 600000, i64 100, i32 2}
+!19 = !{i32 700000, i64 100, i32 2}
+!20 = !{i32 800000, i64 30, i32 3}
+!21 = !{i32 900000, i64 18, i32 5}
+!22 = !{i32 950000, i64 10, i32 7}
+!23 = !{i32 990000, i64 2, i32 10}
+!24 = !{i32 999000, i64 2, i32 10}
+!25 = !{i32 999900, i64 2, i32 10}
+!26 = !{i32 999990, i64 2, i32 10}
+!27 = !{i32 999999, i64 2, i32 10}
+!28 = !{!"clang version 5.0.0 (git@github.ibm.com:llvm/clang.git d10c298f0fbab013ff45d395a3de8aa14085ccae) (llvm/llvm.git c3fbf2fa5a1161b7ddb3a64220e99a7ff2cb4bc6)"}
+!29 = !{!"function_entry_count", i64 30}
+!30 = !{!31, !31, i64 0}
+!31 = !{!"int", !32, i64 0}
+!32 = !{!"omnipotent char", !33, i64 0}
+!33 = !{!"Simple C++ TBAA"}
+!34 = !{!"branch_weights", i32 0, i32 30}
+!35 = !{!"function_entry_count", i64 2}
+!36 = !{!"branch_weights", i32 2, i32 200}
+!37 = !{!38, !38, i64 0}
+!38 = !{!"vtable pointer", !33, i64 0}
+!39 = !{!40, !41, i64 240}
+!40 = !{!"_ZTSSt9basic_iosIcSt11char_traitsIcEE", !41, i64 216, !32, i64 224, !42, i64 225, !41, i64 232, !41, i64 240, !41, i64 248, !41, i64 256}
+!41 = !{!"any pointer", !32, i64 0}
+!42 = !{!"bool", !32, i64 0}
+!43 = !{!"branch_weights", i32 0, i32 20}
+!44 = !{!45, !32, i64 56}
+!45 = !{!"_ZTSSt5ctypeIcE", !41, i64 16, !42, i64 24, !41, i64 32, !41, i64 40, !41, i64 48, !32, i64 56, !32, i64 57, !32, i64 313, !32, i64 569}
+!46 = !{!"branch_weights", i32 2, i32 18}
+!47 = !{!32, !32, i64 0}
+!48 = !{!"VP", i32 0, i64 1, i64 0, i64 1}
+!49 = !{!"branch_weights", i32 20, i32 2}
+!50 = !{!"function_entry_count", i64 3}
+!51 = !{!"memcpy2.cpp:_GLOBAL__sub_I_memcpy2.cpp"}