Index: lib/Transforms/Utils/LowerMemIntrinsics.cpp
===================================================================
--- lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -168,11 +168,12 @@
   IntegerType *ILengthType = dyn_cast<IntegerType>(CopyLenType);
   assert(ILengthType &&
          "expected size argument to memcpy to be an integer type!");
+  Type *Int8Type = Type::getInt8Ty(Ctx);
+  bool LoopOpIsInt8 = LoopOpType == Int8Type;
   ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
-  Value *RuntimeLoopCount = PLBuilder.CreateUDiv(CopyLen, CILoopOpSize);
-  Value *RuntimeResidual = PLBuilder.CreateURem(CopyLen, CILoopOpSize);
-  Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual);
-
+  Value *RuntimeLoopCount = LoopOpIsInt8 ?
+                            CopyLen :
+                            PLBuilder.CreateUDiv(CopyLen, CILoopOpSize);
   BasicBlock *LoopBB =
       BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, PostLoopBB);
   IRBuilder<> LoopBuilder(LoopBB);
@@ -189,8 +190,11 @@
       LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLenType, 1U));
   LoopIndex->addIncoming(NewIndex, LoopBB);
 
-  Type *Int8Type = Type::getInt8Ty(Ctx);
-  if (LoopOpType != Int8Type) {
+  if (!LoopOpIsInt8) {
+   // Add in the
+   Value *RuntimeResidual = PLBuilder.CreateURem(CopyLen, CILoopOpSize);
+   Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual);
+
     // Loop body for the residual copy.
     BasicBlock *ResLoopBB = BasicBlock::Create(Ctx, "loop-memcpy-residual",
                                                PreLoopBB->getParent(),
Index: test/CodeGen/NVPTX/lower-aggr-copies.ll
===================================================================
--- test/CodeGen/NVPTX/lower-aggr-copies.ll
+++ test/CodeGen/NVPTX/lower-aggr-copies.ll
@@ -36,9 +36,7 @@
 
 ; WIR-LABEL:   @memcpy_caller
 ; WIR:         entry:
-; WIR:         [[LoopCount:%[0-9]+]] = udiv i64 %n, 1
-; WIR:         [[ResidualSize:%[0-9]+]] = urem i64 %n, 1
-; WIR:         [[Cond:%[0-9]+]] = icmp ne i64 [[LoopCount]], 0
+; WIR:         [[Cond:%[0-9]+]] = icmp ne i64 %n, 0
 ; WIR:         br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
 
 ; WIR:         loop-memcpy-expansion:
@@ -48,7 +46,7 @@
 ; WIR:         [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
 ; WIR:         store i8 [[Load]], i8* [[DstGep]]
 ; WIR:         [[IndexInc]] = add i64 %loop-index, 1
-; WIR:         [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], [[LoopCount]]
+; WIR:         [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n
 ; WIR:         br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
 
 ; WIR-LABEL:   post-loop-memcpy-expansion:
@@ -74,9 +72,7 @@
 
 ; WIR-LABEL:   @memcpy_volatile_caller
 ; WIR:         entry:
-; WIR:         [[LoopCount:%[0-9]+]] = udiv i64 %n, 1
-; WIR:         [[ResidualSize:%[0-9]+]] = urem i64 %n, 1
-; WIR:         [[Cond:%[0-9]+]] = icmp ne i64 [[LoopCount]], 0
+; WIR:         [[Cond:%[0-9]+]] = icmp ne i64 %n, 0
 ; WIR:         br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
 
 ; WIR:         loop-memcpy-expansion:
@@ -86,7 +82,7 @@
 ; WIR:         [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
 ; WIR:         store volatile i8 [[Load]], i8* [[DstGep]]
 ; WIR:         [[IndexInc]] = add i64 %loop-index, 1
-; WIR:         [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], [[LoopCount]]
+; WIR:         [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n
 ; WIR:         br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
 
 ; WIR-LABEL:   post-loop-memcpy-expansion: