Index: lib/Transforms/Utils/LowerMemIntrinsics.cpp =================================================================== --- lib/Transforms/Utils/LowerMemIntrinsics.cpp +++ lib/Transforms/Utils/LowerMemIntrinsics.cpp @@ -168,11 +168,12 @@ IntegerType *ILengthType = dyn_cast(CopyLenType); assert(ILengthType && "expected size argument to memcpy to be an integer type!"); + Type *Int8Type = Type::getInt8Ty(Ctx); + bool LoopOpIsInt8 = LoopOpType == Int8Type; ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize); - Value *RuntimeLoopCount = PLBuilder.CreateUDiv(CopyLen, CILoopOpSize); - Value *RuntimeResidual = PLBuilder.CreateURem(CopyLen, CILoopOpSize); - Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual); - + Value *RuntimeLoopCount = LoopOpIsInt8 ? + CopyLen : + PLBuilder.CreateUDiv(CopyLen, CILoopOpSize); BasicBlock *LoopBB = BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, PostLoopBB); IRBuilder<> LoopBuilder(LoopBB); @@ -189,8 +190,11 @@ LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLenType, 1U)); LoopIndex->addIncoming(NewIndex, LoopBB); - Type *Int8Type = Type::getInt8Ty(Ctx); - if (LoopOpType != Int8Type) { + if (!LoopOpIsInt8) { + // Add in the + Value *RuntimeResidual = PLBuilder.CreateURem(CopyLen, CILoopOpSize); + Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual); + // Loop body for the residual copy. BasicBlock *ResLoopBB = BasicBlock::Create(Ctx, "loop-memcpy-residual", PreLoopBB->getParent(), Index: test/CodeGen/NVPTX/lower-aggr-copies.ll =================================================================== --- test/CodeGen/NVPTX/lower-aggr-copies.ll +++ test/CodeGen/NVPTX/lower-aggr-copies.ll @@ -36,9 +36,7 @@ ; WIR-LABEL: @memcpy_caller ; WIR: entry: -; WIR: [[LoopCount:%[0-9]+]] = udiv i64 %n, 1 -; WIR: [[ResidualSize:%[0-9]+]] = urem i64 %n, 1 -; WIR: [[Cond:%[0-9]+]] = icmp ne i64 [[LoopCount]], 0 +; WIR: [[Cond:%[0-9]+]] = icmp ne i64 %n, 0 ; WIR: br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion ; WIR: loop-memcpy-expansion: @@ -48,7 +46,7 @@ ; WIR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index ; WIR: store i8 [[Load]], i8* [[DstGep]] ; WIR: [[IndexInc]] = add i64 %loop-index, 1 -; WIR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], [[LoopCount]] +; WIR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n ; WIR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion ; WIR-LABEL: post-loop-memcpy-expansion: @@ -74,9 +72,7 @@ ; WIR-LABEL: @memcpy_volatile_caller ; WIR: entry: -; WIR: [[LoopCount:%[0-9]+]] = udiv i64 %n, 1 -; WIR: [[ResidualSize:%[0-9]+]] = urem i64 %n, 1 -; WIR: [[Cond:%[0-9]+]] = icmp ne i64 [[LoopCount]], 0 +; WIR: [[Cond:%[0-9]+]] = icmp ne i64 %n, 0 ; WIR: br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion ; WIR: loop-memcpy-expansion: @@ -86,7 +82,7 @@ ; WIR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index ; WIR: store volatile i8 [[Load]], i8* [[DstGep]] ; WIR: [[IndexInc]] = add i64 %loop-index, 1 -; WIR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], [[LoopCount]] +; WIR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n ; WIR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion ; WIR-LABEL: post-loop-memcpy-expansion: