Index: lib/Transforms/Utils/LowerMemIntrinsics.cpp =================================================================== --- lib/Transforms/Utils/LowerMemIntrinsics.cpp +++ lib/Transforms/Utils/LowerMemIntrinsics.cpp @@ -174,7 +174,7 @@ Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual); BasicBlock *LoopBB = - BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, nullptr); + BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, PostLoopBB); IRBuilder<> LoopBuilder(LoopBB); PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLenType, 2, "loop-index"); @@ -193,7 +193,8 @@ if (LoopOpType != Int8Type) { // Loop body for the residual copy. BasicBlock *ResLoopBB = BasicBlock::Create(Ctx, "loop-memcpy-residual", - PreLoopBB->getParent(), nullptr); + PreLoopBB->getParent(), + PostLoopBB); // Residual loop header. BasicBlock *ResHeaderBB = BasicBlock::Create( Ctx, "loop-memcpy-residual-header", PreLoopBB->getParent(), nullptr); Index: test/CodeGen/NVPTX/lower-aggr-copies.ll =================================================================== --- test/CodeGen/NVPTX/lower-aggr-copies.ll +++ test/CodeGen/NVPTX/lower-aggr-copies.ll @@ -50,6 +50,9 @@ ; WIR: [[IndexInc]] = add i64 %loop-index, 1 ; WIR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], [[LoopCount]] ; WIR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion + +; WIR-LABEL: post-loop-memcpy-expansion: +; WIR: ret i8* %dst } define i8* @memcpy_volatile_caller(i8* %dst, i8* %src, i64 %n) #0 { @@ -85,6 +88,9 @@ ; WIR: [[IndexInc]] = add i64 %loop-index, 1 ; WIR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], [[LoopCount]] ; WIR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion + +; WIR-LABEL: post-loop-memcpy-expansion: +; WIR: ret i8* %dst } define i8* @memcpy_casting_caller(i32* %dst, i32* %src, i64 %n) #0 {