diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -943,6 +943,8 @@ I.replaceAllUsesWith(Offset); I.eraseFromParent(); + SmallVector DeferredIntrs; + for (Value *V : WorkList) { CallInst *Call = dyn_cast(V); if (!Call) { @@ -997,22 +999,13 @@ // These intrinsics are for address space 0 only Intr->eraseFromParent(); continue; - case Intrinsic::memcpy: { - MemCpyInst *MemCpy = cast(Intr); - Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getDestAlign(), - MemCpy->getRawSource(), MemCpy->getSourceAlign(), - MemCpy->getLength(), MemCpy->isVolatile()); - Intr->eraseFromParent(); + case Intrinsic::memcpy: + case Intrinsic::memmove: + // These have 2 pointer operands. In case if second pointer also needs + // to be replaced we defer processing of these intrinsics until all + // other values are processed. + DeferredIntrs.push_back(Intr); continue; - } - case Intrinsic::memmove: { - MemMoveInst *MemMove = cast(Intr); - Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getDestAlign(), - MemMove->getRawSource(), MemMove->getSourceAlign(), - MemMove->getLength(), MemMove->isVolatile()); - Intr->eraseFromParent(); - continue; - } case Intrinsic::memset: { MemSetInst *MemSet = cast(Intr); Builder.CreateMemSet( @@ -1050,6 +1043,27 @@ llvm_unreachable("Don't know how to promote alloca intrinsic use."); } } + + for (IntrinsicInst *Intr : DeferredIntrs) { + Builder.SetInsertPoint(Intr); + Intrinsic::ID ID = Intr->getIntrinsicID(); + assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove); + + MemTransferInst *MI = cast(Intr); + auto *B = + Builder.CreateMemTransferInst(ID, MI->getRawDest(), MI->getDestAlign(), + MI->getRawSource(), MI->getSourceAlign(), + MI->getLength(), MI->isVolatile()); + + for (unsigned I = 1; I != 3; ++I) { + if (uint64_t Bytes = Intr->getDereferenceableBytes(I)) { + B->addDereferenceableAttr(I, Bytes); + } + } + + Intr->eraseFromParent(); + } + return true; } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll @@ -2,9 +2,11 @@ declare void @llvm.memcpy.p0i8.p1i8.i32(i8* nocapture, i8 addrspace(1)* nocapture, i32, i1) #0 declare void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* nocapture, i8* nocapture, i32, i1) #0 +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) #0 declare void @llvm.memmove.p0i8.p1i8.i32(i8* nocapture, i8 addrspace(1)* nocapture, i32, i1) #0 declare void @llvm.memmove.p1i8.p0i8.i32(i8 addrspace(1)* nocapture, i8* nocapture, i32, i1) #0 +declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) #0 declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i1) #0 @@ -61,5 +63,35 @@ ret void } +; CHECK-LABEL: @promote_alloca_used_twice_in_memcpy( +; CHECK: %i = bitcast double addrspace(3)* %arrayidx1 to i8 addrspace(3)* +; CHECK: %i1 = bitcast double addrspace(3)* %arrayidx2 to i8 addrspace(3)* +; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* align 8 dereferenceable(16) %i, i8 addrspace(3)* align 8 dereferenceable(16) %i1, i64 16, i1 false) +define amdgpu_kernel void @promote_alloca_used_twice_in_memcpy(i32 %c) { +entry: + %r = alloca double, align 8 + %arrayidx1 = getelementptr inbounds double, double* %r, i32 1 + %i = bitcast double* %arrayidx1 to i8* + %arrayidx2 = getelementptr inbounds double, double* %r, i32 %c + %i1 = bitcast double* %arrayidx2 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 dereferenceable(16) %i, i8* align 8 dereferenceable(16) %i1, i64 16, i1 false) + ret void +} + +; CHECK-LABEL: @promote_alloca_used_twice_in_memmove( +; CHECK: %i = bitcast double addrspace(3)* %arrayidx1 to i8 addrspace(3)* +; CHECK: %i1 = bitcast double addrspace(3)* %arrayidx2 to i8 addrspace(3)* +; CHECK: call void @llvm.memmove.p3i8.p3i8.i64(i8 addrspace(3)* align 8 dereferenceable(16) %i, i8 addrspace(3)* align 8 dereferenceable(16) %i1, i64 16, i1 false) +define amdgpu_kernel void @promote_alloca_used_twice_in_memmove(i32 %c) { +entry: + %r = alloca double, align 8 + %arrayidx1 = getelementptr inbounds double, double* %r, i32 1 + %i = bitcast double* %arrayidx1 to i8* + %arrayidx2 = getelementptr inbounds double, double* %r, i32 %c + %i1 = bitcast double* %arrayidx2 to i8* + call void @llvm.memmove.p0i8.p0i8.i64(i8* align 8 dereferenceable(16) %i, i8* align 8 dereferenceable(16) %i1, i64 16, i1 false) + ret void +} + attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" } attributes #1 = { nounwind readnone }