Index: llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -600,8 +600,12 @@ Value *BaseAlloca, Value *Val, std::vector &WorkList) const { for (User *User : Val->users()) { - if (is_contained(WorkList, User)) + auto UI = llvm::find(WorkList, User); + if (UI != WorkList.end()) { + WorkList.erase(UI); + WorkList.push_back(User); continue; + } if (CallInst *CI = dyn_cast(User)) { if (!isCallPromotable(CI)) Index: llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll +++ llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll @@ -2,6 +2,7 @@ declare void @llvm.memcpy.p0i8.p1i8.i32(i8* nocapture, i8 addrspace(1)* nocapture, i32, i1) #0 declare void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* nocapture, i8* nocapture, i32, i1) #0 +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) #0 declare void @llvm.memmove.p0i8.p1i8.i32(i8* nocapture, i8 addrspace(1)* nocapture, i32, i1) #0 declare void @llvm.memmove.p1i8.p0i8.i32(i8 addrspace(1)* nocapture, i8* nocapture, i32, i1) #0 @@ -61,5 +62,20 @@ ret void } +; CHECK-LABEL: @promote_alloca_used_twice_in_memcpy( +; CHECK: %i = bitcast double addrspace(3)* %arrayidx1 to i8 addrspace(3)* +; CHECK: %i1 = bitcast double addrspace(3)* %arrayidx2 to i8 addrspace(3)* +; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* align 8 %i, i8 addrspace(3)* align 8 %i1, i64 16, i1 false) +define amdgpu_kernel void @promote_alloca_used_twice_in_memcpy(i32 %c) { +entry: + %r = alloca double, align 8 + %arrayidx1 = getelementptr inbounds double, double* %r, i32 1 + %i = bitcast double* %arrayidx1 to i8* + %arrayidx2 = getelementptr inbounds double, double* %r, i32 %c + %i1 = bitcast double* %arrayidx2 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 dereferenceable(16) %i, i8* align 8 dereferenceable(16) %i1, i64 16, i1 false) + ret void +} + attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" } attributes #1 = { nounwind readnone }