Index: lib/Transforms/Scalar/InferAddressSpaces.cpp =================================================================== --- lib/Transforms/Scalar/InferAddressSpaces.cpp +++ lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -249,8 +249,16 @@ pushPtrOperand(RMW->getPointerOperand()); else if (auto *CmpX = dyn_cast(&I)) pushPtrOperand(CmpX->getPointerOperand()); + else if (auto *MI = dyn_cast(&I)) { + // For memset/memcpy/memmove, any pointer operand can be replaced. + pushPtrOperand(MI->getRawDest()); - // TODO: Support intrinsics + // Handle 2nd operand for memcpy/memmove. + if (auto *MTI = dyn_cast(MI)) + pushPtrOperand(MTI->getRawSource()); + } + + // TODO: Support target intrinsics } std::vector Postorder; // The resultant postorder. @@ -559,6 +567,64 @@ return false; } +/// Update memory intrinsic uses that require more complex processing than +/// simple memory instructions. Thse require re-mangling and may have multiple +/// pointer operands. +static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, + Value *OldV, Value *NewV) { + IRBuilder<> B(MI); + MDNode *TBAA = MI->getMetadata(LLVMContext::MD_tbaa); + MDNode *ScopeMD = MI->getMetadata(LLVMContext::MD_alias_scope); + MDNode *NoAliasMD = MI->getMetadata(LLVMContext::MD_noalias); + + if (auto *MSI = dyn_cast(MI)) { + B.CreateMemSet(NewV, MSI->getValue(), + MSI->getLength(), MSI->getAlignment(), + false, TBAA, ScopeMD, NoAliasMD); + } else if (auto *MTI = dyn_cast(MI)) { + Value *Src = MTI->getRawSource(); + Value *Dest = MTI->getRawDest(); + + // Be careful in case this is a self-to-self copy. + if (Src == OldV) + Src = NewV; + + if (Dest == OldV) + Dest = NewV; + + if (isa(MTI)) { + MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct); + B.CreateMemCpy(Dest, Src, MTI->getLength(), + MTI->getAlignment(), + false, // isVolatile + TBAA, TBAAStruct, ScopeMD, NoAliasMD); + } else { + assert(isa(MTI)); + B.CreateMemMove(Dest, Src, MTI->getLength(), + MTI->getAlignment(), + false, // isVolatile + TBAA, ScopeMD, NoAliasMD); + } + } else + llvm_unreachable("unhandled MemIntrinsic"); + + MI->eraseFromParent(); + return true; +} + +static Value::use_iterator skipToNextUser(Value::use_iterator I, + Value::use_iterator E) { + User *CurUser = I->getUser(); + ++I; + + // Some users may see the same pointer operand in multiple operands. Skip to + // the next instruction. + while (I != E && I->getUser() == CurUser) + ++I; + + return I; +} + bool InferAddressSpaces::rewriteWithNewAddressSpaces( const std::vector &Postorder, const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const { @@ -594,20 +660,30 @@ if (NewV == nullptr) continue; - SmallVector Uses; - for (Use &U : V->uses()) - Uses.push_back(&U); - DEBUG(dbgs() << "Replacing the uses of " << *V << "\n with\n " << *NewV << '\n'); - for (Use *U : Uses) { - if (isSimplePointerUseValidToReplace(*U)) { + Value::use_iterator I, E, Next; + for (I = V->use_begin(), E = V->use_end(); I != E; ) { + Use &U = *I; + I = skipToNextUser(I, E); + + if (isSimplePointerUseValidToReplace(U)) { // If V is used as the pointer operand of a compatible memory operation, // sets the pointer operand to NewV. This replacement does not change // the element type, so the resultant load/store is still valid. - U->set(NewV); - } else if (isa(U->getUser())) { + U.set(NewV); + continue; + } + + User *CurUser = U.getUser(); + // Handle more complex cases like intrinsic that need to be remangled. + if (auto *MI = dyn_cast(CurUser)) { + if (!MI->isVolatile() && handleMemIntrinsicPtrUse(MI, V, NewV)) + continue; + } + + if (isa(CurUser)) { // Otherwise, replaces the use with generic(NewV). // TODO: Some optimization opportunities are missed. For example, in // %0 = icmp eq float* %p, %q @@ -621,13 +697,14 @@ BasicBlock::iterator InsertPos = std::next(I->getIterator()); while (isa(InsertPos)) ++InsertPos; - U->set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos)); + U.set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos)); } else { - U->set(ConstantExpr::getAddrSpaceCast(cast(NewV), - V->getType())); + U.set(ConstantExpr::getAddrSpaceCast(cast(NewV), + V->getType())); } } } + if (V->use_empty()) RecursivelyDeleteTriviallyDeadInstructions(V); } Index: test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll =================================================================== --- /dev/null +++ test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll @@ -0,0 +1,134 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -infer-address-spaces %s | FileCheck %s + +; CHECK-LABEL: @memset_group_to_flat( +; CHECK: call void @llvm.memset.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 +define void @memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 { + %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)* + call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 + ret void +} + +; CHECK-LABEL: @memset_global_to_flat( +; CHECK: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %global.ptr, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 +define void @memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 { + %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)* + call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 + ret void +} + +; CHECK-LABEL: @memset_group_to_flat_no_md( +; CHECK: call void @llvm.memset.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 4, i64 %size, i32 4, i1 false){{$}} +define void @memset_group_to_flat_no_md(i8 addrspace(3)* %group.ptr, i64 %size) #0 { + %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)* + call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 %size, i32 4, i1 false) + ret void +} + +; CHECK-LABEL: @memset_global_to_flat_no_md( +; CHECK: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %global.ptr, i8 4, i64 %size, i32 4, i1 false){{$}} +define void @memset_global_to_flat_no_md(i8 addrspace(1)* %global.ptr, i64 %size) #0 { + %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)* + call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 %size, i32 4, i1 false) + ret void +} + +; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group( +; CHCK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 +define void @memcpy_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { + %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* + call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 + ret void +} + +; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_with_group( +; CHECK: call void @llvm.memcpy.p3i8.p4i8.i64(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(4)* %src.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 +define void @memcpy_flat_to_flat_replace_dest_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(4)* %src.ptr, i64 %size) #0 { + %cast.dest = addrspacecast i8 addrspace(3)* %dest.group.ptr to i8 addrspace(4)* + call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %src.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 + ret void +} + +; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_src_with_group( +; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* %src.group.ptr, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 +define void @memcpy_flat_to_flat_replace_dest_src_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { + %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* + %cast.dest = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* + call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 + ret void +} + +; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_group_src_global( +; CHECK: call void @llvm.memcpy.p3i8.p1i8.i64(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(1)* %src.global.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 +define void @memcpy_flat_to_flat_replace_dest_group_src_global(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(1)* %src.global.ptr, i64 %size) #0 { + %cast.src = addrspacecast i8 addrspace(1)* %src.global.ptr to i8 addrspace(4)* + %cast.dest = addrspacecast i8 addrspace(3)* %dest.group.ptr to i8 addrspace(4)* + call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 + ret void +} + +; CHECK-LABEL: @memcpy_group_to_flat_replace_dest_global( +; CHECK: call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %dest.global.ptr, i8 addrspace(3)* %src.group.ptr, i32 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 +define void @memcpy_group_to_flat_replace_dest_global(i8 addrspace(1)* %dest.global.ptr, i8 addrspace(3)* %src.group.ptr, i32 %size) #0 { + %cast.dest = addrspacecast i8 addrspace(1)* %dest.global.ptr to i8 addrspace(4)* + call void @llvm.memcpy.p4i8.p3i8.i32(i8 addrspace(4)* %cast.dest, i8 addrspace(3)* %src.group.ptr, i32 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 + ret void +} + +; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct( +; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa.struct !7 +define void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { + %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* + call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa.struct !7 + ret void +} + +; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group_no_md( +; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}} +define void @memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { + %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* + call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false) + ret void +} + +; CHECK-LABEL: @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md( +; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest0, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}} +; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest1, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}} +define void @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest0, i8 addrspace(4)* %dest1, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { + %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* + call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest0, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false) + call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest1, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false) + ret void +} + +; Check for iterator problems if the pointer has 2 uses in the same call +; CHECK-LABEL: @memcpy_group_flat_to_flat_self( +; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 addrspace(3)* %group.ptr, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 +define void @memcpy_group_flat_to_flat_self(i8 addrspace(3)* %group.ptr) #0 { + %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)* + call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast, i8 addrspace(4)* %cast, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 + ret void +} +; CHECK-LABEL: @memmove_flat_to_flat_replace_src_with_group( +; CHECK: call void @llvm.memmove.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 +define void @memmove_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { + %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* + call void @llvm.memmove.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 + ret void +} + +declare void @llvm.memset.p4i8.i64(i8 addrspace(4)* nocapture writeonly, i8, i64, i32, i1) #1 +declare void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* nocapture writeonly, i8 addrspace(4)* nocapture readonly, i64, i32, i1) #1 +declare void @llvm.memcpy.p4i8.p3i8.i32(i8 addrspace(4)* nocapture writeonly, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #1 +declare void @llvm.memmove.p4i8.p4i8.i64(i8 addrspace(4)* nocapture writeonly, i8 addrspace(4)* nocapture readonly, i64, i32, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { argmemonly nounwind } + +!0 = !{!1, !1, i64 0} +!1 = !{!"A", !2} +!2 = !{!"tbaa root"} +!3 = !{!"B", !2} +!4 = !{!5} +!5 = distinct !{!5, !6, !"some scope"} +!6 = distinct !{!6, !"some domain"} +!7 = !{i64 0, i64 8, null} \ No newline at end of file Index: test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll =================================================================== --- test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll +++ test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll @@ -115,4 +115,26 @@ ret { i32, i1 } %ret } -attributes #0 = { nounwind } \ No newline at end of file +; FIXME: Shouldn't be losing names +; CHECK-LABEL: @volatile_memset_group_to_flat( +; CHECK: addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)* +; CHECK: call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %1, i8 4, i64 32, i32 4, i1 true) +define void @volatile_memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 { + %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)* + call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 true) + ret void +} + +; CHECK-LABEL: @volatile_memset_global_to_flat( +; CHECK: addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)* +; CHECK: call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %1, i8 4, i64 32, i32 4, i1 true) +define void @volatile_memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 { + %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)* + call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 true) + ret void +} + +declare void @llvm.memset.p4i8.i64(i8 addrspace(4)* nocapture writeonly, i8, i64, i32, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { argmemonly nounwind }