Index: lib/Transforms/Scalar/InferAddressSpaces.cpp =================================================================== --- lib/Transforms/Scalar/InferAddressSpaces.cpp +++ lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -247,9 +247,19 @@ } else if (AtomicCmpXchgInst *CmpX = dyn_cast(&I)) { appendsGenericAddressExpressionToPostorderStack( CmpX->getPointerOperand(), &PostorderStack, &Visited); + } else if (MemIntrinsic *MI = dyn_cast(&I)) { + // For memset/memcpy/memmove, any pointer operand can be replaced. + appendsGenericAddressExpressionToPostorderStack( + MI->getRawDest(), &PostorderStack, &Visited); + + // Handle 2nd operand for memcpy/memmove. + if (MemTransferInst *MTI = dyn_cast(MI)) { + appendsGenericAddressExpressionToPostorderStack( + MTI->getRawSource(), &PostorderStack, &Visited); + } } - // TODO: Support intrinsics + // TODO: Support target intrinsics } std::vector Postorder; // The resultant postorder. @@ -534,7 +544,7 @@ return NewAS; } -static bool isCompatiblePtrUse(Use &U) { +static bool isSimpleCompatiblePtrUse(Use &U) { User *Inst = U.getUser(); unsigned OpNo = U.getOperandNo(); @@ -555,6 +565,56 @@ return false; } +/// Handle updating uses that require more complex processing than simply +/// replacing an operand. e.g. intrinsic uses that need to be re-mangled. +/// \returns true on sucess but does not remove the user instruction \p U. +static bool handleComplexPtrUse(User &U, Value *OldV, Value *NewV) { + if (MemIntrinsic *MI = dyn_cast(&U)) { + if (MI->isVolatile()) + return false; + + CallInst *NewCall; + IRBuilder<> B(MI); + MDNode *TBAA = MI->getMetadata(LLVMContext::MD_tbaa); + MDNode *ScopeMD = MI->getMetadata(LLVMContext::MD_alias_scope); + MDNode *NoAliasMD = MI->getMetadata(LLVMContext::MD_noalias); + + if (MemSetInst *MSI = dyn_cast(MI)) { + NewCall = B.CreateMemSet(NewV, MSI->getValue(), + MSI->getLength(), MSI->getAlignment(), + false, TBAA, ScopeMD, NoAliasMD); + } else if (MemTransferInst *MTI = dyn_cast(MI)) { + // Be careful in case this is a self to self copy. + Value *Src = MTI->getRawSource(); + Value *Dest = MTI->getRawDest(); + + if (Src == OldV) + Src = NewV; + + if (Dest == OldV) + Dest = NewV; + + if (isa(MTI)) { + MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct); + NewCall = B.CreateMemCpy(Dest, Src, MTI->getLength(), + MTI->getAlignment(), false, + TBAA, TBAAStruct, ScopeMD, NoAliasMD); + } else { + assert(isa(MTI)); + NewCall = B.CreateMemMove(Dest, Src, MTI->getLength(), + MTI->getAlignment(), false, + TBAA, ScopeMD, NoAliasMD); + } + } else + llvm_unreachable("unhandled MemIntrinsic"); + + MI->eraseFromParent(); + return true; + } + + return false; +} + bool InferAddressSpaces::rewriteWithNewAddressSpaces( const std::vector &Postorder, const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const { @@ -590,20 +650,34 @@ if (NewV == nullptr) continue; - SmallVector Uses; - for (Use &U : V->uses()) - Uses.push_back(&U); - DEBUG(dbgs() << "Replacing the uses of " << *V << "\n with\n " << *NewV << '\n'); - for (Use *U : Uses) { - if (isCompatiblePtrUse(*U)) { + Value::use_iterator I, E, Next; + for (I = V->use_begin(), E = V->use_end(); I != E; ) { + Use &U = *I; + ++I; + + if (isSimpleCompatiblePtrUse(U)) { // If V is used as the pointer operand of a compatible memory operation, // sets the pointer operand to NewV. This replacement does not change // the element type, so the resultant load/store is still valid. - U->set(NewV); - } else if (isa(U->getUser())) { + U.set(NewV); + continue; + } + + // Intrinsic users may see the same pointer operand in multiple + // operands. Skip to the next instruction. + User *CurUser = U.getUser(); + while (I != E && I->getUser() == CurUser) { + ++I; + } + + // Handle more complex cases like intrinsic that need to be remangled. + if (handleComplexPtrUse(*CurUser, V, NewV)) + continue; + + if (isa(U.getUser())) { // Otherwise, replaces the use with generic(NewV). // TODO: Some optimization opportunities are missed. For example, in // %0 = icmp eq float* %p, %q @@ -617,13 +691,14 @@ BasicBlock::iterator InsertPos = std::next(I->getIterator()); while (isa(InsertPos)) ++InsertPos; - U->set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos)); + U.set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos)); } else { - U->set(ConstantExpr::getAddrSpaceCast(cast(NewV), - V->getType())); + U.set(ConstantExpr::getAddrSpaceCast(cast(NewV), + V->getType())); } } } + if (V->use_empty()) RecursivelyDeleteTriviallyDeadInstructions(V); } Index: test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll =================================================================== --- /dev/null +++ test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll @@ -0,0 +1,134 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -infer-address-spaces %s | FileCheck %s + +; CHECK-LABEL: @memset_group_to_flat( +; CHECK: call void @llvm.memset.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 +define void @memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 { + %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)* + call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 + ret void +} + +; CHECK-LABEL: @memset_global_to_flat( +; CHECK: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %global.ptr, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 +define void @memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 { + %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)* + call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 + ret void +} + +; CHECK-LABEL: @memset_group_to_flat_no_md( +; CHECK: call void @llvm.memset.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 4, i64 %size, i32 4, i1 false){{$}} +define void @memset_group_to_flat_no_md(i8 addrspace(3)* %group.ptr, i64 %size) #0 { + %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)* + call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 %size, i32 4, i1 false) + ret void +} + +; CHECK-LABEL: @memset_global_to_flat_no_md( +; CHECK: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %global.ptr, i8 4, i64 %size, i32 4, i1 false){{$}} +define void @memset_global_to_flat_no_md(i8 addrspace(1)* %global.ptr, i64 %size) #0 { + %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)* + call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 %size, i32 4, i1 false) + ret void +} + +; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group( +; CHCK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 +define void @memcpy_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { + %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* + call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 + ret void +} + +; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_with_group( +; CHECK: call void @llvm.memcpy.p3i8.p4i8.i64(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(4)* %src.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 +define void @memcpy_flat_to_flat_replace_dest_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(4)* %src.ptr, i64 %size) #0 { + %cast.dest = addrspacecast i8 addrspace(3)* %dest.group.ptr to i8 addrspace(4)* + call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %src.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 + ret void +} + +; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_src_with_group( +; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* %src.group.ptr, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 +define void @memcpy_flat_to_flat_replace_dest_src_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { + %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* + %cast.dest = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* + call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 + ret void +} + +; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_group_src_global( +; CHECK: call void @llvm.memcpy.p3i8.p1i8.i64(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(1)* %src.global.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 +define void @memcpy_flat_to_flat_replace_dest_group_src_global(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(1)* %src.global.ptr, i64 %size) #0 { + %cast.src = addrspacecast i8 addrspace(1)* %src.global.ptr to i8 addrspace(4)* + %cast.dest = addrspacecast i8 addrspace(3)* %dest.group.ptr to i8 addrspace(4)* + call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 + ret void +} + +; CHECK-LABEL: @memcpy_group_to_flat_replace_dest_global( +; CHECK: call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %dest.global.ptr, i8 addrspace(3)* %src.group.ptr, i32 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 +define void @memcpy_group_to_flat_replace_dest_global(i8 addrspace(1)* %dest.global.ptr, i8 addrspace(3)* %src.group.ptr, i32 %size) #0 { + %cast.dest = addrspacecast i8 addrspace(1)* %dest.global.ptr to i8 addrspace(4)* + call void @llvm.memcpy.p4i8.p3i8.i32(i8 addrspace(4)* %cast.dest, i8 addrspace(3)* %src.group.ptr, i32 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 + ret void +} + +; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct( +; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa.struct !7 +define void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { + %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* + call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa.struct !7 + ret void +} + +; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group_no_md( +; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}} +define void @memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { + %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* + call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false) + ret void +} + +; CHECK-LABEL: @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md( +; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest0, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}} +; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest1, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}} +define void @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest0, i8 addrspace(4)* %dest1, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { + %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* + call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest0, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false) + call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest1, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false) + ret void +} + +; Check for iterator problems if the pointer has 2 uses in the same call +; CHECK-LABEL: @memcpy_group_flat_to_flat_self( +; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 addrspace(3)* %group.ptr, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 +define void @memcpy_group_flat_to_flat_self(i8 addrspace(3)* %group.ptr) #0 { + %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)* + call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast, i8 addrspace(4)* %cast, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 + ret void +} +; CHECK-LABEL: @memmove_flat_to_flat_replace_src_with_group( +; CHECK: call void @llvm.memmove.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 +define void @memmove_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { + %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)* + call void @llvm.memmove.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 + ret void +} + +declare void @llvm.memset.p4i8.i64(i8 addrspace(4)* nocapture writeonly, i8, i64, i32, i1) #1 +declare void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* nocapture writeonly, i8 addrspace(4)* nocapture readonly, i64, i32, i1) #1 +declare void @llvm.memcpy.p4i8.p3i8.i32(i8 addrspace(4)* nocapture writeonly, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #1 +declare void @llvm.memmove.p4i8.p4i8.i64(i8 addrspace(4)* nocapture writeonly, i8 addrspace(4)* nocapture readonly, i64, i32, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { argmemonly nounwind } + +!0 = !{!1, !1, i64 0} +!1 = !{!"A", !2} +!2 = !{!"tbaa root"} +!3 = !{!"B", !2} +!4 = !{!5} +!5 = distinct !{!5, !6, !"some scope"} +!6 = distinct !{!6, !"some domain"} +!7 = !{i64 0, i64 8, null} \ No newline at end of file Index: test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll =================================================================== --- test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll +++ test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll @@ -115,4 +115,26 @@ ret { i32, i1 } %ret } -attributes #0 = { nounwind } \ No newline at end of file +; FIXME: Shouldn't be losing names +; CHECK-LABEL: @volatile_memset_group_to_flat( +; CHECK: addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)* +; CHECK: call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %1, i8 4, i64 32, i32 4, i1 true) +define void @volatile_memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 { + %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)* + call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 true) + ret void +} + +; CHECK-LABEL: @volatile_memset_global_to_flat( +; CHECK: addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)* +; CHECK: call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %1, i8 4, i64 32, i32 4, i1 true) +define void @volatile_memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 { + %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)* + call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 true) + ret void +} + +declare void @llvm.memset.p4i8.i64(i8 addrspace(4)* nocapture writeonly, i8, i64, i32, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { argmemonly nounwind }