Index: llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -42,6 +42,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" @@ -282,6 +283,21 @@ // so remove the variables from these lists before replaceAllUsesWith removeFromUsedLists(M, LocalVars); + // Create alias.scope and their lists. Each field in the new structure + // does not alias with all other fields. + SmallVector AliasScopes; + SmallVector NoAliasList; + if (LocalVars.size() > 1) { + MDBuilder MDB(Ctx); + AliasScopes.reserve(LocalVars.size()); + MDNode *Domain = MDB.createAnonymousAliasScopeDomain(); + for (size_t I = 0; I < LocalVars.size(); I++) { + MDNode *Scope = MDB.createAnonymousAliasScope(Domain); + AliasScopes.push_back(Scope); + } + NoAliasList.append(&AliasScopes[1], AliasScopes.end()); + } + // Replace uses of ith variable with a constantexpr to the ith field of the // instance that will be allocated by AMDGPUMachineFunction Type *I32 = Type::getInt32Ty(Ctx); @@ -313,7 +329,15 @@ uint64_t Off = DL.getStructLayout(LDSTy)->getElementOffset(I); Align A = commonAlignment(StructAlign, Off); - refineUsesAlignment(GEP, A, DL); + + if (I) + NoAliasList[I - 1] = AliasScopes[I - 1]; + MDNode *NoAlias = + NoAliasList.empty() ? nullptr : MDNode::get(Ctx, NoAliasList); + MDNode *AliasScope = + AliasScopes.empty() ? nullptr : MDNode::get(Ctx, {AliasScopes[I]}); + + refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias); } // Mark kernels with asm that reads the address of the allocated structure @@ -334,12 +358,25 @@ return true; } - void refineUsesAlignment(Value *Ptr, Align A, const DataLayout &DL, - unsigned MaxDepth = 5) { - if (!MaxDepth || A == 1) + void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL, + MDNode *AliasScope, MDNode *NoAlias, + unsigned MaxDepth = 5) { + if (!MaxDepth || (A == 1 && !AliasScope)) return; for (User *U : Ptr->users()) { + if (auto *I = dyn_cast(U)) { + if (AliasScope && I->mayReadOrWriteMemory()) { + MDNode *AS = I->getMetadata(LLVMContext::MD_alias_scope); + AS = MDNode::concatenate(AS, AliasScope); + I->setMetadata(LLVMContext::MD_alias_scope, AS); + + MDNode *NA = I->getMetadata(LLVMContext::MD_noalias); + NA = MDNode::concatenate(NA, NoAlias); + I->setMetadata(LLVMContext::MD_noalias, NA); + } + } + if (auto *LI = dyn_cast(U)) { LI->setAlignment(std::max(A, LI->getAlign())); continue; @@ -364,17 +401,19 @@ if (auto *GEP = dyn_cast(U)) { unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType()); APInt Off(BitWidth, 0); - if (GEP->getPointerOperand() == Ptr && - GEP->accumulateConstantOffset(DL, Off)) { - Align GA = commonAlignment(A, Off.getLimitedValue()); - refineUsesAlignment(GEP, GA, DL, MaxDepth - 1); + if (GEP->getPointerOperand() == Ptr) { + Align GA; + if (GEP->accumulateConstantOffset(DL, Off)) + GA = commonAlignment(A, Off.getLimitedValue()); + refineUsesAlignmentAndAA(GEP, GA, DL, AliasScope, NoAlias, + MaxDepth - 1); } continue; } if (auto *I = dyn_cast(U)) { if (I->getOpcode() == Instruction::BitCast || I->getOpcode() == Instruction::AddrSpaceCast) - refineUsesAlignment(I, A, DL, MaxDepth - 1); + refineUsesAlignmentAndAA(I, A, DL, AliasScope, NoAlias, MaxDepth - 1); } } } Index: llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll @@ -0,0 +1,49 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +@a = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4 +@b = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4 + +; CHECK-LABEL: @no_clobber_ds_load_stores_x2_preexisting_aa +; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !tbaa !0, !alias.scope !5, !noalias !10 +; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !tbaa !0, !alias.scope !5, !noalias !10 +; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !tbaa !0, !alias.scope !10, !noalias !5 +; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !tbaa !0, !alias.scope !10, !noalias !5 + +define amdgpu_kernel void @no_clobber_ds_load_stores_x2_preexisting_aa(i32 addrspace(1)* %arg, i32 %i) { +bb: + store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4, !alias.scope !0, !noalias !3, !tbaa !5 + %gep.a = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 %i + %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !alias.scope !0, !noalias !3, !tbaa !5 + store i32 2, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 0), align 4, !alias.scope !3, !noalias !0, !tbaa !5 + %gep.b = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 %i + %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !alias.scope !3, !noalias !0, !tbaa !5 + %val = add i32 %val.a, %val.b + store i32 %val, i32 addrspace(1)* %arg, align 4 + ret void +} + +!0 = !{!1} +!1 = distinct !{!1, !2} +!2 = distinct !{!2} +!3 = !{!4} +!4 = distinct !{!4, !2} +!5 = !{!6, !7, i64 0} +!6 = !{!"no_clobber_ds_load_stores_x2_preexisting_aa", !7, i64 0} +!7 = !{!"int", !8, i64 0} +!8 = !{!"omnipotent char", !9, i64 0} +!9 = !{!"Simple C++ TBAA"} + +; CHECK:!0 = !{!1, !2, i64 0} +; CHECK:!1 = !{!"no_clobber_ds_load_stores_x2_preexisting_aa", !2, i64 0} +; CHECK:!2 = !{!"int", !3, i64 0} +; CHECK:!3 = !{!"omnipotent char", !4, i64 0} +; CHECK:!4 = !{!"Simple C++ TBAA"} +; CHECK:!5 = !{!6, !8} +; CHECK:!6 = distinct !{!6, !7} +; CHECK:!7 = distinct !{!7} +; CHECK:!8 = distinct !{!8, !9} +; CHECK:!9 = distinct !{!9} +; CHECK:!10 = !{!11, !12} +; CHECK:!11 = distinct !{!11, !7} +; CHECK:!12 = distinct !{!12, !9} Index: llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll @@ -0,0 +1,77 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -O3 < %s | FileCheck -check-prefix=GCN %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +@a = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4 +@b = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4 +@c = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4 + +; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x2: +; GCN: ds_write2st64_b32 +; GCN: ds_read2st64_b32 + +; CHECK-LABEL: @no_clobber_ds_load_stores_x2 +; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !alias.scope !0, !noalias !3 +; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !alias.scope !0, !noalias !3 +; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !alias.scope !3, !noalias !0 +; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !alias.scope !3, !noalias !0 + +define amdgpu_kernel void @no_clobber_ds_load_stores_x2(i32 addrspace(1)* %arg, i32 %i) { +bb: + store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4 + %gep.a = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 %i + %val.a = load i32, i32 addrspace(3)* %gep.a, align 4 + store i32 2, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 0), align 4 + %gep.b = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 %i + %val.b = load i32, i32 addrspace(3)* %gep.b, align 4 + %val = add i32 %val.a, %val.b + store i32 %val, i32 addrspace(1)* %arg, align 4 + ret void +} + +; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x3: +; GCN-DAG: ds_write2st64_b32 +; GCN-DAG: ds_write_b32 +; GCN-DAG: ds_read2st64_b32 +; GCN-DAG: ds_read_b32 + +; CHECK-LABEL: @no_clobber_ds_load_stores_x3 +; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !alias.scope !5, !noalias !8 +; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !alias.scope !5, !noalias !8 +; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !alias.scope !11, !noalias !12 +; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !alias.scope !11, !noalias !12 +; CHECK: store i32 3, i32 addrspace(3)* %2, align 16, !alias.scope !13, !noalias !14 +; CHECK: %val.c = load i32, i32 addrspace(3)* %gep.c, align 4, !alias.scope !13, !noalias !14 + +define amdgpu_kernel void @no_clobber_ds_load_stores_x3(i32 addrspace(1)* %arg, i32 %i) { +bb: + store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4 + %gep.a = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 %i + %val.a = load i32, i32 addrspace(3)* %gep.a, align 4 + store i32 2, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 0), align 4 + %gep.b = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 %i + %val.b = load i32, i32 addrspace(3)* %gep.b, align 4 + store i32 3, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @c, i32 0, i32 0), align 4 + %gep.c = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @c, i32 0, i32 %i + %val.c = load i32, i32 addrspace(3)* %gep.c, align 4 + %val.1 = add i32 %val.a, %val.b + %val = add i32 %val.1, %val.c + store i32 %val, i32 addrspace(1)* %arg, align 4 + ret void +} + +; CHECK: !0 = !{!1} +; CHECK: !1 = distinct !{!1, !2} +; CHECK: !2 = distinct !{!2} +; CHECK: !3 = !{!4} +; CHECK: !4 = distinct !{!4, !2} +; CHECK: !5 = !{!6} +; CHECK: !6 = distinct !{!6, !7} +; CHECK: !7 = distinct !{!7} +; CHECK: !8 = !{!9, !10} +; CHECK: !9 = distinct !{!9, !7} +; CHECK: !10 = distinct !{!10, !7} +; CHECK: !11 = !{!9} +; CHECK: !12 = !{!6, !10} +; CHECK: !13 = !{!10} +; CHECK: !14 = !{!6, !9}