Index: llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -42,6 +42,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" @@ -282,6 +283,23 @@ // so remove the variables from these lists before replaceAllUsesWith removeFromUsedLists(M, LocalVars); + // Create alias.scope and their lists. Each field in the new structure + // does not alias with all other fields. + SmallVector AliasScopes; + SmallVector NoAliasList; + if (LocalVars.size() > 1) { + MDBuilder MDB(Ctx); + AliasScopes.reserve(LocalVars.size()); + for (size_t I = 0; I < LocalVars.size(); I++) { + MDNode *Domain = MDB.createAnonymousAliasScopeDomain(); + MDNode *Scope = MDB.createAnonymousAliasScope(Domain); + AliasScopes.push_back(Scope); + } + NoAliasList.append(&AliasScopes[1], AliasScopes.end()); + } else { + AliasScopes.push_back(nullptr); + } + // Replace uses of ith variable with a constantexpr to the ith field of the // instance that will be allocated by AMDGPUMachineFunction Type *I32 = Type::getInt32Ty(Ctx); @@ -313,7 +331,13 @@ uint64_t Off = DL.getStructLayout(LDSTy)->getElementOffset(I); Align A = commonAlignment(StructAlign, Off); - refineUsesAlignment(GEP, A, DL); + + if (I) + NoAliasList[I - 1] = AliasScopes[I - 1]; + MDNode *NoAlias = NoAliasList.empty() ? nullptr + : MDNode::get(Ctx, NoAliasList); + + refineUsesAlignmentAndAA(GEP, A, DL, AliasScopes[I], NoAlias); } // Mark kernels with asm that reads the address of the allocated structure @@ -334,12 +358,26 @@ return true; } - void refineUsesAlignment(Value *Ptr, Align A, const DataLayout &DL, - unsigned MaxDepth = 5) { - if (!MaxDepth || A == 1) + void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL, + MDNode *AliasScope, MDNode *NoAlias, + unsigned MaxDepth = 5) { + if (!MaxDepth || (A == 1 && !AliasScope)) return; for (User *U : Ptr->users()) { + if (auto *I = dyn_cast(U)) { + if (AliasScope && I->mayReadOrWriteMemory()) { + MDNode *AS = + MDNode::concatenate(I->getMetadata(LLVMContext::MD_alias_scope), + AliasScope); + I->setMetadata(LLVMContext::MD_alias_scope, AS); + MDNode *NA = + MDNode::concatenate(I->getMetadata(LLVMContext::MD_noalias), + NoAlias); + I->setMetadata(LLVMContext::MD_noalias, NA); + } + } + if (auto *LI = dyn_cast(U)) { LI->setAlignment(std::max(A, LI->getAlign())); continue; @@ -364,17 +402,20 @@ if (auto *GEP = dyn_cast(U)) { unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType()); APInt Off(BitWidth, 0); - if (GEP->getPointerOperand() == Ptr && - GEP->accumulateConstantOffset(DL, Off)) { - Align GA = commonAlignment(A, Off.getLimitedValue()); - refineUsesAlignment(GEP, GA, DL, MaxDepth - 1); + if (GEP->getPointerOperand() == Ptr) { + Align GA; + if (GEP->accumulateConstantOffset(DL, Off)) + GA = commonAlignment(A, Off.getLimitedValue()); + refineUsesAlignmentAndAA(GEP, GA, DL, AliasScope, NoAlias, + MaxDepth - 1); } continue; } if (auto *I = dyn_cast(U)) { if (I->getOpcode() == Instruction::BitCast || I->getOpcode() == Instruction::AddrSpaceCast) - refineUsesAlignment(I, A, DL, MaxDepth - 1); + refineUsesAlignmentAndAA(I, A, DL, AliasScope, NoAlias, + MaxDepth - 1); } } } Index: llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll @@ -0,0 +1,77 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -O3 < %s | FileCheck -check-prefix=GCN %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s + +@a = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4 +@b = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4 +@c = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4 + +; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x2: +; GCN: ds_write2st64_b32 +; GCN: ds_read2st64_b32 + +; CHECK-LABEL: @no_clobber_ds_load_stores_x2 +; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !alias.scope !0, !noalias !2 +; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !alias.scope !0, !noalias !2 +; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !alias.scope !3, !noalias !5 +; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !alias.scope !3, !noalias !5 + +define amdgpu_kernel void @no_clobber_ds_load_stores_x2(i32 addrspace(1)* %arg, i32 %i) { +bb: + store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4 + %gep.a = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 %i + %val.a = load i32, i32 addrspace(3)* %gep.a, align 4 + store i32 2, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 0), align 4 + %gep.b = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 %i + %val.b = load i32, i32 addrspace(3)* %gep.b, align 4 + %val = add i32 %val.a, %val.b + store i32 %val, i32 addrspace(1)* %arg, align 4 + ret void +} + +; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x3: +; GCN-DAG: ds_write2st64_b32 +; GCN-DAG: ds_write_b32 +; GCN-DAG: ds_read2st64_b32 +; GCN-DAG: ds_read_b32 + +; CHECK-LABEL: @no_clobber_ds_load_stores_x3 +; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !alias.scope !6, !noalias !8 +; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !alias.scope !6, !noalias !8 +; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !alias.scope !9, !noalias !13 +; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !alias.scope !9, !noalias !13 +; CHECK: store i32 3, i32 addrspace(3)* %2, align 16, !alias.scope !11, !noalias !14 +; CHECK: %val.c = load i32, i32 addrspace(3)* %gep.c, align 4, !alias.scope !11, !noalias !14 + +define amdgpu_kernel void @no_clobber_ds_load_stores_x3(i32 addrspace(1)* %arg, i32 %i) { +bb: + store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4 + %gep.a = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 %i + %val.a = load i32, i32 addrspace(3)* %gep.a, align 4 + store i32 2, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 0), align 4 + %gep.b = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 %i + %val.b = load i32, i32 addrspace(3)* %gep.b, align 4 + store i32 3, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @c, i32 0, i32 0), align 4 + %gep.c = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @c, i32 0, i32 %i + %val.c = load i32, i32 addrspace(3)* %gep.c, align 4 + %val.1 = add i32 %val.a, %val.b + %val = add i32 %val.1, %val.c + store i32 %val, i32 addrspace(1)* %arg, align 4 + ret void +} + +; CHECK: !0 = distinct !{!0, !1} +; CHECK: !1 = distinct !{!1} +; CHECK: !2 = !{!3} +; CHECK: !3 = distinct !{!3, !4} +; CHECK: !4 = distinct !{!4} +; CHECK: !5 = !{!0} +; CHECK: !6 = distinct !{!6, !7} +; CHECK: !7 = distinct !{!7} +; CHECK: !8 = !{!9, !11} +; CHECK: !9 = distinct !{!9, !10} +; CHECK: !10 = distinct !{!10} +; CHECK: !11 = distinct !{!11, !12} +; CHECK: !12 = distinct !{!12} +; CHECK: !13 = !{!6, !11} +; CHECK: !14 = !{!6, !9}