Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -119,6 +119,9 @@ void initializeSIInsertWaitsPass(PassRegistry&); extern char &SIInsertWaitsID; +ImmutablePass *createAMDGPUAAWrapperPass(); +void initializeAMDGPUAAWrapperPassPass(PassRegistry&); + Target &getTheAMDGPUTarget(); Target &getTheGCNTarget(); Index: lib/Target/AMDGPU/AMDGPUAliasAnalysis.h =================================================================== --- /dev/null +++ lib/Target/AMDGPU/AMDGPUAliasAnalysis.h @@ -0,0 +1,74 @@ +#ifndef LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H +#define LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H + +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" + +namespace llvm { + +/// A simple AA result that uses TBAA metadata to answer queries. +class AMDGPUAAResult : public AAResultBase { + friend AAResultBase; + + const DataLayout &DL; + +public: + explicit AMDGPUAAResult(const DataLayout &DL) : AAResultBase(), DL(DL) {} + AMDGPUAAResult(AMDGPUAAResult &&Arg) + : AAResultBase(std::move(Arg)), DL(Arg.DL){} + + /// Handle invalidation events from the new pass manager. + /// + /// By definition, this result is stateless and so remains valid. + bool invalidate(Function &, const PreservedAnalyses &) { return false; } + + AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB); + bool pointsToConstantMemory(const MemoryLocation &Loc, bool OrLocal); + +private: + bool Aliases(const MDNode *A, const MDNode *B) const; + bool PathAliases(const MDNode *A, const MDNode *B) const; +}; + +/// Analysis pass providing a never-invalidated alias analysis result. +class AMDGPUAA : public AnalysisInfoMixin { + friend AnalysisInfoMixin; + static char PassID; + +public: + typedef AMDGPUAAResult Result; + + AMDGPUAAResult run(Function &F, AnalysisManager &AM) { + return AMDGPUAAResult(F.getParent()->getDataLayout()); + } +}; + +/// Legacy wrapper pass to provide the AMDGPUAAResult object. +class AMDGPUAAWrapperPass : public ImmutablePass { + std::unique_ptr Result; + +public: + static char ID; + + AMDGPUAAWrapperPass() : ImmutablePass(ID) { + initializeAMDGPUAAWrapperPassPass(*PassRegistry::getPassRegistry()); + } + + AMDGPUAAResult &getResult() { return *Result; } + const AMDGPUAAResult &getResult() const { return *Result; } + + bool doInitialization(Module &M) override { + Result.reset(new AMDGPUAAResult(M.getDataLayout())); + return false; + } + bool doFinalization(Module &M) override { + Result.reset(); + return false; + } + void getAnalysisUsage(AnalysisUsage &AU) const override; +}; + +} +#endif // LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H Index: lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp @@ -0,0 +1,103 @@ +#include "AMDGPU.h" +#include "AMDGPUAliasAnalysis.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" + +using namespace llvm; + +#define DEBUG_TYPE "hsail-aa" + +static cl::opt EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, + cl::desc("Enable AMDGPU Alias Analysis"), + cl::init(true)); + +// Register this pass... +char AMDGPUAAWrapperPass::ID = 0; +INITIALIZE_PASS(AMDGPUAAWrapperPass, "amdgpu-aa", + "AMDGPU Address space based Alias Analysis", false, true) + +ImmutablePass *llvm::createAMDGPUAAWrapperPass() { + return new AMDGPUAAWrapperPass(); +} + +void AMDGPUAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); +} + +AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA, + const MemoryLocation &LocB) { + if (!EnableAMDGPUAliasAnalysis) + return AAResultBase::alias(LocA, LocB); + + // This array is indexed by the AMDGPUAS::AddressSpaces + // enum elements PRIVATE_ADDRESS ... to FLAT_ADDRESS + // see "llvm/Transforms/AMDSPIRUtils.h" + static const AliasResult ASAliasRules[5][5] = { + /* Private Global Constant Group Flat */ + /* Private */ {MayAlias, NoAlias , NoAlias , NoAlias , MayAlias}, + /* Global */ {NoAlias , MayAlias, NoAlias , NoAlias , MayAlias}, + /* Constant */ {NoAlias , NoAlias , MayAlias, NoAlias , NoAlias }, + /* Group */ {NoAlias , NoAlias , NoAlias , MayAlias, MayAlias}, + /* Flat */ {MayAlias, MayAlias, NoAlias , MayAlias, MayAlias} + }; + unsigned asA = LocA.Ptr->getType()->getPointerAddressSpace(); + unsigned asB = LocB.Ptr->getType()->getPointerAddressSpace(); + if (asA > AMDGPUAS::AddressSpaces::FLAT_ADDRESS || + asB > AMDGPUAS::AddressSpaces::FLAT_ADDRESS) + report_fatal_error("Pointer address space out of range"); + + AliasResult Result = ASAliasRules[asA][asB]; + if (Result == NoAlias) return Result; + + if (isa(LocA.Ptr) && isa(LocB.Ptr)) { + Type *T1 = dyn_cast(LocA.Ptr->getType())->getElementType(); + Type *T2 = dyn_cast(LocB.Ptr->getType())->getElementType(); + + if ((T1->isVectorTy() && !T2->isVectorTy()) || + (T2->isVectorTy() && !T1->isVectorTy())) + return NoAlias; + } + // Forward the query to the next alias analysis. + return AAResultBase::alias(LocA, LocB); +} + +bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc, + bool OrLocal) +{ + if (!EnableAMDGPUAliasAnalysis) + return AAResultBase::pointsToConstantMemory(Loc, OrLocal); + + const Value *Base = GetUnderlyingObject(Loc.Ptr, DL); + assert(Base->getType()->isPointerTy()); + + if(Base->getType()->getPointerAddressSpace() == + AMDGPUAS::AddressSpaces::CONSTANT_ADDRESS) { + return true; + } + + if (const GlobalVariable *GV = dyn_cast(Base)) { + if (GV->isConstant()) + return true; + } else if (const Argument *Arg = dyn_cast(Base)) { + const Function *F = Arg->getParent(); + unsigned ArgNo = Arg->getArgNo(); + /* On an argument, ReadOnly attribute indicates that the function does + not write through this pointer argument, even though it may write + to the memory that the pointer points to. + On an argument, ReadNone attribute indicates that the function does + not dereference that pointer argument, even though it may read or write + the memory that the pointer points to if accessed through other pointers. + */ + if (F->getAttributes().hasAttribute(ArgNo + 1, Attribute::NoAlias) && + (F->getAttributes().hasAttribute(ArgNo + 1, Attribute::ReadNone) || + F->getAttributes().hasAttribute(ArgNo + 1, Attribute::ReadOnly))) { + return true; + } + } + return AAResultBase::pointsToConstantMemory(Loc, OrLocal); +} Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -15,6 +15,7 @@ #include "AMDGPUTargetMachine.h" #include "AMDGPU.h" +#include "AMDGPUAliasAnalysis.h" #include "AMDGPUCallLowering.h" #include "AMDGPUInstructionSelector.h" #include "AMDGPULegalizerInfo.h" @@ -119,6 +120,7 @@ initializeSIInsertSkipsPass(*PR); initializeSIDebuggerInsertNopsPass(*PR); initializeSIOptimizeExecMaskingPass(*PR); + initializeAMDGPUAAWrapperPassPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -507,6 +509,12 @@ addPass(createSROAPass()); addStraightLineScalarOptimizationPasses(); + + addPass(createAMDGPUAAWrapperPass()); + addPass(createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) { + if (auto *WrapperPass = P.getAnalysisIfAvailable()) + AAR.addAAResult(WrapperPass->getResult()); + })); } TargetPassConfig::addIRPasses(); Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -36,6 +36,7 @@ add_llvm_target(AMDGPUCodeGen AMDILCFGStructurizer.cpp + AMDGPUAliasAnalysis.cpp AMDGPUAlwaysInlinePass.cpp AMDGPUAnnotateKernelFeatures.cpp AMDGPUAnnotateUniformValues.cpp Index: test/CodeGen/AMDGPU/amdgpu.private-memory.ll =================================================================== --- test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -1,9 +1,9 @@ -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE -; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC +; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC +; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE +; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC +; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA +; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC +; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -check-prefix=HSAOPT -check-prefix=OPT %s ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -check-prefix=NOHSAOPT -check-prefix=OPT %s Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx901 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=fiji -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI -check-prefix=GFX89 %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=CI %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx901 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI -check-prefix=GFX89 %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=CI %s ; GCN-LABEL: {{^}}s_insertelement_v2i16_0: ; GCN: s_load_dword [[VEC:s[0-9]+]] Index: test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll =================================================================== --- test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) Index: test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll =================================================================== --- test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll +++ test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -mattr=-promote-alloca,-load-store-opt < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=hawaii -enable-amdgpu-aa=0 -verify-machineinstrs -mattr=-promote-alloca,-load-store-opt < %s | FileCheck -check-prefix=GCN %s @sPrivateStorage = internal addrspace(3) global [256 x [8 x <4 x i64>]] undef Index: test/CodeGen/AMDGPU/vectorize-global-local.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/vectorize-global-local.ll @@ -0,0 +1,79 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s +; CHECK-DAG: flat_load_dwordx4 +; CHECK-DAG: flat_load_dwordx4 +; CHECK-DAG: flat_load_dwordx4 +; CHECK-DAG: flat_load_dwordx4 +; CHECK-DAG: ds_write2_b32 +; CHECK-DAG: ds_write2_b32 +; CHECK-DAG: ds_write2_b32 +; CHECK-DAG: ds_write2_b32 +; CHECK-DAG: ds_write2_b32 +; CHECK-DAG: ds_write2_b32 +; CHECK-DAG: ds_write2_b32 +; CHECK-DAG: ds_write2_b32 + +define void @vectorize_global_local(i32 addrspace(1)* nocapture readonly, i32 addrspace(3)* nocapture) { + %3 = load i32, i32 addrspace(1)* %0, align 4 + store i32 %3, i32 addrspace(3)* %1, align 4 + %4 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 1 + %5 = load i32, i32 addrspace(1)* %4, align 4 + %6 = getelementptr inbounds i32, i32 addrspace(3)* %1, i32 1 + store i32 %5, i32 addrspace(3)* %6, align 4 + %7 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 2 + %8 = load i32, i32 addrspace(1)* %7, align 4 + %9 = getelementptr inbounds i32, i32 addrspace(3)* %1, i32 2 + store i32 %8, i32 addrspace(3)* %9, align 4 + %10 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 3 + %11 = load i32, i32 addrspace(1)* %10, align 4 + %12 = getelementptr inbounds i32, i32 addrspace(3)* %1, i32 3 + store i32 %11, i32 addrspace(3)* %12, align 4 + %13 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 4 + %14 = load i32, i32 addrspace(1)* %13, align 4 + %15 = getelementptr inbounds i32, i32 addrspace(3)* %1, i32 4 + store i32 %14, i32 addrspace(3)* %15, align 4 + %16 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 5 + %17 = load i32, i32 addrspace(1)* %16, align 4 + %18 = getelementptr inbounds i32, i32 addrspace(3)* %1, i32 5 + store i32 %17, i32 addrspace(3)* %18, align 4 + %19 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 6 + %20 = load i32, i32 addrspace(1)* %19, align 4 + %21 = getelementptr inbounds i32, i32 addrspace(3)* %1, i32 6 + store i32 %20, i32 addrspace(3)* %21, align 4 + %22 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 7 + %23 = load i32, i32 addrspace(1)* %22, align 4 + %24 = getelementptr inbounds i32, i32 addrspace(3)* %1, i32 7 + store i32 %23, i32 addrspace(3)* %24, align 4 + %25 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 8 + %26 = load i32, i32 addrspace(1)* %25, align 4 + %27 = getelementptr inbounds i32, i32 addrspace(3)* %1, i32 8 + store i32 %26, i32 addrspace(3)* %27, align 4 + %28 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 9 + %29 = load i32, i32 addrspace(1)* %28, align 4 + %30 = getelementptr inbounds i32, i32 addrspace(3)* %1, i32 9 + store i32 %29, i32 addrspace(3)* %30, align 4 + %31 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 10 + %32 = load i32, i32 addrspace(1)* %31, align 4 + %33 = getelementptr inbounds i32, i32 addrspace(3)* %1, i32 10 + store i32 %32, i32 addrspace(3)* %33, align 4 + %34 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 11 + %35 = load i32, i32 addrspace(1)* %34, align 4 + %36 = getelementptr inbounds i32, i32 addrspace(3)* %1, i32 11 + store i32 %35, i32 addrspace(3)* %36, align 4 + %37 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 12 + %38 = load i32, i32 addrspace(1)* %37, align 4 + %39 = getelementptr inbounds i32, i32 addrspace(3)* %1, i32 12 + store i32 %38, i32 addrspace(3)* %39, align 4 + %40 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 13 + %41 = load i32, i32 addrspace(1)* %40, align 4 + %42 = getelementptr inbounds i32, i32 addrspace(3)* %1, i32 13 + store i32 %41, i32 addrspace(3)* %42, align 4 + %43 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 14 + %44 = load i32, i32 addrspace(1)* %43, align 4 + %45 = getelementptr inbounds i32, i32 addrspace(3)* %1, i32 14 + store i32 %44, i32 addrspace(3)* %45, align 4 + %46 = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 15 + %47 = load i32, i32 addrspace(1)* %46, align 4 + %48 = getelementptr inbounds i32, i32 addrspace(3)* %1, i32 15 + store i32 %47, i32 addrspace(3)* %48, align 4 + ret void +}