Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -38,6 +38,7 @@ FunctionPass *createAMDGPUCFGStructurizerPass(); // SI Passes +FunctionPass *createGCNOptimizeUniformMemOps(); FunctionPass *createSITypeRewriter(); FunctionPass *createSIAnnotateControlFlowPass(); FunctionPass *createSIFoldOperandsPass(); @@ -104,6 +105,9 @@ void initializeSIInsertWaitsPass(PassRegistry&); extern char &SIInsertWaitsID; +void initializeGCNOptimizeUniformMemOpsPass(PassRegistry&); +extern char &GCNOptimizeUniformMemOpsID; + extern Target TheAMDGPUTarget; extern Target TheGCNTarget; Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -58,6 +58,7 @@ initializeSIInsertWaitsPass(*PR); initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); + initializeGCNOptimizeUniformMemOpsPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -312,6 +313,7 @@ addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions addPass(createSinkingPass()); addPass(createSITypeRewriter()); + addPass(createGCNOptimizeUniformMemOps()); addPass(createAMDGPUAnnotateUniformValues()); addPass(createSIAnnotateControlFlowPass()); Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -47,6 +47,7 @@ AMDGPUInstrInfo.cpp AMDGPUPromoteAlloca.cpp AMDGPURegisterInfo.cpp + GCNOptimizeUniformMemOps.cpp R600ClauseMergePass.cpp R600ControlFlowFinalizer.cpp R600EmitClauseMarkers.cpp Index: lib/Target/AMDGPU/GCNOptimizeUniformMemOps.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/GCNOptimizeUniformMemOps.cpp @@ -0,0 +1,124 @@ +//===-- GCNOptimizeUniformMemOps.cpp - Optimize Uniform Loads -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass scans the programs for uniform loads in the global address space +/// and tries to promote them to constant address space loads. +/// +/// Uniform loads from the constant address space are selected to SMRD/SMEM +/// instructions which are executed by the SALU and read data from the Scalar +/// Data. Any time we can promote a load from global to constant address space +/// we are potentially reducing the latency for the instruction, since it now +/// reads through the cache, and we use less memory bandwith since SMRD/SMEM +/// instructions are executed once per wave rather than one per thread. +/// +/// This optimization is safe as long as that the memory pointed to by the +/// load's pointer has not bee updated by the current kernel. Writes to +/// global memory do not update the Scalar Data Cache, so in this case we +/// can only read the value back by fetching it from global memory. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/InitializePasses.h" +#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "gcn-opt-uniform-mem" + +using namespace llvm; + +namespace { + +// FIXME: This can create globals so should be a module pass. +class GCNOptimizeUniformMemOps : public FunctionPass, + public InstVisitor { +private: + Module *Mod; + DivergenceAnalysis *DA; + MemoryDependenceResults *MemDep; + +public: + static char ID; + + GCNOptimizeUniformMemOps() : + FunctionPass(ID), + Mod(nullptr) { } + + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + + const char *getPassName() const override { + return "GCN Optimize Uniform Mem Ops"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + } + + void visitLoadInst(LoadInst &I); + +}; + +} // End anonymous namespace + +char GCNOptimizeUniformMemOps::ID = 0; + +INITIALIZE_PASS_BEGIN(GCNOptimizeUniformMemOps, DEBUG_TYPE, + "GCN Optimize Uniform Mem Ops", false, false) +INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) +INITIALIZE_PASS_END(GCNOptimizeUniformMemOps, DEBUG_TYPE, + "GCN Optmize Uniform Mem Ops", false, false) + +char &llvm::GCNOptimizeUniformMemOpsID = GCNOptimizeUniformMemOps::ID; + + +bool GCNOptimizeUniformMemOps::doInitialization(Module &M) { + Mod = &M; + return false; +} + +void GCNOptimizeUniformMemOps::visitLoadInst(LoadInst &I) { + if (I.getPointerAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS || + !DA->isUniform(I.getPointerOperand())) + return; + + MemDepResult Dep = MemDep->getDependency(cast(&I)); + if (Dep.isNonFuncLocal()) { + IRBuilder<> Builder(&I); + Value *Ptr = I.getPointerOperand(); + Type *ElTy = cast(Ptr->getType())->getElementType(); + Type *NewTy = PointerType::get(ElTy, + AMDGPUAS::CONSTANT_ADDRESS); + // FIXME: Add update methods to divergence analysis and update it here. + Value *NewPtr = Builder.CreateAddrSpaceCast(Ptr, NewTy); + I.replaceUsesOfWith(Ptr, NewPtr); + } +} + +bool GCNOptimizeUniformMemOps::runOnFunction(Function &F) { + if (F.hasFnAttribute(Attribute::OptimizeNone)) + return false; + + DA = &getAnalysis(); + MemDep = &getAnalysis().getMemDep(); + visit(F); + return true; +} + +FunctionPass *llvm::createGCNOptimizeUniformMemOps() { + return new GCNOptimizeUniformMemOps(); +} Index: test/CodeGen/AMDGPU/uniform-mem-opt.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/uniform-mem-opt.ll @@ -0,0 +1,90 @@ +; RUN: llc < %s -march=amdgcn -mcpu=fiji -verify-machineinstrs | FileCheck --check-prefix=GCN %s + +; GCN-LABEL: {{^}}no_memdep: +; GCN: s_load_dword [[SVAL:s[0-9]+]] +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @no_memdep(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + %val = load i32, i32 addrspace(1)* %in + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}no_memdep_alias_arg: +; GCN: buffer_store_dword +; GCN: s_load_dword [[SVAL:s[0-9]+]] +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @no_memdep_alias_arg(i32 addrspace(1)* noalias %in, i32 addrspace(1)* %out0, i32 addrspace(1)* %out1) { + store i32 0, i32 addrspace(1)* %out0 + %val = load i32, i32 addrspace(1)* %in + store i32 %val, i32 addrspace(1)* %out1 + ret void +} + +; GCN-LABEL: {{^}}memdep: +; GCN: buffer_store_dword +; GCN: buffer_load_dword [[VVAL:v[0-9]+]] +; GCN: buffer_store_dword [[VVAL]] +define void @memdep(i32 addrspace(1)* %in, i32 addrspace(1)* %out0, i32 addrspace(1)* %out1) { + store i32 0, i32 addrspace(1)* %out0 + %val = load i32, i32 addrspace(1)* %in + store i32 %val, i32 addrspace(1)* %out1 + ret void +} + +; GCN-LABEL: {{^}}no_memdep_diff_addrspace: +; GCN-DAG: ds_write +; FIXME: We should be able to use s_load_dword here, we need to teach alias analysis about address spaces. +; GCN-DAG: buffer_load_dword [[VVAL:v[0-9]+]] +; GCN-DAG: buffer_store_dword [[VVAL]] +define void @no_memdep_diff_addrspace(i32 addrspace(1)* %in, i32 addrspace(3)* %out0, i32 addrspace(1)* %out1) { + store i32 0, i32 addrspace(3)* %out0 + %val = load i32, i32 addrspace(1)* %in + store i32 %val, i32 addrspace(1)* %out1 + ret void +} + +; Make sure we don't try to promote lds to constant address space +; GCN-LABEL: {{^}}no_memdep_wrong_addrspace: +; GCN-DAG: buffer_store_dword +; GCN-DAG: ds_read_b32 [[VVAL:v[0-9]+]] +; GCN: buffer_store_dword [[VVAL]] +define void @no_memdep_wrong_addrspace(i32 addrspace(3)* noalias %in, i32 addrspace(1)* %out0, i32 addrspace(1)* %out1) { + store i32 0, i32 addrspace(1)* %out0 + %val = load i32, i32 addrspace(3)* %in + store i32 %val, i32 addrspace(1)* %out1 + ret void +} + +; GCN-LABEL: {{^}}no_memdep_divergent_load: +; GCN: buffer_store_dword +; GCN: flat_load_dword [[VVAL:v[0-9]+]] +; GCN: buffer_store_dword [[VVAL]] +define void @no_memdep_divergent_load(i32 addrspace(1)* noalias %in, i32 addrspace(1)* %out0, i32 addrspace(1)* %out1) { + store i32 0, i32 addrspace(1)* %out0 + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %gep + store i32 %val, i32 addrspace(1)* %out1 + ret void +} + +; GCN-LABEL: {{^}}no_memdep_divergent_store: +; GCN: flat_store_dword +; GCN: s_load_dword [[SVAL:s[0-9]+]] +; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] +; GCN: buffer_store_dword [[VVAL]] +define void @no_memdep_divergent_store(i32 addrspace(1)* noalias %in, i32 addrspace(1)* %out0, i32 addrspace(1)* %out1) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %gep = getelementptr i32, i32 addrspace(1)* %out0, i32 %tid + store i32 0, i32 addrspace(1)* %gep + %val = load i32, i32 addrspace(1)* %in + store i32 %val, i32 addrspace(1)* %out1 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #1 = { nounwind readnone } +