Index: llvm/include/llvm/InitializePasses.h =================================================================== --- llvm/include/llvm/InitializePasses.h +++ llvm/include/llvm/InitializePasses.h @@ -189,6 +189,7 @@ void initializeIndVarSimplifyLegacyPassPass(PassRegistry&); void initializeIndirectBrExpandPassPass(PassRegistry&); void initializeInferAddressSpacesPass(PassRegistry&); +void initializeInferArgAddressSpacesPass(PassRegistry&); void initializeInferFunctionAttrsLegacyPassPass(PassRegistry&); void initializeInjectTLIMappingsLegacyPass(PassRegistry &); void initializeInlineCostAnalysisPass(PassRegistry&); Index: llvm/include/llvm/Transforms/IPO/InferArgumentAddressSpaces.h =================================================================== --- /dev/null +++ llvm/include/llvm/Transforms/IPO/InferArgumentAddressSpaces.h @@ -0,0 +1,34 @@ +//===-- InferArgumentAddressSpaces.h --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Infer pointer argument address spaces for local functions based on the call +/// sites. When all calls use a same address space for actual argument we can +/// infer its address space in the callee. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_IPO_INFERARGUMENTADDRESSSPACES_H +#define LLVM_TRANSFORMS_IPO_INFERARGUMENTADDRESSSPACES_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { +class Pass; + +/// Create a pass to infer function arguments' address spaces. +Pass *createInferArgAddressSpacesPass(); + +struct InferArgAddressSpacesPass : PassInfoMixin { + InferArgAddressSpacesPass() {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + +} + +#endif // LLVM_TRANSFORMS_IPO_INFERARGUMENTADDRESSSPACES_H Index: llvm/lib/Passes/PassBuilder.cpp =================================================================== --- llvm/lib/Passes/PassBuilder.cpp +++ llvm/lib/Passes/PassBuilder.cpp @@ -107,6 +107,7 @@ #include "llvm/Transforms/IPO/GlobalSplit.h" #include "llvm/Transforms/IPO/HotColdSplitting.h" #include "llvm/Transforms/IPO/IROutliner.h" +#include "llvm/Transforms/IPO/InferArgumentAddressSpaces.h" #include "llvm/Transforms/IPO/InferFunctionAttrs.h" #include "llvm/Transforms/IPO/Inliner.h" #include "llvm/Transforms/IPO/Internalize.h" Index: llvm/lib/Passes/PassRegistry.def =================================================================== --- llvm/lib/Passes/PassRegistry.def +++ llvm/lib/Passes/PassRegistry.def @@ -65,6 +65,7 @@ MODULE_PASS("globalsplit", GlobalSplitPass()) MODULE_PASS("hotcoldsplit", HotColdSplittingPass()) MODULE_PASS("inferattrs", InferFunctionAttrsPass()) +MODULE_PASS("infer-argument-address-spaces", InferArgAddressSpacesPass()) MODULE_PASS("inliner-wrapper", ModuleInlinerWrapperPass()) MODULE_PASS("inliner-ml-advisor-release", ModuleInlinerWrapperPass(getInlineParams(), true, {}, InliningAdvisorMode::Release, 0)) MODULE_PASS("print", InlineAdvisorAnalysisPrinterPass(dbgs())) Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -50,6 +50,7 @@ #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/AlwaysInliner.h" #include "llvm/Transforms/IPO/GlobalDCE.h" +#include "llvm/Transforms/IPO/InferArgumentAddressSpaces.h" #include "llvm/Transforms/IPO/Internalize.h" #include "llvm/Transforms/IPO/PassManagerBuilder.h" #include "llvm/Transforms/Scalar.h" @@ -401,6 +402,7 @@ initializeAMDGPUResourceUsageAnalysisPass(*PR); initializeGCNNSAReassignPass(*PR); initializeGCNPreRAOptimizationsPass(*PR); + initializeInferArgAddressSpacesPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -1025,6 +1027,9 @@ // without ever running any passes on the second. addPass(createBarrierNoopPass()); + if (TM.getOptLevel() > CodeGenOpt::Less) + addPass(createInferArgAddressSpacesPass()); + // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. if (TM.getTargetTriple().getArch() == Triple::r600) addPass(createR600OpenCLImageTypeLoweringPass()); Index: llvm/lib/Transforms/IPO/CMakeLists.txt =================================================================== --- llvm/lib/Transforms/IPO/CMakeLists.txt +++ llvm/lib/Transforms/IPO/CMakeLists.txt @@ -22,6 +22,7 @@ HotColdSplitting.cpp IPO.cpp IROutliner.cpp + InferArgumentAddressSpaces.cpp InferFunctionAttrs.cpp InlineSimple.cpp Inliner.cpp Index: llvm/lib/Transforms/IPO/IPO.cpp =================================================================== --- llvm/lib/Transforms/IPO/IPO.cpp +++ llvm/lib/Transforms/IPO/IPO.cpp @@ -39,6 +39,7 @@ initializeIROutlinerLegacyPassPass(Registry); initializeAlwaysInlinerLegacyPassPass(Registry); initializeSimpleInlinerPass(Registry); + initializeInferArgAddressSpacesPass(Registry); initializeInferFunctionAttrsLegacyPassPass(Registry); initializeInternalizeLegacyPassPass(Registry); initializeLoopExtractorLegacyPassPass(Registry); Index: llvm/lib/Transforms/IPO/InferArgumentAddressSpaces.cpp =================================================================== --- /dev/null +++ llvm/lib/Transforms/IPO/InferArgumentAddressSpaces.cpp @@ -0,0 +1,220 @@ +//===- InferArgumentAddressSpaces.cpp -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Infer pointer argument address spaces for local functions based on the call +/// sites. When all calls use a same address space for actual argument we can +/// infer its address space in the callee. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/InitializePasses.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Transforms/IPO/InferArgumentAddressSpaces.h" + +#define DEBUG_TYPE "infer-argument-address-spaces" + +using namespace llvm; + +namespace { +class InferArgAddressSpacesImpl { +private: + bool handleFunction(Function &F); + +public: + bool run(const CallGraph &CG); +}; + +class InferArgAddressSpaces : public ModulePass { +public: + static char ID; + + InferArgAddressSpaces() : ModulePass(ID) {} + + bool runOnModule(Module &M) override { + if (skipModule(M)) + return false; + + CallGraphWrapperPass &CGPass = getAnalysis(); + const CallGraph &CG = CGPass.getCallGraph(); + + return InferArgAddressSpacesImpl().run(CG); + } + + StringRef getPassName() const override { + return "Infer Argument Address Spaces"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + AU.addRequired(); + ModulePass::getAnalysisUsage(AU); + } +}; + +} // end anonymous namespace + +char InferArgAddressSpaces::ID = 0; + +INITIALIZE_PASS(InferArgAddressSpaces, DEBUG_TYPE, + "Infer Argument Address Spaces", false, false) + +// Return address space of a pointer \p V if we can deduce it or -1. +static unsigned getPointerAddressSpace(const Value *V) { + for ( ; ; ) { + if (auto *BC = dyn_cast(V)) { + V = BC->getOperand(0); + continue; + } + + if (auto *GEP = dyn_cast(V)) { + V = GEP->getPointerOperand(); + continue; + } + + if (auto *CE = dyn_cast(V)) { + switch (CE->getOpcode()) { + case Instruction::AddrSpaceCast: + return CE->getOperand(0)->getType()->getPointerAddressSpace(); + case Instruction::BitCast: + case Instruction::GetElementPtr: + V = CE->getOperand(0); + continue; + default: + break; + } + } + + break; + } + + if (auto *AC = dyn_cast(V)) { + return AC->getSrcAddressSpace(); + } + + return UINT_MAX; +} + +bool InferArgAddressSpacesImpl::handleFunction(Function &F) { + LLVM_DEBUG(dbgs() << "Infer argument address spaces running on '" + << F.getName() << "'\n"); + + if (F.use_empty()) + return false; + + // The pair for pointer arguments: argument number and address sppace. + std::list> PointerArgs; + unsigned I = 0; + for (Argument &Arg : F.args()) { + if (auto PtrTy = dyn_cast(Arg.getType())) + // Skip pointers which already have specific address space and generic + // pointers already coerced to a specific address space. Essentialy + // make sure coercion only goes in one direction. + if (!PtrTy->getAddressSpace() && + !(Arg.hasOneUse() && isa(*Arg.user_begin()))) + PointerArgs.push_back(std::make_pair(I, UINT_MAX)); + ++I; + } + + if (PointerArgs.empty()) + return false; + + FunctionType *FTy = F.getFunctionType(); + for (auto *U : F.users()) { + CallInst *CI = dyn_cast(&*U); + if (!CI || CI->getCalledFunction() != &F) + return false; + + decltype(PointerArgs)::iterator Next; + for (auto I = PointerArgs.begin(), E = PointerArgs.end(); I != E; + I = Next) { + Next = std::next(I); + Value *Op = CI->getArgOperand(I->first); + unsigned AS = getPointerAddressSpace(Op); + unsigned RecordedAS = I->second; + + // Omit arguments for which we cannot determine address space at all, + // or we have several calls with different address spaces, or there + // is a call with the same address space this pointer already has. + if (AS == UINT_MAX || (AS != RecordedAS && RecordedAS != UINT_MAX) || + AS == FTy->getParamType(I->first)->getPointerAddressSpace()) { + PointerArgs.erase(I); + if (PointerArgs.empty()) + return false; + } + I->second = AS; + } + } + + // We have a list of pointer arguments with unique address space across all + // call sites. Now we can actually infer their types. + bool Changed = false; + BasicBlock &Entry = F.getEntryBlock(); + IRBuilder<> B(&Entry, Entry.getFirstInsertionPt()); + for (auto I : PointerArgs) { + Argument *Arg = F.getArg(I.first); + LLVM_DEBUG(dbgs() << " Coerce argument '" << Arg->getName() + << "' to address space " << I.second << '\n'); + + PointerType *PT = dyn_cast(Arg->getType()); + PointerType *NewPT = + PointerType::getWithSamePointeeType(PT, I.second); + Value *Cast = + B.CreateAddrSpaceCast(Arg, NewPT, Twine(Arg->getName(), ".coerce")); + Value *CastBack = + B.CreateAddrSpaceCast(Cast, PT, Twine(Arg->getName(), ".ptr")); + Arg->replaceUsesWithIf(CastBack, + [Cast](Use &U) { return U.getUser() != Cast; }); + Changed = true; + } + + return Changed; +} + +bool InferArgAddressSpacesImpl::run(const CallGraph &CG) { + SetVector Worklist; + for (scc_iterator CGI = scc_begin(&CG); !CGI.isAtEnd(); + ++CGI) { + for (const CallGraphNode *I : *CGI) { + Function *F = I->getFunction(); + if (!F || F->isDeclaration() || F->hasExternalLinkage()) + continue; + + Worklist.insert(F); + } + } + + bool Changed = false; + + // Process nodes in reverse order starting from callers. + while (!Worklist.empty()) + Changed |= handleFunction(*Worklist.pop_back_val()); + + return Changed; +} + +Pass *llvm::createInferArgAddressSpacesPass() { + return new InferArgAddressSpaces(); +} + +PreservedAnalyses InferArgAddressSpacesPass::run(Module &M, + ModuleAnalysisManager &AM) { + const CallGraph &CG = AM.getResult(M); + + bool Changed = InferArgAddressSpacesImpl().run(CG); + if (Changed) { + PreservedAnalyses PA; + PA.preserveSet(); + return PA; + } + return PreservedAnalyses::all(); +} Index: llvm/test/CodeGen/AMDGPU/infer-arg-addrspaces.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/infer-arg-addrspaces.ll @@ -0,0 +1,407 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s +; RUN: opt -mtriple=amdgcn-- -S -passes=infer-argument-address-spaces,infer-address-spaces < %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-- -S -infer-argument-address-spaces -infer-address-spaces < %s | FileCheck %s + +; GCN-LABEL: {{^}}external_bar: +; GCN: flat_store_dword +define void @external_bar(ptr %p) #0 { +; CHECK-LABEL: @external_bar( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i32 0, ptr [[P:%.*]], align 4 +; CHECK-NEXT: ret void +; +entry: + store i32 0, ptr %p, align 4 + ret void +} + +; GCN-LABEL: {{^}}internal_bar: +; GCN: global_store_dword +define internal void @internal_bar(ptr %p) #0 { +; CHECK-LABEL: @internal_bar( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P_COERCE:%.*]] = addrspacecast ptr [[P:%.*]] to ptr addrspace(1) +; CHECK-NEXT: store i32 0, ptr addrspace(1) [[P_COERCE]], align 4 +; CHECK-NEXT: ret void +; +entry: + store i32 0, ptr %p, align 4 + ret void +} + +; GCN-LABEL: {{^}}single_caller_foo: +define internal void @single_caller_foo(ptr %p) #0 { +; CHECK-LABEL: @single_caller_foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P_COERCE:%.*]] = addrspacecast ptr [[P:%.*]] to ptr addrspace(1) +; CHECK-NEXT: [[P_PTR:%.*]] = addrspacecast ptr addrspace(1) [[P_COERCE]] to ptr +; CHECK-NEXT: tail call void @internal_bar(ptr [[P_PTR]]) +; CHECK-NEXT: tail call void @external_bar(ptr [[P_PTR]]) +; CHECK-NEXT: ret void +; +entry: + tail call void @internal_bar(ptr %p) + tail call void @external_bar(ptr %p) + ret void +} + +; GCN-LABEL: {{^}}caller1: +define amdgpu_kernel void @caller1(ptr addrspace(1) %p) { +; CHECK-LABEL: @caller1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C:%.*]] = addrspacecast ptr addrspace(1) [[P:%.*]] to ptr +; CHECK-NEXT: tail call void @single_caller_foo(ptr [[C]]) +; CHECK-NEXT: ret void +; +entry: + %c = addrspacecast ptr addrspace(1) %p to ptr + tail call void @single_caller_foo(ptr %c) + ret void +} + +; GCN-LABEL: {{^}}multiple_caller_same_as_foo: +; GCN: global_load_dword +; GCN: global_store_dword +define internal void @multiple_caller_same_as_foo(ptr %p) #0 { +; CHECK-LABEL: @multiple_caller_same_as_foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P_COERCE:%.*]] = addrspacecast ptr [[P:%.*]] to ptr addrspace(1) +; CHECK-NEXT: [[V:%.*]] = load i32, ptr addrspace(1) [[P_COERCE]], align 4 +; CHECK-NEXT: [[I:%.*]] = add i32 [[V]], 1 +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P_COERCE]], i64 1 +; CHECK-NEXT: store i32 [[I]], ptr addrspace(1) [[P1]], align 4 +; CHECK-NEXT: ret void +; +entry: + %v = load i32, ptr %p, align 4 + %i = add i32 %v, 1 + %p1 = getelementptr inbounds i32, ptr %p, i64 1 + store i32 %i, ptr %p1, align 4 + ret void +} + +; GCN-LABEL: {{^}}caller2: +define amdgpu_kernel void @caller2(ptr addrspace(1) %p) { +; CHECK-LABEL: @caller2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C:%.*]] = addrspacecast ptr addrspace(1) [[P:%.*]] to ptr +; CHECK-NEXT: tail call void @multiple_caller_same_as_foo(ptr [[C]]) +; CHECK-NEXT: [[C1:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 1 +; CHECK-NEXT: tail call void @multiple_caller_same_as_foo(ptr [[C1]]) +; CHECK-NEXT: ret void +; +entry: + %c = addrspacecast ptr addrspace(1) %p to ptr + tail call void @multiple_caller_same_as_foo(ptr %c) + %c1 = getelementptr inbounds i32, ptr %c, i64 1 + tail call void @multiple_caller_same_as_foo(ptr %c1) + ret void +} + +; GCN-LABEL: {{^}}caller3: +define amdgpu_kernel void @caller3(ptr addrspace(1) %p) { +; CHECK-LABEL: @caller3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C:%.*]] = addrspacecast ptr addrspace(1) [[P:%.*]] to ptr +; CHECK-NEXT: tail call void @multiple_caller_same_as_foo(ptr [[C]]) +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P]], i64 1 +; CHECK-NEXT: [[C1:%.*]] = addrspacecast ptr addrspace(1) [[P1]] to ptr +; CHECK-NEXT: tail call void @multiple_caller_same_as_foo(ptr [[C1]]) +; CHECK-NEXT: ret void +; +entry: + %c = addrspacecast ptr addrspace(1) %p to ptr + tail call void @multiple_caller_same_as_foo(ptr %c) + %p1 = getelementptr inbounds i32, ptr addrspace(1) %p, i64 1 + %c1 = addrspacecast ptr addrspace(1) %p1 to ptr + tail call void @multiple_caller_same_as_foo(ptr %c1) + ret void +} + +; GCN-LABEL: {{^}}multiple_caller_different_as_foo: +; GCN: flat_store_dword +define internal void @multiple_caller_different_as_foo(ptr %p) #0 { +; CHECK-LABEL: @multiple_caller_different_as_foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i32 0, ptr [[P:%.*]], align 4 +; CHECK-NEXT: ret void +; +entry: + store i32 0, ptr %p, align 4 + ret void +} + +; GCN-LABEL: {{^}}caller4: +define amdgpu_kernel void @caller4(ptr addrspace(1) %p) { +; CHECK-LABEL: @caller4( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C:%.*]] = addrspacecast ptr addrspace(1) [[P:%.*]] to ptr +; CHECK-NEXT: tail call void @multiple_caller_different_as_foo(ptr [[C]]) +; CHECK-NEXT: ret void +; +entry: + %c = addrspacecast ptr addrspace(1) %p to ptr + tail call void @multiple_caller_different_as_foo(ptr %c) + ret void +} + +; GCN-LABEL: {{^}}caller5: +define amdgpu_kernel void @caller5(ptr addrspace(3) %p) { +; CHECK-LABEL: @caller5( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C:%.*]] = addrspacecast ptr addrspace(3) [[P:%.*]] to ptr +; CHECK-NEXT: tail call void @multiple_caller_different_as_foo(ptr [[C]]) +; CHECK-NEXT: ret void +; +entry: + %c = addrspacecast ptr addrspace(3) %p to ptr + tail call void @multiple_caller_different_as_foo(ptr %c) + ret void +} + +; GCN-LABEL: {{^}}single_caller_different_as_foo: +; GCN: flat_store_dword +define internal void @single_caller_different_as_foo(ptr %p) #0 { +; CHECK-LABEL: @single_caller_different_as_foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i32 0, ptr [[P:%.*]], align 4 +; CHECK-NEXT: ret void +; +entry: + store i32 0, ptr %p, align 4 + ret void +} + +; GCN-LABEL: {{^}}caller6: +define amdgpu_kernel void @caller6(ptr addrspace(1) %p, ptr %p1) { +; CHECK-LABEL: @caller6( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C:%.*]] = addrspacecast ptr addrspace(1) [[P:%.*]] to ptr +; CHECK-NEXT: tail call void @single_caller_different_as_foo(ptr [[C]]) +; CHECK-NEXT: tail call void @single_caller_different_as_foo(ptr [[P1:%.*]]) +; CHECK-NEXT: ret void +; +entry: + %c = addrspacecast ptr addrspace(1) %p to ptr + tail call void @single_caller_different_as_foo(ptr %c) + tail call void @single_caller_different_as_foo(ptr %p1) + ret void +} + +; GCN-LABEL: {{^}}aliased_foo: +; GCN: flat_store_dword +define internal void @aliased_foo(ptr %p) #0 { +; CHECK-LABEL: @aliased_foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i32 0, ptr [[P:%.*]], align 4 +; CHECK-NEXT: ret void +; +entry: + store i32 0, ptr %p, align 4 + ret void +} + +@alias1 = hidden alias void (ptr), void (ptr addrspace(3))* @aliased_foo + +; GCN-LABEL: {{^}}caller7: +define amdgpu_kernel void @caller7(ptr %p) { +; CHECK-LABEL: @caller7( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C:%.*]] = addrspacecast ptr [[P:%.*]] to ptr addrspace(3) +; CHECK-NEXT: tail call void @alias1(ptr addrspace(3) [[C]]) +; CHECK-NEXT: ret void +; +entry: + %c = addrspacecast ptr %p to ptr addrspace(3) + tail call void @alias1(ptr addrspace(3) %c) + ret void +} + +; GCN-LABEL: {{^}}bitcasted_foo: +; GCN: flat_store_dword +define internal void @bitcasted_foo(i32 %i, ptr %p) #0 { +; CHECK-LABEL: @bitcasted_foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i32 [[I:%.*]], ptr [[P:%.*]], align 4 +; CHECK-NEXT: ret void +; +entry: + store i32 %i, ptr %p, align 4 + ret void +} + +; GCN-LABEL: {{^}}caller8: +define amdgpu_kernel void @caller8(ptr addrspace(1) %p) { +; CHECK-LABEL: @caller8( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C:%.*]] = addrspacecast ptr addrspace(1) [[P:%.*]] to ptr +; CHECK-NEXT: tail call void @bitcasted_foo(float 1.000000e+00, ptr [[C]]) +; CHECK-NEXT: ret void +; +entry: + %c = addrspacecast ptr addrspace(1) %p to ptr + tail call void bitcast (void (i32, ptr)* @bitcasted_foo to void (float, ptr)*)(float 1.0, ptr %c) + ret void +} + +; GCN-LABEL: {{^}}one_of_two_foo: +; GCN: global_store_dword +; GCN: flat_store_dword +define internal void @one_of_two_foo(ptr %p, ptr %p1) #0 { +; CHECK-LABEL: @one_of_two_foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P_COERCE:%.*]] = addrspacecast ptr [[P:%.*]] to ptr addrspace(1) +; CHECK-NEXT: store i32 0, ptr addrspace(1) [[P_COERCE]], align 4 +; CHECK-NEXT: store i32 1, ptr [[P1:%.*]], align 4 +; CHECK-NEXT: ret void +; +entry: + store i32 0, ptr %p, align 4 + store i32 1, ptr %p1, align 4 + ret void +} + +; GCN-LABEL: {{^}}caller9: +define amdgpu_kernel void @caller9(ptr addrspace(1) %p, ptr %p1) { +; CHECK-LABEL: @caller9( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C:%.*]] = addrspacecast ptr addrspace(1) [[P:%.*]] to ptr +; CHECK-NEXT: tail call void @one_of_two_foo(ptr [[C]], ptr [[P1:%.*]]) +; CHECK-NEXT: ret void +; +entry: + %c = addrspacecast ptr addrspace(1) %p to ptr + tail call void @one_of_two_foo(ptr %c, ptr %p1) + ret void +} + +; GCN-LABEL: {{^}}two_of_two_foo: +; GCN: global_store_dword +; GCN: global_store_dword +define internal void @two_of_two_foo(ptr %p, ptr %p1) #0 { +; CHECK-LABEL: @two_of_two_foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P_COERCE:%.*]] = addrspacecast ptr [[P:%.*]] to ptr addrspace(1) +; CHECK-NEXT: [[P1_COERCE:%.*]] = addrspacecast ptr [[P1:%.*]] to ptr addrspace(1) +; CHECK-NEXT: store i32 0, ptr addrspace(1) [[P_COERCE]], align 4 +; CHECK-NEXT: store i32 1, ptr addrspace(1) [[P1_COERCE]], align 4 +; CHECK-NEXT: ret void +; +entry: + store i32 0, ptr %p, align 4 + store i32 1, ptr %p1, align 4 + ret void +} + +; GCN-LABEL: {{^}}caller10: +define amdgpu_kernel void @caller10(ptr addrspace(1) %p, ptr addrspace(1) %p1) { +; CHECK-LABEL: @caller10( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C:%.*]] = addrspacecast ptr addrspace(1) [[P:%.*]] to ptr +; CHECK-NEXT: [[C1:%.*]] = addrspacecast ptr addrspace(1) [[P1:%.*]] to ptr +; CHECK-NEXT: tail call void @two_of_two_foo(ptr [[C]], ptr [[C1]]) +; CHECK-NEXT: ret void +; +entry: + %c = addrspacecast ptr addrspace(1) %p to ptr + %c1 = addrspacecast ptr addrspace(1) %p1 to ptr + tail call void @two_of_two_foo(ptr %c, ptr %c1) + ret void +} + +; GCN-LABEL: {{^}}escaped_foo: +; GCN: flat_store_dword +define internal void @escaped_foo(ptr %p) #0 { +; CHECK-LABEL: @escaped_foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i32 0, ptr [[P:%.*]], align 4 +; CHECK-NEXT: ret void +; +entry: + store i32 0, ptr %p, align 4 + ret void +} + +@ptr_to_escaped_foo = internal global void (ptr)* @escaped_foo + +; GCN-LABEL: {{^}}caller11: +define amdgpu_kernel void @caller11(ptr addrspace(1) %p) { +; CHECK-LABEL: @caller11( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C:%.*]] = addrspacecast ptr addrspace(1) [[P:%.*]] to ptr +; CHECK-NEXT: tail call void @escaped_foo(ptr [[C]]) +; CHECK-NEXT: ret void +; +entry: + %c = addrspacecast ptr addrspace(1) %p to ptr + tail call void @escaped_foo(ptr %c) + ret void +} + +; GCN-LABEL: {{^}}bitcast_arg_foo: +; GCN: global_store_dword +define internal void @bitcast_arg_foo(i32* %p) #0 { +; CHECK-LABEL: @bitcast_arg_foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P_COERCE:%.*]] = addrspacecast ptr [[P:%.*]] to ptr addrspace(1) +; CHECK-NEXT: store i32 0, ptr addrspace(1) [[P_COERCE]], align 4 +; CHECK-NEXT: ret void +; +entry: + store i32 0, i32* %p, align 4 + ret void +} + +; GCN-LABEL: {{^}}caller12: +define amdgpu_kernel void @caller12(float addrspace(1)* %p) { +; CHECK-LABEL: @caller12( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C:%.*]] = addrspacecast ptr addrspace(1) [[P:%.*]] to ptr +; CHECK-NEXT: [[C1:%.*]] = bitcast ptr [[C]] to ptr +; CHECK-NEXT: [[C2:%.*]] = bitcast ptr [[C1]] to ptr +; CHECK-NEXT: tail call void @bitcast_arg_foo(ptr [[C2]]) +; CHECK-NEXT: ret void +; +entry: + %c = addrspacecast float addrspace(1)* %p to float* + %c1 = bitcast float* %c to i64* + %c2 = bitcast i64* %c1 to i32* + tail call void @bitcast_arg_foo(i32* %c2) + ret void +} + +; GCN-LABEL: {{^}}constexpr_cast_arg_foo: +; GCN: global_store_dword +define internal void @constexpr_cast_arg_foo(i32* %p) #0 { +; CHECK-LABEL: @constexpr_cast_arg_foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P_COERCE:%.*]] = addrspacecast ptr [[P:%.*]] to ptr addrspace(1) +; CHECK-NEXT: store i32 0, ptr addrspace(1) [[P_COERCE]], align 4 +; CHECK-NEXT: ret void +; +entry: + store i32 0, i32* %p, align 4 + ret void +} + +@ptr_global_float = addrspace(1) global float addrspace(1)* undef +@ptr_global_i32 = addrspace(1) global i32* undef + +; GCN-LABEL: {{^}}caller13: +define amdgpu_kernel void @caller13(float addrspace(1)* %p1) { +; CHECK-LABEL: @caller13( +; CHECK-NEXT: entry: +; CHECK-NEXT: tail call void @constexpr_cast_arg_foo(ptr addrspacecast (ptr addrspace(1) null to ptr)) +; CHECK-NEXT: tail call void @constexpr_cast_arg_foo(ptr addrspacecast (ptr addrspace(1) @ptr_global_float to ptr)) +; CHECK-NEXT: tail call void @constexpr_cast_arg_foo(ptr getelementptr (i32, ptr addrspacecast (ptr addrspace(1) @ptr_global_i32 to ptr), i64 1)) +; CHECK-NEXT: ret void +; +entry: + tail call void @constexpr_cast_arg_foo(i32* addrspacecast (i32 addrspace(1)* null to i32*)) + tail call void @constexpr_cast_arg_foo(i32* bitcast (float* addrspacecast (float addrspace(1)* @ptr_global_float to float*) to i32*)) + tail call void @constexpr_cast_arg_foo(i32* getelementptr (i32, i32* addrspacecast (i32 addrspace(1)* @ptr_global_i32 to i32*), i64 1)) + ret void +} + +attributes #0 = { noinline } Index: llvm/test/CodeGen/AMDGPU/llc-pipeline.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -720,6 +720,8 @@ ; GCN-O2-NEXT: Call Graph SCC Pass Manager ; GCN-O2-NEXT: Inliner for always_inline functions ; GCN-O2-NEXT: A No-Op Barrier Pass +; GCN-O2-NEXT: CallGraph Construction +; GCN-O2-NEXT: Infer Argument Address Spaces ; GCN-O2-NEXT: Lower OpenCL enqueued blocks ; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O2-NEXT: FunctionPass Manager @@ -1013,6 +1015,8 @@ ; GCN-O3-NEXT: Call Graph SCC Pass Manager ; GCN-O3-NEXT: Inliner for always_inline functions ; GCN-O3-NEXT: A No-Op Barrier Pass +; GCN-O3-NEXT: CallGraph Construction +; GCN-O3-NEXT: Infer Argument Address Spaces ; GCN-O3-NEXT: Lower OpenCL enqueued blocks ; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O3-NEXT: FunctionPass Manager