Index: llvm/lib/Target/AMDGPU/AMDGPU.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPU.h +++ llvm/lib/Target/AMDGPU/AMDGPU.h @@ -102,6 +102,15 @@ void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &); extern char &AMDGPULowerKernelArgumentsID; +FunctionPass *createAMDGPUPromoteKernelArgumentsPass(); +void initializeAMDGPUPromoteKernelArgumentsPass(PassRegistry &); +extern char &AMDGPUPromoteKernelArgumentsID; + +struct AMDGPUPromoteKernelArgumentsPass + : PassInfoMixin { + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + ModulePass *createAMDGPULowerKernelAttributesPass(); void initializeAMDGPULowerKernelAttributesPass(PassRegistry &); extern char &AMDGPULowerKernelAttributesID; Index: llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp @@ -0,0 +1,194 @@ +//===-- AMDGPUPromoteKernelArguments.cpp ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This pass recursively promotes generic pointer arguments of a kernel +/// into the global address space. +/// +/// The pass walks kernel's pointer arguments, then loads from them. If a loaded +/// value is a pointer and loaded pointer is unmodified in the kernel before the +/// load, then promote loaded pointer to global. Then recursively continue. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/InitializePasses.h" +#include "llvm/IR/IRBuilder.h" + +#define DEBUG_TYPE "amdgpu-promote-kernel-arguments" + +using namespace llvm; + +namespace { + +class AMDGPUPromoteKernelArguments : public FunctionPass{ + MemorySSA *MSSA; + + Instruction *ArgCastInsertPt; + + SmallVector Ptrs; + + void enqueueUsers(Value *Ptr); + + bool promotePointer(Value *Ptr); + +public: + static char ID; + + AMDGPUPromoteKernelArguments() : FunctionPass(ID) {} + + bool run(Function &F, MemorySSA &MSSA); + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesAll(); + } +}; + +} // end anonymous namespace + +void AMDGPUPromoteKernelArguments::enqueueUsers(Value *Ptr) { + SmallVector PtrUsers(Ptr->users()); + + while (!PtrUsers.empty()) { + Instruction *U = dyn_cast(PtrUsers.pop_back_val()); + if (!U) + continue; + + switch (U->getOpcode()) { + default: + break; + case Instruction::Load: { + LoadInst *LD = cast(U); + PointerType *PT = dyn_cast(LD->getType()); + if (!PT || (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS && + PT->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS && + PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) || + LD->getPointerOperand()->stripInBoundsOffsets() != Ptr) + break; + const MemoryAccess *MA = MSSA->getWalker()->getClobberingMemoryAccess(LD); + // TODO: This load poprobably can be promoted to constant address space. + if (MSSA->isLiveOnEntryDef(MA)) + Ptrs.push_back(LD); + break; + } + case Instruction::GetElementPtr: + case Instruction::AddrSpaceCast: + case Instruction::BitCast: + if (U->getOperand(0)->stripInBoundsOffsets() == Ptr) + PtrUsers.append(U->user_begin(), U->user_end()); + break; + } + } +} + +bool AMDGPUPromoteKernelArguments::promotePointer(Value *Ptr) { + enqueueUsers(Ptr); + + PointerType *PT = cast(Ptr->getType()); + if (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) + return false; + + bool IsArg = isa(Ptr); + IRBuilder<> B(IsArg ? ArgCastInsertPt + : &*std::next(cast(Ptr)->getIterator())); + + // Cast pointer to global address space and back to flat and let + // Infer Address Spaces pass to do all necessary rewriting. + PointerType *NewPT = + PointerType::getWithSamePointeeType(PT, AMDGPUAS::GLOBAL_ADDRESS); + Value *Cast = B.CreateAddrSpaceCast(Ptr, NewPT, + Twine(Ptr->getName(), ".global")); + Value *CastBack = B.CreateAddrSpaceCast(Cast, PT, + Twine(Ptr->getName(), ".flat")); + Ptr->replaceUsesWithIf(CastBack, + [Cast](Use &U) { return U.getUser() != Cast; }); + + return true; +} + +// skip allocas +static BasicBlock::iterator getInsertPt(BasicBlock &BB) { + BasicBlock::iterator InsPt = BB.getFirstInsertionPt(); + for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) { + AllocaInst *AI = dyn_cast(&*InsPt); + + // If this is a dynamic alloca, the value may depend on the loaded kernargs, + // so loads will need to be inserted before it. + if (!AI || !AI->isStaticAlloca()) + break; + } + + return InsPt; +} + +bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA) { + if (skipFunction(F)) + return false; + + CallingConv::ID CC = F.getCallingConv(); + if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty()) + return false; + + ArgCastInsertPt = &*getInsertPt(*F.begin()); + this->MSSA = &MSSA; + + for (Argument &Arg : F.args()) { + if (Arg.use_empty()) + continue; + + PointerType *PT = dyn_cast(Arg.getType()); + if (!PT || (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS && + PT->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS && + PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)) + continue; + + Ptrs.push_back(&Arg); + } + + bool Changed = false; + while (!Ptrs.empty()) { + Value *Ptr = Ptrs.pop_back_val(); + Changed |= promotePointer(Ptr); + } + + return Changed; +} + +bool AMDGPUPromoteKernelArguments::runOnFunction(Function &F) { + MemorySSA &MSSA = getAnalysis().getMSSA(); + return run(F, MSSA); +} + +INITIALIZE_PASS_BEGIN(AMDGPUPromoteKernelArguments, DEBUG_TYPE, + "AMDGPU Promote Kernel Arguments", false, false) +INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) +INITIALIZE_PASS_END(AMDGPUPromoteKernelArguments, DEBUG_TYPE, + "AMDGPU Promote Kernel Arguments", false, false) + +char AMDGPUPromoteKernelArguments::ID = 0; + +FunctionPass *llvm::createAMDGPUPromoteKernelArgumentsPass() { + return new AMDGPUPromoteKernelArguments(); +} + +PreservedAnalyses +AMDGPUPromoteKernelArgumentsPass::run(Function &F, + FunctionAnalysisManager &AM) { + MemorySSA &MSSA = AM.getResult(F).getMSSA(); + if (AMDGPUPromoteKernelArguments().run(F, MSSA)) { + PreservedAnalyses PA; + PA.preserveSet(); + PA.preserve(); + return PA; + } + return PreservedAnalyses::all(); +} Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -306,6 +306,11 @@ cl::desc("Enable Pre-RA optimizations pass"), cl::init(true), cl::Hidden); +static cl::opt EnablePromoteKernelArguments( + "amdgpu-enable-promote-kernel-arguments", + cl::desc("Enable promotion of flat kernel pointer arguments to global"), + cl::Hidden, cl::init(true)); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -339,6 +344,7 @@ initializeAMDGPUArgumentUsageInfoPass(*PR); initializeAMDGPUAtomicOptimizerPass(*PR); initializeAMDGPULowerKernelArgumentsPass(*PR); + initializeAMDGPUPromoteKernelArgumentsPass(*PR); initializeAMDGPULowerKernelAttributesPass(*PR); initializeAMDGPULowerIntrinsicsPass(*PR); initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); @@ -533,6 +539,8 @@ bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls; bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt; bool LibCallSimplify = EnableLibCallSimplify && EnableOpt; + bool PromoteKernelArguments = EnablePromoteKernelArguments && + getOptLevel() > CodeGenOpt::Less; if (EnableFunctionCalls) { delete Builder.Inliner; @@ -574,7 +582,14 @@ Builder.addExtension( PassManagerBuilder::EP_CGSCCOptimizerLate, - [EnableOpt](const PassManagerBuilder &, legacy::PassManagerBase &PM) { + [EnableOpt, PromoteKernelArguments](const PassManagerBuilder &, + legacy::PassManagerBase &PM) { + // Add promote kernel arguments pass to the opt pipeline right before + // infer address spaces which is needed to do actual address space + // rewriting. + if (PromoteKernelArguments) + PM.add(createAMDGPUPromoteKernelArgumentsPass()); + // Add infer address spaces pass to the opt pipeline after inlining // but before SROA to increase SROA opportunities. PM.add(createInferAddressSpacesPass()); @@ -651,6 +666,10 @@ PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); return true; } + if (PassName == "amdgpu-promote-kernel-arguments") { + PM.addPass(AMDGPUPromoteKernelArgumentsPass()); + return true; + } return false; }); @@ -702,6 +721,13 @@ FunctionPassManager FPM; + // Add promote kernel arguments pass to the opt pipeline right before + // infer address spaces which is needed to do actual address space + // rewriting. + if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() && + EnablePromoteKernelArguments) + FPM.addPass(AMDGPUPromoteKernelArgumentsPass()); + // Add infer address spaces pass to the opt pipeline after inlining // but before SROA to increase SROA opportunities. FPM.addPass(InferAddressSpacesPass()); Index: llvm/lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- llvm/lib/Target/AMDGPU/CMakeLists.txt +++ llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -83,6 +83,7 @@ AMDGPUPrintfRuntimeBinding.cpp AMDGPUPromoteAlloca.cpp AMDGPUPropagateAttributes.cpp + AMDGPUPromoteKernelArguments.cpp AMDGPURegBankCombiner.cpp AMDGPURegisterBankInfo.cpp AMDGPUReplaceLDSUseWithPointer.cpp Index: llvm/test/CodeGen/AMDGPU/opt-pipeline.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/opt-pipeline.ll +++ llvm/test/CodeGen/AMDGPU/opt-pipeline.ll @@ -408,6 +408,11 @@ ; GCN-O2-NEXT: OpenMP specific optimizations ; GCN-O2-NEXT: Deduce function attributes ; GCN-O2-NEXT: FunctionPass Manager +; GCN-O2-NEXT: Dominator Tree Construction +; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) +; GCN-O2-NEXT: Function Alias Analysis Results +; GCN-O2-NEXT: Memory SSA +; GCN-O2-NEXT: AMDGPU Promote Kernel Arguments ; GCN-O2-NEXT: Infer address spaces ; GCN-O2-NEXT: AMDGPU Kernel Attributes ; GCN-O2-NEXT: FunctionPass Manager @@ -766,6 +771,11 @@ ; GCN-O3-NEXT: Deduce function attributes ; GCN-O3-NEXT: Promote 'by reference' arguments to scalars ; GCN-O3-NEXT: FunctionPass Manager +; GCN-O3-NEXT: Dominator Tree Construction +; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) +; GCN-O3-NEXT: Function Alias Analysis Results +; GCN-O3-NEXT: Memory SSA +; GCN-O3-NEXT: AMDGPU Promote Kernel Arguments ; GCN-O3-NEXT: Infer address spaces ; GCN-O3-NEXT: AMDGPU Kernel Attributes ; GCN-O3-NEXT: FunctionPass Manager Index: llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/promote-kernel-arguments.ll @@ -0,0 +1,317 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -amdgpu-promote-kernel-arguments -infer-address-spaces | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -passes=amdgpu-promote-kernel-arguments,infer-address-spaces | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -amdgpu-promote-kernel-arguments -infer-address-spaces | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: ptr_nest_3: +; GCN-COUNT-2: global_load_dwordx2 +; GCN: global_store_dword +define amdgpu_kernel void @ptr_nest_3(float** addrspace(1)* nocapture readonly %Arg) { +; CHECK-LABEL: @ptr_nest_3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float**, float** addrspace(1)* [[ARG:%.*]], i32 [[I]] +; CHECK-NEXT: [[P2:%.*]] = load float**, float** addrspace(1)* [[P1]], align 8 +; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast float** [[P2]] to float* addrspace(1)* +; CHECK-NEXT: [[P3:%.*]] = load float*, float* addrspace(1)* [[P2_GLOBAL]], align 8 +; CHECK-NEXT: [[P3_GLOBAL:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)* +; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* [[P3_GLOBAL]], align 4 +; CHECK-NEXT: ret void +; +entry: + %i = tail call i32 @llvm.amdgcn.workitem.id.x() + %p1 = getelementptr inbounds float**, float** addrspace(1)* %Arg, i32 %i + %p2 = load float**, float** addrspace(1)* %p1, align 8 + %p3 = load float*, float** %p2, align 8 + store float 0.000000e+00, float* %p3, align 4 + ret void +} + +; GCN-LABEL: ptr_bitcast: +; GCN: global_load_dwordx2 +; GCN: global_store_dword +define amdgpu_kernel void @ptr_bitcast(float** nocapture readonly %Arg) { +; CHECK-LABEL: @ptr_bitcast( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)* +; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i32 [[I]] +; CHECK-NEXT: [[P1_CAST:%.*]] = bitcast float* addrspace(1)* [[P1]] to i32* addrspace(1)* +; CHECK-NEXT: [[P2:%.*]] = load i32*, i32* addrspace(1)* [[P1_CAST]], align 8 +; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast i32* [[P2]] to i32 addrspace(1)* +; CHECK-NEXT: store i32 0, i32 addrspace(1)* [[P2_GLOBAL]], align 4 +; CHECK-NEXT: ret void +; +entry: + %i = tail call i32 @llvm.amdgcn.workitem.id.x() + %p1 = getelementptr inbounds float*, float** %Arg, i32 %i + %p1.cast = bitcast float** %p1 to i32** + %p2 = load i32*, i32** %p1.cast, align 8 + store i32 0, i32* %p2, align 4 + ret void +} + +%struct.S = type { float* } + +; GCN-LABEL: ptr_in_struct: +; GCN: s_load_dwordx2 +; GCN: global_store_dword +define amdgpu_kernel void @ptr_in_struct(%struct.S addrspace(1)* nocapture readonly %Arg) { +; CHECK-LABEL: @ptr_in_struct( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], [[STRUCT_S]] addrspace(1)* [[ARG:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[P1:%.*]] = load float*, float* addrspace(1)* [[P]], align 8 +; CHECK-NEXT: [[P1_GLOBAL:%.*]] = addrspacecast float* [[P1]] to float addrspace(1)* +; CHECK-NEXT: [[ID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float addrspace(1)* [[P1_GLOBAL]], i32 [[ID]] +; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* [[ARRAYIDX]], align 4 +; CHECK-NEXT: ret void +; +entry: + %p = getelementptr inbounds %struct.S, %struct.S addrspace(1)* %Arg, i64 0, i32 0 + %p1 = load float*, float* addrspace(1)* %p, align 8 + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %arrayidx = getelementptr inbounds float, float* %p1, i32 %id + store float 0.000000e+00, float* %arrayidx, align 4 + ret void +} + +@LDS = internal unnamed_addr addrspace(3) global [4 x float] undef, align 16 + +; GCN-LABEL: flat_ptr_arg: +; GCN-COUNT-2: global_load_dwordx2 +; GCN: global_load_dwordx4 +; GCN: global_store_dword +define amdgpu_kernel void @flat_ptr_arg(float** nocapture readonly noalias %Arg, float** nocapture noalias %Out, i32 %X) { +; CHECK-LABEL: @flat_ptr_arg( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[OUT_GLOBAL:%.*]] = addrspacecast float** [[OUT:%.*]] to float* addrspace(1)* +; CHECK-NEXT: [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)* +; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I]] to i64 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i64 [[IDXPROM]] +; CHECK-NEXT: [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8 +; CHECK-NEXT: [[I1_GLOBAL:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)* +; CHECK-NEXT: [[I2:%.*]] = load float, float addrspace(1)* [[I1_GLOBAL]], align 4 +; CHECK-NEXT: [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X:%.*]] +; CHECK-NEXT: store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4 +; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 1 +; CHECK-NEXT: [[I3:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_1]], align 4 +; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[X]], 1 +; CHECK-NEXT: [[ARRAYIDX512_1:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_1]] +; CHECK-NEXT: store float [[I3]], float addrspace(3)* [[ARRAYIDX512_1]], align 4 +; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 2 +; CHECK-NEXT: [[I4:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_2]], align 4 +; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[X]], 2 +; CHECK-NEXT: [[ARRAYIDX512_2:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_2]] +; CHECK-NEXT: store float [[I4]], float addrspace(3)* [[ARRAYIDX512_2]], align 4 +; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 3 +; CHECK-NEXT: [[I5:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_3]], align 4 +; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[X]], 3 +; CHECK-NEXT: [[ARRAYIDX512_3:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_3]] +; CHECK-NEXT: store float [[I5]], float addrspace(3)* [[ARRAYIDX512_3]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[X]], -1 +; CHECK-NEXT: [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]] +; CHECK-NEXT: [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4 +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[OUT_GLOBAL]], i64 [[IDXPROM]] +; CHECK-NEXT: [[I7:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX11]], align 8 +; CHECK-NEXT: [[I7_GLOBAL:%.*]] = addrspacecast float* [[I7]] to float addrspace(1)* +; CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[X]] to i64 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I7_GLOBAL]], i64 [[IDXPROM8]] +; CHECK-NEXT: store float [[I6]], float addrspace(1)* [[ARRAYIDX9]], align 4 +; CHECK-NEXT: ret void +; +entry: + %i = tail call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = zext i32 %i to i64 + %arrayidx10 = getelementptr inbounds float*, float** %Arg, i64 %idxprom + %i1 = load float*, float** %arrayidx10, align 8 + %i2 = load float, float* %i1, align 4 + %arrayidx512 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %X + store float %i2, float addrspace(3)* %arrayidx512, align 4 + %arrayidx3.1 = getelementptr inbounds float, float* %i1, i64 1 + %i3 = load float, float* %arrayidx3.1, align 4 + %add.1 = add nsw i32 %X, 1 + %arrayidx512.1 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.1 + store float %i3, float addrspace(3)* %arrayidx512.1, align 4 + %arrayidx3.2 = getelementptr inbounds float, float* %i1, i64 2 + %i4 = load float, float* %arrayidx3.2, align 4 + %add.2 = add nsw i32 %X, 2 + %arrayidx512.2 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.2 + store float %i4, float addrspace(3)* %arrayidx512.2, align 4 + %arrayidx3.3 = getelementptr inbounds float, float* %i1, i64 3 + %i5 = load float, float* %arrayidx3.3, align 4 + %add.3 = add nsw i32 %X, 3 + %arrayidx512.3 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.3 + store float %i5, float addrspace(3)* %arrayidx512.3, align 4 + %sub = add nsw i32 %X, -1 + %arrayidx711 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %sub + %i6 = load float, float addrspace(3)* %arrayidx711, align 4 + %arrayidx11 = getelementptr inbounds float*, float** %Out, i64 %idxprom + %i7 = load float*, float** %arrayidx11, align 8 + %idxprom8 = sext i32 %X to i64 + %arrayidx9 = getelementptr inbounds float, float* %i7, i64 %idxprom8 + store float %i6, float* %arrayidx9, align 4 + ret void +} + +; GCN-LABEL: global_ptr_arg: +; GCN: global_load_dwordx2 +; GCN: global_load_dwordx4 +; GCN: global_store_dword +define amdgpu_kernel void @global_ptr_arg(float* addrspace(1)* nocapture readonly %Arg, i32 %X) { +; CHECK-LABEL: @global_ptr_arg( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I]] to i64 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8 +; CHECK-NEXT: [[I1_GLOBAL:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)* +; CHECK-NEXT: [[I2:%.*]] = load float, float addrspace(1)* [[I1_GLOBAL]], align 4 +; CHECK-NEXT: [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X:%.*]] +; CHECK-NEXT: store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4 +; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 1 +; CHECK-NEXT: [[I3:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_1]], align 4 +; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[X]], 1 +; CHECK-NEXT: [[ARRAYIDX512_1:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_1]] +; CHECK-NEXT: store float [[I3]], float addrspace(3)* [[ARRAYIDX512_1]], align 4 +; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 2 +; CHECK-NEXT: [[I4:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_2]], align 4 +; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[X]], 2 +; CHECK-NEXT: [[ARRAYIDX512_2:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_2]] +; CHECK-NEXT: store float [[I4]], float addrspace(3)* [[ARRAYIDX512_2]], align 4 +; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 3 +; CHECK-NEXT: [[I5:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_3]], align 4 +; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[X]], 3 +; CHECK-NEXT: [[ARRAYIDX512_3:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_3]] +; CHECK-NEXT: store float [[I5]], float addrspace(3)* [[ARRAYIDX512_3]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[X]], -1 +; CHECK-NEXT: [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]] +; CHECK-NEXT: [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4 +; CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[X]] to i64 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 [[IDXPROM8]] +; CHECK-NEXT: store float [[I6]], float addrspace(1)* [[ARRAYIDX9]], align 4 +; CHECK-NEXT: ret void +; +entry: + %i = tail call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = zext i32 %i to i64 + %arrayidx10 = getelementptr inbounds float*, float* addrspace(1)* %Arg, i64 %idxprom + %i1 = load float*, float* addrspace(1)* %arrayidx10, align 8 + %i2 = load float, float* %i1, align 4 + %arrayidx512 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %X + store float %i2, float addrspace(3)* %arrayidx512, align 4 + %arrayidx3.1 = getelementptr inbounds float, float* %i1, i64 1 + %i3 = load float, float* %arrayidx3.1, align 4 + %add.1 = add nsw i32 %X, 1 + %arrayidx512.1 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.1 + store float %i3, float addrspace(3)* %arrayidx512.1, align 4 + %arrayidx3.2 = getelementptr inbounds float, float* %i1, i64 2 + %i4 = load float, float* %arrayidx3.2, align 4 + %add.2 = add nsw i32 %X, 2 + %arrayidx512.2 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.2 + store float %i4, float addrspace(3)* %arrayidx512.2, align 4 + %arrayidx3.3 = getelementptr inbounds float, float* %i1, i64 3 + %i5 = load float, float* %arrayidx3.3, align 4 + %add.3 = add nsw i32 %X, 3 + %arrayidx512.3 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.3 + store float %i5, float addrspace(3)* %arrayidx512.3, align 4 + %sub = add nsw i32 %X, -1 + %arrayidx711 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %sub + %i6 = load float, float addrspace(3)* %arrayidx711, align 4 + %idxprom8 = sext i32 %X to i64 + %arrayidx9 = getelementptr inbounds float, float* %i1, i64 %idxprom8 + store float %i6, float* %arrayidx9, align 4 + ret void +} + +; GCN-LABEL: global_ptr_arg_clobbered: +; GCN: global_store_dwordx2 +; GCN: global_load_dwordx2 +; GCN: flat_load_dword +; GCN: flat_store_dword +define amdgpu_kernel void @global_ptr_arg_clobbered(float* addrspace(1)* nocapture readonly %Arg, i32 %X) { +; CHECK-LABEL: @global_ptr_arg_clobbered( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I]] to i64 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARRAYIDX10]], i32 [[X:%.*]] +; CHECK-NEXT: store float* null, float* addrspace(1)* [[ARRAYIDX11]], align 4 +; CHECK-NEXT: [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8 +; CHECK-NEXT: [[I2:%.*]] = load float, float* [[I1]], align 4 +; CHECK-NEXT: [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X]] +; CHECK-NEXT: store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[X]], -1 +; CHECK-NEXT: [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]] +; CHECK-NEXT: [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4 +; CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[X]] to i64 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[I1]], i64 [[IDXPROM8]] +; CHECK-NEXT: store float [[I6]], float* [[ARRAYIDX9]], align 4 +; CHECK-NEXT: ret void +; +entry: + %i = tail call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = zext i32 %i to i64 + %arrayidx10 = getelementptr inbounds float*, float* addrspace(1)* %Arg, i64 %idxprom + %arrayidx11 = getelementptr inbounds float*, float* addrspace(1)* %arrayidx10, i32 %X + store float* null, float* addrspace(1)* %arrayidx11, align 4 + %i1 = load float*, float* addrspace(1)* %arrayidx10, align 8 + %i2 = load float, float* %i1, align 4 + %arrayidx512 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %X + store float %i2, float addrspace(3)* %arrayidx512, align 4 + %sub = add nsw i32 %X, -1 + %arrayidx711 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %sub + %i6 = load float, float addrspace(3)* %arrayidx711, align 4 + %idxprom8 = sext i32 %X to i64 + %arrayidx9 = getelementptr inbounds float, float* %i1, i64 %idxprom8 + store float %i6, float* %arrayidx9, align 4 + ret void +} + +; GCN-LABEL: global_ptr_arg_clobbered_after_load: +; GCN: global_load_dwordx2 +; GCN: global_store_dwordx2 +; GCN: global_load_dword +; GCN: global_store_dword +define amdgpu_kernel void @global_ptr_arg_clobbered_after_load(float* addrspace(1)* nocapture readonly %Arg, i32 %X) { +; CHECK-LABEL: @global_ptr_arg_clobbered_after_load( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I]] to i64 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8 +; CHECK-NEXT: [[I1_GLOBAL:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)* +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARRAYIDX10]], i32 [[X:%.*]] +; CHECK-NEXT: store float* null, float* addrspace(1)* [[ARRAYIDX11]], align 4 +; CHECK-NEXT: [[I2:%.*]] = load float, float addrspace(1)* [[I1_GLOBAL]], align 4 +; CHECK-NEXT: [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X]] +; CHECK-NEXT: store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[X]], -1 +; CHECK-NEXT: [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]] +; CHECK-NEXT: [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4 +; CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[X]] to i64 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 [[IDXPROM8]] +; CHECK-NEXT: store float [[I6]], float addrspace(1)* [[ARRAYIDX9]], align 4 +; CHECK-NEXT: ret void +; +entry: + %i = tail call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = zext i32 %i to i64 + %arrayidx10 = getelementptr inbounds float*, float* addrspace(1)* %Arg, i64 %idxprom + %i1 = load float*, float* addrspace(1)* %arrayidx10, align 8 + %arrayidx11 = getelementptr inbounds float*, float* addrspace(1)* %arrayidx10, i32 %X + store float* null, float* addrspace(1)* %arrayidx11, align 4 + %i2 = load float, float* %i1, align 4 + %arrayidx512 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %X + store float %i2, float addrspace(3)* %arrayidx512, align 4 + %sub = add nsw i32 %X, -1 + %arrayidx711 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %sub + %i6 = load float, float addrspace(3)* %arrayidx711, align 4 + %idxprom8 = sext i32 %X to i64 + %arrayidx9 = getelementptr inbounds float, float* %i1, i64 %idxprom8 + store float %i6, float* %arrayidx9, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x()