Index: lib/Target/NVPTX/CMakeLists.txt =================================================================== --- lib/Target/NVPTX/CMakeLists.txt +++ lib/Target/NVPTX/CMakeLists.txt @@ -21,6 +21,7 @@ NVPTXInstrInfo.cpp NVPTXLowerAggrCopies.cpp NVPTXLowerKernelArgs.cpp + NVPTXLowerAlloca.cpp NVPTXMCExpr.cpp NVPTXPrologEpilogPass.cpp NVPTXRegisterInfo.cpp Index: lib/Target/NVPTX/NVPTX.h =================================================================== --- lib/Target/NVPTX/NVPTX.h +++ lib/Target/NVPTX/NVPTX.h @@ -70,6 +70,7 @@ MachineFunctionPass *createNVPTXReplaceImageHandlesPass(); FunctionPass *createNVPTXImageOptimizerPass(); FunctionPass *createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM); +BasicBlockPass *createNVPTXLowerAllocaPass(); bool isImageOrSamplerVal(const Value *, const Module *); Index: lib/Target/NVPTX/NVPTXLowerAlloca.cpp =================================================================== --- /dev/null +++ lib/Target/NVPTX/NVPTXLowerAlloca.cpp @@ -0,0 +1,115 @@ +//===-- NVPTXLowerAlloca.cpp - Make alloca to use local memory =====--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// For all alloca instructions, and add a pair of cast to local address for +// each of them. For example, +// +// %A = alloca i32 +// store i32 0, i32* %A ; emits st.u32 +// +// will be transformed to +// +// %A = alloca i32 +// %Local = addrspacecast i32* %A to i32 addrspace(5)* +// %Generic = addrspacecast i32 addrspace(5)* %A to i32* +// store i32 0, i32 addrspace(5)* %Generic ; emits st.local.u32 +// +// And we will rely on NVPTXFavorNonGenericAddrSpace to combine the last +// two instructions. +// +//===----------------------------------------------------------------------===// + +#include "NVPTX.h" +#include "NVPTXUtilities.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/Pass.h" + +using namespace llvm; + +namespace llvm { +void initializeNVPTXLowerAllocaPass(PassRegistry &); +} + +namespace { +class NVPTXLowerAlloca : public BasicBlockPass { + bool runOnBasicBlock(BasicBlock &BB) override; + +public: + static char ID; // Pass identification, replacement for typeid + NVPTXLowerAlloca() : BasicBlockPass(ID) {} + const char *getPassName() const override { + return "convert address space of alloca'ed memory to local"; + } +}; +} // namespace + +char NVPTXLowerAlloca::ID = 1; + +INITIALIZE_PASS(NVPTXLowerAlloca, "nvptx-lower-alloca", + "Lower Alloca", false, false) + +// ============================================================================= +// Main function for this pass. +// ============================================================================= +bool NVPTXLowerAlloca::runOnBasicBlock(BasicBlock &BB) { + bool Changed = false; + for (auto &I : BB) { + if (auto allocaInst = dyn_cast(&I)) { + Changed = true; + auto PTy = dyn_cast(allocaInst->getType()); + auto ETy = PTy->getElementType(); + auto LocalAddrTy = PointerType::get(ETy, ADDRESS_SPACE_LOCAL); + auto NewASCToLocal = new AddrSpaceCastInst(allocaInst, LocalAddrTy, ""); + auto GenericAddrTy = PointerType::get(ETy, ADDRESS_SPACE_GENERIC); + auto NewASCToGeneric = new AddrSpaceCastInst(NewASCToLocal, + GenericAddrTy, ""); + NewASCToLocal->insertAfter(allocaInst); + NewASCToGeneric->insertAfter(NewASCToLocal); + for (Value::use_iterator UI = allocaInst->use_begin(), + UE = allocaInst->use_end(); + UI != UE; ) { + // Check Load, Store, GEP, and BitCast Uses on alloca and make them + // use the converted generic address, in order to expose non-generic + // addrspacecast to NVPTXFavorNonGenericAddrSpace. For other types + // of instructions this is unecessary and may introduce redudant + // address cast. + const auto &AllocaUse = *UI++; + auto LI = dyn_cast(AllocaUse.getUser()); + if (LI && LI->getPointerOperand() == allocaInst && !LI->isVolatile()) { + LI->setOperand(LI->getPointerOperandIndex(), NewASCToGeneric); + continue; + } + auto SI = dyn_cast(AllocaUse.getUser()); + if (SI && SI->getPointerOperand() == allocaInst && !SI->isVolatile()) { + SI->setOperand(SI->getPointerOperandIndex(), NewASCToGeneric); + continue; + } + auto GI = dyn_cast(AllocaUse.getUser()); + if (GI && GI->getPointerOperand() == allocaInst) { + GI->setOperand(GI->getPointerOperandIndex(), NewASCToGeneric); + continue; + } + auto BI = dyn_cast(AllocaUse.getUser()); + if (BI && BI->getOperand(0) == allocaInst) { + BI->setOperand(0, NewASCToGeneric); + continue; + } + } + } + } + return Changed; +} + +BasicBlockPass *llvm::createNVPTXLowerAllocaPass() { + return new NVPTXLowerAlloca(); +} Index: lib/Target/NVPTX/NVPTXTargetMachine.cpp =================================================================== --- lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -54,6 +54,7 @@ void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&); void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &); void initializeNVPTXLowerKernelArgsPass(PassRegistry &); +void initializeNVPTXLowerAllocaPass(PassRegistry &); } extern "C" void LLVMInitializeNVPTXTarget() { @@ -70,6 +71,7 @@ initializeNVPTXFavorNonGenericAddrSpacesPass( *PassRegistry::getPassRegistry()); initializeNVPTXLowerKernelArgsPass(*PassRegistry::getPassRegistry()); + initializeNVPTXLowerAllocaPass(*PassRegistry::getPassRegistry()); } static std::string computeDataLayout(bool is64Bit) { @@ -166,12 +168,11 @@ addPass(createNVPTXAssignValidGlobalNamesPass()); addPass(createGenericToNVVMPass()); addPass(createNVPTXLowerKernelArgsPass(&getNVPTXTargetMachine())); - addPass(createNVPTXFavorNonGenericAddrSpacesPass()); // NVPTXLowerKernelArgs emits alloca for byval parameters which can often - // be eliminated by SROA. We do not run SROA right after NVPTXLowerKernelArgs - // because we plan to merge NVPTXLowerKernelArgs and - // NVPTXFavorNonGenericAddrSpaces into one pass. + // be eliminated by SROA. addPass(createSROAPass()); + addPass(createNVPTXLowerAllocaPass()); + addPass(createNVPTXFavorNonGenericAddrSpacesPass()); // FavorNonGenericAddrSpaces shortcuts unnecessary addrspacecasts, and leave // them unused. We could remove dead code in an ad-hoc manner, but that // requires manual work and might be error-prone. Index: test/CodeGen/NVPTX/call-with-alloca-buffer.ll =================================================================== --- test/CodeGen/NVPTX/call-with-alloca-buffer.ll +++ test/CodeGen/NVPTX/call-with-alloca-buffer.ll @@ -27,8 +27,9 @@ ; CHECK: cvta.to.global.u64 %rd[[A1_REG:[0-9]+]], %rd[[A_REG]] ; FIXME: casting A1_REG to A2_REG is unnecessary; A2_REG is essentially A_REG ; CHECK: cvta.global.u64 %rd[[A2_REG:[0-9]+]], %rd[[A1_REG]] +; CHECK: cvta.local.u64 %rd[[SP_REG:[0-9]+]] ; CHECK: ld.global.f32 %f[[A0_REG:[0-9]+]], [%rd[[A1_REG]]] -; CHECK: st.f32 [%SP+0], %f[[A0_REG]] +; CHECK: st.local.f32 [{{%rd[0-9]+}}], %f[[A0_REG]] %0 = load float, float* %a, align 4 %1 = bitcast [16 x i8]* %buf to float* @@ -49,7 +50,6 @@ %7 = bitcast i8* %arrayidx7 to float* store float %6, float* %7, align 4 -; CHECK: add.u64 %rd[[SP_REG:[0-9]+]], %SP, 0 ; CHECK: .param .b64 param0; ; CHECK-NEXT: st.param.b64 [param0+0], %rd[[A2_REG]] ; CHECK-NEXT: .param .b64 param1; Index: test/CodeGen/NVPTX/lower-alloca.ll =================================================================== --- /dev/null +++ test/CodeGen/NVPTX/lower-alloca.ll @@ -0,0 +1,22 @@ +; RUN: opt < %s -S -nvptx-lower-alloca -nvptx-favor-non-generic -dce | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix PTX + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx64-unknown-unknown" + +define void @kernel() { +; LABEL: @lower_alloca +; PTX-LABEL: .visible .entry kernel( + %A = alloca i32 +; CHECK: addrspacecast i32* %A to i32 addrspace(5)* +; CHECK: store i32 0, i32 addrspace(5)* {{%.+}} +; PTX: st.local.u32 [{{%rd[0-9]+}}], {{%r[0-9]+}} + store i32 0, i32* %A + call void @callee(i32* %A) + ret void +} + +declare void @callee(i32*) + +!nvvm.annotations = !{!0} +!0 = !{void ()* @kernel, !"kernel", i32 1}