Skip to content

Commit cd3afea

Browse files
author
Jingyue Wu
committedJun 17, 2015
Add NVPTXLowerAlloca pass to convert alloca'ed memory to local address
Summary: This is done by first adding two additional instructions to convert the alloca returned address to local and convert it back to generic. Then replace all uses of alloca instruction with the converted generic address. Then we can rely NVPTXFavorNonGenericAddrSpace pass to combine the generic addresscast and the corresponding Load, Store, Bitcast, GEP Instruction together. Patched by Xuetian Weng (xweng@google.com). Test Plan: test/CodeGen/NVPTX/lower-alloca.ll Reviewers: jholewinski, jingyue Reviewed By: jingyue Subscribers: meheff, broune, eliben, jholewinski, llvm-commits Differential Revision: http://reviews.llvm.org/D10483 llvm-svn: 239964
1 parent d0e87eb commit cd3afea

File tree

6 files changed

+146
-6
lines changed

6 files changed

+146
-6
lines changed
 

‎llvm/lib/Target/NVPTX/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ set(NVPTXCodeGen_sources
2121
NVPTXInstrInfo.cpp
2222
NVPTXLowerAggrCopies.cpp
2323
NVPTXLowerKernelArgs.cpp
24+
NVPTXLowerAlloca.cpp
2425
NVPTXMCExpr.cpp
2526
NVPTXPrologEpilogPass.cpp
2627
NVPTXRegisterInfo.cpp

‎llvm/lib/Target/NVPTX/NVPTX.h

+1
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ MachineFunctionPass *createNVPTXPrologEpilogPass();
7070
MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
7171
FunctionPass *createNVPTXImageOptimizerPass();
7272
FunctionPass *createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM);
73+
BasicBlockPass *createNVPTXLowerAllocaPass();
7374

7475
bool isImageOrSamplerVal(const Value *, const Module *);
7576

+115
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
//===-- NVPTXLowerAlloca.cpp - Make alloca to use local memory =====--===//
2+
//
3+
// The LLVM Compiler Infrastructure
4+
//
5+
// This file is distributed under the University of Illinois Open Source
6+
// License. See LICENSE.TXT for details.
7+
//
8+
//===----------------------------------------------------------------------===//
9+
//
10+
// For all alloca instructions, and add a pair of cast to local address for
11+
// each of them. For example,
12+
//
13+
// %A = alloca i32
14+
// store i32 0, i32* %A ; emits st.u32
15+
//
16+
// will be transformed to
17+
//
18+
// %A = alloca i32
19+
// %Local = addrspacecast i32* %A to i32 addrspace(5)*
20+
// %Generic = addrspacecast i32 addrspace(5)* %A to i32*
21+
// store i32 0, i32 addrspace(5)* %Generic ; emits st.local.u32
22+
//
23+
// And we will rely on NVPTXFavorNonGenericAddrSpace to combine the last
24+
// two instructions.
25+
//
26+
//===----------------------------------------------------------------------===//
27+
28+
#include "NVPTX.h"
29+
#include "NVPTXUtilities.h"
30+
#include "llvm/IR/Function.h"
31+
#include "llvm/IR/Instructions.h"
32+
#include "llvm/IR/IntrinsicInst.h"
33+
#include "llvm/IR/Module.h"
34+
#include "llvm/IR/Type.h"
35+
#include "llvm/Pass.h"
36+
37+
using namespace llvm;
38+
39+
namespace llvm {
40+
void initializeNVPTXLowerAllocaPass(PassRegistry &);
41+
}
42+
43+
namespace {
44+
class NVPTXLowerAlloca : public BasicBlockPass {
45+
bool runOnBasicBlock(BasicBlock &BB) override;
46+
47+
public:
48+
static char ID; // Pass identification, replacement for typeid
49+
NVPTXLowerAlloca() : BasicBlockPass(ID) {}
50+
const char *getPassName() const override {
51+
return "convert address space of alloca'ed memory to local";
52+
}
53+
};
54+
} // namespace
55+
56+
char NVPTXLowerAlloca::ID = 1;
57+
58+
INITIALIZE_PASS(NVPTXLowerAlloca, "nvptx-lower-alloca",
59+
"Lower Alloca", false, false)
60+
61+
// =============================================================================
62+
// Main function for this pass.
63+
// =============================================================================
64+
bool NVPTXLowerAlloca::runOnBasicBlock(BasicBlock &BB) {
65+
bool Changed = false;
66+
for (auto &I : BB) {
67+
if (auto allocaInst = dyn_cast<AllocaInst>(&I)) {
68+
Changed = true;
69+
auto PTy = dyn_cast<PointerType>(allocaInst->getType());
70+
auto ETy = PTy->getElementType();
71+
auto LocalAddrTy = PointerType::get(ETy, ADDRESS_SPACE_LOCAL);
72+
auto NewASCToLocal = new AddrSpaceCastInst(allocaInst, LocalAddrTy, "");
73+
auto GenericAddrTy = PointerType::get(ETy, ADDRESS_SPACE_GENERIC);
74+
auto NewASCToGeneric = new AddrSpaceCastInst(NewASCToLocal,
75+
GenericAddrTy, "");
76+
NewASCToLocal->insertAfter(allocaInst);
77+
NewASCToGeneric->insertAfter(NewASCToLocal);
78+
for (Value::use_iterator UI = allocaInst->use_begin(),
79+
UE = allocaInst->use_end();
80+
UI != UE; ) {
81+
// Check Load, Store, GEP, and BitCast Uses on alloca and make them
82+
// use the converted generic address, in order to expose non-generic
83+
// addrspacecast to NVPTXFavorNonGenericAddrSpace. For other types
84+
// of instructions this is unecessary and may introduce redudant
85+
// address cast.
86+
const auto &AllocaUse = *UI++;
87+
auto LI = dyn_cast<LoadInst>(AllocaUse.getUser());
88+
if (LI && LI->getPointerOperand() == allocaInst && !LI->isVolatile()) {
89+
LI->setOperand(LI->getPointerOperandIndex(), NewASCToGeneric);
90+
continue;
91+
}
92+
auto SI = dyn_cast<StoreInst>(AllocaUse.getUser());
93+
if (SI && SI->getPointerOperand() == allocaInst && !SI->isVolatile()) {
94+
SI->setOperand(SI->getPointerOperandIndex(), NewASCToGeneric);
95+
continue;
96+
}
97+
auto GI = dyn_cast<GetElementPtrInst>(AllocaUse.getUser());
98+
if (GI && GI->getPointerOperand() == allocaInst) {
99+
GI->setOperand(GI->getPointerOperandIndex(), NewASCToGeneric);
100+
continue;
101+
}
102+
auto BI = dyn_cast<BitCastInst>(AllocaUse.getUser());
103+
if (BI && BI->getOperand(0) == allocaInst) {
104+
BI->setOperand(0, NewASCToGeneric);
105+
continue;
106+
}
107+
}
108+
}
109+
}
110+
return Changed;
111+
}
112+
113+
BasicBlockPass *llvm::createNVPTXLowerAllocaPass() {
114+
return new NVPTXLowerAlloca();
115+
}

‎llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp

+5-4
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ void initializeNVPTXAllocaHoistingPass(PassRegistry &);
5454
void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
5555
void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &);
5656
void initializeNVPTXLowerKernelArgsPass(PassRegistry &);
57+
void initializeNVPTXLowerAllocaPass(PassRegistry &);
5758
}
5859

5960
extern "C" void LLVMInitializeNVPTXTarget() {
@@ -70,6 +71,7 @@ extern "C" void LLVMInitializeNVPTXTarget() {
7071
initializeNVPTXFavorNonGenericAddrSpacesPass(
7172
*PassRegistry::getPassRegistry());
7273
initializeNVPTXLowerKernelArgsPass(*PassRegistry::getPassRegistry());
74+
initializeNVPTXLowerAllocaPass(*PassRegistry::getPassRegistry());
7375
}
7476

7577
static std::string computeDataLayout(bool is64Bit) {
@@ -166,12 +168,11 @@ void NVPTXPassConfig::addIRPasses() {
166168
addPass(createNVPTXAssignValidGlobalNamesPass());
167169
addPass(createGenericToNVVMPass());
168170
addPass(createNVPTXLowerKernelArgsPass(&getNVPTXTargetMachine()));
169-
addPass(createNVPTXFavorNonGenericAddrSpacesPass());
170171
// NVPTXLowerKernelArgs emits alloca for byval parameters which can often
171-
// be eliminated by SROA. We do not run SROA right after NVPTXLowerKernelArgs
172-
// because we plan to merge NVPTXLowerKernelArgs and
173-
// NVPTXFavorNonGenericAddrSpaces into one pass.
172+
// be eliminated by SROA.
174173
addPass(createSROAPass());
174+
addPass(createNVPTXLowerAllocaPass());
175+
addPass(createNVPTXFavorNonGenericAddrSpacesPass());
175176
// FavorNonGenericAddrSpaces shortcuts unnecessary addrspacecasts, and leave
176177
// them unused. We could remove dead code in an ad-hoc manner, but that
177178
// requires manual work and might be error-prone.

‎llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,9 @@ entry:
2727
; CHECK: cvta.to.global.u64 %rd[[A1_REG:[0-9]+]], %rd[[A_REG]]
2828
; FIXME: casting A1_REG to A2_REG is unnecessary; A2_REG is essentially A_REG
2929
; CHECK: cvta.global.u64 %rd[[A2_REG:[0-9]+]], %rd[[A1_REG]]
30+
; CHECK: cvta.local.u64 %rd[[SP_REG:[0-9]+]]
3031
; CHECK: ld.global.f32 %f[[A0_REG:[0-9]+]], [%rd[[A1_REG]]]
31-
; CHECK: st.f32 [%SP+0], %f[[A0_REG]]
32+
; CHECK: st.local.f32 [{{%rd[0-9]+}}], %f[[A0_REG]]
3233

3334
%0 = load float, float* %a, align 4
3435
%1 = bitcast [16 x i8]* %buf to float*
@@ -49,7 +50,6 @@ entry:
4950
%7 = bitcast i8* %arrayidx7 to float*
5051
store float %6, float* %7, align 4
5152

52-
; CHECK: add.u64 %rd[[SP_REG:[0-9]+]], %SP, 0
5353
; CHECK: .param .b64 param0;
5454
; CHECK-NEXT: st.param.b64 [param0+0], %rd[[A2_REG]]
5555
; CHECK-NEXT: .param .b64 param1;
+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
; RUN: opt < %s -S -nvptx-lower-alloca -nvptx-favor-non-generic -dce | FileCheck %s
2+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix PTX
3+
4+
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
5+
target triple = "nvptx64-unknown-unknown"
6+
7+
define void @kernel() {
8+
; LABEL: @lower_alloca
9+
; PTX-LABEL: .visible .entry kernel(
10+
%A = alloca i32
11+
; CHECK: addrspacecast i32* %A to i32 addrspace(5)*
12+
; CHECK: store i32 0, i32 addrspace(5)* {{%.+}}
13+
; PTX: st.local.u32 [{{%rd[0-9]+}}], {{%r[0-9]+}}
14+
store i32 0, i32* %A
15+
call void @callee(i32* %A)
16+
ret void
17+
}
18+
19+
declare void @callee(i32*)
20+
21+
!nvvm.annotations = !{!0}
22+
!0 = !{void ()* @kernel, !"kernel", i32 1}

0 commit comments

Comments
 (0)
Please sign in to comment.