Skip to content

Commit 29bbdc1

Browse files
committedFeb 21, 2017
[NVPTX] Unify vectorization of load/stores of aggregate arguments and return values.
Original code only used vector loads/stores for explicit vector arguments. It could also do more loads/stores than necessary (e.g v5f32 would touch 8 f32 values). Aggregate types were loaded one element at a time, even the vectors contained within. This change attempts to generalize (and simplify) parameter space loads/stores so that vector loads/stores can be used more broadly. Functionality of the patch has been verified by compiling thrust test suite and manually checking the differences between PTX generated by llvm with and without the patch. General algorithm: * ComputePTXValueVTs() flattens input/output argument into a flat list of scalars to load/store and returns their types and offsets. * VectorizePTXValueVTs() uses that data to create vectorization plan which returns an array of flags marking boundaries of vectorized load/stores. Scalars are represented as 1-element vectors. * Code that generates loads/stores implements a simple state machine that constructs a vector according to the plan. Differential Revision: https://reviews.llvm.org/D30011 llvm-svn: 295784
1 parent 7d6b71d commit 29bbdc1

File tree

9 files changed

+1384
-746
lines changed

9 files changed

+1384
-746
lines changed
 

‎llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

+420-710
Large diffs are not rendered by default.

‎llvm/test/CodeGen/NVPTX/aggregate-return.ll

+27-8
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,40 @@
11
; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s
22

33
declare <2 x float> @barv(<2 x float> %input)
4+
declare <3 x float> @barv3(<3 x float> %input)
45
declare [2 x float] @bara([2 x float] %input)
56
declare {float, float} @bars({float, float} %input)
67

7-
define void @foov(<2 x float> %input, <2 x float>* %output) {
8-
; CHECK-LABEL: @foov
8+
define void @test_v2f32(<2 x float> %input, <2 x float>* %output) {
9+
; CHECK-LABEL: @test_v2f32
910
%call = tail call <2 x float> @barv(<2 x float> %input)
1011
; CHECK: .param .align 8 .b8 retval0[8];
11-
; CHECK: ld.param.v2.f32 {[[ELEMV1:%f[0-9]+]], [[ELEMV2:%f[0-9]+]]}, [retval0+0];
12+
; CHECK: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0+0];
1213
store <2 x float> %call, <2 x float>* %output, align 8
13-
; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[ELEMV1]], [[ELEMV2]]}
14+
; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[E0]], [[E1]]}
1415
ret void
1516
}
1617

17-
define void @fooa([2 x float] %input, [2 x float]* %output) {
18-
; CHECK-LABEL: @fooa
18+
define void @test_v3f32(<3 x float> %input, <3 x float>* %output) {
19+
; CHECK-LABEL: @test_v3f32
20+
;
21+
%call = tail call <3 x float> @barv3(<3 x float> %input)
22+
; CHECK: .param .align 16 .b8 retval0[16];
23+
; CHECK-DAG: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0+0];
24+
; CHECK-DAG: ld.param.f32 [[E2:%f[0-9]+]], [retval0+8];
25+
; Make sure we don't load more values than than we need to.
26+
; CHECK-NOT: ld.param.f32 [[E3:%f[0-9]+]], [retval0+12];
27+
store <3 x float> %call, <3 x float>* %output, align 8
28+
; CHECK-DAG: st.f32 [{{%rd[0-9]}}+8],
29+
; -- This is suboptimal. We should do st.v2.f32 instead
30+
; of combining 2xf32 info i64.
31+
; CHECK-DAG: st.u64 [{{%rd[0-9]}}],
32+
; CHECK: ret;
33+
ret void
34+
}
35+
36+
define void @test_a2f32([2 x float] %input, [2 x float]* %output) {
37+
; CHECK-LABEL: @test_a2f32
1938
%call = tail call [2 x float] @bara([2 x float] %input)
2039
; CHECK: .param .align 4 .b8 retval0[8];
2140
; CHECK-DAG: ld.param.f32 [[ELEMA1:%f[0-9]+]], [retval0+0];
@@ -28,8 +47,8 @@ define void @fooa([2 x float] %input, [2 x float]* %output) {
2847
; CHECK: ret
2948
}
3049

31-
define void @foos({float, float} %input, {float, float}* %output) {
32-
; CHECK-LABEL: @foos
50+
define void @test_s2f32({float, float} %input, {float, float}* %output) {
51+
; CHECK-LABEL: @test_s2f32
3352
%call = tail call {float, float} @bars({float, float} %input)
3453
; CHECK: .param .align 4 .b8 retval0[8];
3554
; CHECK-DAG: ld.param.f32 [[ELEMS1:%f[0-9]+]], [retval0+0];

‎llvm/test/CodeGen/NVPTX/f16-instructions.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ define half @test_tailcall_flipped(half %a, half %b) #0 {
229229
; CHECK-LABEL: test_select(
230230
; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_param_0];
231231
; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_param_1];
232-
; CHECK: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
232+
; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
233233
; CHECK-NEXT: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]];
234234
; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
235235
; CHECK-NEXT: ret;

‎llvm/test/CodeGen/NVPTX/ldparam-v4.ll

+4-1
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@
22

33
declare <4 x float> @bar()
44

5+
; CHECK-LABEL: .func foo(
56
define void @foo(<4 x float>* %ptr) {
6-
; CHECK: ld.param.v4.f32
7+
; CHECK: ld.param.u32 %[[PTR:r[0-9]+]], [foo_param_0];
8+
; CHECK: ld.param.v4.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]], [[E2:%f[0-9]+]], [[E3:%f[0-9]+]]}, [retval0+0];
9+
; CHECK: st.v4.f32 [%[[PTR]]], {[[E0]], [[E1]], [[E2]], [[E3]]}
710
%val = tail call <4 x float> @bar()
811
store <4 x float> %val, <4 x float>* %ptr
912
ret void

‎llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll

+14-13
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix PTX
1+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s --check-prefix PTX
22
; RUN: opt < %s -S -nvptx-lower-aggr-copies | FileCheck %s --check-prefix IR
33

44
; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to
@@ -27,9 +27,9 @@ entry:
2727
; PTX: LBB[[LABEL:[_0-9]+]]:
2828
; PTX: ld.u8 %rs[[REG:[0-9]+]]
2929
; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[REG]]
30-
; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
31-
; PTX-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
32-
; PTX-NEXT: @%p[[PRED]] bra LBB[[LABEL]]
30+
; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1
31+
; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
32+
; PTX: @%p[[PRED]] bra LBB[[LABEL]]
3333
}
3434

3535
define i8* @memcpy_volatile_caller(i8* %dst, i8* %src, i64 %n) #0 {
@@ -45,9 +45,9 @@ entry:
4545
; PTX: LBB[[LABEL:[_0-9]+]]:
4646
; PTX: ld.volatile.u8 %rs[[REG:[0-9]+]]
4747
; PTX: st.volatile.u8 [%rd{{[0-9]+}}], %rs[[REG]]
48-
; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
49-
; PTX-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
50-
; PTX-NEXT: @%p[[PRED]] bra LBB[[LABEL]]
48+
; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1
49+
; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
50+
; PTX: @%p[[PRED]] bra LBB[[LABEL]]
5151
}
5252

5353
define i8* @memcpy_casting_caller(i32* %dst, i32* %src, i64 %n) #0 {
@@ -78,12 +78,13 @@ entry:
7878
; IR-NEXT: store i8 [[VAL]], i8* [[STOREPTR]]
7979

8080
; PTX-LABEL: .visible .func (.param .b64 func_retval0) memset_caller(
81-
; PTX: ld.param.u8 %rs[[REG:[0-9]+]]
81+
; PTX: ld.param.u32 %r[[C:[0-9]+]]
82+
; PTX: cvt.u16.u32 %rs[[REG:[0-9]+]], %r[[C]];
8283
; PTX: LBB[[LABEL:[_0-9]+]]:
8384
; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[REG]]
84-
; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
85-
; PTX-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
86-
; PTX-NEXT: @%p[[PRED]] bra LBB[[LABEL]]
85+
; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1
86+
; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
87+
; PTX: @%p[[PRED]] bra LBB[[LABEL]]
8788
}
8889

8990
define i8* @volatile_memset_caller(i8* %dst, i32 %c, i64 %n) #0 {
@@ -118,15 +119,15 @@ entry:
118119
; PTX-NEXT: @%p[[SRC_GT_THAN_DST]] bra LBB[[FORWARD_BB:[0-9_]+]]
119120
; -- this is the backwards copying BB
120121
; PTX: @%p[[NEQ0]] bra LBB[[EXIT:[0-9_]+]]
121-
; PTX: add.s64 %rd[[N]], %rd[[N]], -1
122+
; PTX: add.s64 %rd{{[0-9]}}, %rd{{[0-9]}}, -1
122123
; PTX: ld.u8 %rs[[ELEMENT:[0-9]+]]
123124
; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT]]
124125
; -- this is the forwards copying BB
125126
; PTX: LBB[[FORWARD_BB]]:
126127
; PTX: @%p[[NEQ0]] bra LBB[[EXIT]]
127128
; PTX: ld.u8 %rs[[ELEMENT2:[0-9]+]]
128129
; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT2]]
129-
; PTX: add.s64 %rd[[INDEX:[0-9]+]], %rd[[INDEX]], 1
130+
; PTX: add.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, 1
130131
; -- exit block
131132
; PTX: LBB[[EXIT]]:
132133
; PTX-NEXT: st.param.b64 [func_retval0

‎llvm/test/CodeGen/NVPTX/param-load-store.ll

+813
Large diffs are not rendered by default.

‎llvm/test/CodeGen/NVPTX/vec-param-load.ll

+76-7
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,81 @@
22

33
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
44

5-
6-
define <16 x float> @foo(<16 x float> %a) {
7-
; Make sure we index into vectors properly
8-
; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+48];
9-
; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+32];
10-
; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+16];
11-
; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0];
5+
define <16 x float> @test_v16f32(<16 x float> %a) {
6+
; CHECK-LABEL: test_v16f32(
7+
; CHECK-DAG: ld.param.v4.f32 {[[V_12_15:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+48];
8+
; CHECK-DAG: ld.param.v4.f32 {[[V_8_11:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+32];
9+
; CHECK-DAG: ld.param.v4.f32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+16];
10+
; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0];
11+
; CHECK-DAG: st.param.v4.f32 [func_retval0+0], {[[V_0_3]]}
12+
; CHECK-DAG: st.param.v4.f32 [func_retval0+16], {[[V_4_7]]}
13+
; CHECK-DAG: st.param.v4.f32 [func_retval0+32], {[[V_8_11]]}
14+
; CHECK-DAG: st.param.v4.f32 [func_retval0+48], {[[V_12_15]]}
15+
; CHECK: ret;
1216
ret <16 x float> %a
1317
}
18+
19+
define <8 x float> @test_v8f32(<8 x float> %a) {
20+
; CHECK-LABEL: test_v8f32(
21+
; CHECK-DAG: ld.param.v4.f32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0+16];
22+
; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0];
23+
; CHECK-DAG: st.param.v4.f32 [func_retval0+0], {[[V_0_3]]}
24+
; CHECK-DAG: st.param.v4.f32 [func_retval0+16], {[[V_4_7]]}
25+
; CHECK: ret;
26+
ret <8 x float> %a
27+
}
28+
29+
define <4 x float> @test_v4f32(<4 x float> %a) {
30+
; CHECK-LABEL: test_v4f32(
31+
; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v4f32_param_0];
32+
; CHECK-DAG: st.param.v4.f32 [func_retval0+0], {[[V_0_3]]}
33+
; CHECK: ret;
34+
ret <4 x float> %a
35+
}
36+
37+
define <2 x float> @test_v2f32(<2 x float> %a) {
38+
; CHECK-LABEL: test_v2f32(
39+
; CHECK-DAG: ld.param.v2.f32 {[[V_0_3:(%f[0-9]+[, ]*){2}]]}, [test_v2f32_param_0];
40+
; CHECK-DAG: st.param.v2.f32 [func_retval0+0], {[[V_0_3]]}
41+
; CHECK: ret;
42+
ret <2 x float> %a
43+
}
44+
45+
; Oddly shaped vectors should not load any extra elements.
46+
define <3 x float> @test_v3f32(<3 x float> %a) {
47+
; CHECK-LABEL: test_v3f32(
48+
; CHECK-DAG: ld.param.f32 [[V_2:%f[0-9]+]], [test_v3f32_param_0+8];
49+
; CHECK-DAG: ld.param.v2.f32 {[[V_0_1:(%f[0-9]+[, ]*){2}]]}, [test_v3f32_param_0];
50+
; CHECK-DAG: st.param.v2.f32 [func_retval0+0], {[[V_0_1]]}
51+
; CHECK-DAG: st.param.f32 [func_retval0+8], [[V_2]]
52+
; CHECK: ret;
53+
ret <3 x float> %a
54+
}
55+
56+
define <8 x i64> @test_v8i64(<8 x i64> %a) {
57+
; CHECK-LABEL: test_v8i64(
58+
; CHECK-DAG: ld.param.v2.u64 {[[V_6_7:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+48];
59+
; CHECK-DAG: ld.param.v2.u64 {[[V_4_5:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+32];
60+
; CHECK-DAG: ld.param.v2.u64 {[[V_2_3:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+16];
61+
; CHECK-DAG: ld.param.v2.u64 {[[V_0_1:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0];
62+
; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[V_0_1]]}
63+
; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[V_2_3]]}
64+
; CHECK-DAG: st.param.v2.b64 [func_retval0+32], {[[V_4_5]]}
65+
; CHECK-DAG: st.param.v2.b64 [func_retval0+48], {[[V_6_7]]}
66+
; CHECK: ret;
67+
ret <8 x i64> %a
68+
}
69+
70+
define <16 x i16> @test_v16i16(<16 x i16> %a) {
71+
; CHECK-LABEL: test_v16i16(
72+
; CHECK-DAG: ld.param.v4.u16 {[[V_12_15:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+24];
73+
; CHECK-DAG: ld.param.v4.u16 {[[V_8_11:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+16];
74+
; CHECK-DAG: ld.param.v4.u16 {[[V_4_7:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+8];
75+
; CHECK-DAG: ld.param.v4.u16 {[[V_0_3:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0];
76+
; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[V_0_3]]}
77+
; CHECK-DAG: st.param.v4.b16 [func_retval0+8], {[[V_4_7]]}
78+
; CHECK-DAG: st.param.v4.b16 [func_retval0+16], {[[V_8_11]]}
79+
; CHECK-DAG: st.param.v4.b16 [func_retval0+24], {[[V_12_15]]}
80+
; CHECK: ret;
81+
ret <16 x i16> %a
82+
}

‎llvm/test/CodeGen/NVPTX/vec8.ll

+9-4
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,15 @@ target triple = "nvptx-unknown-cuda"
44

55
; CHECK: .visible .func foo
66
define void @foo(<8 x i8> %a, i8* %b) {
7-
%t0 = extractelement <8 x i8> %a, i32 0
8-
; CHECK-DAG: ld.param.v4.u8
9-
; CHECK-DAG: ld.param.u32
10-
store i8 %t0, i8* %b
7+
; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [foo_param_0]
8+
; CHECK-DAG: ld.param.v4.u8 {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [foo_param_0+4]
9+
; CHECK-DAG: ld.param.u32 %[[B:r[0-9+]]], [foo_param_1]
10+
; CHECK: add.s16 [[T:%rs[0-9+]]], [[E1]], [[E6]];
11+
; CHECK: st.u8 [%[[B]]], [[T]];
12+
%t0 = extractelement <8 x i8> %a, i32 1
13+
%t1 = extractelement <8 x i8> %a, i32 6
14+
%t = add i8 %t0, %t1
15+
store i8 %t, i8* %b
1116
ret void
1217
}
1318

‎llvm/test/CodeGen/NVPTX/vector-call.ll

+20-2
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,27 @@ target triple = "nvptx-unknown-cuda"
44

55
declare void @bar(<4 x i32>)
66

7-
; CHECK-LABEL: @foo
7+
; CHECK-LABEL: .func foo(
8+
; CHECK-DAG: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [foo_param_0];
9+
; CHECK: .param .align 16 .b8 param0[16];
10+
; CHECK-DAG: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
11+
; CHECK: call.uni
12+
; CHECK: ret;
813
define void @foo(<4 x i32> %a) {
9-
; CHECK: st.param.v4.b32
1014
tail call void @bar(<4 x i32> %a)
1115
ret void
1216
}
17+
18+
; CHECK-LABEL: .func foo3(
19+
; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo3_param_0];
20+
; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [foo3_param_0+8];
21+
; CHECK: .param .align 16 .b8 param0[16];
22+
; CHECK-DAG: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
23+
; CHECK-DAG: st.param.b32 [param0+8], [[E2]];
24+
; CHECK: call.uni
25+
; CHECK: ret;
26+
declare void @bar3(<3 x i32>)
27+
define void @foo3(<3 x i32> %a) {
28+
tail call void @bar3(<3 x i32> %a)
29+
ret void
30+
}

0 commit comments

Comments
 (0)
Please sign in to comment.