1
- ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,SI %s
2
- ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI %s
1
+ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2
+ ; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,SI
3
+ ; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI
3
4
4
- ; GCN-LABEL: {{^}}s_sext_i1_to_i32:
5
- ; GCN: v_cndmask_b32_e64
6
- ; GCN: s_endpgm
7
5
define amdgpu_kernel void @s_sext_i1_to_i32 (i32 addrspace (1 )* %out , i32 %a , i32 %b ) nounwind {
6
+ ; SI-LABEL: s_sext_i1_to_i32:
7
+ ; SI: ; %bb.0:
8
+ ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
9
+ ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
10
+ ; SI-NEXT: s_mov_b32 s3, 0xf000
11
+ ; SI-NEXT: s_mov_b32 s2, -1
12
+ ; SI-NEXT: s_waitcnt lgkmcnt(0)
13
+ ; SI-NEXT: v_mov_b32_e32 v0, s5
14
+ ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
15
+ ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
16
+ ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
17
+ ; SI-NEXT: s_endpgm
18
+ ;
19
+ ; VI-LABEL: s_sext_i1_to_i32:
20
+ ; VI: ; %bb.0:
21
+ ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
22
+ ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
23
+ ; VI-NEXT: s_mov_b32 s7, 0xf000
24
+ ; VI-NEXT: s_mov_b32 s6, -1
25
+ ; VI-NEXT: s_waitcnt lgkmcnt(0)
26
+ ; VI-NEXT: v_mov_b32_e32 v0, s1
27
+ ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
28
+ ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
29
+ ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
30
+ ; VI-NEXT: s_endpgm
8
31
%cmp = icmp eq i32 %a , %b
9
32
%sext = sext i1 %cmp to i32
10
33
store i32 %sext , i32 addrspace (1 )* %out , align 4
11
34
ret void
12
35
}
13
36
14
- ; GCN-LABEL: {{^}}test_s_sext_i32_to_i64:
15
- ; GCN: s_ashr_i32
16
- ; GCN: s_endpg
17
37
define amdgpu_kernel void @test_s_sext_i32_to_i64 (i64 addrspace (1 )* %out , i32 %a , i32 %b , i32 %c ) nounwind {
38
+ ; SI-LABEL: test_s_sext_i32_to_i64:
39
+ ; SI: ; %bb.0: ; %entry
40
+ ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
41
+ ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
42
+ ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
43
+ ; SI-NEXT: s_mov_b32 s3, 0xf000
44
+ ; SI-NEXT: s_waitcnt lgkmcnt(0)
45
+ ; SI-NEXT: s_mul_i32 s4, s4, s5
46
+ ; SI-NEXT: s_add_i32 s4, s4, s2
47
+ ; SI-NEXT: s_ashr_i32 s5, s4, 31
48
+ ; SI-NEXT: s_mov_b32 s2, -1
49
+ ; SI-NEXT: v_mov_b32_e32 v0, s4
50
+ ; SI-NEXT: v_mov_b32_e32 v1, s5
51
+ ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
52
+ ; SI-NEXT: s_endpgm
53
+ ;
54
+ ; VI-LABEL: test_s_sext_i32_to_i64:
55
+ ; VI: ; %bb.0: ; %entry
56
+ ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
57
+ ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
58
+ ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
59
+ ; VI-NEXT: s_mov_b32 s7, 0xf000
60
+ ; VI-NEXT: s_mov_b32 s6, -1
61
+ ; VI-NEXT: s_waitcnt lgkmcnt(0)
62
+ ; VI-NEXT: s_mul_i32 s1, s2, s3
63
+ ; VI-NEXT: s_add_i32 s1, s1, s0
64
+ ; VI-NEXT: s_ashr_i32 s0, s1, 31
65
+ ; VI-NEXT: v_mov_b32_e32 v0, s1
66
+ ; VI-NEXT: v_mov_b32_e32 v1, s0
67
+ ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
68
+ ; VI-NEXT: s_endpgm
18
69
entry:
19
70
%mul = mul i32 %a , %b
20
71
%add = add i32 %mul , %c
@@ -23,50 +74,170 @@ entry:
23
74
ret void
24
75
}
25
76
26
- ; GCN-LABEL: {{^}}s_sext_i1_to_i64:
27
- ; GCN: v_cndmask_b32_e64 v[[LOREG:[0-9]+]], 0, -1, vcc
28
- ; GCN: v_mov_b32_e32 v[[HIREG:[0-9]+]], v[[LOREG]]
29
- ; GCN: buffer_store_dwordx2 v{{\[}}[[LOREG]]:[[HIREG]]{{\]}}
30
- ; GCN: s_endpgm
31
77
define amdgpu_kernel void @s_sext_i1_to_i64 (i64 addrspace (1 )* %out , i32 %a , i32 %b ) nounwind {
78
+ ; SI-LABEL: s_sext_i1_to_i64:
79
+ ; SI: ; %bb.0:
80
+ ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
81
+ ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
82
+ ; SI-NEXT: s_mov_b32 s3, 0xf000
83
+ ; SI-NEXT: s_waitcnt lgkmcnt(0)
84
+ ; SI-NEXT: v_mov_b32_e32 v0, s5
85
+ ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
86
+ ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
87
+ ; SI-NEXT: s_mov_b32 s2, -1
88
+ ; SI-NEXT: v_mov_b32_e32 v1, v0
89
+ ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
90
+ ; SI-NEXT: s_endpgm
91
+ ;
92
+ ; VI-LABEL: s_sext_i1_to_i64:
93
+ ; VI: ; %bb.0:
94
+ ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
95
+ ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
96
+ ; VI-NEXT: s_mov_b32 s7, 0xf000
97
+ ; VI-NEXT: s_mov_b32 s6, -1
98
+ ; VI-NEXT: s_waitcnt lgkmcnt(0)
99
+ ; VI-NEXT: v_mov_b32_e32 v0, s1
100
+ ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
101
+ ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
102
+ ; VI-NEXT: v_mov_b32_e32 v1, v0
103
+ ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
104
+ ; VI-NEXT: s_endpgm
32
105
%cmp = icmp eq i32 %a , %b
33
106
%sext = sext i1 %cmp to i64
34
107
store i64 %sext , i64 addrspace (1 )* %out , align 8
35
108
ret void
36
109
}
37
110
38
- ; GCN-LABEL: {{^}}s_sext_i32_to_i64:
39
- ; GCN: s_ashr_i32
40
- ; GCN: s_endpgm
41
111
define amdgpu_kernel void @s_sext_i32_to_i64 (i64 addrspace (1 )* %out , i32 %a ) nounwind {
112
+ ; SI-LABEL: s_sext_i32_to_i64:
113
+ ; SI: ; %bb.0:
114
+ ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
115
+ ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
116
+ ; SI-NEXT: s_mov_b32 s3, 0xf000
117
+ ; SI-NEXT: s_waitcnt lgkmcnt(0)
118
+ ; SI-NEXT: s_ashr_i32 s5, s4, 31
119
+ ; SI-NEXT: s_mov_b32 s2, -1
120
+ ; SI-NEXT: v_mov_b32_e32 v0, s4
121
+ ; SI-NEXT: v_mov_b32_e32 v1, s5
122
+ ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
123
+ ; SI-NEXT: s_endpgm
124
+ ;
125
+ ; VI-LABEL: s_sext_i32_to_i64:
126
+ ; VI: ; %bb.0:
127
+ ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
128
+ ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
129
+ ; VI-NEXT: s_mov_b32 s7, 0xf000
130
+ ; VI-NEXT: s_mov_b32 s6, -1
131
+ ; VI-NEXT: s_waitcnt lgkmcnt(0)
132
+ ; VI-NEXT: s_ashr_i32 s1, s0, 31
133
+ ; VI-NEXT: v_mov_b32_e32 v0, s0
134
+ ; VI-NEXT: v_mov_b32_e32 v1, s1
135
+ ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
136
+ ; VI-NEXT: s_endpgm
42
137
%sext = sext i32 %a to i64
43
138
store i64 %sext , i64 addrspace (1 )* %out , align 8
44
139
ret void
45
140
}
46
141
47
- ; GCN-LABEL: {{^}}v_sext_i32_to_i64:
48
- ; GCN: v_ashr
49
- ; GCN: s_endpgm
50
142
define amdgpu_kernel void @v_sext_i32_to_i64 (i64 addrspace (1 )* %out , i32 addrspace (1 )* %in ) nounwind {
143
+ ; SI-LABEL: v_sext_i32_to_i64:
144
+ ; SI: ; %bb.0:
145
+ ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
146
+ ; SI-NEXT: s_mov_b32 s7, 0xf000
147
+ ; SI-NEXT: s_mov_b32 s6, -1
148
+ ; SI-NEXT: s_mov_b32 s10, s6
149
+ ; SI-NEXT: s_mov_b32 s11, s7
150
+ ; SI-NEXT: s_waitcnt lgkmcnt(0)
151
+ ; SI-NEXT: s_mov_b32 s8, s2
152
+ ; SI-NEXT: s_mov_b32 s9, s3
153
+ ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
154
+ ; SI-NEXT: s_mov_b32 s4, s0
155
+ ; SI-NEXT: s_mov_b32 s5, s1
156
+ ; SI-NEXT: s_waitcnt vmcnt(0)
157
+ ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
158
+ ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
159
+ ; SI-NEXT: s_endpgm
160
+ ;
161
+ ; VI-LABEL: v_sext_i32_to_i64:
162
+ ; VI: ; %bb.0:
163
+ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
164
+ ; VI-NEXT: s_mov_b32 s3, 0xf000
165
+ ; VI-NEXT: s_mov_b32 s2, -1
166
+ ; VI-NEXT: s_waitcnt lgkmcnt(0)
167
+ ; VI-NEXT: s_mov_b32 s0, s4
168
+ ; VI-NEXT: s_mov_b32 s1, s5
169
+ ; VI-NEXT: s_mov_b32 s4, s6
170
+ ; VI-NEXT: s_mov_b32 s5, s7
171
+ ; VI-NEXT: s_mov_b32 s6, s2
172
+ ; VI-NEXT: s_mov_b32 s7, s3
173
+ ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
174
+ ; VI-NEXT: s_waitcnt vmcnt(0)
175
+ ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
176
+ ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
177
+ ; VI-NEXT: s_endpgm
51
178
%val = load i32 , i32 addrspace (1 )* %in , align 4
52
179
%sext = sext i32 %val to i64
53
180
store i64 %sext , i64 addrspace (1 )* %out , align 8
54
181
ret void
55
182
}
56
183
57
- ; GCN-LABEL: {{^}}s_sext_i16_to_i64:
58
- ; GCN: s_load_dword [[VAL:s[0-9]+]]
59
- ; GCN: s_bfe_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100000
60
184
define amdgpu_kernel void @s_sext_i16_to_i64 (i64 addrspace (1 )* %out , i16 %a ) nounwind {
185
+ ; SI-LABEL: s_sext_i16_to_i64:
186
+ ; SI: ; %bb.0:
187
+ ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
188
+ ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
189
+ ; SI-NEXT: s_mov_b32 s3, 0xf000
190
+ ; SI-NEXT: s_waitcnt lgkmcnt(0)
191
+ ; SI-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x100000
192
+ ; SI-NEXT: s_mov_b32 s2, -1
193
+ ; SI-NEXT: v_mov_b32_e32 v0, s4
194
+ ; SI-NEXT: v_mov_b32_e32 v1, s5
195
+ ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
196
+ ; SI-NEXT: s_endpgm
197
+ ;
198
+ ; VI-LABEL: s_sext_i16_to_i64:
199
+ ; VI: ; %bb.0:
200
+ ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
201
+ ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
202
+ ; VI-NEXT: s_mov_b32 s7, 0xf000
203
+ ; VI-NEXT: s_mov_b32 s6, -1
204
+ ; VI-NEXT: s_waitcnt lgkmcnt(0)
205
+ ; VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
206
+ ; VI-NEXT: v_mov_b32_e32 v0, s0
207
+ ; VI-NEXT: v_mov_b32_e32 v1, s1
208
+ ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
209
+ ; VI-NEXT: s_endpgm
61
210
%sext = sext i16 %a to i64
62
211
store i64 %sext , i64 addrspace (1 )* %out , align 8
63
212
ret void
64
213
}
65
214
66
- ; GCN-LABEL: {{^}}s_sext_i1_to_i16:
67
- ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1
68
- ; GCN-NEXT: buffer_store_short [[RESULT]]
69
215
define amdgpu_kernel void @s_sext_i1_to_i16 (i16 addrspace (1 )* %out , i32 %a , i32 %b ) nounwind {
216
+ ; SI-LABEL: s_sext_i1_to_i16:
217
+ ; SI: ; %bb.0:
218
+ ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
219
+ ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
220
+ ; SI-NEXT: s_mov_b32 s3, 0xf000
221
+ ; SI-NEXT: s_mov_b32 s2, -1
222
+ ; SI-NEXT: s_waitcnt lgkmcnt(0)
223
+ ; SI-NEXT: v_mov_b32_e32 v0, s5
224
+ ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
225
+ ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
226
+ ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
227
+ ; SI-NEXT: s_endpgm
228
+ ;
229
+ ; VI-LABEL: s_sext_i1_to_i16:
230
+ ; VI: ; %bb.0:
231
+ ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
232
+ ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
233
+ ; VI-NEXT: s_mov_b32 s7, 0xf000
234
+ ; VI-NEXT: s_mov_b32 s6, -1
235
+ ; VI-NEXT: s_waitcnt lgkmcnt(0)
236
+ ; VI-NEXT: v_mov_b32_e32 v0, s1
237
+ ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
238
+ ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
239
+ ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
240
+ ; VI-NEXT: s_endpgm
70
241
%cmp = icmp eq i32 %a , %b
71
242
%sext = sext i1 %cmp to i16
72
243
store i16 %sext , i16 addrspace (1 )* %out
@@ -77,10 +248,38 @@ define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32
77
248
; makes it all the way throught the legalizer/optimizer to make sure
78
249
; we select this correctly. In the s_sext_i1_to_i16, the sign_extend node
79
250
; is optimized to a select very early.
80
- ; GCN-LABEL: {{^}}s_sext_i1_to_i16_with_and:
81
- ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1
82
- ; GCN-NEXT: buffer_store_short [[RESULT]]
83
251
define amdgpu_kernel void @s_sext_i1_to_i16_with_and (i16 addrspace (1 )* %out , i32 %a , i32 %b , i32 %c , i32 %d ) nounwind {
252
+ ; SI-LABEL: s_sext_i1_to_i16_with_and:
253
+ ; SI: ; %bb.0:
254
+ ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
255
+ ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9
256
+ ; SI-NEXT: s_mov_b32 s11, 0xf000
257
+ ; SI-NEXT: s_mov_b32 s10, -1
258
+ ; SI-NEXT: s_waitcnt lgkmcnt(0)
259
+ ; SI-NEXT: v_mov_b32_e32 v0, s5
260
+ ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
261
+ ; SI-NEXT: v_mov_b32_e32 v0, s7
262
+ ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s6, v0
263
+ ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
264
+ ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
265
+ ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
266
+ ; SI-NEXT: s_endpgm
267
+ ;
268
+ ; VI-LABEL: s_sext_i1_to_i16_with_and:
269
+ ; VI: ; %bb.0:
270
+ ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
271
+ ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
272
+ ; VI-NEXT: s_mov_b32 s7, 0xf000
273
+ ; VI-NEXT: s_mov_b32 s6, -1
274
+ ; VI-NEXT: s_waitcnt lgkmcnt(0)
275
+ ; VI-NEXT: v_mov_b32_e32 v0, s1
276
+ ; VI-NEXT: v_mov_b32_e32 v1, s3
277
+ ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
278
+ ; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v1
279
+ ; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
280
+ ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
281
+ ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
282
+ ; VI-NEXT: s_endpgm
84
283
%cmp0 = icmp eq i32 %a , %b
85
284
%cmp1 = icmp eq i32 %c , %d
86
285
%cmp = and i1 %cmp0 , %cmp1
@@ -89,10 +288,38 @@ define amdgpu_kernel void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32
89
288
ret void
90
289
}
91
290
92
- ; GCN-LABEL: {{^}}v_sext_i1_to_i16_with_and:
93
- ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1
94
- ; GCN-NEXT: buffer_store_short [[RESULT]]
95
291
define amdgpu_kernel void @v_sext_i1_to_i16_with_and (i16 addrspace (1 )* %out , i32 %a , i32 %b , i32 %c ) nounwind {
292
+ ; SI-LABEL: v_sext_i1_to_i16_with_and:
293
+ ; SI: ; %bb.0:
294
+ ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
295
+ ; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
296
+ ; SI-NEXT: s_load_dword s0, s[0:1], 0xd
297
+ ; SI-NEXT: s_mov_b32 s7, 0xf000
298
+ ; SI-NEXT: s_mov_b32 s6, -1
299
+ ; SI-NEXT: s_waitcnt lgkmcnt(0)
300
+ ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0
301
+ ; SI-NEXT: v_mov_b32_e32 v0, s0
302
+ ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v0
303
+ ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
304
+ ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
305
+ ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
306
+ ; SI-NEXT: s_endpgm
307
+ ;
308
+ ; VI-LABEL: v_sext_i1_to_i16_with_and:
309
+ ; VI: ; %bb.0:
310
+ ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
311
+ ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
312
+ ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
313
+ ; VI-NEXT: s_mov_b32 s7, 0xf000
314
+ ; VI-NEXT: s_mov_b32 s6, -1
315
+ ; VI-NEXT: s_waitcnt lgkmcnt(0)
316
+ ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0
317
+ ; VI-NEXT: v_mov_b32_e32 v0, s0
318
+ ; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v0
319
+ ; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
320
+ ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
321
+ ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
322
+ ; VI-NEXT: s_endpgm
96
323
%tid = tail call i32 @llvm.amdgcn.workitem.id.x () #1
97
324
%cmp0 = icmp eq i32 %a , %tid
98
325
%cmp1 = icmp eq i32 %b , %c
@@ -102,13 +329,6 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32
102
329
ret void
103
330
}
104
331
105
- ; GCN-LABEL: {{^}}s_sext_v4i8_to_v4i32:
106
- ; GCN: s_load_dword [[VAL:s[0-9]+]]
107
- ; GCN-DAG: s_bfe_i32 [[EXT2:s[0-9]+]], [[VAL]], 0x80010
108
- ; GCN-DAG: s_ashr_i32 [[EXT3:s[0-9]+]], [[VAL]], 24
109
- ; SI-DAG: s_bfe_i32 [[EXT1:s[0-9]+]], [[VAL]], 0x80008
110
- ; GCN-DAG: s_sext_i32_i8 [[EXT0:s[0-9]+]], [[VAL]]
111
-
112
332
; FIXME: We end up with a v_bfe instruction, because the i16 srl
113
333
; gets selected to a v_lshrrev_b16 instructions, so the input to
114
334
; the bfe is a vector registers. To fix this we need to be able to
@@ -117,21 +337,51 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32
117
337
; t55: i16 = srl t29, Constant:i32<8>
118
338
; t63: i32 = any_extend t55
119
339
; t64: i32 = sign_extend_inreg t63, ValueType:ch:i8
120
-
121
- ; VI-DAG: v_bfe_i32 [[VEXT1:v[0-9]+]], v{{[0-9]+}}, 0, 8
122
-
123
- ; GCN-DAG: v_mov_b32_e32 [[VEXT0:v[0-9]+]], [[EXT0]]
124
- ; SI-DAG: v_mov_b32_e32 [[VEXT1:v[0-9]+]], [[EXT1]]
125
- ; GCN-DAG: v_mov_b32_e32 [[VEXT2:v[0-9]+]], [[EXT2]]
126
- ; GCN-DAG: v_mov_b32_e32 [[VEXT3:v[0-9]+]], [[EXT3]]
127
-
128
- ; GCN-DAG: buffer_store_dword [[VEXT0]]
129
- ; GCN-DAG: buffer_store_dword [[VEXT1]]
130
- ; GCN-DAG: buffer_store_dword [[VEXT2]]
131
- ; GCN-DAG: buffer_store_dword [[VEXT3]]
132
-
133
- ; GCN: s_endpgm
134
340
define amdgpu_kernel void @s_sext_v4i8_to_v4i32 (i32 addrspace (1 )* %out , i32 %a ) nounwind {
341
+ ; SI-LABEL: s_sext_v4i8_to_v4i32:
342
+ ; SI: ; %bb.0:
343
+ ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
344
+ ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
345
+ ; SI-NEXT: s_mov_b32 s3, 0xf000
346
+ ; SI-NEXT: s_mov_b32 s2, -1
347
+ ; SI-NEXT: s_waitcnt lgkmcnt(0)
348
+ ; SI-NEXT: s_ashr_i32 s5, s4, 24
349
+ ; SI-NEXT: s_bfe_i32 s6, s4, 0x80010
350
+ ; SI-NEXT: s_sext_i32_i8 s7, s4
351
+ ; SI-NEXT: s_bfe_i32 s4, s4, 0x80008
352
+ ; SI-NEXT: v_mov_b32_e32 v0, s7
353
+ ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
354
+ ; SI-NEXT: s_waitcnt expcnt(0)
355
+ ; SI-NEXT: v_mov_b32_e32 v0, s4
356
+ ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
357
+ ; SI-NEXT: s_waitcnt expcnt(0)
358
+ ; SI-NEXT: v_mov_b32_e32 v0, s6
359
+ ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
360
+ ; SI-NEXT: s_waitcnt expcnt(0)
361
+ ; SI-NEXT: v_mov_b32_e32 v0, s5
362
+ ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
363
+ ; SI-NEXT: s_endpgm
364
+ ;
365
+ ; VI-LABEL: s_sext_v4i8_to_v4i32:
366
+ ; VI: ; %bb.0:
367
+ ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
368
+ ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
369
+ ; VI-NEXT: s_mov_b32 s7, 0xf000
370
+ ; VI-NEXT: s_mov_b32 s6, -1
371
+ ; VI-NEXT: s_waitcnt lgkmcnt(0)
372
+ ; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s0
373
+ ; VI-NEXT: s_ashr_i32 s1, s0, 24
374
+ ; VI-NEXT: s_bfe_i32 s2, s0, 0x80010
375
+ ; VI-NEXT: s_sext_i32_i8 s0, s0
376
+ ; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
377
+ ; VI-NEXT: v_mov_b32_e32 v1, s0
378
+ ; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
379
+ ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
380
+ ; VI-NEXT: v_mov_b32_e32 v0, s2
381
+ ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
382
+ ; VI-NEXT: v_mov_b32_e32 v0, s1
383
+ ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
384
+ ; VI-NEXT: s_endpgm
135
385
%cast = bitcast i32 %a to <4 x i8 >
136
386
%ext = sext <4 x i8 > %cast to <4 x i32 >
137
387
%elt0 = extractelement <4 x i32 > %ext , i32 0
@@ -145,25 +395,57 @@ define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a)
145
395
ret void
146
396
}
147
397
148
- ; GCN-LABEL: {{^}}v_sext_v4i8_to_v4i32:
149
- ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
150
398
; FIXME: need to optimize same sequence as above test to avoid
151
399
; this shift.
152
- ; VI-DAG: v_lshrrev_b16_e32 [[SH16:v[0-9]+]], 8, [[VAL]]
153
- ; GCN-DAG: v_ashrrev_i32_e32 [[EXT3:v[0-9]+]], 24, [[VAL]]
154
- ; VI-DAG: v_bfe_i32 [[EXT0:v[0-9]+]], [[VAL]], 0, 8
155
- ; VI-DAG: v_bfe_i32 [[EXT2:v[0-9]+]], [[VAL]], 16, 8
156
- ; VI-DAG: v_bfe_i32 [[EXT1:v[0-9]+]], [[SH16]], 0, 8
157
-
158
- ; SI-DAG: v_bfe_i32 [[EXT2:v[0-9]+]], [[VAL]], 16, 8
159
- ; SI-DAG: v_bfe_i32 [[EXT1:v[0-9]+]], [[VAL]], 8, 8
160
- ; SI: v_bfe_i32 [[EXT0:v[0-9]+]], [[VAL]], 0, 8
161
-
162
- ; GCN: buffer_store_dword [[EXT0]]
163
- ; GCN: buffer_store_dword [[EXT1]]
164
- ; GCN: buffer_store_dword [[EXT2]]
165
- ; GCN: buffer_store_dword [[EXT3]]
166
400
define amdgpu_kernel void @v_sext_v4i8_to_v4i32 (i32 addrspace (1 )* %out , i32 addrspace (1 )* %in ) nounwind {
401
+ ; SI-LABEL: v_sext_v4i8_to_v4i32:
402
+ ; SI: ; %bb.0:
403
+ ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
404
+ ; SI-NEXT: s_mov_b32 s7, 0xf000
405
+ ; SI-NEXT: s_mov_b32 s6, -1
406
+ ; SI-NEXT: s_mov_b32 s10, s6
407
+ ; SI-NEXT: s_mov_b32 s11, s7
408
+ ; SI-NEXT: s_waitcnt lgkmcnt(0)
409
+ ; SI-NEXT: s_mov_b32 s8, s2
410
+ ; SI-NEXT: s_mov_b32 s9, s3
411
+ ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
412
+ ; SI-NEXT: s_mov_b32 s4, s0
413
+ ; SI-NEXT: s_mov_b32 s5, s1
414
+ ; SI-NEXT: s_waitcnt vmcnt(0)
415
+ ; SI-NEXT: v_ashrrev_i32_e32 v1, 24, v0
416
+ ; SI-NEXT: v_bfe_i32 v2, v0, 16, 8
417
+ ; SI-NEXT: v_bfe_i32 v3, v0, 8, 8
418
+ ; SI-NEXT: v_bfe_i32 v0, v0, 0, 8
419
+ ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
420
+ ; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0
421
+ ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
422
+ ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
423
+ ; SI-NEXT: s_endpgm
424
+ ;
425
+ ; VI-LABEL: v_sext_v4i8_to_v4i32:
426
+ ; VI: ; %bb.0:
427
+ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
428
+ ; VI-NEXT: s_mov_b32 s3, 0xf000
429
+ ; VI-NEXT: s_mov_b32 s2, -1
430
+ ; VI-NEXT: s_waitcnt lgkmcnt(0)
431
+ ; VI-NEXT: s_mov_b32 s0, s4
432
+ ; VI-NEXT: s_mov_b32 s1, s5
433
+ ; VI-NEXT: s_mov_b32 s4, s6
434
+ ; VI-NEXT: s_mov_b32 s5, s7
435
+ ; VI-NEXT: s_mov_b32 s6, s2
436
+ ; VI-NEXT: s_mov_b32 s7, s3
437
+ ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
438
+ ; VI-NEXT: s_waitcnt vmcnt(0)
439
+ ; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0
440
+ ; VI-NEXT: v_ashrrev_i32_e32 v2, 24, v0
441
+ ; VI-NEXT: v_bfe_i32 v3, v0, 16, 8
442
+ ; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
443
+ ; VI-NEXT: v_bfe_i32 v1, v1, 0, 8
444
+ ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
445
+ ; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
446
+ ; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0
447
+ ; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0
448
+ ; VI-NEXT: s_endpgm
167
449
%a = load i32 , i32 addrspace (1 )* %in
168
450
%cast = bitcast i32 %a to <4 x i8 >
169
451
%ext = sext <4 x i8 > %cast to <4 x i32 >
@@ -179,18 +461,53 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addr
179
461
}
180
462
181
463
; FIXME: s_bfe_i64, same on SI and VI
182
- ; GCN-LABEL: {{^}}s_sext_v4i16_to_v4i32:
183
- ; SI-DAG: s_ashr_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 48
184
- ; SI-DAG: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16
185
-
186
- ; VI: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16
187
- ; VI: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16
188
-
189
-
190
- ; GCN-DAG: s_sext_i32_i16
191
- ; GCN-DAG: s_sext_i32_i16
192
- ; GCN: s_endpgm
193
464
define amdgpu_kernel void @s_sext_v4i16_to_v4i32 (i32 addrspace (1 )* %out , i64 %a ) nounwind {
465
+ ; SI-LABEL: s_sext_v4i16_to_v4i32:
466
+ ; SI: ; %bb.0:
467
+ ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
468
+ ; SI-NEXT: s_mov_b32 s7, 0xf000
469
+ ; SI-NEXT: s_mov_b32 s6, -1
470
+ ; SI-NEXT: s_waitcnt lgkmcnt(0)
471
+ ; SI-NEXT: s_mov_b32 s4, s0
472
+ ; SI-NEXT: s_mov_b32 s5, s1
473
+ ; SI-NEXT: s_ashr_i64 s[0:1], s[2:3], 48
474
+ ; SI-NEXT: s_ashr_i32 s1, s2, 16
475
+ ; SI-NEXT: s_sext_i32_i16 s2, s2
476
+ ; SI-NEXT: s_sext_i32_i16 s3, s3
477
+ ; SI-NEXT: v_mov_b32_e32 v0, s2
478
+ ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
479
+ ; SI-NEXT: s_waitcnt expcnt(0)
480
+ ; SI-NEXT: v_mov_b32_e32 v0, s1
481
+ ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
482
+ ; SI-NEXT: s_waitcnt expcnt(0)
483
+ ; SI-NEXT: v_mov_b32_e32 v0, s3
484
+ ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
485
+ ; SI-NEXT: s_waitcnt expcnt(0)
486
+ ; SI-NEXT: v_mov_b32_e32 v0, s0
487
+ ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
488
+ ; SI-NEXT: s_endpgm
489
+ ;
490
+ ; VI-LABEL: s_sext_v4i16_to_v4i32:
491
+ ; VI: ; %bb.0:
492
+ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
493
+ ; VI-NEXT: s_mov_b32 s3, 0xf000
494
+ ; VI-NEXT: s_mov_b32 s2, -1
495
+ ; VI-NEXT: s_waitcnt lgkmcnt(0)
496
+ ; VI-NEXT: s_mov_b32 s1, s5
497
+ ; VI-NEXT: s_ashr_i32 s5, s6, 16
498
+ ; VI-NEXT: s_sext_i32_i16 s6, s6
499
+ ; VI-NEXT: s_mov_b32 s0, s4
500
+ ; VI-NEXT: v_mov_b32_e32 v0, s6
501
+ ; VI-NEXT: s_ashr_i32 s4, s7, 16
502
+ ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
503
+ ; VI-NEXT: v_mov_b32_e32 v0, s5
504
+ ; VI-NEXT: s_sext_i32_i16 s7, s7
505
+ ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
506
+ ; VI-NEXT: v_mov_b32_e32 v0, s7
507
+ ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
508
+ ; VI-NEXT: v_mov_b32_e32 v0, s4
509
+ ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
510
+ ; VI-NEXT: s_endpgm
194
511
%cast = bitcast i64 %a to <4 x i16 >
195
512
%ext = sext <4 x i16 > %cast to <4 x i32 >
196
513
%elt0 = extractelement <4 x i32 > %ext , i32 0
@@ -204,13 +521,54 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a)
204
521
ret void
205
522
}
206
523
207
- ; GCN-LABEL: {{^}}v_sext_v4i16_to_v4i32:
208
- ; GCN-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
209
- ; GCN-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
210
- ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
211
- ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
212
- ; GCN: s_endpgm
213
524
define amdgpu_kernel void @v_sext_v4i16_to_v4i32 (i32 addrspace (1 )* %out , i64 addrspace (1 )* %in ) nounwind {
525
+ ; SI-LABEL: v_sext_v4i16_to_v4i32:
526
+ ; SI: ; %bb.0:
527
+ ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
528
+ ; SI-NEXT: s_mov_b32 s7, 0xf000
529
+ ; SI-NEXT: s_mov_b32 s6, -1
530
+ ; SI-NEXT: s_mov_b32 s10, s6
531
+ ; SI-NEXT: s_mov_b32 s11, s7
532
+ ; SI-NEXT: s_waitcnt lgkmcnt(0)
533
+ ; SI-NEXT: s_mov_b32 s8, s2
534
+ ; SI-NEXT: s_mov_b32 s9, s3
535
+ ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
536
+ ; SI-NEXT: s_mov_b32 s4, s0
537
+ ; SI-NEXT: s_mov_b32 s5, s1
538
+ ; SI-NEXT: s_waitcnt vmcnt(0)
539
+ ; SI-NEXT: v_ashr_i64 v[2:3], v[0:1], 48
540
+ ; SI-NEXT: v_bfe_i32 v1, v1, 0, 16
541
+ ; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
542
+ ; SI-NEXT: v_bfe_i32 v0, v0, 0, 16
543
+ ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
544
+ ; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0
545
+ ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
546
+ ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
547
+ ; SI-NEXT: s_endpgm
548
+ ;
549
+ ; VI-LABEL: v_sext_v4i16_to_v4i32:
550
+ ; VI: ; %bb.0:
551
+ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
552
+ ; VI-NEXT: s_mov_b32 s3, 0xf000
553
+ ; VI-NEXT: s_mov_b32 s2, -1
554
+ ; VI-NEXT: s_waitcnt lgkmcnt(0)
555
+ ; VI-NEXT: s_mov_b32 s0, s4
556
+ ; VI-NEXT: s_mov_b32 s1, s5
557
+ ; VI-NEXT: s_mov_b32 s4, s6
558
+ ; VI-NEXT: s_mov_b32 s5, s7
559
+ ; VI-NEXT: s_mov_b32 s6, s2
560
+ ; VI-NEXT: s_mov_b32 s7, s3
561
+ ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
562
+ ; VI-NEXT: s_waitcnt vmcnt(0)
563
+ ; VI-NEXT: v_ashrrev_i32_e32 v2, 16, v1
564
+ ; VI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
565
+ ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
566
+ ; VI-NEXT: v_bfe_i32 v1, v1, 0, 16
567
+ ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
568
+ ; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0
569
+ ; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
570
+ ; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0
571
+ ; VI-NEXT: s_endpgm
214
572
%a = load i64 , i64 addrspace (1 )* %in
215
573
%cast = bitcast i64 %a to <4 x i16 >
216
574
%ext = sext <4 x i16 > %cast to <4 x i32 >
0 commit comments