Skip to content

Commit a6013c0

Browse files
committedMar 12, 2019
Regenerate sign_extend.ll test.
This will change as part of the fix for the regressions in D58017. llvm-svn: 355933
1 parent 9f0a5ca commit a6013c0

File tree

1 file changed

+443
-85
lines changed

1 file changed

+443
-85
lines changed
 

‎llvm/test/CodeGen/AMDGPU/sign_extend.ll

+443-85
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,71 @@
1-
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,SI %s
2-
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI %s
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,SI
3+
; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI
34

4-
; GCN-LABEL: {{^}}s_sext_i1_to_i32:
5-
; GCN: v_cndmask_b32_e64
6-
; GCN: s_endpgm
75
define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
6+
; SI-LABEL: s_sext_i1_to_i32:
7+
; SI: ; %bb.0:
8+
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
9+
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
10+
; SI-NEXT: s_mov_b32 s3, 0xf000
11+
; SI-NEXT: s_mov_b32 s2, -1
12+
; SI-NEXT: s_waitcnt lgkmcnt(0)
13+
; SI-NEXT: v_mov_b32_e32 v0, s5
14+
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
15+
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
16+
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
17+
; SI-NEXT: s_endpgm
18+
;
19+
; VI-LABEL: s_sext_i1_to_i32:
20+
; VI: ; %bb.0:
21+
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
22+
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
23+
; VI-NEXT: s_mov_b32 s7, 0xf000
24+
; VI-NEXT: s_mov_b32 s6, -1
25+
; VI-NEXT: s_waitcnt lgkmcnt(0)
26+
; VI-NEXT: v_mov_b32_e32 v0, s1
27+
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
28+
; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
29+
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
30+
; VI-NEXT: s_endpgm
831
%cmp = icmp eq i32 %a, %b
932
%sext = sext i1 %cmp to i32
1033
store i32 %sext, i32 addrspace(1)* %out, align 4
1134
ret void
1235
}
1336

14-
; GCN-LABEL: {{^}}test_s_sext_i32_to_i64:
15-
; GCN: s_ashr_i32
16-
; GCN: s_endpg
1737
define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
38+
; SI-LABEL: test_s_sext_i32_to_i64:
39+
; SI: ; %bb.0: ; %entry
40+
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
41+
; SI-NEXT: s_load_dword s2, s[0:1], 0xd
42+
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
43+
; SI-NEXT: s_mov_b32 s3, 0xf000
44+
; SI-NEXT: s_waitcnt lgkmcnt(0)
45+
; SI-NEXT: s_mul_i32 s4, s4, s5
46+
; SI-NEXT: s_add_i32 s4, s4, s2
47+
; SI-NEXT: s_ashr_i32 s5, s4, 31
48+
; SI-NEXT: s_mov_b32 s2, -1
49+
; SI-NEXT: v_mov_b32_e32 v0, s4
50+
; SI-NEXT: v_mov_b32_e32 v1, s5
51+
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
52+
; SI-NEXT: s_endpgm
53+
;
54+
; VI-LABEL: test_s_sext_i32_to_i64:
55+
; VI: ; %bb.0: ; %entry
56+
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
57+
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
58+
; VI-NEXT: s_load_dword s0, s[0:1], 0x34
59+
; VI-NEXT: s_mov_b32 s7, 0xf000
60+
; VI-NEXT: s_mov_b32 s6, -1
61+
; VI-NEXT: s_waitcnt lgkmcnt(0)
62+
; VI-NEXT: s_mul_i32 s1, s2, s3
63+
; VI-NEXT: s_add_i32 s1, s1, s0
64+
; VI-NEXT: s_ashr_i32 s0, s1, 31
65+
; VI-NEXT: v_mov_b32_e32 v0, s1
66+
; VI-NEXT: v_mov_b32_e32 v1, s0
67+
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
68+
; VI-NEXT: s_endpgm
1869
entry:
1970
%mul = mul i32 %a, %b
2071
%add = add i32 %mul, %c
@@ -23,50 +74,170 @@ entry:
2374
ret void
2475
}
2576

26-
; GCN-LABEL: {{^}}s_sext_i1_to_i64:
27-
; GCN: v_cndmask_b32_e64 v[[LOREG:[0-9]+]], 0, -1, vcc
28-
; GCN: v_mov_b32_e32 v[[HIREG:[0-9]+]], v[[LOREG]]
29-
; GCN: buffer_store_dwordx2 v{{\[}}[[LOREG]]:[[HIREG]]{{\]}}
30-
; GCN: s_endpgm
3177
define amdgpu_kernel void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
78+
; SI-LABEL: s_sext_i1_to_i64:
79+
; SI: ; %bb.0:
80+
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
81+
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
82+
; SI-NEXT: s_mov_b32 s3, 0xf000
83+
; SI-NEXT: s_waitcnt lgkmcnt(0)
84+
; SI-NEXT: v_mov_b32_e32 v0, s5
85+
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
86+
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
87+
; SI-NEXT: s_mov_b32 s2, -1
88+
; SI-NEXT: v_mov_b32_e32 v1, v0
89+
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
90+
; SI-NEXT: s_endpgm
91+
;
92+
; VI-LABEL: s_sext_i1_to_i64:
93+
; VI: ; %bb.0:
94+
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
95+
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
96+
; VI-NEXT: s_mov_b32 s7, 0xf000
97+
; VI-NEXT: s_mov_b32 s6, -1
98+
; VI-NEXT: s_waitcnt lgkmcnt(0)
99+
; VI-NEXT: v_mov_b32_e32 v0, s1
100+
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
101+
; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
102+
; VI-NEXT: v_mov_b32_e32 v1, v0
103+
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
104+
; VI-NEXT: s_endpgm
32105
%cmp = icmp eq i32 %a, %b
33106
%sext = sext i1 %cmp to i64
34107
store i64 %sext, i64 addrspace(1)* %out, align 8
35108
ret void
36109
}
37110

38-
; GCN-LABEL: {{^}}s_sext_i32_to_i64:
39-
; GCN: s_ashr_i32
40-
; GCN: s_endpgm
41111
define amdgpu_kernel void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
112+
; SI-LABEL: s_sext_i32_to_i64:
113+
; SI: ; %bb.0:
114+
; SI-NEXT: s_load_dword s4, s[0:1], 0xb
115+
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
116+
; SI-NEXT: s_mov_b32 s3, 0xf000
117+
; SI-NEXT: s_waitcnt lgkmcnt(0)
118+
; SI-NEXT: s_ashr_i32 s5, s4, 31
119+
; SI-NEXT: s_mov_b32 s2, -1
120+
; SI-NEXT: v_mov_b32_e32 v0, s4
121+
; SI-NEXT: v_mov_b32_e32 v1, s5
122+
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
123+
; SI-NEXT: s_endpgm
124+
;
125+
; VI-LABEL: s_sext_i32_to_i64:
126+
; VI: ; %bb.0:
127+
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
128+
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
129+
; VI-NEXT: s_mov_b32 s7, 0xf000
130+
; VI-NEXT: s_mov_b32 s6, -1
131+
; VI-NEXT: s_waitcnt lgkmcnt(0)
132+
; VI-NEXT: s_ashr_i32 s1, s0, 31
133+
; VI-NEXT: v_mov_b32_e32 v0, s0
134+
; VI-NEXT: v_mov_b32_e32 v1, s1
135+
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
136+
; VI-NEXT: s_endpgm
42137
%sext = sext i32 %a to i64
43138
store i64 %sext, i64 addrspace(1)* %out, align 8
44139
ret void
45140
}
46141

47-
; GCN-LABEL: {{^}}v_sext_i32_to_i64:
48-
; GCN: v_ashr
49-
; GCN: s_endpgm
50142
define amdgpu_kernel void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
143+
; SI-LABEL: v_sext_i32_to_i64:
144+
; SI: ; %bb.0:
145+
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
146+
; SI-NEXT: s_mov_b32 s7, 0xf000
147+
; SI-NEXT: s_mov_b32 s6, -1
148+
; SI-NEXT: s_mov_b32 s10, s6
149+
; SI-NEXT: s_mov_b32 s11, s7
150+
; SI-NEXT: s_waitcnt lgkmcnt(0)
151+
; SI-NEXT: s_mov_b32 s8, s2
152+
; SI-NEXT: s_mov_b32 s9, s3
153+
; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
154+
; SI-NEXT: s_mov_b32 s4, s0
155+
; SI-NEXT: s_mov_b32 s5, s1
156+
; SI-NEXT: s_waitcnt vmcnt(0)
157+
; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
158+
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
159+
; SI-NEXT: s_endpgm
160+
;
161+
; VI-LABEL: v_sext_i32_to_i64:
162+
; VI: ; %bb.0:
163+
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
164+
; VI-NEXT: s_mov_b32 s3, 0xf000
165+
; VI-NEXT: s_mov_b32 s2, -1
166+
; VI-NEXT: s_waitcnt lgkmcnt(0)
167+
; VI-NEXT: s_mov_b32 s0, s4
168+
; VI-NEXT: s_mov_b32 s1, s5
169+
; VI-NEXT: s_mov_b32 s4, s6
170+
; VI-NEXT: s_mov_b32 s5, s7
171+
; VI-NEXT: s_mov_b32 s6, s2
172+
; VI-NEXT: s_mov_b32 s7, s3
173+
; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
174+
; VI-NEXT: s_waitcnt vmcnt(0)
175+
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
176+
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
177+
; VI-NEXT: s_endpgm
51178
%val = load i32, i32 addrspace(1)* %in, align 4
52179
%sext = sext i32 %val to i64
53180
store i64 %sext, i64 addrspace(1)* %out, align 8
54181
ret void
55182
}
56183

57-
; GCN-LABEL: {{^}}s_sext_i16_to_i64:
58-
; GCN: s_load_dword [[VAL:s[0-9]+]]
59-
; GCN: s_bfe_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100000
60184
define amdgpu_kernel void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
185+
; SI-LABEL: s_sext_i16_to_i64:
186+
; SI: ; %bb.0:
187+
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
188+
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
189+
; SI-NEXT: s_mov_b32 s3, 0xf000
190+
; SI-NEXT: s_waitcnt lgkmcnt(0)
191+
; SI-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x100000
192+
; SI-NEXT: s_mov_b32 s2, -1
193+
; SI-NEXT: v_mov_b32_e32 v0, s4
194+
; SI-NEXT: v_mov_b32_e32 v1, s5
195+
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
196+
; SI-NEXT: s_endpgm
197+
;
198+
; VI-LABEL: s_sext_i16_to_i64:
199+
; VI: ; %bb.0:
200+
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
201+
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
202+
; VI-NEXT: s_mov_b32 s7, 0xf000
203+
; VI-NEXT: s_mov_b32 s6, -1
204+
; VI-NEXT: s_waitcnt lgkmcnt(0)
205+
; VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
206+
; VI-NEXT: v_mov_b32_e32 v0, s0
207+
; VI-NEXT: v_mov_b32_e32 v1, s1
208+
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
209+
; VI-NEXT: s_endpgm
61210
%sext = sext i16 %a to i64
62211
store i64 %sext, i64 addrspace(1)* %out, align 8
63212
ret void
64213
}
65214

66-
; GCN-LABEL: {{^}}s_sext_i1_to_i16:
67-
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1
68-
; GCN-NEXT: buffer_store_short [[RESULT]]
69215
define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
216+
; SI-LABEL: s_sext_i1_to_i16:
217+
; SI: ; %bb.0:
218+
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
219+
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
220+
; SI-NEXT: s_mov_b32 s3, 0xf000
221+
; SI-NEXT: s_mov_b32 s2, -1
222+
; SI-NEXT: s_waitcnt lgkmcnt(0)
223+
; SI-NEXT: v_mov_b32_e32 v0, s5
224+
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
225+
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
226+
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
227+
; SI-NEXT: s_endpgm
228+
;
229+
; VI-LABEL: s_sext_i1_to_i16:
230+
; VI: ; %bb.0:
231+
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
232+
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
233+
; VI-NEXT: s_mov_b32 s7, 0xf000
234+
; VI-NEXT: s_mov_b32 s6, -1
235+
; VI-NEXT: s_waitcnt lgkmcnt(0)
236+
; VI-NEXT: v_mov_b32_e32 v0, s1
237+
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
238+
; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
239+
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
240+
; VI-NEXT: s_endpgm
70241
%cmp = icmp eq i32 %a, %b
71242
%sext = sext i1 %cmp to i16
72243
store i16 %sext, i16 addrspace(1)* %out
@@ -77,10 +248,38 @@ define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32
77248
; makes it all the way throught the legalizer/optimizer to make sure
78249
; we select this correctly. In the s_sext_i1_to_i16, the sign_extend node
79250
; is optimized to a select very early.
80-
; GCN-LABEL: {{^}}s_sext_i1_to_i16_with_and:
81-
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1
82-
; GCN-NEXT: buffer_store_short [[RESULT]]
83251
define amdgpu_kernel void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
252+
; SI-LABEL: s_sext_i1_to_i16_with_and:
253+
; SI: ; %bb.0:
254+
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
255+
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9
256+
; SI-NEXT: s_mov_b32 s11, 0xf000
257+
; SI-NEXT: s_mov_b32 s10, -1
258+
; SI-NEXT: s_waitcnt lgkmcnt(0)
259+
; SI-NEXT: v_mov_b32_e32 v0, s5
260+
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
261+
; SI-NEXT: v_mov_b32_e32 v0, s7
262+
; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s6, v0
263+
; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
264+
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
265+
; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
266+
; SI-NEXT: s_endpgm
267+
;
268+
; VI-LABEL: s_sext_i1_to_i16_with_and:
269+
; VI: ; %bb.0:
270+
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
271+
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
272+
; VI-NEXT: s_mov_b32 s7, 0xf000
273+
; VI-NEXT: s_mov_b32 s6, -1
274+
; VI-NEXT: s_waitcnt lgkmcnt(0)
275+
; VI-NEXT: v_mov_b32_e32 v0, s1
276+
; VI-NEXT: v_mov_b32_e32 v1, s3
277+
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
278+
; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v1
279+
; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
280+
; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
281+
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
282+
; VI-NEXT: s_endpgm
84283
%cmp0 = icmp eq i32 %a, %b
85284
%cmp1 = icmp eq i32 %c, %d
86285
%cmp = and i1 %cmp0, %cmp1
@@ -89,10 +288,38 @@ define amdgpu_kernel void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32
89288
ret void
90289
}
91290

92-
; GCN-LABEL: {{^}}v_sext_i1_to_i16_with_and:
93-
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1
94-
; GCN-NEXT: buffer_store_short [[RESULT]]
95291
define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
292+
; SI-LABEL: v_sext_i1_to_i16_with_and:
293+
; SI: ; %bb.0:
294+
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
295+
; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
296+
; SI-NEXT: s_load_dword s0, s[0:1], 0xd
297+
; SI-NEXT: s_mov_b32 s7, 0xf000
298+
; SI-NEXT: s_mov_b32 s6, -1
299+
; SI-NEXT: s_waitcnt lgkmcnt(0)
300+
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0
301+
; SI-NEXT: v_mov_b32_e32 v0, s0
302+
; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v0
303+
; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
304+
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
305+
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
306+
; SI-NEXT: s_endpgm
307+
;
308+
; VI-LABEL: v_sext_i1_to_i16_with_and:
309+
; VI: ; %bb.0:
310+
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
311+
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
312+
; VI-NEXT: s_load_dword s0, s[0:1], 0x34
313+
; VI-NEXT: s_mov_b32 s7, 0xf000
314+
; VI-NEXT: s_mov_b32 s6, -1
315+
; VI-NEXT: s_waitcnt lgkmcnt(0)
316+
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0
317+
; VI-NEXT: v_mov_b32_e32 v0, s0
318+
; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v0
319+
; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
320+
; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
321+
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
322+
; VI-NEXT: s_endpgm
96323
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1
97324
%cmp0 = icmp eq i32 %a, %tid
98325
%cmp1 = icmp eq i32 %b, %c
@@ -102,13 +329,6 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32
102329
ret void
103330
}
104331

105-
; GCN-LABEL: {{^}}s_sext_v4i8_to_v4i32:
106-
; GCN: s_load_dword [[VAL:s[0-9]+]]
107-
; GCN-DAG: s_bfe_i32 [[EXT2:s[0-9]+]], [[VAL]], 0x80010
108-
; GCN-DAG: s_ashr_i32 [[EXT3:s[0-9]+]], [[VAL]], 24
109-
; SI-DAG: s_bfe_i32 [[EXT1:s[0-9]+]], [[VAL]], 0x80008
110-
; GCN-DAG: s_sext_i32_i8 [[EXT0:s[0-9]+]], [[VAL]]
111-
112332
; FIXME: We end up with a v_bfe instruction, because the i16 srl
113333
; gets selected to a v_lshrrev_b16 instructions, so the input to
114334
; the bfe is a vector registers. To fix this we need to be able to
@@ -117,21 +337,51 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32
117337
; t55: i16 = srl t29, Constant:i32<8>
118338
; t63: i32 = any_extend t55
119339
; t64: i32 = sign_extend_inreg t63, ValueType:ch:i8
120-
121-
; VI-DAG: v_bfe_i32 [[VEXT1:v[0-9]+]], v{{[0-9]+}}, 0, 8
122-
123-
; GCN-DAG: v_mov_b32_e32 [[VEXT0:v[0-9]+]], [[EXT0]]
124-
; SI-DAG: v_mov_b32_e32 [[VEXT1:v[0-9]+]], [[EXT1]]
125-
; GCN-DAG: v_mov_b32_e32 [[VEXT2:v[0-9]+]], [[EXT2]]
126-
; GCN-DAG: v_mov_b32_e32 [[VEXT3:v[0-9]+]], [[EXT3]]
127-
128-
; GCN-DAG: buffer_store_dword [[VEXT0]]
129-
; GCN-DAG: buffer_store_dword [[VEXT1]]
130-
; GCN-DAG: buffer_store_dword [[VEXT2]]
131-
; GCN-DAG: buffer_store_dword [[VEXT3]]
132-
133-
; GCN: s_endpgm
134340
define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind {
341+
; SI-LABEL: s_sext_v4i8_to_v4i32:
342+
; SI: ; %bb.0:
343+
; SI-NEXT: s_load_dword s4, s[0:1], 0xb
344+
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
345+
; SI-NEXT: s_mov_b32 s3, 0xf000
346+
; SI-NEXT: s_mov_b32 s2, -1
347+
; SI-NEXT: s_waitcnt lgkmcnt(0)
348+
; SI-NEXT: s_ashr_i32 s5, s4, 24
349+
; SI-NEXT: s_bfe_i32 s6, s4, 0x80010
350+
; SI-NEXT: s_sext_i32_i8 s7, s4
351+
; SI-NEXT: s_bfe_i32 s4, s4, 0x80008
352+
; SI-NEXT: v_mov_b32_e32 v0, s7
353+
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
354+
; SI-NEXT: s_waitcnt expcnt(0)
355+
; SI-NEXT: v_mov_b32_e32 v0, s4
356+
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
357+
; SI-NEXT: s_waitcnt expcnt(0)
358+
; SI-NEXT: v_mov_b32_e32 v0, s6
359+
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
360+
; SI-NEXT: s_waitcnt expcnt(0)
361+
; SI-NEXT: v_mov_b32_e32 v0, s5
362+
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
363+
; SI-NEXT: s_endpgm
364+
;
365+
; VI-LABEL: s_sext_v4i8_to_v4i32:
366+
; VI: ; %bb.0:
367+
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
368+
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
369+
; VI-NEXT: s_mov_b32 s7, 0xf000
370+
; VI-NEXT: s_mov_b32 s6, -1
371+
; VI-NEXT: s_waitcnt lgkmcnt(0)
372+
; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s0
373+
; VI-NEXT: s_ashr_i32 s1, s0, 24
374+
; VI-NEXT: s_bfe_i32 s2, s0, 0x80010
375+
; VI-NEXT: s_sext_i32_i8 s0, s0
376+
; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
377+
; VI-NEXT: v_mov_b32_e32 v1, s0
378+
; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
379+
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
380+
; VI-NEXT: v_mov_b32_e32 v0, s2
381+
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
382+
; VI-NEXT: v_mov_b32_e32 v0, s1
383+
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
384+
; VI-NEXT: s_endpgm
135385
%cast = bitcast i32 %a to <4 x i8>
136386
%ext = sext <4 x i8> %cast to <4 x i32>
137387
%elt0 = extractelement <4 x i32> %ext, i32 0
@@ -145,25 +395,57 @@ define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a)
145395
ret void
146396
}
147397

148-
; GCN-LABEL: {{^}}v_sext_v4i8_to_v4i32:
149-
; GCN: buffer_load_dword [[VAL:v[0-9]+]]
150398
; FIXME: need to optimize same sequence as above test to avoid
151399
; this shift.
152-
; VI-DAG: v_lshrrev_b16_e32 [[SH16:v[0-9]+]], 8, [[VAL]]
153-
; GCN-DAG: v_ashrrev_i32_e32 [[EXT3:v[0-9]+]], 24, [[VAL]]
154-
; VI-DAG: v_bfe_i32 [[EXT0:v[0-9]+]], [[VAL]], 0, 8
155-
; VI-DAG: v_bfe_i32 [[EXT2:v[0-9]+]], [[VAL]], 16, 8
156-
; VI-DAG: v_bfe_i32 [[EXT1:v[0-9]+]], [[SH16]], 0, 8
157-
158-
; SI-DAG: v_bfe_i32 [[EXT2:v[0-9]+]], [[VAL]], 16, 8
159-
; SI-DAG: v_bfe_i32 [[EXT1:v[0-9]+]], [[VAL]], 8, 8
160-
; SI: v_bfe_i32 [[EXT0:v[0-9]+]], [[VAL]], 0, 8
161-
162-
; GCN: buffer_store_dword [[EXT0]]
163-
; GCN: buffer_store_dword [[EXT1]]
164-
; GCN: buffer_store_dword [[EXT2]]
165-
; GCN: buffer_store_dword [[EXT3]]
166400
define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
401+
; SI-LABEL: v_sext_v4i8_to_v4i32:
402+
; SI: ; %bb.0:
403+
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
404+
; SI-NEXT: s_mov_b32 s7, 0xf000
405+
; SI-NEXT: s_mov_b32 s6, -1
406+
; SI-NEXT: s_mov_b32 s10, s6
407+
; SI-NEXT: s_mov_b32 s11, s7
408+
; SI-NEXT: s_waitcnt lgkmcnt(0)
409+
; SI-NEXT: s_mov_b32 s8, s2
410+
; SI-NEXT: s_mov_b32 s9, s3
411+
; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
412+
; SI-NEXT: s_mov_b32 s4, s0
413+
; SI-NEXT: s_mov_b32 s5, s1
414+
; SI-NEXT: s_waitcnt vmcnt(0)
415+
; SI-NEXT: v_ashrrev_i32_e32 v1, 24, v0
416+
; SI-NEXT: v_bfe_i32 v2, v0, 16, 8
417+
; SI-NEXT: v_bfe_i32 v3, v0, 8, 8
418+
; SI-NEXT: v_bfe_i32 v0, v0, 0, 8
419+
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
420+
; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0
421+
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
422+
; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
423+
; SI-NEXT: s_endpgm
424+
;
425+
; VI-LABEL: v_sext_v4i8_to_v4i32:
426+
; VI: ; %bb.0:
427+
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
428+
; VI-NEXT: s_mov_b32 s3, 0xf000
429+
; VI-NEXT: s_mov_b32 s2, -1
430+
; VI-NEXT: s_waitcnt lgkmcnt(0)
431+
; VI-NEXT: s_mov_b32 s0, s4
432+
; VI-NEXT: s_mov_b32 s1, s5
433+
; VI-NEXT: s_mov_b32 s4, s6
434+
; VI-NEXT: s_mov_b32 s5, s7
435+
; VI-NEXT: s_mov_b32 s6, s2
436+
; VI-NEXT: s_mov_b32 s7, s3
437+
; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
438+
; VI-NEXT: s_waitcnt vmcnt(0)
439+
; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0
440+
; VI-NEXT: v_ashrrev_i32_e32 v2, 24, v0
441+
; VI-NEXT: v_bfe_i32 v3, v0, 16, 8
442+
; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
443+
; VI-NEXT: v_bfe_i32 v1, v1, 0, 8
444+
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
445+
; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
446+
; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0
447+
; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0
448+
; VI-NEXT: s_endpgm
167449
%a = load i32, i32 addrspace(1)* %in
168450
%cast = bitcast i32 %a to <4 x i8>
169451
%ext = sext <4 x i8> %cast to <4 x i32>
@@ -179,18 +461,53 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addr
179461
}
180462

181463
; FIXME: s_bfe_i64, same on SI and VI
182-
; GCN-LABEL: {{^}}s_sext_v4i16_to_v4i32:
183-
; SI-DAG: s_ashr_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 48
184-
; SI-DAG: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16
185-
186-
; VI: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16
187-
; VI: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16
188-
189-
190-
; GCN-DAG: s_sext_i32_i16
191-
; GCN-DAG: s_sext_i32_i16
192-
; GCN: s_endpgm
193464
define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind {
465+
; SI-LABEL: s_sext_v4i16_to_v4i32:
466+
; SI: ; %bb.0:
467+
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
468+
; SI-NEXT: s_mov_b32 s7, 0xf000
469+
; SI-NEXT: s_mov_b32 s6, -1
470+
; SI-NEXT: s_waitcnt lgkmcnt(0)
471+
; SI-NEXT: s_mov_b32 s4, s0
472+
; SI-NEXT: s_mov_b32 s5, s1
473+
; SI-NEXT: s_ashr_i64 s[0:1], s[2:3], 48
474+
; SI-NEXT: s_ashr_i32 s1, s2, 16
475+
; SI-NEXT: s_sext_i32_i16 s2, s2
476+
; SI-NEXT: s_sext_i32_i16 s3, s3
477+
; SI-NEXT: v_mov_b32_e32 v0, s2
478+
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
479+
; SI-NEXT: s_waitcnt expcnt(0)
480+
; SI-NEXT: v_mov_b32_e32 v0, s1
481+
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
482+
; SI-NEXT: s_waitcnt expcnt(0)
483+
; SI-NEXT: v_mov_b32_e32 v0, s3
484+
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
485+
; SI-NEXT: s_waitcnt expcnt(0)
486+
; SI-NEXT: v_mov_b32_e32 v0, s0
487+
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
488+
; SI-NEXT: s_endpgm
489+
;
490+
; VI-LABEL: s_sext_v4i16_to_v4i32:
491+
; VI: ; %bb.0:
492+
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
493+
; VI-NEXT: s_mov_b32 s3, 0xf000
494+
; VI-NEXT: s_mov_b32 s2, -1
495+
; VI-NEXT: s_waitcnt lgkmcnt(0)
496+
; VI-NEXT: s_mov_b32 s1, s5
497+
; VI-NEXT: s_ashr_i32 s5, s6, 16
498+
; VI-NEXT: s_sext_i32_i16 s6, s6
499+
; VI-NEXT: s_mov_b32 s0, s4
500+
; VI-NEXT: v_mov_b32_e32 v0, s6
501+
; VI-NEXT: s_ashr_i32 s4, s7, 16
502+
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
503+
; VI-NEXT: v_mov_b32_e32 v0, s5
504+
; VI-NEXT: s_sext_i32_i16 s7, s7
505+
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
506+
; VI-NEXT: v_mov_b32_e32 v0, s7
507+
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
508+
; VI-NEXT: v_mov_b32_e32 v0, s4
509+
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
510+
; VI-NEXT: s_endpgm
194511
%cast = bitcast i64 %a to <4 x i16>
195512
%ext = sext <4 x i16> %cast to <4 x i32>
196513
%elt0 = extractelement <4 x i32> %ext, i32 0
@@ -204,13 +521,54 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a)
204521
ret void
205522
}
206523

207-
; GCN-LABEL: {{^}}v_sext_v4i16_to_v4i32:
208-
; GCN-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
209-
; GCN-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
210-
; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
211-
; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
212-
; GCN: s_endpgm
213524
define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
525+
; SI-LABEL: v_sext_v4i16_to_v4i32:
526+
; SI: ; %bb.0:
527+
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
528+
; SI-NEXT: s_mov_b32 s7, 0xf000
529+
; SI-NEXT: s_mov_b32 s6, -1
530+
; SI-NEXT: s_mov_b32 s10, s6
531+
; SI-NEXT: s_mov_b32 s11, s7
532+
; SI-NEXT: s_waitcnt lgkmcnt(0)
533+
; SI-NEXT: s_mov_b32 s8, s2
534+
; SI-NEXT: s_mov_b32 s9, s3
535+
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
536+
; SI-NEXT: s_mov_b32 s4, s0
537+
; SI-NEXT: s_mov_b32 s5, s1
538+
; SI-NEXT: s_waitcnt vmcnt(0)
539+
; SI-NEXT: v_ashr_i64 v[2:3], v[0:1], 48
540+
; SI-NEXT: v_bfe_i32 v1, v1, 0, 16
541+
; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
542+
; SI-NEXT: v_bfe_i32 v0, v0, 0, 16
543+
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
544+
; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0
545+
; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
546+
; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0
547+
; SI-NEXT: s_endpgm
548+
;
549+
; VI-LABEL: v_sext_v4i16_to_v4i32:
550+
; VI: ; %bb.0:
551+
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
552+
; VI-NEXT: s_mov_b32 s3, 0xf000
553+
; VI-NEXT: s_mov_b32 s2, -1
554+
; VI-NEXT: s_waitcnt lgkmcnt(0)
555+
; VI-NEXT: s_mov_b32 s0, s4
556+
; VI-NEXT: s_mov_b32 s1, s5
557+
; VI-NEXT: s_mov_b32 s4, s6
558+
; VI-NEXT: s_mov_b32 s5, s7
559+
; VI-NEXT: s_mov_b32 s6, s2
560+
; VI-NEXT: s_mov_b32 s7, s3
561+
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
562+
; VI-NEXT: s_waitcnt vmcnt(0)
563+
; VI-NEXT: v_ashrrev_i32_e32 v2, 16, v1
564+
; VI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
565+
; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
566+
; VI-NEXT: v_bfe_i32 v1, v1, 0, 16
567+
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
568+
; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0
569+
; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
570+
; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0
571+
; VI-NEXT: s_endpgm
214572
%a = load i64, i64 addrspace(1)* %in
215573
%cast = bitcast i64 %a to <4 x i16>
216574
%ext = sext <4 x i16> %cast to <4 x i32>

0 commit comments

Comments
 (0)
Please sign in to comment.