Skip to content

Commit 5cd09ad

Browse files
committedJan 5, 2016
AMDGPU/SI: Select non-uniform constant addrspace loads to flat instructions for HSA
Summary: This fixes a regression caused by r256282. Reviewers: arsenm, cfang Subscribers: arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D15736 llvm-svn: 256810
1 parent 0d80888 commit 5cd09ad

File tree

3 files changed

+246
-166
lines changed

3 files changed

+246
-166
lines changed
 

‎llvm/lib/Target/AMDGPU/SIInstrInfo.td

+2-1
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,8 @@ def SIconstdata_ptr : SDNode<
141141
class flat_ld <SDPatternOperator ld> : PatFrag<(ops node:$ptr),
142142
(ld node:$ptr), [{
143143
return isFlatLoad(dyn_cast<LoadSDNode>(N)) ||
144-
isGlobalLoad(dyn_cast<LoadSDNode>(N));
144+
isGlobalLoad(dyn_cast<LoadSDNode>(N)) ||
145+
isConstantLoad(cast<LoadSDNode>(N), -1);
145146
}]>;
146147

147148
def flat_load : flat_ld <load>;

‎llvm/test/CodeGen/AMDGPU/load.ll

+93-47
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
22
; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
3-
; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
4-
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
3+
; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-NOHSA --check-prefix=FUNC %s
4+
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs | FileCheck --check-prefix=FUNC --check-prefix=CI-HSA --check-prefix=SI %s
5+
; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI-NOHSA --check-prefix=FUNC %s
56

67
;===------------------------------------------------------------------------===;
78
; GLOBAL ADDRESS SPACE
@@ -11,7 +12,8 @@
1112
; FUNC-LABEL: {{^}}load_i8:
1213
; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
1314

14-
; SI: buffer_load_ubyte v{{[0-9]+}},
15+
; SI-NOHSA: buffer_load_ubyte v{{[0-9]+}},
16+
; CI-HSA: flat_load_ubyte
1517
define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
1618
%1 = load i8, i8 addrspace(1)* %in
1719
%2 = zext i8 %1 to i32
@@ -23,7 +25,8 @@ define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
2325
; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
2426
; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
2527
; R600: 8
26-
; SI: buffer_load_sbyte
28+
; SI-NOHSA: buffer_load_sbyte
29+
; CI-HSA: flat_load_sbyte
2730
define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
2831
entry:
2932
%0 = load i8, i8 addrspace(1)* %in
@@ -35,8 +38,10 @@ entry:
3538
; FUNC-LABEL: {{^}}load_v2i8:
3639
; R600: VTX_READ_8
3740
; R600: VTX_READ_8
38-
; SI: buffer_load_ubyte
39-
; SI: buffer_load_ubyte
41+
; SI-NOHSA: buffer_load_ubyte
42+
; SI-NOHSA: buffer_load_ubyte
43+
; CI-HSA: flat_load_ubyte
44+
; CI-HSA: flat_load_ubyte
4045
define void @load_v2i8(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
4146
entry:
4247
%0 = load <2 x i8>, <2 x i8> addrspace(1)* %in
@@ -53,8 +58,10 @@ entry:
5358
; R600-DAG: 8
5459
; R600-DAG: 8
5560

56-
; SI: buffer_load_sbyte
57-
; SI: buffer_load_sbyte
61+
; SI-NOHSA: buffer_load_sbyte
62+
; SI-NOHSA: buffer_load_sbyte
63+
; CI-HSA: flat_load_sbyte
64+
; CI-HSA: flat_load_sbyte
5865
define void @load_v2i8_sext(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
5966
entry:
6067
%0 = load <2 x i8>, <2 x i8> addrspace(1)* %in
@@ -68,10 +75,14 @@ entry:
6875
; R600: VTX_READ_8
6976
; R600: VTX_READ_8
7077
; R600: VTX_READ_8
71-
; SI: buffer_load_ubyte
72-
; SI: buffer_load_ubyte
73-
; SI: buffer_load_ubyte
74-
; SI: buffer_load_ubyte
78+
; SI-NOHSA: buffer_load_ubyte
79+
; SI-NOHSA: buffer_load_ubyte
80+
; SI-NOHSA: buffer_load_ubyte
81+
; SI-NOHSA: buffer_load_ubyte
82+
; CI-HSA: flat_load_ubyte
83+
; CI-HSA: flat_load_ubyte
84+
; CI-HSA: flat_load_ubyte
85+
; CI-HSA: flat_load_ubyte
7586
define void @load_v4i8(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
7687
entry:
7788
%0 = load <4 x i8>, <4 x i8> addrspace(1)* %in
@@ -93,10 +104,14 @@ entry:
93104
; R600-DAG: 8
94105
; R600-DAG: 8
95106
; R600-DAG: 8
96-
; SI: buffer_load_sbyte
97-
; SI: buffer_load_sbyte
98-
; SI: buffer_load_sbyte
99-
; SI: buffer_load_sbyte
107+
; SI-NOHSA: buffer_load_sbyte
108+
; SI-NOHSA: buffer_load_sbyte
109+
; SI-NOHSA: buffer_load_sbyte
110+
; SI-NOHSA: buffer_load_sbyte
111+
; CI-HSA: flat_load_sbyte
112+
; CI-HSA: flat_load_sbyte
113+
; CI-HSA: flat_load_sbyte
114+
; CI-HSA: flat_load_sbyte
100115
define void @load_v4i8_sext(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
101116
entry:
102117
%0 = load <4 x i8>, <4 x i8> addrspace(1)* %in
@@ -108,7 +123,8 @@ entry:
108123
; Load an i16 value from the global address space.
109124
; FUNC-LABEL: {{^}}load_i16:
110125
; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
111-
; SI: buffer_load_ushort
126+
; SI-NOHSA: buffer_load_ushort
127+
; CI-HSA: flat_load_ushort
112128
define void @load_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
113129
entry:
114130
%0 = load i16 , i16 addrspace(1)* %in
@@ -121,7 +137,8 @@ entry:
121137
; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
122138
; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
123139
; R600: 16
124-
; SI: buffer_load_sshort
140+
; SI-NOHSA: buffer_load_sshort
141+
; CI-HSA: flat_load_sshort
125142
define void @load_i16_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
126143
entry:
127144
%0 = load i16, i16 addrspace(1)* %in
@@ -133,8 +150,10 @@ entry:
133150
; FUNC-LABEL: {{^}}load_v2i16:
134151
; R600: VTX_READ_16
135152
; R600: VTX_READ_16
136-
; SI: buffer_load_ushort
137-
; SI: buffer_load_ushort
153+
; SI-NOHSA: buffer_load_ushort
154+
; SI-NOHSA: buffer_load_ushort
155+
; CI-HSA: flat_load_ushort
156+
; CI-HSA: flat_load_ushort
138157
define void @load_v2i16(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
139158
entry:
140159
%0 = load <2 x i16>, <2 x i16> addrspace(1)* %in
@@ -150,8 +169,10 @@ entry:
150169
; R600-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST_Y]], 0.0, literal
151170
; R600-DAG: 16
152171
; R600-DAG: 16
153-
; SI: buffer_load_sshort
154-
; SI: buffer_load_sshort
172+
; SI-NOHSA: buffer_load_sshort
173+
; SI-NOHSA: buffer_load_sshort
174+
; CI-HSA: flat_load_sshort
175+
; CI-HSA: flat_load_sshort
155176
define void @load_v2i16_sext(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
156177
entry:
157178
%0 = load <2 x i16>, <2 x i16> addrspace(1)* %in
@@ -165,10 +186,14 @@ entry:
165186
; R600: VTX_READ_16
166187
; R600: VTX_READ_16
167188
; R600: VTX_READ_16
168-
; SI: buffer_load_ushort
169-
; SI: buffer_load_ushort
170-
; SI: buffer_load_ushort
171-
; SI: buffer_load_ushort
189+
; SI-NOHSA: buffer_load_ushort
190+
; SI-NOHSA: buffer_load_ushort
191+
; SI-NOHSA: buffer_load_ushort
192+
; SI-NOHSA: buffer_load_ushort
193+
; CI-HSA: flat_load_ushort
194+
; CI-HSA: flat_load_ushort
195+
; CI-HSA: flat_load_ushort
196+
; CI-HSA: flat_load_ushort
172197
define void @load_v4i16(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
173198
entry:
174199
%0 = load <4 x i16>, <4 x i16> addrspace(1)* %in
@@ -190,10 +215,14 @@ entry:
190215
; R600-DAG: 16
191216
; R600-DAG: 16
192217
; R600-DAG: 16
193-
; SI: buffer_load_sshort
194-
; SI: buffer_load_sshort
195-
; SI: buffer_load_sshort
196-
; SI: buffer_load_sshort
218+
; SI-NOHSA: buffer_load_sshort
219+
; SI-NOHSA: buffer_load_sshort
220+
; SI-NOHSA: buffer_load_sshort
221+
; SI-NOHSA: buffer_load_sshort
222+
; CI-HSA: flat_load_sshort
223+
; CI-HSA: flat_load_sshort
224+
; CI-HSA: flat_load_sshort
225+
; CI-HSA: flat_load_sshort
197226
define void @load_v4i16_sext(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
198227
entry:
199228
%0 = load <4 x i16>, <4 x i16> addrspace(1)* %in
@@ -206,7 +235,8 @@ entry:
206235
; FUNC-LABEL: {{^}}load_i32:
207236
; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
208237

209-
; SI: buffer_load_dword v{{[0-9]+}}
238+
; SI-NOHSA: buffer_load_dword v{{[0-9]+}}
239+
; CI-HSA: flat_load_dword
210240
define void @load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
211241
entry:
212242
%0 = load i32, i32 addrspace(1)* %in
@@ -218,7 +248,8 @@ entry:
218248
; FUNC-LABEL: {{^}}load_f32:
219249
; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
220250

221-
; SI: buffer_load_dword v{{[0-9]+}}
251+
; SI-NOHSA: buffer_load_dword v{{[0-9]+}}
252+
; CI-HSA: flat_load_dword
222253
define void @load_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
223254
entry:
224255
%0 = load float, float addrspace(1)* %in
@@ -230,7 +261,8 @@ entry:
230261
; FUNC-LABEL: {{^}}load_v2f32:
231262
; R600: MEM_RAT
232263
; R600: VTX_READ_64
233-
; SI: buffer_load_dwordx2
264+
; SI-NOHSA: buffer_load_dwordx2
265+
; CI-HSA: flat_load_dwordx2
234266
define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
235267
entry:
236268
%0 = load <2 x float>, <2 x float> addrspace(1)* %in
@@ -240,7 +272,8 @@ entry:
240272

241273
; FUNC-LABEL: {{^}}load_i64:
242274
; R600: VTX_READ_64
243-
; SI: buffer_load_dwordx2
275+
; SI-NOHSA: buffer_load_dwordx2
276+
; CI-HSA: flat_load_dwordx2
244277
define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
245278
entry:
246279
%0 = load i64, i64 addrspace(1)* %in
@@ -253,7 +286,8 @@ entry:
253286
; R600: MEM_RAT
254287
; R600: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal.x
255288
; R600: 31
256-
; SI: buffer_load_dword
289+
; SI-NOHSA: buffer_load_dword
290+
; CI-HSA: flat_load_dword
257291

258292
define void @load_i64_sext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
259293
entry:
@@ -278,8 +312,10 @@ entry:
278312
; R600: VTX_READ_128
279313
; R600: VTX_READ_128
280314

281-
; SI: buffer_load_dwordx4
282-
; SI: buffer_load_dwordx4
315+
; SI-NOHSA: buffer_load_dwordx4
316+
; SI-NOHSA: buffer_load_dwordx4
317+
; CI-HSA: flat_load_dwordx4
318+
; CI-HSA: flat_load_dwordx4
283319
define void @load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) {
284320
entry:
285321
%0 = load <8 x i32>, <8 x i32> addrspace(1)* %in
@@ -293,10 +329,14 @@ entry:
293329
; R600: VTX_READ_128
294330
; R600: VTX_READ_128
295331

296-
; SI: buffer_load_dwordx4
297-
; SI: buffer_load_dwordx4
298-
; SI: buffer_load_dwordx4
299-
; SI: buffer_load_dwordx4
332+
; SI-NOHSA: buffer_load_dwordx4
333+
; SI-NOHSA: buffer_load_dwordx4
334+
; SI-NOHSA: buffer_load_dwordx4
335+
; SI-NOHSA: buffer_load_dwordx4
336+
; CI-HSA: flat_load_dwordx4
337+
; CI-HSA: flat_load_dwordx4
338+
; CI-HSA: flat_load_dwordx4
339+
; CI-HSA: flat_load_dwordx4
300340
define void @load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) {
301341
entry:
302342
%0 = load <16 x i32>, <16 x i32> addrspace(1)* %in
@@ -313,7 +353,8 @@ entry:
313353
; R600: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
314354
; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
315355
; R600: 8
316-
; SI: buffer_load_sbyte v{{[0-9]+}},
356+
; SI-NOHSA: buffer_load_sbyte v{{[0-9]+}},
357+
; CI-HSA: flat_load_sbyte v{{[0-9]+}},
317358
define void @load_const_i8_sext(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
318359
entry:
319360
%0 = load i8, i8 addrspace(2)* %in
@@ -325,7 +366,8 @@ entry:
325366
; Load an aligned i8 value
326367
; FUNC-LABEL: {{^}}load_const_i8_aligned:
327368
; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
328-
; SI: buffer_load_ubyte v{{[0-9]+}},
369+
; SI-NOHSA: buffer_load_ubyte v{{[0-9]+}},
370+
; CI-HSA: flat_load_ubyte v{{[0-9]+}},
329371
define void @load_const_i8_aligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
330372
entry:
331373
%0 = load i8, i8 addrspace(2)* %in
@@ -337,7 +379,8 @@ entry:
337379
; Load an un-aligned i8 value
338380
; FUNC-LABEL: {{^}}load_const_i8_unaligned:
339381
; R600: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
340-
; SI: buffer_load_ubyte v{{[0-9]+}},
382+
; SI-NOHSA: buffer_load_ubyte v{{[0-9]+}},
383+
; CI-HSA: flat_load_ubyte v{{[0-9]+}},
341384
define void @load_const_i8_unaligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
342385
entry:
343386
%0 = getelementptr i8, i8 addrspace(2)* %in, i32 1
@@ -352,7 +395,8 @@ entry:
352395
; R600: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
353396
; R600: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
354397
; R600: 16
355-
; SI: buffer_load_sshort
398+
; SI-NOHSA: buffer_load_sshort
399+
; CI-HSA: flat_load_sshort
356400
define void @load_const_i16_sext(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
357401
entry:
358402
%0 = load i16, i16 addrspace(2)* %in
@@ -364,7 +408,8 @@ entry:
364408
; Load an aligned i16 value
365409
; FUNC-LABEL: {{^}}load_const_i16_aligned:
366410
; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
367-
; SI: buffer_load_ushort
411+
; SI-NOHSA: buffer_load_ushort
412+
; CI-HSA: flat_load_ushort
368413
define void @load_const_i16_aligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
369414
entry:
370415
%0 = load i16, i16 addrspace(2)* %in
@@ -376,7 +421,8 @@ entry:
376421
; Load an un-aligned i16 value
377422
; FUNC-LABEL: {{^}}load_const_i16_unaligned:
378423
; R600: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
379-
; SI: buffer_load_ushort
424+
; SI-NOHSA: buffer_load_ushort
425+
; CI-HSA: flat_load_ushort
380426
define void @load_const_i16_unaligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
381427
entry:
382428
%0 = getelementptr i16, i16 addrspace(2)* %in, i32 1

‎llvm/test/CodeGen/AMDGPU/salu-to-valu.ll

+151-118
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
2-
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
1+
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI %s
2+
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI %s
3+
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s
34

45
declare i32 @llvm.r600.read.tidig.x() #0
56
declare i32 @llvm.r600.read.tidig.y() #0
@@ -18,8 +19,10 @@ declare i32 @llvm.r600.read.tidig.y() #0
1819

1920
; Make sure we aren't using VGPR's for the srsrc operand of BUFFER_LOAD_*
2021
; instructions
21-
; GCN: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
22-
; GCN: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
22+
; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
23+
; GCN-NOHSA: buffer_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
24+
; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}
25+
; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}
2326

2427
define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
2528
entry:
@@ -50,8 +53,10 @@ done: ; preds = %loop
5053
; Test moving an SMRD instruction to the VALU
5154

5255
; GCN-LABEL: {{^}}smrd_valu:
56+
; FIXME: We should be using flat load for HSA.
5357
; GCN: buffer_load_dword [[OUT:v[0-9]+]]
54-
; GCN: buffer_store_dword [[OUT]]
58+
; GCN-NOHSA: buffer_store_dword [[OUT]]
59+
; GCN-HSA: flat_store_dword [[OUT]]
5560
define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 {
5661
entry:
5762
%tmp = icmp ne i32 %a, 0
@@ -77,8 +82,9 @@ endif: ; preds = %else, %if
7782
; Test moving an SMRD with an immediate offset to the VALU
7883

7984
; GCN-LABEL: {{^}}smrd_valu2:
80-
; GCN-NOT: v_add
81-
; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16{{$}}
85+
; GCN-NOHSA-NOT: v_add
86+
; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16{{$}}
87+
; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
8288
define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) #1 {
8389
entry:
8490
%tmp = call i32 @llvm.r600.read.tidig.x() #0
@@ -91,12 +97,14 @@ entry:
9197

9298
; Use a big offset that will use the SMRD literal offset on CI
9399
; GCN-LABEL: {{^}}smrd_valu_ci_offset:
94-
; GCN-NOT: v_add
95-
; GCN: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4e20{{$}}
96-
; GCN-NOT: v_add
97-
; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
98-
; GCN: v_add_i32_e32
99-
; GCN: buffer_store_dword
100+
; GCN-NOHSA-NOT: v_add
101+
; GCN-NOHSA: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4e20{{$}}
102+
; GCN-NOHSA-NOT: v_add
103+
; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
104+
; GCN-NOHSA: v_add_i32_e32
105+
; GCN-NOHSA: buffer_store_dword
106+
; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
107+
; GCN-HSA: flat_store_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
100108
define void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %c) #1 {
101109
entry:
102110
%tmp = call i32 @llvm.r600.read.tidig.x() #0
@@ -109,13 +117,14 @@ entry:
109117
}
110118

111119
; GCN-LABEL: {{^}}smrd_valu_ci_offset_x2:
112-
; GCN-NOT: v_add
113-
; GCN: s_mov_b32 [[OFFSET:s[0-9]+]], 0x9c40{{$}}
114-
; GCN-NOT: v_add
115-
; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
116-
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
117-
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
118-
; GCN: buffer_store_dwordx2
120+
; GCN-NOHSA-NOT: v_add
121+
; GCN-NOHSA: s_mov_b32 [[OFFSET:s[0-9]+]], 0x9c40{{$}}
122+
; GCN-NOHSA-NOT: v_add
123+
; GCN-NOHSA: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
124+
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
125+
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
126+
; GCN-NOHSA: buffer_store_dwordx2
127+
; GCN-HSA: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
119128
define void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(2)* %in, i64 %c) #1 {
120129
entry:
121130
%tmp = call i32 @llvm.r600.read.tidig.x() #0
@@ -128,15 +137,16 @@ entry:
128137
}
129138

130139
; GCN-LABEL: {{^}}smrd_valu_ci_offset_x4:
131-
; GCN-NOT: v_add
132-
; GCN: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4d20{{$}}
133-
; GCN-NOT: v_add
134-
; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
135-
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
136-
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
137-
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
138-
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
139-
; GCN: buffer_store_dwordx4
140+
; GCN-NOHSA-NOT: v_add
141+
; GCN-NOHSA: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4d20{{$}}
142+
; GCN-NOHSA-NOT: v_add
143+
; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
144+
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
145+
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
146+
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
147+
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
148+
; GCN-NOHSA: buffer_store_dwordx4
149+
; GCN-HSA: flat_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
140150
define void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in, <4 x i32> %c) #1 {
141151
entry:
142152
%tmp = call i32 @llvm.r600.read.tidig.x() #0
@@ -152,25 +162,27 @@ entry:
152162
; CI.
153163

154164
; GCN-LABEL: {{^}}smrd_valu_ci_offset_x8:
155-
; GCN-NOT: v_add
156-
; GCN: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}}
157-
; GCN-NOT: v_add
158-
; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
159-
; GCN-NOT: v_add
160-
; GCN: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
161-
; GCN-NOT: v_add
162-
; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
163-
164-
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
165-
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
166-
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
167-
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
168-
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
169-
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
170-
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
171-
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
172-
; GCN: buffer_store_dwordx4
173-
; GCN: buffer_store_dwordx4
165+
; GCN-NOHSA-NOT: v_add
166+
; GCN-NOHSA: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}}
167+
; GCN-NOHSA-NOT: v_add
168+
; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
169+
; GCN-NOHSA-NOT: v_add
170+
; GCN-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
171+
; GCN-NOHSA-NOT: v_add
172+
; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
173+
174+
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
175+
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
176+
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
177+
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
178+
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
179+
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
180+
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
181+
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
182+
; GCN-NOHSA: buffer_store_dwordx4
183+
; GCN-NOHSA: buffer_store_dwordx4
184+
; GCN-HSA: flat_load_dwordx4
185+
; GCN-HSA: flat_load_dwordx4
174186
define void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 {
175187
entry:
176188
%tmp = call i32 @llvm.r600.read.tidig.x() #0
@@ -184,35 +196,40 @@ entry:
184196

185197
; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16:
186198

187-
; GCN-NOT: v_add
188-
; GCN: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}}
189-
; GCN-NOT: v_add
190-
; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
191-
; GCN-NOT: v_add
192-
; GCN: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}}
193-
; GCN-NOT: v_add
194-
; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
195-
; GCN-NOT: v_add
196-
; GCN: s_mov_b32 [[OFFSET2:s[0-9]+]], 0x134a0{{$}}
197-
; GCN-NOT: v_add
198-
; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET2]] addr64{{$}}
199-
; GCN-NOT: v_add
200-
; GCN: s_mov_b32 [[OFFSET3:s[0-9]+]], 0x134b0{{$}}
201-
; GCN-NOT: v_add
202-
; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET3]] addr64{{$}}
203-
204-
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
205-
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
206-
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
207-
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
208-
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
209-
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
210-
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
211-
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
212-
; GCN: buffer_store_dwordx4
213-
; GCN: buffer_store_dwordx4
214-
; GCN: buffer_store_dwordx4
215-
; GCN: buffer_store_dwordx4
199+
; GCN-NOHSA-NOT: v_add
200+
; GCN-NOHSA: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}}
201+
; GCN-NOHSA-NOT: v_add
202+
; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
203+
; GCN-NOHSA-NOT: v_add
204+
; GCN-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}}
205+
; GCN-NOHSA-NOT: v_add
206+
; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
207+
; GCN-NOHSA-NOT: v_add
208+
; GCN-NOHSA: s_mov_b32 [[OFFSET2:s[0-9]+]], 0x134a0{{$}}
209+
; GCN-NOHSA-NOT: v_add
210+
; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET2]] addr64{{$}}
211+
; GCN-NOHSA-NOT: v_add
212+
; GCN-NOHSA: s_mov_b32 [[OFFSET3:s[0-9]+]], 0x134b0{{$}}
213+
; GCN-NOHSA-NOT: v_add
214+
; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET3]] addr64{{$}}
215+
216+
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
217+
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
218+
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
219+
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
220+
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
221+
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
222+
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
223+
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
224+
; GCN-NOHSA: buffer_store_dwordx4
225+
; GCN-NOHSA: buffer_store_dwordx4
226+
; GCN-NOHSA: buffer_store_dwordx4
227+
; GCN-NOHSA: buffer_store_dwordx4
228+
229+
; GCN-HSA: flat_load_dwordx4
230+
; GCN-HSA: flat_load_dwordx4
231+
; GCN-HSA: flat_load_dwordx4
232+
; GCN-HSA: flat_load_dwordx4
216233

217234
; GCN: s_endpgm
218235
define void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 {
@@ -227,9 +244,11 @@ entry:
227244
}
228245

229246
; GCN-LABEL: {{^}}smrd_valu2_salu_user:
230-
; GCN: buffer_load_dword [[MOVED:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
247+
; GCN-NOHSA: buffer_load_dword [[MOVED:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
248+
; GCN-HSA: flat_load_dword [[MOVED:v[0-9]+]], v[{{[0-9+:[0-9]+}}]
231249
; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, s{{[0-9]+}}, [[MOVED]]
232-
; GCN: buffer_store_dword [[ADD]]
250+
; GCN-NOHSA: buffer_store_dword [[ADD]]
251+
; GCN-HSA: flat_store_dword [[ADD]]
233252
define void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in, i32 %a) #1 {
234253
entry:
235254
%tmp = call i32 @llvm.r600.read.tidig.x() #0
@@ -242,7 +261,8 @@ entry:
242261
}
243262

244263
; GCN-LABEL: {{^}}smrd_valu2_max_smrd_offset:
245-
; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1020{{$}}
264+
; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1020{{$}}
265+
; GCN-HSA flat_load_dword v{{[0-9]}}, v{{[0-9]+:[0-9]+}}
246266
define void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
247267
entry:
248268
%tmp = call i32 @llvm.r600.read.tidig.x() #0
@@ -254,8 +274,9 @@ entry:
254274
}
255275

256276
; GCN-LABEL: {{^}}smrd_valu2_mubuf_offset:
257-
; GCN-NOT: v_add
258-
; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}}
277+
; GCN-NOHSA-NOT: v_add
278+
; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}}
279+
; GCN-HSA: flat_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}]
259280
define void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
260281
entry:
261282
%tmp = call i32 @llvm.r600.read.tidig.x() #0
@@ -267,8 +288,10 @@ entry:
267288
}
268289

269290
; GCN-LABEL: {{^}}s_load_imm_v8i32:
270-
; GCN: buffer_load_dwordx4
271-
; GCN: buffer_load_dwordx4
291+
; GCN-NOHSA: buffer_load_dwordx4
292+
; GCN-NOHSA: buffer_load_dwordx4
293+
; GCN-HSA: flat_load_dwordx4
294+
; GCN-HSA: flat_load_dwordx4
272295
define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
273296
entry:
274297
%tmp0 = tail call i32 @llvm.r600.read.tidig.x()
@@ -280,16 +303,18 @@ entry:
280303
}
281304

282305
; GCN-LABEL: {{^}}s_load_imm_v8i32_salu_user:
283-
; GCN: buffer_load_dwordx4
284-
; GCN: buffer_load_dwordx4
285-
; GCN: v_add_i32_e32
286-
; GCN: v_add_i32_e32
287-
; GCN: v_add_i32_e32
288-
; GCN: v_add_i32_e32
289-
; GCN: v_add_i32_e32
290-
; GCN: v_add_i32_e32
291-
; GCN: v_add_i32_e32
292-
; GCN: buffer_store_dword
306+
; GCN-NOHSA: buffer_load_dwordx4
307+
; GCN-NOHSA: buffer_load_dwordx4
308+
; GCN-NOHSA: v_add_i32_e32
309+
; GCN-NOHSA: v_add_i32_e32
310+
; GCN-NOHSA: v_add_i32_e32
311+
; GCN-NOHSA: v_add_i32_e32
312+
; GCN-NOHSA: v_add_i32_e32
313+
; GCN-NOHSA: v_add_i32_e32
314+
; GCN-NOHSA: v_add_i32_e32
315+
; GCN-NOHSA: buffer_store_dword
316+
; GCN-HSA: flat_load_dwordx4
317+
; GCN-HSA: flat_load_dwordx4
293318
define void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
294319
entry:
295320
%tmp0 = tail call i32 @llvm.r600.read.tidig.x()
@@ -319,10 +344,14 @@ entry:
319344
}
320345

321346
; GCN-LABEL: {{^}}s_load_imm_v16i32:
322-
; GCN: buffer_load_dwordx4
323-
; GCN: buffer_load_dwordx4
324-
; GCN: buffer_load_dwordx4
325-
; GCN: buffer_load_dwordx4
347+
; GCN-NOHSA: buffer_load_dwordx4
348+
; GCN-NOHSA: buffer_load_dwordx4
349+
; GCN-NOHSA: buffer_load_dwordx4
350+
; GCN-NOHSA: buffer_load_dwordx4
351+
; GCN-HSA: flat_load_dwordx4
352+
; GCN-HSA: flat_load_dwordx4
353+
; GCN-HSA: flat_load_dwordx4
354+
; GCN-HSA: flat_load_dwordx4
326355
define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
327356
entry:
328357
%tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1
@@ -334,26 +363,30 @@ entry:
334363
}
335364

336365
; GCN-LABEL: {{^}}s_load_imm_v16i32_salu_user:
337-
; GCN: buffer_load_dwordx4
338-
; GCN: buffer_load_dwordx4
339-
; GCN: buffer_load_dwordx4
340-
; GCN: buffer_load_dwordx4
341-
; GCN: v_add_i32_e32
342-
; GCN: v_add_i32_e32
343-
; GCN: v_add_i32_e32
344-
; GCN: v_add_i32_e32
345-
; GCN: v_add_i32_e32
346-
; GCN: v_add_i32_e32
347-
; GCN: v_add_i32_e32
348-
; GCN: v_add_i32_e32
349-
; GCN: v_add_i32_e32
350-
; GCN: v_add_i32_e32
351-
; GCN: v_add_i32_e32
352-
; GCN: v_add_i32_e32
353-
; GCN: v_add_i32_e32
354-
; GCN: v_add_i32_e32
355-
; GCN: v_add_i32_e32
356-
; GCN: buffer_store_dword
366+
; GCN-NOHSA: buffer_load_dwordx4
367+
; GCN-NOHSA: buffer_load_dwordx4
368+
; GCN-NOHSA: buffer_load_dwordx4
369+
; GCN-NOHSA: buffer_load_dwordx4
370+
; GCN-NOHSA: v_add_i32_e32
371+
; GCN-NOHSA: v_add_i32_e32
372+
; GCN-NOHSA: v_add_i32_e32
373+
; GCN-NOHSA: v_add_i32_e32
374+
; GCN-NOHSA: v_add_i32_e32
375+
; GCN-NOHSA: v_add_i32_e32
376+
; GCN-NOHSA: v_add_i32_e32
377+
; GCN-NOHSA: v_add_i32_e32
378+
; GCN-NOHSA: v_add_i32_e32
379+
; GCN-NOHSA: v_add_i32_e32
380+
; GCN-NOHSA: v_add_i32_e32
381+
; GCN-NOHSA: v_add_i32_e32
382+
; GCN-NOHSA: v_add_i32_e32
383+
; GCN-NOHSA: v_add_i32_e32
384+
; GCN-NOHSA: v_add_i32_e32
385+
; GCN-NOHSA: buffer_store_dword
386+
; GCN-HSA: flat_load_dwordx4
387+
; GCN-HSA: flat_load_dwordx4
388+
; GCN-HSA: flat_load_dwordx4
389+
; GCN-HSA: flat_load_dwordx4
357390
define void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
358391
entry:
359392
%tmp0 = tail call i32 @llvm.r600.read.tidig.x() #1

0 commit comments

Comments
 (0)
Please sign in to comment.