Skip to content

Commit 20d4795

Browse files
committedJun 29, 2018
[AMDGPU] Enable LICM in the BE pipeline
This allows to hoist code portion to compute reciprocal of loop invariant denominator in integer division after codegen prepare expansion. Differential Revision: https://reviews.llvm.org/D48604 llvm-svn: 335988
1 parent 3994baf commit 20d4795

9 files changed

+290
-32
lines changed
 

‎llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -587,6 +587,7 @@ void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
587587
}
588588

589589
void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
590+
addPass(createLICMPass());
590591
addPass(createSeparateConstOffsetFromGEPPass());
591592
addPass(createSpeculativeExecutionPass());
592593
// ReassociateGEPs exposes more opportunites for SLSR. See

‎llvm/test/CodeGen/AMDGPU/amdpal_scratch_mergedshader.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ define amdgpu_hs void @_amdgpu_hs_main(i32 inreg %arg, i32 inreg %arg1, i32 inre
2424

2525
.endls: ; preds = %.beginls, %.entry
2626
%.fca.2.gep120.i = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>] addrspace(5)* %__llpc_global_proxy_7.i, i64 0, i64 2
27-
store <4 x float> <float 9.000000e+00, float 1.000000e+01, float 1.100000e+01, float 1.200000e+01>, <4 x float> addrspace(5)* %.fca.2.gep120.i, align 16
27+
store volatile <4 x float> <float 9.000000e+00, float 1.000000e+01, float 1.100000e+01, float 1.200000e+01>, <4 x float> addrspace(5)* %.fca.2.gep120.i, align 16
2828
br label %bb
2929

3030
bb: ; preds = %bb, %.endls

‎llvm/test/CodeGen/AMDGPU/collapse-endcf.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ bb.end: ; preds = %bb.then, %bb
210210
; GCN-NEXT: s_cbranch_execnz [[BB1_LOOP]]
211211

212212
; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
213-
; GCN: s_and_b64 exec, exec, vcc
213+
; GCN: s_and_b64 exec, exec, {{vcc|s\[[0-9:]+\]}}
214214

215215
; GCN-NOT: s_or_b64 exec, exec
216216

‎llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll

+7-5
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,13 @@
77
; only contain the lanes that were active during the last loop iteration.
88
;
99
; SI: ; %for.body
10-
; SI: v_cmp_gt_u32_e64 [[SREG:s\[[0-9]+:[0-9]+\]]], 4,
11-
; SI: v_cndmask_b32_e64 [[VREG:v[0-9]+]], 0, -1, [[SREG]]
12-
; SI-NOT: [[VREG]]
13-
; SI: ; %for.end
14-
; SI: v_cmp_ne_u32_e32 vcc, 0, [[VREG]]
10+
; SI: v_cmp_gt_u32_e64 [[SREG:s\[[0-9]+:[0-9]+\]]], 4,
11+
; SI: v_cndmask_b32_e64 [[VREG:v[0-9]+]], 0, -1, [[SREG]]
12+
; SI-NEXT: s_cbranch_vccnz [[ENDIF:BB[0-9_]+]]
13+
; SI: [[ENDIF]]:
14+
; SI-NOT: [[VREG]]
15+
; SI: ; %for.end
16+
; SI: v_cmp_ne_u32_e32 vcc, 0, [[VREG]]
1517
define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) {
1618
entry:
1719
br label %for.body

‎llvm/test/CodeGen/AMDGPU/idiv-licm.ll

+249
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
2+
3+
; GCN-LABEL: {{^}}udiv32_invariant_denom:
4+
; GCN: v_cvt_f32_u32
5+
; GCN: v_rcp_iflag_f32
6+
; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x4f800000,
7+
; GCN: v_cvt_u32_f32_e32
8+
; GCN-DAG: v_mul_hi_u32
9+
; GCN-DAG: v_mul_lo_i32
10+
; GCN-DAG: v_sub_i32_e32
11+
; GCN-DAG: v_cmp_eq_u32_e64
12+
; GCN-DAG: v_cndmask_b32_e64
13+
; GCN-DAG: v_mul_hi_u32
14+
; GCN-DAG: v_add_i32_e32
15+
; GCN-DAG: v_subrev_i32_e32
16+
; GCN-DAG: v_cndmask_b32_e64
17+
; GCN: [[LOOP:BB[0-9_]+]]:
18+
; GCN-NOT: v_rcp
19+
; GCN: s_cbranch_scc0 [[LOOP]]
20+
; GCN: s_endpgm
21+
define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
22+
bb:
23+
br label %bb3
24+
25+
bb2: ; preds = %bb3
26+
ret void
27+
28+
bb3: ; preds = %bb3, %bb
29+
%tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
30+
%tmp4 = udiv i32 %tmp, %arg1
31+
%tmp5 = zext i32 %tmp to i64
32+
%tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5
33+
store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4
34+
%tmp7 = add nuw nsw i32 %tmp, 1
35+
%tmp8 = icmp eq i32 %tmp7, 1024
36+
br i1 %tmp8, label %bb2, label %bb3
37+
}
38+
39+
; GCN-LABEL: {{^}}urem32_invariant_denom:
40+
; GCN: v_cvt_f32_u32
41+
; GCN: v_rcp_iflag_f32
42+
; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x4f800000,
43+
; GCN: v_cvt_u32_f32_e32
44+
; GCN-DAG: v_mul_hi_u32
45+
; GCN-DAG: v_mul_lo_i32
46+
; GCN-DAG: v_sub_i32_e32
47+
; GCN-DAG: v_cmp_eq_u32_e64
48+
; GCN-DAG: v_cndmask_b32_e64
49+
; GCN-DAG: v_mul_hi_u32
50+
; GCN-DAG: v_add_i32_e32
51+
; GCN-DAG: v_subrev_i32_e32
52+
; GCN-DAG: v_cndmask_b32_e64
53+
; GCN: [[LOOP:BB[0-9_]+]]:
54+
; GCN-NOT: v_rcp
55+
; GCN: s_cbranch_scc0 [[LOOP]]
56+
; GCN: s_endpgm
57+
define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
58+
bb:
59+
br label %bb3
60+
61+
bb2: ; preds = %bb3
62+
ret void
63+
64+
bb3: ; preds = %bb3, %bb
65+
%tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
66+
%tmp4 = urem i32 %tmp, %arg1
67+
%tmp5 = zext i32 %tmp to i64
68+
%tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5
69+
store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4
70+
%tmp7 = add nuw nsw i32 %tmp, 1
71+
%tmp8 = icmp eq i32 %tmp7, 1024
72+
br i1 %tmp8, label %bb2, label %bb3
73+
}
74+
75+
; GCN-LABEL: {{^}}sdiv32_invariant_denom:
76+
; GCN: v_cvt_f32_u32
77+
; GCN: v_rcp_iflag_f32
78+
; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x4f800000,
79+
; GCN: v_cvt_u32_f32_e32
80+
; GCN-DAG: v_mul_hi_u32
81+
; GCN-DAG: v_mul_lo_i32
82+
; GCN-DAG: v_sub_i32_e32
83+
; GCN-DAG: v_cmp_eq_u32_e64
84+
; GCN-DAG: v_cndmask_b32_e64
85+
; GCN-DAG: v_mul_hi_u32
86+
; GCN-DAG: v_add_i32_e32
87+
; GCN-DAG: v_subrev_i32_e32
88+
; GCN-DAG: v_cndmask_b32_e64
89+
; GCN: [[LOOP:BB[0-9_]+]]:
90+
; GCN-NOT: v_rcp
91+
; GCN: s_cbranch_scc0 [[LOOP]]
92+
; GCN: s_endpgm
93+
define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
94+
bb:
95+
br label %bb3
96+
97+
bb2: ; preds = %bb3
98+
ret void
99+
100+
bb3: ; preds = %bb3, %bb
101+
%tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
102+
%tmp4 = sdiv i32 %tmp, %arg1
103+
%tmp5 = zext i32 %tmp to i64
104+
%tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5
105+
store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4
106+
%tmp7 = add nuw nsw i32 %tmp, 1
107+
%tmp8 = icmp eq i32 %tmp7, 1024
108+
br i1 %tmp8, label %bb2, label %bb3
109+
}
110+
111+
; GCN-LABEL: {{^}}srem32_invariant_denom:
112+
; GCN: v_cvt_f32_u32
113+
; GCN: v_rcp_iflag_f32
114+
; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x4f800000,
115+
; GCN: v_cvt_u32_f32_e32
116+
; GCN-DAG: v_mul_hi_u32
117+
; GCN-DAG: v_mul_lo_i32
118+
; GCN-DAG: v_sub_i32_e32
119+
; GCN-DAG: v_cmp_eq_u32_e64
120+
; GCN-DAG: v_cndmask_b32_e64
121+
; GCN-DAG: v_mul_hi_u32
122+
; GCN-DAG: v_add_i32_e32
123+
; GCN-DAG: v_subrev_i32_e32
124+
; GCN-DAG: v_cndmask_b32_e64
125+
; GCN: [[LOOP:BB[0-9_]+]]:
126+
; GCN-NOT: v_rcp
127+
; GCN: s_cbranch_scc0 [[LOOP]]
128+
; GCN: s_endpgm
129+
define amdgpu_kernel void @srem32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
130+
bb:
131+
br label %bb3
132+
133+
bb2: ; preds = %bb3
134+
ret void
135+
136+
bb3: ; preds = %bb3, %bb
137+
%tmp = phi i32 [ 0, %bb ], [ %tmp7, %bb3 ]
138+
%tmp4 = srem i32 %tmp, %arg1
139+
%tmp5 = zext i32 %tmp to i64
140+
%tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp5
141+
store i32 %tmp4, i32 addrspace(1)* %tmp6, align 4
142+
%tmp7 = add nuw nsw i32 %tmp, 1
143+
%tmp8 = icmp eq i32 %tmp7, 1024
144+
br i1 %tmp8, label %bb2, label %bb3
145+
}
146+
147+
; GCN-LABEL: {{^}}udiv16_invariant_denom:
148+
; GCN: v_cvt_f32_u32
149+
; GCN: v_rcp_iflag_f32
150+
; GCN: [[LOOP:BB[0-9_]+]]:
151+
; GCN-NOT: v_rcp
152+
; GCN: s_cbranch_scc0 [[LOOP]]
153+
; GCN: s_endpgm
154+
define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
155+
bb:
156+
br label %bb3
157+
158+
bb2: ; preds = %bb3
159+
ret void
160+
161+
bb3: ; preds = %bb3, %bb
162+
%tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
163+
%tmp4 = udiv i16 %tmp, %arg1
164+
%tmp5 = zext i16 %tmp to i64
165+
%tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5
166+
store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2
167+
%tmp7 = add nuw nsw i16 %tmp, 1
168+
%tmp8 = icmp eq i16 %tmp7, 1024
169+
br i1 %tmp8, label %bb2, label %bb3
170+
}
171+
172+
; GCN-LABEL: {{^}}urem16_invariant_denom:
173+
; GCN: v_cvt_f32_u32
174+
; GCN: v_rcp_iflag_f32
175+
; GCN: [[LOOP:BB[0-9_]+]]:
176+
; GCN-NOT: v_rcp
177+
; GCN: s_cbranch_scc0 [[LOOP]]
178+
; GCN: s_endpgm
179+
define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
180+
bb:
181+
br label %bb3
182+
183+
bb2: ; preds = %bb3
184+
ret void
185+
186+
bb3: ; preds = %bb3, %bb
187+
%tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
188+
%tmp4 = urem i16 %tmp, %arg1
189+
%tmp5 = zext i16 %tmp to i64
190+
%tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5
191+
store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2
192+
%tmp7 = add nuw nsw i16 %tmp, 1
193+
%tmp8 = icmp eq i16 %tmp7, 1024
194+
br i1 %tmp8, label %bb2, label %bb3
195+
}
196+
197+
; GCN-LABEL: {{^}}sdiv16_invariant_denom:
198+
; GCN-DAG: s_sext_i32_i16
199+
; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0x7fffffff
200+
; GCN-DAG: v_cvt_f32_i32
201+
; GCN-DAG: v_rcp_iflag_f32
202+
; GCN: [[LOOP:BB[0-9_]+]]:
203+
; GCN-NOT: v_rcp
204+
; GCN: s_cbranch_scc0 [[LOOP]]
205+
; GCN: s_endpgm
206+
define amdgpu_kernel void @sdiv16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
207+
bb:
208+
br label %bb3
209+
210+
bb2: ; preds = %bb3
211+
ret void
212+
213+
bb3: ; preds = %bb3, %bb
214+
%tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
215+
%tmp4 = sdiv i16 %tmp, %arg1
216+
%tmp5 = zext i16 %tmp to i64
217+
%tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5
218+
store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2
219+
%tmp7 = add nuw nsw i16 %tmp, 1
220+
%tmp8 = icmp eq i16 %tmp7, 1024
221+
br i1 %tmp8, label %bb2, label %bb3
222+
}
223+
224+
; GCN-LABEL: {{^}}srem16_invariant_denom:
225+
; GCN-DAG: s_sext_i32_i16
226+
; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0x7fffffff
227+
; GCN-DAG: v_cvt_f32_i32
228+
; GCN-DAG: v_rcp_iflag_f32
229+
; GCN: [[LOOP:BB[0-9_]+]]:
230+
; GCN-NOT: v_rcp
231+
; GCN: s_cbranch_scc0 [[LOOP]]
232+
; GCN: s_endpgm
233+
define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
234+
bb:
235+
br label %bb3
236+
237+
bb2: ; preds = %bb3
238+
ret void
239+
240+
bb3: ; preds = %bb3, %bb
241+
%tmp = phi i16 [ 0, %bb ], [ %tmp7, %bb3 ]
242+
%tmp4 = srem i16 %tmp, %arg1
243+
%tmp5 = zext i16 %tmp to i64
244+
%tmp6 = getelementptr inbounds i16, i16 addrspace(1)* %arg, i64 %tmp5
245+
store i16 %tmp4, i16 addrspace(1)* %tmp6, align 2
246+
%tmp7 = add nuw nsw i16 %tmp, 1
247+
%tmp8 = icmp eq i16 %tmp7, 1024
248+
br i1 %tmp8, label %bb2, label %bb3
249+
}

‎llvm/test/CodeGen/AMDGPU/infinite-loop.ll

+9-10
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ entry:
1212
br label %loop
1313

1414
loop:
15-
store i32 999, i32 addrspace(1)* %out, align 4
15+
store volatile i32 999, i32 addrspace(1)* %out, align 4
1616
br label %loop
1717
}
1818

@@ -21,7 +21,7 @@ loop:
2121
; IR: br i1 %cond, label %loop, label %UnifiedReturnBlock
2222

2323
; IR: loop:
24-
; IR: store i32 999, i32 addrspace(1)* %out, align 4
24+
; IR: store volatile i32 999, i32 addrspace(1)* %out, align 4
2525
; IR: br i1 true, label %loop, label %UnifiedReturnBlock
2626

2727
; IR: UnifiedReturnBlock:
@@ -47,7 +47,7 @@ entry:
4747
br i1 %cond, label %loop, label %return
4848

4949
loop:
50-
store i32 999, i32 addrspace(1)* %out, align 4
50+
store volatile i32 999, i32 addrspace(1)* %out, align 4
5151
br label %loop
5252

5353
return:
@@ -59,11 +59,11 @@ return:
5959
; IR: br i1 undef, label %loop1, label %loop2
6060

6161
; IR: loop1:
62-
; IR: store i32 999, i32 addrspace(1)* %out, align 4
62+
; IR: store volatile i32 999, i32 addrspace(1)* %out, align 4
6363
; IR: br i1 true, label %loop1, label %DummyReturnBlock
6464

6565
; IR: loop2:
66-
; IR: store i32 888, i32 addrspace(1)* %out, align 4
66+
; IR: store volatile i32 888, i32 addrspace(1)* %out, align 4
6767
; IR: br i1 true, label %loop2, label %DummyReturnBlock
6868

6969
; IR: DummyReturnBlock:
@@ -96,11 +96,11 @@ entry:
9696
br i1 undef, label %loop1, label %loop2
9797

9898
loop1:
99-
store i32 999, i32 addrspace(1)* %out, align 4
99+
store volatile i32 999, i32 addrspace(1)* %out, align 4
100100
br label %loop1
101101

102102
loop2:
103-
store i32 888, i32 addrspace(1)* %out, align 4
103+
store volatile i32 888, i32 addrspace(1)* %out, align 4
104104
br label %loop2
105105
}
106106

@@ -113,7 +113,7 @@ loop2:
113113
; IR: br label %inner_loop
114114

115115
; IR: inner_loop:
116-
; IR: store i32 999, i32 addrspace(1)* %out, align 4
116+
; IR: store volatile i32 999, i32 addrspace(1)* %out, align 4
117117
; IR: %cond3 = icmp eq i32 %tmp, 3
118118
; IR: br i1 true, label %TransitionBlock, label %UnifiedReturnBlock
119119

@@ -132,7 +132,6 @@ loop2:
132132
; SI: [[INNER_LOOP:BB[0-9]+_[0-9]+]]: ; %inner_loop
133133
; SI: s_waitcnt expcnt(0)
134134
; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7
135-
; SI: v_cmp_ne_u32_e32
136135
; SI: s_waitcnt lgkmcnt(0)
137136
; SI: buffer_store_dword [[REG]]
138137

@@ -156,7 +155,7 @@ outer_loop:
156155
br label %inner_loop
157156

158157
inner_loop: ; preds = %LeafBlock, %LeafBlock1
159-
store i32 999, i32 addrspace(1)* %out, align 4
158+
store volatile i32 999, i32 addrspace(1)* %out, align 4
160159
%cond3 = icmp eq i32 %tmp, 3
161160
br i1 %cond3, label %inner_loop, label %outer_loop
162161

‎llvm/test/CodeGen/AMDGPU/multilevel-break.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
; GCN-NEXT: s_andn2_b64 exec, exec, [[OR_BREAK]]
3737
; GCN-NEXT: s_cbranch_execnz [[INNER_LOOP]]
3838

39-
; GCN: ; %bb.{{[0-9]+}}: ; %Flow1{{$}}
39+
; GCN: ; %bb.{{[0-9]+}}: ; %Flow2{{$}}
4040
; GCN-NEXT: ; in Loop: Header=[[OUTER_LOOP]] Depth=1
4141

4242
; Ensure copy is eliminated

‎llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll

+20-14
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
2-
; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
1+
; RUN: llc < %s -march=amdgcn -mcpu=verde -asm-verbose=0 -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
2+
; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -asm-verbose=0 -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
33

44
; FUNC-LABEL: {{^}}break_inserted_outside_of_loop:
55

@@ -89,17 +89,24 @@ declare float @llvm.fabs.f32(float) nounwind readnone
8989

9090
; This broke the old AMDIL cfg structurizer
9191
; FUNC-LABEL: {{^}}loop_land_info_assert:
92-
; SI: s_cmp_lt_i32
93-
; SI-NEXT: s_cbranch_scc1 [[ENDPGM:BB[0-9]+_[0-9]+]]
94-
95-
; SI: s_cmpk_lt_i32
96-
; SI-NEXT: s_cbranch_scc0 [[ENDPGM]]
97-
98-
; SI: [[INFLOOP:BB[0-9]+_[0-9]+]]
99-
; SI: s_cbranch_vccnz [[INFLOOP]]
100-
101-
; SI: [[ENDPGM]]:
102-
; SI: s_endpgm
92+
; SI: v_cmp_lt_i32_e64 [[CMP4:s\[[0-9:]+\]]], s{{[0-9]+}}, 4{{$}}
93+
; SI: s_and_b64 vcc, exec, [[CMP4]]
94+
; SI-NEXT: s_cbranch_vccnz [[BR1:BB[0-9_]+]]
95+
; SI-NEXT: s_branch [[BR2:BB[0-9_]+]]
96+
; SI-NEXT: BB{{[0-9_]+}}:
97+
; SI-NEXT: buffer_store_dword
98+
99+
; SI: [[INFLOOP:BB[0-9]+_[0-9]+]]:
100+
101+
; SI: [[BR1]]:
102+
; SI-NEXT: s_and_b64 vcc, exec,
103+
; SI-NEXT: s_cbranch_vccnz [[ENDPGM:BB[0-9]+_[0-9]+]]
104+
; SI: s_branch [[INFLOOP]]
105+
; SI-NEXT: [[BR2]]:
106+
; SI: s_cbranch_vccz [[ENDPGM]]
107+
108+
; SI: [[ENDPGM]]:
109+
; SI-NEXT: s_endpgm
103110
define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind {
104111
entry:
105112
%cmp = icmp sgt i32 %c0, 0
@@ -144,7 +151,6 @@ return:
144151
ret void
145152
}
146153

147-
148154
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
149155

150156
attributes #0 = { nounwind readnone }

‎llvm/test/CodeGen/AMDGPU/smrd.ll

+1
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,7 @@ ret_block: ; preds = %.outer, %.label22, %
328328
.inner_loop_body:
329329
%descriptor = load <4 x i32>, <4 x i32> addrspace(4)* %descptr, align 16, !invariant.load !0
330330
%load1result = call float @llvm.SI.load.const.v4i32(<4 x i32> %descriptor, i32 0)
331+
store float %load1result, float addrspace(1)* undef
331332
%inner_br2 = icmp uge i32 %1, 10
332333
br i1 %inner_br2, label %.inner_loop_header, label %.outer_loop_body
333334

0 commit comments

Comments
 (0)
Please sign in to comment.