Skip to content

Commit 5fa289f

Browse files
committedMay 22, 2017
[AMDGPU] Narrow lshl from 64 to 32 bit if possible
Turn expensive 64 bit shift into 32 bit if shift does not overflow int: shl (ext x) => zext (shl x) Differential Revision: https://reviews.llvm.org/D33367 llvm-svn: 303569
1 parent 80cb549 commit 5fa289f

14 files changed

+105
-37
lines changed
 

‎llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

+33-11
Original file line numberDiff line numberDiff line change
@@ -2595,27 +2595,49 @@ SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
25952595

25962596
SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
25972597
DAGCombinerInfo &DCI) const {
2598-
if (N->getValueType(0) != MVT::i64)
2598+
EVT VT = N->getValueType(0);
2599+
if (VT != MVT::i64)
25992600
return SDValue();
26002601

2601-
// i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
2602-
2603-
// On some subtargets, 64-bit shift is a quarter rate instruction. In the
2604-
// common case, splitting this into a move and a 32-bit shift is faster and
2605-
// the same code size.
2606-
const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2602+
ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
26072603
if (!RHS)
26082604
return SDValue();
26092605

2610-
unsigned RHSVal = RHS->getZExtValue();
2611-
if (RHSVal < 32)
2612-
return SDValue();
2613-
26142606
SDValue LHS = N->getOperand(0);
2607+
unsigned RHSVal = RHS->getZExtValue();
2608+
if (!RHSVal)
2609+
return LHS;
26152610

26162611
SDLoc SL(N);
26172612
SelectionDAG &DAG = DCI.DAG;
26182613

2614+
switch (LHS->getOpcode()) {
2615+
default:
2616+
break;
2617+
case ISD::ZERO_EXTEND:
2618+
case ISD::SIGN_EXTEND:
2619+
case ISD::ANY_EXTEND: {
2620+
// shl (ext x) => zext (shl x), if shift does not overflow int
2621+
KnownBits Known;
2622+
SDValue X = LHS->getOperand(0);
2623+
DAG.computeKnownBits(X, Known);
2624+
unsigned LZ = Known.countMinLeadingZeros();
2625+
if (LZ < RHSVal)
2626+
break;
2627+
EVT XVT = X.getValueType();
2628+
SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
2629+
return DAG.getZExtOrTrunc(Shl, SL, VT);
2630+
}
2631+
}
2632+
2633+
// i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
2634+
2635+
// On some subtargets, 64-bit shift is a quarter rate instruction. In the
2636+
// common case, splitting this into a move and a 32-bit shift is faster and
2637+
// the same code size.
2638+
if (RHSVal < 32)
2639+
return SDValue();
2640+
26192641
SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
26202642

26212643
SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);

‎llvm/test/CodeGen/AMDGPU/add.i16.ll

+1-2
Original file line numberDiff line numberDiff line change
@@ -84,11 +84,10 @@ define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i1
8484

8585
; FIXME: Need to handle non-uniform case for function below (load without gep).
8686
; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i64:
87-
; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
8887
; VI: flat_load_ushort [[A:v[0-9]+]]
8988
; VI: flat_load_ushort [[B:v[0-9]+]]
9089
; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
91-
; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
90+
; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:{{[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
9291
define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
9392
%tid = call i32 @llvm.amdgcn.workitem.id.x()
9493
%gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid

‎llvm/test/CodeGen/AMDGPU/add.v2i16.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -202,10 +202,10 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
202202
; VI: flat_load_ushort v[[B_LO:[0-9]+]]
203203
; VI: flat_load_ushort v[[B_HI:[0-9]+]]
204204

205-
; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
206-
; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
207-
; VI: v_add_u16_e32
208-
; VI: v_add_u16_e32
205+
; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
206+
; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
207+
; VI-DAG: v_add_u16_e32
208+
; VI-DAG: v_add_u16_e32
209209

210210
; VI: buffer_store_dwordx4
211211
define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {

‎llvm/test/CodeGen/AMDGPU/bfe-patterns.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out,
5050
; GCN-LABEL: {{^}}s_ubfe_sub_i32:
5151
; GCN: s_load_dword [[SRC:s[0-9]+]]
5252
; GCN: s_load_dword [[WIDTH:s[0-9]+]]
53-
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}}
53+
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], [[WIDTH]]
5454
; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
5555
define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
5656
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -128,7 +128,7 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out,
128128
; GCN-LABEL: {{^}}s_sbfe_sub_i32:
129129
; GCN: s_load_dword [[SRC:s[0-9]+]]
130130
; GCN: s_load_dword [[WIDTH:s[0-9]+]]
131-
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}}
131+
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], [[WIDTH]]
132132
; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
133133
define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
134134
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()

‎llvm/test/CodeGen/AMDGPU/ctlz.ll

+1-2
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,6 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
135135
}
136136

137137
; FUNC-LABEL: {{^}}v_ctlz_i64:
138-
; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
139138
; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
140139
; GCN-DAG: v_cmp_eq_u32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]]
141140
; GCN-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]]
@@ -145,7 +144,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
145144
; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[HI]], v[[LO]]
146145
; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[OR]]
147146
; GCN-DAG: v_cndmask_b32_e32 v[[CLTZ_LO:[0-9]+]], 64, v[[CTLZ:[0-9]+]], vcc
148-
; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}}
147+
; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI:[0-9]+]]{{\]}}
149148
define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
150149
%tid = call i32 @llvm.r600.read.tidig.x()
151150
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid

‎llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll

+1-2
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias
121121
; GCN-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]]
122122
; GCN-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]]
123123
; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]]
124-
; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
125-
; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
124+
; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI:[0-9]+]]{{\]}}
126125
define amdgpu_kernel void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
127126
%tid = call i32 @llvm.r600.read.tidig.x()
128127
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid

‎llvm/test/CodeGen/AMDGPU/ds_write2.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -266,8 +266,8 @@ define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)*
266266
}
267267

268268
; SI-LABEL: @simple_write2_one_val_f64
269-
; SI: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]],
270-
; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
269+
; SI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]],
270+
; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
271271
; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8
272272
; SI: s_endpgm
273273
define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {

‎llvm/test/CodeGen/AMDGPU/fmed3.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -845,10 +845,10 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(float addrspace(
845845
; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
846846
; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
847847
; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
848-
; GCN: v_min_f32
849-
; GCN: v_max_f32
850-
; GCN: v_min_f32
851-
; GCN: v_max_f32
848+
; GCN-DAG: v_min_f32
849+
; GCN-DAG: v_max_f32
850+
; GCN-DAG: v_min_f32
851+
; GCN-DAG: v_max_f32
852852
define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
853853
%tid = call i32 @llvm.amdgcn.workitem.id.x()
854854
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid

‎llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll

+2
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)*
356356

357357
; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset_addr64:
358358
; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
359+
; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
359360
; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
360361
; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
361362
; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
@@ -371,6 +372,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace
371372

372373
; GCN-LABEL: {{^}}global_atomic_dec_noret_i64_offset_addr64:
373374
; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
375+
; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
374376
; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
375377
; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
376378
; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}

‎llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll

+2
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)*
207207

208208
; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset_addr64:
209209
; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
210+
; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
210211
; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
211212
; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
212213
; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
@@ -222,6 +223,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace
222223

223224
; GCN-LABEL: {{^}}global_atomic_inc_noret_i64_offset_addr64:
224225
; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
226+
; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
225227
; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
226228
; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
227229
; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
; RUN: llc -march=amdgcn < %s | FileCheck %s
2+
3+
; CHECK-LABEL: {{^}}zext_shl64_to_32:
4+
; CHECK: s_lshl_b32
5+
; CHECK-NOT: s_lshl_b64
6+
define amdgpu_kernel void @zext_shl64_to_32(i64 addrspace(1)* nocapture %out, i32 %x) {
7+
%and = and i32 %x, 1073741823
8+
%ext = zext i32 %and to i64
9+
%shl = shl i64 %ext, 2
10+
store i64 %shl, i64 addrspace(1)* %out, align 4
11+
ret void
12+
}
13+
14+
; CHECK-LABEL: {{^}}sext_shl64_to_32:
15+
; CHECK: s_lshl_b32
16+
; CHECK-NOT: s_lshl_b64
17+
define amdgpu_kernel void @sext_shl64_to_32(i64 addrspace(1)* nocapture %out, i32 %x) {
18+
%and = and i32 %x, 536870911
19+
%ext = sext i32 %and to i64
20+
%shl = shl i64 %ext, 2
21+
store i64 %shl, i64 addrspace(1)* %out, align 4
22+
ret void
23+
}
24+
25+
; CHECK-LABEL: {{^}}zext_shl64_overflow:
26+
; CHECK: s_lshl_b64
27+
; CHECK-NOT: s_lshl_b32
28+
define amdgpu_kernel void @zext_shl64_overflow(i64 addrspace(1)* nocapture %out, i32 %x) {
29+
%and = and i32 %x, 2147483647
30+
%ext = zext i32 %and to i64
31+
%shl = shl i64 %ext, 2
32+
store i64 %shl, i64 addrspace(1)* %out, align 4
33+
ret void
34+
}
35+
36+
; CHECK-LABEL: {{^}}sext_shl64_overflow:
37+
; CHECK: s_lshl_b64
38+
; CHECK-NOT: s_lshl_b32
39+
define amdgpu_kernel void @sext_shl64_overflow(i64 addrspace(1)* nocapture %out, i32 %x) {
40+
%and = and i32 %x, 2147483647
41+
%ext = sext i32 %and to i64
42+
%shl = shl i64 %ext, 2
43+
store i64 %shl, i64 addrspace(1)* %out, align 4
44+
ret void
45+
}

‎llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll

+5-6
Original file line numberDiff line numberDiff line change
@@ -299,10 +299,10 @@ define amdgpu_kernel void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)*
299299
}
300300

301301
; GCN-LABEL: {{^}}and_not_mask_i64:
302-
; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}}
303-
; GCN: v_mov_b32_e32 v[[SHRHI]], 0{{$}}
302+
; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}}
303+
; GCN: v_mov_b32_e32 v[[SHRHI:[0-9]+]], 0{{$}}
304304
; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 20, v[[VALLO]]
305-
; GCN-DAG: v_and_b32_e32 v[[SHRLO]], 4, [[SHR]]
305+
; GCN-DAG: v_and_b32_e32 v[[SHRLO:[0-9]+]], 4, [[SHR]]
306306
; GCN-NOT: v[[SHRLO]]
307307
; GCN-NOT: v[[SHRHI]]
308308
; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
@@ -360,10 +360,9 @@ define amdgpu_kernel void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspac
360360
}
361361

362362
; GCN-LABEL: {{^}}v_uextract_bit_33_36_use_upper_half_shift_i64:
363-
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
363+
; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
364364
; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 3
365-
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
366-
; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
365+
; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:{{[0-9]+\]}}
367366
; GCN: buffer_store_dword v[[ZERO]]
368367
define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 {
369368
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()

‎llvm/test/CodeGen/AMDGPU/srl.ll

+2-1
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,8 @@ define amdgpu_kernel void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) {
201201

202202
; GCN-LABEL: {{^}}v_lshr_32_i64:
203203
; GCN-DAG: buffer_load_dword v[[HI_A:[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
204-
; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}}
204+
; GCN-DAG: v_mov_b32_e32 v[[VHI1:[0-9]+]], 0{{$}}
205+
; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], v[[VHI1]]{{$}}
205206
; GCN: buffer_store_dwordx2 v{{\[}}[[HI_A]]:[[VHI]]{{\]}}
206207
define amdgpu_kernel void @v_lshr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
207208
%tid = call i32 @llvm.r600.read.tidig.x() #0

‎llvm/test/CodeGen/AMDGPU/sub.i16.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,9 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i1
8585

8686
; FIXME: Need to handle non-uniform case for function below (load without gep).
8787
; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i64:
88-
; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
8988
; VI: flat_load_ushort [[A:v[0-9]+]]
9089
; VI: flat_load_ushort [[B:v[0-9]+]]
90+
; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
9191
; VI-DAG: v_subrev_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
9292
; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
9393
define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {

0 commit comments

Comments
 (0)
Please sign in to comment.