Index: lib/Target/AMDGPU/SISchedule.td =================================================================== --- lib/Target/AMDGPU/SISchedule.td +++ lib/Target/AMDGPU/SISchedule.td @@ -39,24 +39,32 @@ // instructions and have VALU rates, but write to the SALU (i.e. VOPC // instructions) -def SIFullSpeedModel : SchedMachineModel { - let CompleteModel = 0; -} -def SIQuarterSpeedModel : SchedMachineModel { - let CompleteModel = 0; +class SISchedMachineModel : SchedMachineModel { + let CompleteModel = 0; + let IssueWidth = 1; } -// BufferSize = 0 means the processors are in-order. -let BufferSize = 0 in { +def SIFullSpeedModel : SISchedMachineModel; +def SIQuarterSpeedModel : SISchedMachineModel; // XXX: Are the resource counts correct? -def HWBranch : ProcResource<1>; -def HWExport : ProcResource<7>; // Taken from S_WAITCNT -def HWLGKM : ProcResource<31>; // Taken from S_WAITCNT -def HWSALU : ProcResource<1>; -def HWVMEM : ProcResource<15>; // Taken from S_WAITCNT -def HWVALU : ProcResource<1>; - +def HWBranch : ProcResource<1> { + let BufferSize = 1; +} +def HWExport : ProcResource<1> { + let BufferSize = 7; // Taken from S_WAITCNT +} +def HWLGKM : ProcResource<1> { + let BufferSize = 31; // Taken from S_WAITCNT +} +def HWSALU : ProcResource<1> { + let BufferSize = 1; +} +def HWVMEM : ProcResource<1> { + let BufferSize = 15; // Taken from S_WAITCNT +} +def HWVALU : ProcResource<1> { + let BufferSize = 1; } class HWWriteRes resources, @@ -74,12 +82,12 @@ // The latency values are 1 / (operations / cycle) / 4. multiclass SICommonWriteRes { - def : HWWriteRes; // XXX: Guessed ??? - def : HWWriteRes; // XXX: Guessed ??? - def : HWWriteRes; // 2 - 64 - def : HWWriteRes; - def : HWWriteRes; // XXX: Guessed ??? - def : HWWriteRes; // 300 - 600 + def : HWWriteRes; + def : HWWriteRes; + def : HWWriteRes; // Can be between 2 and 64 + def : HWWriteRes; + def : HWWriteRes; + def : HWWriteRes; def : HWWriteRes; // XXX: Guessed ??? def : HWVALUWriteRes; Index: test/CodeGen/AMDGPU/ds_read2_offset_order.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2_offset_order.ll +++ test/CodeGen/AMDGPU/ds_read2_offset_order.ll @@ -8,9 +8,9 @@ ; SI-LABEL: {{^}}offset_order: -; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset1:4{{$}} -; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:3 offset1:2 -; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:12 offset1:14 +; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:4{{$}} +; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3 +; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:14 offset1:12 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44 define void @offset_order(float addrspace(1)* %out) { Index: test/CodeGen/AMDGPU/fceil64.ll =================================================================== --- test/CodeGen/AMDGPU/fceil64.ll +++ test/CodeGen/AMDGPU/fceil64.ll @@ -12,11 +12,11 @@ ; FUNC-LABEL: {{^}}fceil_f64: ; CI: v_ceil_f64_e32 ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014 -; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 -; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01 -; SI: s_lshr_b64 -; SI: s_not_b64 -; SI: s_and_b64 +; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 +; SI-DAG: s_add_i32 [[A:s[0-9]+]], [[SEXP]], 0xfffffc01 +; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[A]] +; SI-DAG: s_not_b64 +; SI-DAG: s_and_b64 ; SI-DAG: cmp_gt_i32 ; SI-DAG: cndmask_b32 ; SI-DAG: cndmask_b32 Index: test/CodeGen/AMDGPU/ftrunc.f64.ll =================================================================== --- test/CodeGen/AMDGPU/ftrunc.f64.ll +++ test/CodeGen/AMDGPU/ftrunc.f64.ll @@ -24,11 +24,11 @@ ; CI: v_trunc_f64_e32 ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014 -; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 -; SI: s_add_i32 s{{[0-9]+}}, [[SEXP]], 0xfffffc01 -; SI: s_lshr_b64 -; SI: s_not_b64 -; SI: s_and_b64 +; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 +; SI-DAG: s_add_i32 [[A:s[0-9]+]], [[SEXP]], 0xfffffc01 +; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[A]] +; SI-DAG: s_not_b64 +; SI-DAG: s_and_b64 ; SI-DAG: cmp_gt_i32 ; SI-DAG: cndmask_b32 ; SI-DAG: cndmask_b32 Index: test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll +++ test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll @@ -6,13 +6,13 @@ ; FUNC-LABEL: {{^}}rsq_clamped_f64: ; SI: v_rsq_clamp_f64_e32 -; VI: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[2:3] +; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}] ; TODO: this constant should be folded: -; VI: s_mov_b32 s[[ALLBITS:[0-9+]]], -1 -; VI: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff -; VI: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]] +; VI-DAG: s_mov_b32 s[[ALLBITS:[0-9+]]], -1 +; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff +; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff +; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]] ; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]] -; VI: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff ; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]] ; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]] Index: test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll @@ -24,13 +24,13 @@ ; FUNC-LABEL: {{^}}rsq_clamp_f64: ; SI: v_rsq_clamp_f64_e32 -; VI: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[2:3] ; TODO: this constant should be folded: -; VI: s_mov_b32 s[[ALLBITS:[0-9+]]], -1 -; VI: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff -; VI: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]] +; VI-DAG: s_mov_b32 s[[ALLBITS:[0-9+]]], -1 +; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff +; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff +; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]] +; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}} ; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]] -; VI: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff ; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]] ; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]] define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 { Index: test/CodeGen/AMDGPU/llvm.memcpy.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.memcpy.ll +++ test/CodeGen/AMDGPU/llvm.memcpy.ll @@ -6,77 +6,77 @@ ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1: -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 - -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 - -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 - -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 - -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 - -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 - -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 - -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 + +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 + +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 + +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 +; SI-DAG: ds_read_u8 + +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 + +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 + +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 + +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 +; SI-DAG: ds_write_b8 ; SI: s_endpgm define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { @@ -87,41 +87,41 @@ } ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align2: -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 - -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 -; SI: ds_read_u16 - -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 - -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 -; SI: ds_write_b16 +; SI-DAG: ds_read_u16 +; SI-DAG: ds_read_u16 +; SI-DAG: ds_read_u16 +; SI-DAG: ds_read_u16 +; SI-DAG: ds_read_u16 +; SI-DAG: ds_read_u16 +; SI-DAG: ds_read_u16 +; SI-DAG: ds_read_u16 + +; SI-DAG: ds_read_u16 +; SI-DAG: ds_read_u16 +; SI-DAG: ds_read_u16 +; SI-DAG: ds_read_u16 +; SI-DAG: ds_read_u16 +; SI-DAG: ds_read_u16 +; SI-DAG: ds_read_u16 +; SI-DAG: ds_read_u16 + +; SI-DAG: ds_write_b16 +; SI-DAG: ds_write_b16 +; SI-DAG: ds_write_b16 +; SI-DAG: ds_write_b16 +; SI-DAG: ds_write_b16 +; SI-DAG: ds_write_b16 +; SI-DAG: ds_write_b16 +; SI-DAG: ds_write_b16 + +; SI-DAG: ds_write_b16 +; SI-DAG: ds_write_b16 +; SI-DAG: ds_write_b16 +; SI-DAG: ds_write_b16 +; SI-DAG: ds_write_b16 +; SI-DAG: ds_write_b16 +; SI-DAG: ds_write_b16 +; SI-DAG: ds_write_b16 ; SI: s_endpgm define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind { Index: test/CodeGen/AMDGPU/local-memory-two-objects.ll =================================================================== --- test/CodeGen/AMDGPU/local-memory-two-objects.ll +++ test/CodeGen/AMDGPU/local-memory-two-objects.ll @@ -32,8 +32,8 @@ ; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]] ; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], vcc, 16, v{{[0-9]+}} ; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]] -; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16 -; CI: ds_read_b32 {{v[0-9]+}}, [[ADDRR]] +; CI-DAG: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16 +; CI-DAG: ds_read_b32 {{v[0-9]+}}, [[ADDRR]] define void @local_memory_two_objects(i32 addrspace(1)* %out) { entry: Index: test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll =================================================================== --- test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -156,9 +156,11 @@ } ; FUNC-LABEL: @reorder_local_offsets +; FIXME: The scheduler doesn't think its proftible to re-order the +; loads and stores, and I'm not sure that it really is. +; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404 -; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 ; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400 ; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:404 ; CI: buffer_store_dword Index: test/CodeGen/AMDGPU/udivrem.ll =================================================================== --- test/CodeGen/AMDGPU/udivrem.ll +++ test/CodeGen/AMDGPU/udivrem.ll @@ -107,50 +107,54 @@ ; EG-DAG: CNDE_INT ; EG-DAG: CNDE_INT -; SI-DAG: v_rcp_iflag_f32_e32 [[FIRST_RCP:v[0-9]+]] -; SI-DAG: v_mul_hi_u32 [[FIRST_RCP_HI:v[0-9]+]], [[FIRST_RCP]] -; SI-DAG: v_mul_lo_i32 [[FIRST_RCP_LO:v[0-9]+]], [[FIRST_RCP]] -; SI-DAG: v_sub_i32_e32 [[FIRST_NEG_RCP_LO:v[0-9]+]], vcc, 0, [[FIRST_RCP_LO]] +; For SI, we used to have checks for the input and output registers +; of the instructions, but these are way too fragile. The division for +; the two vector elements can be intermixed which makes it impossible to +; accurately check all the operands. +; SI-DAG: v_rcp_iflag_f32_e32 +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_sub_i32_e32 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[FIRST_E:v[0-9]+]], {{v[0-9]+}}, [[FIRST_RCP]] -; SI-DAG: v_add_i32_e32 [[FIRST_RCP_A_E:v[0-9]+]], vcc, [[FIRST_E]], [[FIRST_RCP]] -; SI-DAG: v_subrev_i32_e32 [[FIRST_RCP_S_E:v[0-9]+]], vcc, [[FIRST_E]], [[FIRST_RCP]] +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_add_i32_e32 +; SI-DAG: v_subrev_i32_e32 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[FIRST_Quotient:v[0-9]+]] -; SI-DAG: v_mul_lo_i32 [[FIRST_Num_S_Remainder:v[0-9]+]] -; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder:v[0-9]+]], vcc, [[FIRST_Num_S_Remainder]], v{{[0-9]+}} +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_subrev_i32_e32 ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_and_b32_e32 [[FIRST_Tmp1:v[0-9]+]] -; SI-DAG: v_add_i32_e32 [[FIRST_Quotient_A_One:v[0-9]+]], {{.*}}, [[FIRST_Quotient]] -; SI-DAG: v_subrev_i32_e32 [[FIRST_Quotient_S_One:v[0-9]+]], +; SI-DAG: v_and_b32_e32 +; SI-DAG: v_add_i32_e32 +; SI-DAG: v_subrev_i32_e32 ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_add_i32_e32 [[FIRST_Remainder_A_Den:v[0-9]+]], -; SI-DAG: v_subrev_i32_e32 [[FIRST_Remainder_S_Den:v[0-9]+]], +; SI-DAG: v_add_i32_e32 +; SI-DAG: v_subrev_i32_e32 ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_rcp_iflag_f32_e32 [[SECOND_RCP:v[0-9]+]] -; SI-DAG: v_mul_hi_u32 [[SECOND_RCP_HI:v[0-9]+]], [[SECOND_RCP]] -; SI-DAG: v_mul_lo_i32 [[SECOND_RCP_LO:v[0-9]+]], [[SECOND_RCP]] -; SI-DAG: v_sub_i32_e32 [[SECOND_NEG_RCP_LO:v[0-9]+]], vcc, 0, [[SECOND_RCP_LO]] +; SI-DAG: v_rcp_iflag_f32_e32 +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_sub_i32_e32 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[SECOND_E:v[0-9]+]], {{v[0-9]+}}, [[SECOND_RCP]] -; SI-DAG: v_add_i32_e32 [[SECOND_RCP_A_E:v[0-9]+]], vcc, [[SECOND_E]], [[SECOND_RCP]] -; SI-DAG: v_subrev_i32_e32 [[SECOND_RCP_S_E:v[0-9]+]], vcc, [[SECOND_E]], [[SECOND_RCP]] +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_add_i32_e32 +; SI-DAG: v_subrev_i32_e32 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_mul_hi_u32 [[SECOND_Quotient:v[0-9]+]] -; SI-DAG: v_mul_lo_i32 [[SECOND_Num_S_Remainder:v[0-9]+]] -; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder:v[0-9]+]], vcc, [[SECOND_Num_S_Remainder]], v{{[0-9]+}} +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_subrev_i32_e32 ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_and_b32_e32 [[SECOND_Tmp1:v[0-9]+]] -; SI-DAG: v_add_i32_e32 [[SECOND_Quotient_A_One:v[0-9]+]], {{.*}}, [[SECOND_Quotient]] -; SI-DAG: v_subrev_i32_e32 [[SECOND_Quotient_S_One:v[0-9]+]], +; SI-DAG: v_and_b32_e32 +; SI-DAG: v_add_i32_e32 +; SI-DAG: v_subrev_i32_e32 ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 -; SI-DAG: v_add_i32_e32 [[SECOND_Remainder_A_Den:v[0-9]+]], -; SI-DAG: v_subrev_i32_e32 [[SECOND_Remainder_S_Den:v[0-9]+]], +; SI-DAG: v_add_i32_e32 +; SI-DAG: v_subrev_i32_e32 ; SI-DAG: v_cndmask_b32_e64 ; SI-DAG: v_cndmask_b32_e64 ; SI: s_endpgm