diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp @@ -34,6 +34,7 @@ switch (SecondMI.getOpcode()) { case AMDGPU::V_ADDC_U32_e64: case AMDGPU::V_SUBB_U32_e64: + case AMDGPU::V_SUBBREV_U32_e64: case AMDGPU::V_CNDMASK_B32_e64: { // Try to cluster defs of condition registers to their uses. This improves // the chance VCC will be available which will allow shrinking to VOP2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -5741,7 +5741,7 @@ ; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s9, s5 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc @@ -5752,7 +5752,7 @@ ; GCN-NEXT: v_mul_hi_u32 v5, v0, s3 ; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] ; GCN-NEXT: v_mul_lo_u32 v6, v2, s3 -; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_movk_i32 s5, 0x11e ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, s3 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 @@ -5760,7 +5760,8 @@ ; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v9, v0, v5 ; GCN-NEXT: v_mul_hi_u32 v11, v2, v4 -; GCN-NEXT: s_mov_b32 s9, s5 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc ; GCN-NEXT: v_mul_lo_u32 v10, v2, v5 @@ -5796,35 +5797,34 @@ ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, s7, v1 -; GCN-NEXT: v_mov_b32_e32 v3, s4 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_mov_b32_e32 v3, s4 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc ; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 -; GCN-NEXT: v_subb_u32_e64 v3, s[2:3], v2, v3, s[0:1] +; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] +; GCN-NEXT: v_cmp_lt_u32_e64 s[2:3], s5, v5 +; GCN-NEXT: s_mov_b32 s6, 0x9761f7c8 +; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v4 +; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 +; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v5 +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] ; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] -; GCN-NEXT: s_movk_i32 s2, 0x11e -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s2, v2 -; GCN-NEXT: s_mov_b32 s3, 0x9761f7c8 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s3, v4 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, v2 -; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1] -; GCN-NEXT: v_subrev_i32_e64 v6, s[0:1], s12, v4 -; GCN-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s2, v1 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s3, v0 +; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 +; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v5, s7 +; GCN-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc +; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s5, v1 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0 +; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v6, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GCN-NEXT: s_endpgm @@ -7018,29 +7018,29 @@ ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc ; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 -; GCN-NEXT: v_subb_u32_e64 v3, s[2:3], v2, v3, s[0:1] +; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v5 +; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v4 +; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 +; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v5 +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] ; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v2 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v4 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v2 -; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1] -; GCN-NEXT: v_subrev_i32_e64 v6, s[0:1], s12, v4 -; GCN-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 +; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 +; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v6, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GCN-NEXT: v_xor_b32_e32 v0, s14, v0 ; GCN-NEXT: v_xor_b32_e32 v1, s14, v1 @@ -7225,39 +7225,39 @@ ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc ; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s16, v0 -; GCN-NEXT: v_subb_u32_e64 v3, s[2:3], v2, v3, s[0:1] -; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v2 -; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v5 -; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s17, v2 -; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[0:1] -; GCN-NEXT: v_subrev_i32_e64 v8, s[0:1], s16, v5 +; GCN-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v2, s[0:1] +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s17, v7 +; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s16, v5 +; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s16, v5 +; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s17, v7 +; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] ; GCN-NEXT: s_ashr_i32 s2, s15, 31 -; GCN-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 +; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] +; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 ; GCN-NEXT: s_add_u32 s8, s14, s2 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v7, s9 ; GCN-NEXT: s_mov_b32 s3, s2 ; GCN-NEXT: s_addc_u32 s9, s15, s2 ; GCN-NEXT: s_xor_b64 s[8:9], s[8:9], s[2:3] -; GCN-NEXT: v_cvt_f32_u32_e32 v7, s8 +; GCN-NEXT: v_cvt_f32_u32_e32 v8, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v9, s9 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN-NEXT: v_subb_u32_e32 v1, vcc, v7, v1, vcc ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s17, v1 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_mac_f32_e32 v7, s18, v9 -; GCN-NEXT: v_rcp_f32_e32 v7, v7 +; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; GCN-NEXT: v_mac_f32_e32 v8, s18, v9 ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s16, v0 +; GCN-NEXT: v_rcp_f32_e32 v8, v8 ; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s17, v1 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_mul_f32_e32 v3, s19, v7 +; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v8, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v3, s[0:1] +; GCN-NEXT: v_mul_f32_e32 v3, s19, v8 ; GCN-NEXT: v_mul_f32_e32 v5, s20, v3 ; GCN-NEXT: v_trunc_f32_e32 v5, v5 ; GCN-NEXT: v_mac_f32_e32 v3, s21, v5 @@ -7347,29 +7347,29 @@ ; GCN-NEXT: v_sub_i32_e32 v2, vcc, s10, v2 ; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc ; GCN-NEXT: v_subrev_i32_e64 v6, s[0:1], s8, v2 -; GCN-NEXT: v_subb_u32_e64 v5, s[2:3], v4, v5, s[0:1] +; GCN-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1] +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v7 +; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v6 +; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s8, v6 +; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v7 +; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] ; GCN-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 -; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v6 -; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 -; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[0:1] -; GCN-NEXT: v_subrev_i32_e64 v8, s[0:1], s8, v6 -; GCN-NEXT: v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 -; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 +; GCN-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v7, s11 +; GCN-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 ; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 +; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 -; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v8, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GCN-NEXT: v_xor_b32_e32 v2, s14, v2 ; GCN-NEXT: v_xor_b32_e32 v3, s14, v3 diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll --- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll +++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll @@ -434,20 +434,20 @@ ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v8, v4 ; GFX9-NEXT: v_subb_co_u32_e64 v8, s[4:5], v9, v5, vcc ; GFX9-NEXT: v_sub_co_u32_e64 v9, s[4:5], v4, v3 -; GFX9-NEXT: v_subb_co_u32_e64 v10, s[6:7], v8, v5, s[4:5] -; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[4:5], 0, v8, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5 +; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[6:7], 0, v8, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v5 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v10, v5 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] +; GFX9-NEXT: v_subb_co_u32_e64 v8, s[4:5], v8, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[6:7] ; GFX9-NEXT: v_sub_co_u32_e64 v12, s[4:5], v9, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v4, v3 -; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[4:5], 0, v10, s[4:5] +; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[4:5], 0, v8, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc @@ -455,7 +455,7 @@ ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v12, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc ; GFX9-NEXT: v_xor_b32_e32 v3, v3, v7 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v7 @@ -592,22 +592,22 @@ ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v4 ; GFX9-NEXT: v_subb_co_u32_e64 v6, s[4:5], v6, v3, vcc ; GFX9-NEXT: v_sub_co_u32_e64 v7, s[4:5], v4, v2 -; GFX9-NEXT: v_subb_co_u32_e64 v8, s[6:7], v6, v3, s[4:5] -; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[4:5], 0, v6, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 +; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[6:7], 0, v6, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[6:7] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v8, v3 +; GFX9-NEXT: v_subb_co_u32_e64 v6, s[4:5], v6, v3, s[4:5] ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[6:7] ; GFX9-NEXT: v_sub_co_u32_e64 v10, s[4:5], v7, v2 -; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[4:5], 0, v8, s[4:5] +; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[4:5], 0, v6, s[4:5] ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc @@ -809,7 +809,7 @@ ; GFX9-NEXT: ; implicit-def: $vgpr5_vgpr6 ; GFX9-NEXT: ; implicit-def: $vgpr3_vgpr4 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GFX9-NEXT: s_xor_b64 s[10:11], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz BB8_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v7 @@ -903,48 +903,48 @@ ; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, v9, v13 ; GFX9-NEXT: v_subb_co_u32_e64 v11, s[4:5], v11, v5, vcc ; GFX9-NEXT: v_sub_co_u32_e64 v12, s[4:5], v9, v6 -; GFX9-NEXT: v_subb_co_u32_e64 v13, s[6:7], v11, v5, s[4:5] -; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[4:5], 0, v11, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[4:5] -; GFX9-NEXT: v_add_co_u32_e64 v15, s[4:5], 2, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v16, s[4:5], 0, v7, s[4:5] +; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[6:7], 0, v11, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v13, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[6:7] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v13, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[6:7] +; GFX9-NEXT: v_add_co_u32_e64 v15, s[6:7], 2, v4 +; GFX9-NEXT: v_addc_co_u32_e64 v16, s[6:7], 0, v7, s[6:7] ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v10, vcc -; GFX9-NEXT: v_add_co_u32_e64 v17, s[4:5], 1, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v18, s[4:5], 0, v7, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v17, s[6:7], 1, v4 +; GFX9-NEXT: v_addc_co_u32_e64 v18, s[6:7], 0, v7, s[6:7] ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v14 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v14 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v9, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v14, v18, v16, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v10, v16, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v14, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v7, v17, v15, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GFX9-NEXT: v_xor_b32_e32 v7, v8, v3 -; GFX9-NEXT: v_xor_b32_e32 v3, v4, v7 -; GFX9-NEXT: v_xor_b32_e32 v5, v5, v7 -; GFX9-NEXT: v_sub_co_u32_e64 v3, s[6:7], v3, v7 -; GFX9-NEXT: v_subb_co_u32_e64 v4, s[6:7], v5, v7, s[6:7] -; GFX9-NEXT: v_sub_co_u32_e64 v5, s[6:7], v12, v6 -; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[6:7], 0, v13, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v12, v5, s[4:5] +; GFX9-NEXT: v_subb_co_u32_e64 v5, s[4:5], v11, v5, s[4:5] +; GFX9-NEXT: v_sub_co_u32_e64 v6, s[4:5], v12, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v16, vcc +; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[4:5], 0, v5, s[4:5] +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v5, v12, v6, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v10, v17, v15, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v11, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX9-NEXT: v_xor_b32_e32 v10, v8, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v14, vcc +; GFX9-NEXT: v_xor_b32_e32 v3, v4, v10 ; GFX9-NEXT: v_xor_b32_e32 v5, v5, v8 +; GFX9-NEXT: v_xor_b32_e32 v7, v7, v10 +; GFX9-NEXT: v_sub_co_u32_e64 v3, s[8:9], v3, v10 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v8 ; GFX9-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v8 +; GFX9-NEXT: v_subb_co_u32_e64 v4, s[8:9], v7, v10, s[8:9] ; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v8, vcc ; GFX9-NEXT: BB8_2: ; %Flow -; GFX9-NEXT: s_or_saveexec_b64 s[6:7], s[8:9] +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], s[10:11] ; GFX9-NEXT: s_xor_b64 exec, exec, s[6:7] ; GFX9-NEXT: s_cbranch_execz BB8_4 ; GFX9-NEXT: ; %bb.3: @@ -1085,35 +1085,35 @@ ; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v9 ; GFX9-NEXT: v_subb_co_u32_e64 v7, s[4:5], v7, v3, vcc ; GFX9-NEXT: v_sub_co_u32_e64 v9, s[4:5], v8, v2 -; GFX9-NEXT: v_subb_co_u32_e64 v10, s[6:7], v7, v3, s[4:5] -; GFX9-NEXT: v_subbrev_co_u32_e64 v7, s[4:5], 0, v7, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] -; GFX9-NEXT: v_add_co_u32_e64 v12, s[4:5], 2, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v13, s[4:5], 0, v5, s[4:5] +; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[6:7], 0, v7, s[4:5] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], v10, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[6:7] +; GFX9-NEXT: v_add_co_u32_e64 v12, s[6:7], 2, v4 +; GFX9-NEXT: v_addc_co_u32_e64 v13, s[6:7], 0, v5, s[6:7] ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc -; GFX9-NEXT: v_add_co_u32_e64 v14, s[4:5], 1, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v15, s[4:5], 0, v5, s[4:5] +; GFX9-NEXT: v_add_co_u32_e64 v14, s[6:7], 1, v4 +; GFX9-NEXT: v_addc_co_u32_e64 v15, s[6:7], 0, v5, s[6:7] ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v11 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v8, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v15, v13, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v15, v13, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v13, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v14, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v3, vcc -; GFX9-NEXT: v_sub_co_u32_e64 v3, s[6:7], v9, v2 -; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[6:7], 0, v10, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v7, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v9, v3, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v13, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, v14, v12, s[6:7] +; GFX9-NEXT: v_subb_co_u32_e64 v3, s[4:5], v7, v3, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_sub_co_u32_e64 v6, s[4:5], v9, v2 +; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[4:5], 0, v3, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v9, v6, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v1, vcc ; GFX9-NEXT: BB9_2: ; %Flow diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -97,29 +97,29 @@ ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc ; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 -; GCN-NEXT: v_subb_u32_e64 v3, s[2:3], v2, v3, s[0:1] +; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v5 +; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v4 +; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 +; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v5 +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] ; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v2 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v4 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v2 -; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1] -; GCN-NEXT: v_subrev_i32_e64 v6, s[0:1], s12, v4 -; GCN-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 +; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 +; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v6, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -331,20 +331,20 @@ ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v7, v3, vcc ; GCN-NEXT: v_sub_i32_e64 v7, s[4:5], v0, v2 -; GCN-NEXT: v_subb_u32_e64 v8, s[6:7], v4, v3, s[4:5] -; GCN-NEXT: v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5] -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v3 +; GCN-NEXT: v_subbrev_u32_e64 v8, s[6:7], 0, v4, s[4:5] +; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v3 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v2 -; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v3 +; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] +; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v2 +; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[6:7] +; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], v8, v3 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[4:5] +; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v3, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[6:7] ; GCN-NEXT: v_sub_i32_e64 v10, s[4:5], v7, v2 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; GCN-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v8, s[4:5] +; GCN-NEXT: v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5] ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GCN-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc @@ -352,7 +352,7 @@ ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v10, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN-NEXT: v_xor_b32_e32 v0, v0, v6 ; GCN-NEXT: v_xor_b32_e32 v1, v1, v6 @@ -987,29 +987,29 @@ ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s14, v0 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc ; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 -; GCN-NEXT: v_subb_u32_e64 v3, s[2:3], v2, v3, s[0:1] +; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v5 +; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v4 +; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 +; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v5 +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] ; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v2 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v4 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v2 -; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1] -; GCN-NEXT: v_subrev_i32_e64 v6, s[0:1], s12, v4 -; GCN-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 +; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v5, s15 +; GCN-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 +; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v6, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GCN-NEXT: v_xor_b32_e32 v0, s10, v0 ; GCN-NEXT: v_xor_b32_e32 v1, s10, v1 @@ -1410,28 +1410,28 @@ ; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc ; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s8, v0 -; GCN-NEXT: v_subb_u32_e64 v3, s[2:3], v2, v3, s[0:1] -; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v4 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v2 -; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1] -; GCN-NEXT: v_subrev_i32_e64 v6, s[0:1], s8, v4 +; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v5 +; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v4 +; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s8, v4 +; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v5 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] +; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] +; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 +; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 +; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v6, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -1623,27 +1623,27 @@ ; GCN-NEXT: v_sub_i32_e32 v2, vcc, 24, v2 ; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v1, vcc ; GCN-NEXT: v_sub_i32_e64 v5, s[4:5], v2, v0 -; GCN-NEXT: v_subb_u32_e64 v6, s[6:7], v4, v1, s[4:5] -; GCN-NEXT: v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5] -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 -; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v0 +; GCN-NEXT: v_subbrev_u32_e64 v6, s[6:7], 0, v4, s[4:5] +; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v6, v1 +; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[6:7] +; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v5, v0 +; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[6:7] +; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], v6, v1 +; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v1, s[4:5] ; GCN-NEXT: v_subb_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 -; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[6:7] +; GCN-NEXT: v_sub_i32_e64 v8, s[4:5], v5, v0 +; GCN-NEXT: v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5] ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1 -; GCN-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v7 +; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7 ; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 -; GCN-NEXT: v_sub_i32_e64 v8, s[4:5], v5, v0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GCN-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; GCN-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v6, s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5] ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v1, v6, v4, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1834,27 +1834,27 @@ ; GCN-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 ; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v1, vcc ; GCN-NEXT: v_sub_i32_e64 v5, s[4:5], v2, v0 -; GCN-NEXT: v_subb_u32_e64 v6, s[6:7], v4, v1, s[4:5] -; GCN-NEXT: v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5] -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 -; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v0 +; GCN-NEXT: v_subbrev_u32_e64 v6, s[6:7], 0, v4, s[4:5] +; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v6, v1 +; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[6:7] +; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v5, v0 +; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[6:7] +; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], v6, v1 +; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v1, s[4:5] ; GCN-NEXT: v_subb_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 -; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[6:7] +; GCN-NEXT: v_sub_i32_e64 v8, s[4:5], v5, v0 +; GCN-NEXT: v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5] ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1 -; GCN-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v7 +; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7 ; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 -; GCN-NEXT: v_sub_i32_e64 v8, s[4:5], v5, v0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GCN-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; GCN-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v6, s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5] ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v1, v6, v4, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll b/llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll --- a/llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll @@ -4,9 +4,9 @@ ; ; GCN-LABEL: sub_zext_zext: ; GCN: ds_read_b32 [[VAL:v[0-9]+]], -; GCN-DAG: v_cmp_lt_f32{{.*}} [[CC1:s\[[0-9]+:[0-9]+\]]], 0, [[VAL]] -; GCN-DAG: v_cmp_gt_f32{{.*}} vcc, 0, [[VAL]] -; GCN: v_cndmask_{{.*}} [[ZEXTCC1:v[0-9]+]], 0, 1, [[CC1]] +; GCN: v_cmp_lt_f32{{.*}} vcc, 0, [[VAL]] +; GCN: v_cndmask_{{.*}} [[ZEXTCC1:v[0-9]+]], 0, 1, vcc +; GCN: v_cmp_gt_f32{{.*}} vcc, 0, [[VAL]] ; GCN: v_subbrev{{.*}} {{v[0-9]+}}, vcc, 0, [[ZEXTCC1]], vcc ; ; Before the reversion that this test is attached to, the compiler commuted diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -1585,18 +1585,18 @@ ; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v10, 24, v8 ; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v8, 24, v8 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 -; GCN-IR-NEXT: v_add_i32_e64 v8, s[0:1], 1, v4 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e64 v9, s[0:1], 0, v5, s[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[0:1], v[8:9], v[4:5] +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] ; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[0:1], 0, v7, s[0:1] +; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB11_5 ; GCN-IR-NEXT: BB11_6: ; %udiv-loop-exit @@ -1757,25 +1757,25 @@ ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v6, v7, v4 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 23, v6 -; GCN-IR-NEXT: v_or_b32_e32 v2, v9, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, 0, v8, vcc -; GCN-IR-NEXT: v_add_i32_e64 v9, s[4:5], 1, v0 +; GCN-IR-NEXT: v_or_b32_e32 v2, v9, v2 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v0 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v7, 31, v4 ; GCN-IR-NEXT: v_or_b32_e32 v3, v10, v3 -; GCN-IR-NEXT: v_addc_u32_e64 v10, s[4:5], 0, v1, s[4:5] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], v[9:10], v[0:1] +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[0:1] ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v7 ; GCN-IR-NEXT: v_and_b32_e32 v7, 24, v7 ; GCN-IR-NEXT: v_mov_b32_e32 v0, v9 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: v_sub_i32_e32 v7, vcc, v6, v7 +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], v6, v7 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v5 -; GCN-IR-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc -; GCN-IR-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GCN-IR-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v8, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] ; GCN-IR-NEXT: v_mov_b32_e32 v9, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: s_cbranch_execnz BB12_3 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -97,29 +97,29 @@ ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc ; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 -; GCN-NEXT: v_subb_u32_e64 v3, s[2:3], v2, v3, s[0:1] +; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v5 +; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v4 +; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 +; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v5 +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] ; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v2 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v4 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v2 -; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1] -; GCN-NEXT: v_subrev_i32_e64 v6, s[0:1], s12, v4 -; GCN-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 +; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v5, s11 +; GCN-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 +; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v6, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -321,27 +321,27 @@ ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v6, v3, vcc ; GCN-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2 -; GCN-NEXT: v_subb_u32_e64 v7, s[6:7], v4, v3, s[4:5] -; GCN-NEXT: v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5] +; GCN-NEXT: v_subbrev_u32_e64 v7, s[6:7], 0, v4, s[4:5] +; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v3 -; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 +; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[6:7] +; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v6, v2 +; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7] +; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], v7, v3 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v3 +; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v3, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[6:7] +; GCN-NEXT: v_sub_i32_e64 v9, s[4:5], v6, v2 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5] -; GCN-NEXT: v_sub_i32_e64 v9, s[4:5], v6, v2 +; GCN-NEXT: v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5] ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 ; GCN-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GCN-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v8 +; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_subbrev_u32_e64 v2, s[4:5], 0, v7, s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[6:7] -; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v4, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -832,28 +832,28 @@ ; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc ; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s6, v0 -; GCN-NEXT: v_subb_u32_e64 v3, s[2:3], v2, v3, s[0:1] -; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v2 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v4 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v2 -; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1] -; GCN-NEXT: v_subrev_i32_e64 v6, s[0:1], s6, v4 +; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s7, v5 +; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s6, v4 +; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s6, v4 +; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] +; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s7, v5 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] +; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] +; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 +; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 +; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s7, v1 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v6, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GCN-NEXT: s_endpgm @@ -1115,18 +1115,18 @@ ; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_or_b32_e32 v0, v8, v0 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 +; GCN-IR-NEXT: v_and_b32_e32 v10, 24, v8 ; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v8, 24, v8 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 -; GCN-IR-NEXT: v_add_i32_e64 v8, s[0:1], 1, v4 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 -; GCN-IR-NEXT: v_addc_u32_e64 v9, s[0:1], 0, v5, s[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[0:1], v[8:9], v[4:5] +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] ; GCN-IR-NEXT: v_mov_b32_e32 v4, v8 -; GCN-IR-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[0:1], v6, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v3 -; GCN-IR-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-IR-NEXT: v_subbrev_u32_e64 v7, s[0:1], 0, v7, s[0:1] +; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v8, v2 ; GCN-IR-NEXT: s_cbranch_vccz BB7_5 ; GCN-IR-NEXT: BB7_6: ; %udiv-loop-exit @@ -1232,27 +1232,27 @@ ; GCN-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 ; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v1, vcc ; GCN-NEXT: v_sub_i32_e64 v5, s[4:5], v2, v0 -; GCN-NEXT: v_subb_u32_e64 v6, s[6:7], v4, v1, s[4:5] -; GCN-NEXT: v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5] -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 -; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v0 +; GCN-NEXT: v_subbrev_u32_e64 v6, s[6:7], 0, v4, s[4:5] +; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v6, v1 +; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[6:7] +; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v5, v0 +; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[6:7] +; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], v6, v1 +; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v1, s[4:5] ; GCN-NEXT: v_subb_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1 -; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[6:7] +; GCN-NEXT: v_sub_i32_e64 v8, s[4:5], v5, v0 +; GCN-NEXT: v_subbrev_u32_e64 v4, s[4:5], 0, v4, s[4:5] ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1 -; GCN-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v7 +; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7 ; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0 -; GCN-NEXT: v_sub_i32_e64 v8, s[4:5], v5, v0 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 ; GCN-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; GCN-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v6, s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5] ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v1, v6, v4, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -336,8 +336,8 @@ ; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} ; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, 0, v{{[0-9]+}}, vcc_lo ; GFX1032: v_sub_co_u32_e64 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX1032: v_sub_co_ci_u32_e64 v{{[0-9]+}}, s{{[0-9]+}}, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc_lo -; GFX1032: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc_lo +; GFX1032: v_subrev_co_ci_u32_e64 v{{[0-9]+}}, s{{[0-9]+}}, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc_lo +; GFX1032: v_sub_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc_lo ; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, [[SDST:s\[[0-9:]+\]]], v{{[0-9]+}}, v{{[0-9]+}} ; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} ; GFX1064: v_add_co_ci_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, [[SDST]] @@ -346,8 +346,8 @@ ; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} ; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} ; GFX1064: v_sub_co_u32_e64 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}} -; GFX1064: v_sub_co_ci_u32_e64 v{{[0-9]+}}, s[{{[0-9:]+}}], {{[vs][0-9]+}}, v{{[0-9]+}}, vcc{{$}} -; GFX1064: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc{{$}} +; GFX1064: v_subrev_co_ci_u32_e64 v{{[0-9]+}}, s[{{[0-9:]+}}], {{[vs][0-9]+}}, v{{[0-9]+}}, vcc{{$}} +; GFX1064: v_sub_co_ci_u32_e32 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc{{$}} define amdgpu_kernel void @test_udiv64(i64 addrspace(1)* %arg) #0 { bb: %tmp = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 1