diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll @@ -74,9 +74,8 @@ ; FUNC-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] ; FUNC-NEXT: s_waitcnt vmcnt(0) ; FUNC-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v0, 0 -; FUNC-NEXT: v_mul_lo_u32 v1, v4, v1 -; FUNC-NEXT: v_mul_lo_u32 v0, 0, v0 -; FUNC-NEXT: v_add3_u32 v3, v3, v1, v0 +; FUNC-NEXT: v_mul_lo_u32 v0, v4, v1 +; FUNC-NEXT: v_add_nc_u32_e32 v3, v3, v0 ; FUNC-NEXT: v_mov_b32_e32 v0, 0 ; FUNC-NEXT: global_store_dwordx2 v0, v[2:3], s[4:5] ; FUNC-NEXT: s_endpgm @@ -105,8 +104,6 @@ ; FUNC-NEXT: global_load_dword v2, v0, s[2:3] ; FUNC-NEXT: s_waitcnt vmcnt(0) ; FUNC-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, 0 -; FUNC-NEXT: v_mul_lo_u32 v2, 0, v2 -; FUNC-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; FUNC-NEXT: v_mov_b32_e32 v2, 0 ; FUNC-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; FUNC-NEXT: s_endpgm @@ -136,9 +133,8 @@ ; FUNC-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; FUNC-NEXT: s_waitcnt vmcnt(0) ; FUNC-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v0, 0 -; FUNC-NEXT: v_mul_lo_u32 v1, v4, v1 -; FUNC-NEXT: v_mul_lo_u32 v0, 0, v0 -; FUNC-NEXT: v_add3_u32 v3, v3, v1, v0 +; FUNC-NEXT: v_mul_lo_u32 v0, v4, v1 +; FUNC-NEXT: v_add_nc_u32_e32 v3, v3, v0 ; FUNC-NEXT: v_mov_b32_e32 v0, 0 ; FUNC-NEXT: global_store_dwordx2 v0, v[2:3], s[4:5] ; FUNC-NEXT: s_endpgm @@ -158,21 +154,18 @@ ; FUNC-LABEL: v_mul_i64_and_a_lo: ; FUNC: ; %bb.0: ; FUNC-NEXT: s_clause 0x1 -; FUNC-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; FUNC-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; FUNC-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; FUNC-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; FUNC-NEXT: s_waitcnt lgkmcnt(0) ; FUNC-NEXT: s_clause 0x1 -; FUNC-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] -; FUNC-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] +; FUNC-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] +; FUNC-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] ; FUNC-NEXT: s_waitcnt vmcnt(1) -; FUNC-NEXT: v_mad_u64_u32 v[4:5], s0, 0, v0, 0 -; FUNC-NEXT: v_mul_lo_u32 v1, 0, v1 -; FUNC-NEXT: s_waitcnt vmcnt(0) -; FUNC-NEXT: v_mul_lo_u32 v0, v3, v0 -; FUNC-NEXT: v_add3_u32 v5, v5, v1, v0 ; FUNC-NEXT: v_mov_b32_e32 v0, 0 -; FUNC-NEXT: global_store_dwordx2 v0, v[4:5], s[4:5] +; FUNC-NEXT: s_waitcnt vmcnt(0) +; FUNC-NEXT: v_mul_lo_u32 v1, v1, v2 +; FUNC-NEXT: global_store_dwordx2 v0, v[0:1], s[4:5] ; FUNC-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, i64 addrspace(1)* %aptr, i32 %tid @@ -192,19 +185,18 @@ ; FUNC-NEXT: s_clause 0x1 ; FUNC-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; FUNC-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; FUNC-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; FUNC-NEXT: v_lshlrev_b32_e32 v3, 3, v0 +; FUNC-NEXT: ; kill: killed $vgpr3 +; FUNC-NEXT: ; kill: killed $sgpr6_sgpr7 ; FUNC-NEXT: s_waitcnt lgkmcnt(0) ; FUNC-NEXT: s_clause 0x1 -; FUNC-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; FUNC-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] -; FUNC-NEXT: s_waitcnt vmcnt(1) -; FUNC-NEXT: v_mad_u64_u32 v[4:5], s0, v0, 0, 0 +; FUNC-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7] +; FUNC-NEXT: global_load_dwordx2 v[1:2], v3, s[2:3] +; FUNC-NEXT: ; kill: killed $sgpr2_sgpr3 ; FUNC-NEXT: s_waitcnt vmcnt(0) -; FUNC-NEXT: v_mul_lo_u32 v0, v0, v3 -; FUNC-NEXT: v_mul_lo_u32 v1, v1, 0 -; FUNC-NEXT: v_add3_u32 v5, v5, v0, v1 +; FUNC-NEXT: v_mul_lo_u32 v1, v0, v2 ; FUNC-NEXT: v_mov_b32_e32 v0, 0 -; FUNC-NEXT: global_store_dwordx2 v0, v[4:5], s[4:5] +; FUNC-NEXT: global_store_dwordx2 v0, v[0:1], s[4:5] ; FUNC-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr inbounds i64, i64 addrspace(1)* %aptr, i32 %tid @@ -282,35 +274,27 @@ ; FUNC-NEXT: s_clause 0x1 ; FUNC-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; FUNC-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; FUNC-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; FUNC-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; FUNC-NEXT: s_waitcnt lgkmcnt(0) ; FUNC-NEXT: s_clause 0x1 -; FUNC-NEXT: global_load_dwordx2 v[2:3], v0, s[6:7] -; FUNC-NEXT: global_load_dwordx2 v[4:5], v0, s[2:3] -; FUNC-NEXT: ; implicit-def: $vgpr0_vgpr1 +; FUNC-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] +; FUNC-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] ; FUNC-NEXT: s_waitcnt vmcnt(1) ; FUNC-NEXT: v_cmp_ge_u64_e32 vcc_lo, 0, v[2:3] +; FUNC-NEXT: s_waitcnt vmcnt(0) +; FUNC-NEXT: v_mul_lo_u32 v1, v2, v1 ; FUNC-NEXT: s_and_saveexec_b32 s0, vcc_lo ; FUNC-NEXT: s_xor_b32 s0, exec_lo, s0 -; FUNC-NEXT: s_cbranch_execz .LBB9_2 ; FUNC-NEXT: ; %bb.1: ; %else -; FUNC-NEXT: s_waitcnt vmcnt(0) -; FUNC-NEXT: v_mad_u64_u32 v[0:1], s1, v2, v4, 0 -; FUNC-NEXT: v_mul_lo_u32 v2, v2, v5 -; FUNC-NEXT: v_mul_lo_u32 v3, 0, v4 -; FUNC-NEXT: ; implicit-def: $vgpr4_vgpr5 -; FUNC-NEXT: v_add3_u32 v1, v1, v2, v3 -; FUNC-NEXT: ; implicit-def: $vgpr2_vgpr3 -; FUNC-NEXT: .LBB9_2: ; %Flow +; FUNC-NEXT: v_mad_u64_u32 v[2:3], s1, v2, v0, 0 +; FUNC-NEXT: v_add_nc_u32_e32 v3, v3, v1 +; FUNC-NEXT: v_mov_b32_e32 v0, v2 +; FUNC-NEXT: v_mov_b32_e32 v1, v3 +; FUNC-NEXT: ; %bb.2: ; %Flow ; FUNC-NEXT: s_andn2_saveexec_b32 s0, s0 -; FUNC-NEXT: s_cbranch_execz .LBB9_4 ; FUNC-NEXT: ; %bb.3: ; %if -; FUNC-NEXT: v_mad_u64_u32 v[0:1], s1, v2, 0, 0 -; FUNC-NEXT: s_waitcnt vmcnt(0) -; FUNC-NEXT: v_mul_lo_u32 v2, v2, v5 -; FUNC-NEXT: v_mul_lo_u32 v3, 0, 0 -; FUNC-NEXT: v_add3_u32 v1, v1, v2, v3 -; FUNC-NEXT: .LBB9_4: ; %endif +; FUNC-NEXT: v_mov_b32_e32 v0, 0 +; FUNC-NEXT: ; %bb.4: ; %endif ; FUNC-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; FUNC-NEXT: v_mov_b32_e32 v2, 0 ; FUNC-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]