diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -92,9 +92,6 @@ def HWVALU : ProcResource<1> { let BufferSize = 1; } -def HWRC : ProcResource<1> { // Register destination cache - let BufferSize = 1; -} class HWWriteRes resources, int latency> : WriteRes { @@ -184,21 +181,21 @@ // The latency values are 1 / (operations / cycle). // Add 1 stall cycle for VGPR read. -def : HWWriteRes; -def : HWWriteRes; -def : HWWriteRes; -def : HWWriteRes; -def : HWWriteRes; -def : HWWriteRes; -def : HWWriteRes; - -def : HWWriteRes; -def : HWWriteRes; -def : HWWriteRes; -def : HWWriteRes; -def : HWWriteRes; -def : HWWriteRes; -def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; + +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; def : InstRW<[WriteCopy], (instrs COPY)>; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -64,15 +64,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: v_mov_b32_e32 v5, s3 -; GFX10-NEXT: v_mov_b32_e32 v4, s2 -; GFX10-NEXT: v_add_co_u32_e64 v6, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v1, vcc_lo -; GFX10-NEXT: global_load_dwordx2 v[2:3], v[6:7], off +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: global_store_dwordx2 v[6:7], v[4:5], off +; GFX10-NEXT: v_mov_b32_dpp v2, v4 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: v_mov_b32_dpp v3, v5 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll --- a/llvm/test/CodeGen/AMDGPU/fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -43,8 +43,8 @@ ; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]] ; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] +; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] -; PREGFX10-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] ; PREGFX10-NOT: s_setreg ; PREGFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 ; PREGFX10: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] @@ -57,7 +57,6 @@ ; GFX10-NOT: s_denorm_mode ; GFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 ; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]] -; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] ; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] ; GFX10: v_fma_f32 [[D:v[0-9]+]], [[C]], -[[NUM_SCALE]], [[DEN_SCALE]] ; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]] @@ -315,8 +314,8 @@ ; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]] ; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] +; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] -; PREGFX10-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] ; PREGFX10-NOT: s_setreg ; PREGFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 ; PREGFX10: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] @@ -329,7 +328,6 @@ ; GFX10-NOT: s_denorm_mode ; GFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 ; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]] -; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] ; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] ; GFX10: v_fma_f32 [[D:v[0-9]+]], [[C]], -[[NUM_SCALE]], [[DEN_SCALE]] ; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]] diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -102,18 +102,19 @@ ; ; GFX10-DL-LABEL: udot2: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s3, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -246,20 +247,21 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s6, s2, s4 -; GFX10-DL-NEXT: s_and_b32 s4, s3, s4 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 +; GFX10-DL-NEXT: s_and_b32 s5, s3, s2 +; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 ; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v0, s4, s6 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s5, v0 +; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 16 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v0, s2, s5 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s3, v0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off @@ -377,18 +379,19 @@ ; ; GFX10-DL-LABEL: idot2: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: v_dot2_i32_i16 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_dot2_i32_i16 v2, s3, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -648,18 +651,19 @@ ; ; GFX10-DL-LABEL: udot2_alt_AddOperands: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s3, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -909,14 +913,14 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-DL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s2, s4 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s3, v0 +; GFX10-DL-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s3, s4 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off @@ -1039,18 +1043,19 @@ ; ; GFX10-DL-LABEL: udot2_v4i16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s3, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1171,18 +1176,19 @@ ; ; GFX10-DL-LABEL: udot2_v4i16_Hi: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x4 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x4 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x4 +; GFX10-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x4 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s3, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1312,18 +1318,18 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 -; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s3, s3, s7 -; GFX10-DL-NEXT: s_and_b32 s5, s5, s7 +; GFX10-DL-NEXT: s_and_b32 s3, s3, s8 +; GFX10-DL-NEXT: s_and_b32 s5, s5, s8 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_and_b32 s2, s2, s7 -; GFX10-DL-NEXT: s_and_b32 s4, s4, s7 +; GFX10-DL-NEXT: s_and_b32 s2, s2, s8 +; GFX10-DL-NEXT: s_and_b32 s4, s4, s8 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s3, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 @@ -1457,15 +1463,15 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 -; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s3, s3, s7 -; GFX10-DL-NEXT: s_and_b32 s5, s5, s7 +; GFX10-DL-NEXT: s_and_b32 s3, s3, s8 +; GFX10-DL-NEXT: s_and_b32 s5, s5, s8 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 ; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 16 @@ -1602,18 +1608,18 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff +; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s6, s2, 16 -; GFX10-DL-NEXT: s_and_b32 s7, s3, s5 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: s_and_b32 s2, s2, s5 -; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16 +; GFX10-DL-NEXT: s_lshr_b32 s6, s3, 16 +; GFX10-DL-NEXT: s_and_b32 s7, s4, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-DL-NEXT: s_and_b32 s2, s3, s2 +; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 16 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 @@ -1751,20 +1757,20 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s6, s2, 16 ; GFX10-DL-NEXT: s_lshr_b32 s7, s3, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: s_and_b32 s2, s2, s5 -; GFX10-DL-NEXT: s_and_b32 s3, s3, s5 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s3, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: s_lshr_b32 s5, s4, 16 +; GFX10-DL-NEXT: s_and_b32 s4, s4, s2 +; GFX10-DL-NEXT: s_and_b32 s2, s3, s2 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s5, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s4, v0 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -2049,21 +2055,21 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff +; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s6, s2, s5 -; GFX10-DL-NEXT: s_and_b32 s5, s3, s5 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 +; GFX10-DL-NEXT: s_and_b32 s6, s3, s2 +; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s6, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v0 +; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s6, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s6, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off @@ -2641,10 +2647,10 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -119,18 +119,19 @@ ; ; GFX10-DL-LABEL: idot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s2, s3, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -302,10 +303,10 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off @@ -471,10 +472,10 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off @@ -1037,41 +1038,40 @@ ; ; GFX10-DL-LABEL: idot4_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x80000 -; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x80000 -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s0 -; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 8, s1 -; GFX10-DL-NEXT: v_and_b32_e32 v6, s3, v3 -; GFX10-DL-NEXT: v_and_b32_e32 v7, s4, v3 -; GFX10-DL-NEXT: s_bfe_i32 s0, s2, 0x80000 -; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x80000 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 8, s2 -; GFX10-DL-NEXT: v_and_b32_e32 v8, s1, v3 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s0, v3 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 8, s5 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_lshl_or_b32 v3, v6, 16, v3 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v7, 16, v8 +; GFX10-DL-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-DL-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 8, s2 +; GFX10-DL-NEXT: s_bfe_i32 s2, s2, 0x80000 +; GFX10-DL-NEXT: s_bfe_i32 s6, s3, 0x80000 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 8, s3 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 8, s4 +; GFX10-DL-NEXT: s_bfe_i32 s3, s5, 0x80000 +; GFX10-DL-NEXT: s_bfe_i32 s4, s4, 0x80000 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s6, v0 +; GFX10-DL-NEXT: v_and_b32_e32 v5, s2, v0 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 8, s5 +; GFX10-DL-NEXT: v_and_b32_e32 v7, s3, v0 +; GFX10-DL-NEXT: v_and_b32_e32 v8, s4, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v4 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, v11, 16, v5 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, v6, 16, v7 +; GFX10-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v8 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v2 +; GFX10-DL-NEXT: global_load_ushort v4, v[0:1], off ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v4, v2 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v4, v2, v4 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -122,18 +122,19 @@ ; ; GFX10-DL-LABEL: udot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s2, s3, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -298,10 +299,10 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off @@ -468,10 +469,10 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off @@ -608,25 +609,24 @@ ; ; GFX10-DL-LABEL: udot2_8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_movk_i32 s2, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_movk_i32 s1, 0xff -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s3, s2, s1 -; GFX10-DL-NEXT: s_and_b32 s1, s0, s1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s2, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x80008 +; GFX10-DL-NEXT: s_and_b32 s1, s3, s2 +; GFX10-DL-NEXT: s_and_b32 s0, s4, s2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s4, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s3, s3, 0x80008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s3, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s2, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -772,10 +772,10 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s1, s0, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off @@ -944,30 +944,29 @@ ; ; GFX10-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_movk_i32 s2, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX10-DL-NEXT: s_and_b32 s5, s0, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s1, s2 +; GFX10-DL-NEXT: s_and_b32 s0, s3, s2 +; GFX10-DL-NEXT: s_and_b32 s1, s4, s2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s3, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x80008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x80010 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s5, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s2, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s3, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x80010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 +; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s2, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -1140,26 +1139,26 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_movk_i32 s2, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: s_movk_i32 s5, 0xff +; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s6, s2, s5 -; GFX10-DL-NEXT: s_and_b32 s5, s3, s5 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80008 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s7, v0 -; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80010 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24 -; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s7, v0 +; GFX10-DL-NEXT: s_and_b32 s6, s3, s2 +; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-DL-NEXT: s_bfe_u32 s5, s3, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s2, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s7, v0 +; GFX10-DL-NEXT: s_bfe_u32 s5, s3, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80010 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s2, v0 +; GFX10-DL-NEXT: s_lshr_b32 s2, s3, 24 +; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s7, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -1347,26 +1346,26 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_movk_i32 s2, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: s_movk_i32 s5, 0xff +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s6, s2, 0x80008 ; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80008 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: s_and_b32 s8, s2, s5 -; GFX10-DL-NEXT: s_and_b32 s5, s3, s5 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 -; GFX10-DL-NEXT: s_bfe_u32 s6, s2, 0x80010 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x80008 +; GFX10-DL-NEXT: s_and_b32 s8, s4, s2 +; GFX10-DL-NEXT: s_and_b32 s2, s3, s2 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s7, v0 ; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80010 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x80010 ; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s8, s5, v0 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s4, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s8, s2, v0 +; GFX10-DL-NEXT: s_lshr_b32 s2, s4, 24 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s6, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s5, s7, v1 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 @@ -1544,29 +1543,28 @@ ; ; GFX10-DL-LABEL: notdot4_mixedtypes: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 -; GFX10-DL-NEXT: s_sext_i32_i8 s4, s0 -; GFX10-DL-NEXT: s_sext_i32_i8 s5, s1 +; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s5, s3, 0x80008 +; GFX10-DL-NEXT: s_sext_i32_i8 s0, s2 +; GFX10-DL-NEXT: s_sext_i32_i8 s1, s3 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s3, 0x80010 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -1745,29 +1743,29 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_movk_i32 s3, 0xff +; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: s_movk_i32 s5, 0xff -; GFX10-DL-NEXT: s_mov_b32 s6, 0xffff +; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-DL-NEXT: s_and_b32 s7, s2, s5 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: s_and_b32 s5, s3, s5 -; GFX10-DL-NEXT: v_and_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-DL-NEXT: v_and_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x80010 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s5, v2 -; GFX10-DL-NEXT: s_bfe_u32 s5, s3, 0x80010 -; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_and_b32 s7, s4, s3 +; GFX10-DL-NEXT: s_and_b32 s3, s5, s3 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-DL-NEXT: v_and_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_and_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: s_bfe_u32 s2, s4, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s3, v2 +; GFX10-DL-NEXT: s_bfe_u32 s3, s5, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s5, s5, 24 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v1, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off @@ -1936,39 +1934,38 @@ ; ; GFX10-DL-LABEL: udot4_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s0 -; GFX10-DL-NEXT: v_and_b32_sdwa v7, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v5, 8, s1 -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v3, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 16 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v3, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, s1, 16, v6 -; GFX10-DL-NEXT: v_lshl_or_b32 v3, s0, 16, v3 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5 +; GFX10-DL-NEXT: s_lshr_b32 s5, s2, 16 +; GFX10-DL-NEXT: s_lshr_b32 s6, s3, 16 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s2 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s3 +; GFX10-DL-NEXT: v_and_b32_sdwa v3, v0, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v0, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v0, s6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v6, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: s_lshr_b32 s4, s2, 24 +; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v3 +; GFX10-DL-NEXT: v_lshl_or_b32 v3, v1, 16, v4 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_lshr_b32 s2, s3, 24 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v3, v2 +; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off +; GFX10-DL-NEXT: v_lshl_or_b32 v4, s2, 16, v5 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, s4, 16, v6 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v4, v2 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v2, v3 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v4 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -2157,39 +2154,38 @@ ; ; GFX10-DL-LABEL: udot4_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s0 -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s1 -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 24 -; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 24 -; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v3, v4 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s0, s1 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s2, s3 -; GFX10-DL-NEXT: s_lshr_b32 s0, s1, 16 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 -; GFX10-DL-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v5 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s4, s0 -; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s2 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 +; GFX10-DL-NEXT: s_lshr_b32 s4, s2, 24 +; GFX10-DL-NEXT: s_lshr_b32 s5, s3, 24 +; GFX10-DL-NEXT: s_lshr_b32 s6, s2, 16 +; GFX10-DL-NEXT: s_lshr_b32 s7, s3, 16 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v0, v0, v1 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v1, s2, s3 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v2, s4, s5 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s6, s7 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v0, 8, v0 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 8, v2 +; GFX10-DL-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_or_b32_sdwa v1, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX10-DL-NEXT: v_or_b32_e32 v3, v2, v1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v3 +; GFX10-DL-NEXT: global_load_ubyte v5, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v5 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -167,18 +167,19 @@ ; ; GFX10-DL-LABEL: idot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s2, s4, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s1, s2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -471,50 +472,49 @@ ; ; GFX10-DL-LABEL: idot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: ; implicit-def: $vcc_hi -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: global_load_ushort v4, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12 -; GFX10-DL-NEXT: s_lshr_b32 s4, s1, 12 -; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40004 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s2 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s4 -; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s9, s1, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004 +; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 12 +; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 12 +; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s7, s2, 0x40004 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s0 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s1 +; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_i32 s9, s2, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40008 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2 +; GFX10-DL-NEXT: s_mov_b32 s0, 0xffff ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 -; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s8, s9 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x40010 +; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s9, s10 +; GFX10-DL-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX10-DL-NEXT: s_bfe_i32 s0, s2, 0x40010 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_mad_i32_i24 v4, s5, s6, v4 +; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40014 +; GFX10-DL-NEXT: v_mad_i32_i24 v4, s7, s8, v4 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, v2, v3, v4 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_i32 s0, s2, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x40018 +; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 28 +; GFX10-DL-NEXT: s_ashr_i32 s4, s4, 28 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 -; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s7, s2, v2 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40018 -; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 -; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -815,50 +815,49 @@ ; ; GFX10-DL-LABEL: idot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: ; implicit-def: $vcc_hi -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: global_load_ubyte v4, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12 -; GFX10-DL-NEXT: s_lshr_b32 s4, s1, 12 -; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40004 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s2 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s4 -; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s9, s1, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004 +; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 12 +; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 12 +; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s7, s2, 0x40004 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 12, s0 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s1 +; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_i32 s9, s2, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40008 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2 +; GFX10-DL-NEXT: s_movk_i32 s0, 0xff ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 -; GFX10-DL-NEXT: s_movk_i32 s4, 0xff -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s8, s9 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x40010 +; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s9, s10 +; GFX10-DL-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX10-DL-NEXT: s_bfe_i32 s0, s2, 0x40010 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_mad_i32_i24 v4, s5, s6, v4 +; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40014 +; GFX10-DL-NEXT: v_mad_i32_i24 v4, s7, s8, v4 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, v2, v3, v4 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_i32 s0, s2, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x40018 +; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 28 +; GFX10-DL-NEXT: s_ashr_i32 s4, s4, 28 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 -; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s7, s2, v2 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40018 -; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 -; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -1832,58 +1831,57 @@ ; ; GFX10-DL-LABEL: idot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: ; implicit-def: $vcc_hi -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s5, s0, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s7, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 28 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-DL-NEXT: s_bfe_u32 s9, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s1, s2, 28 +; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX10-DL-NEXT: s_and_b32 s9, s2, 15 +; GFX10-DL-NEXT: s_bfe_u32 s2, s2, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s10, s4, 15 +; GFX10-DL-NEXT: s_bfe_u32 s11, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s13, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s14, s4, 0x4000c +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s9, s2 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX10-DL-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s5 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s9, s10, s11 +; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x40010 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s8, s13, s14 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_bfe_u32 s2, s4, 0x40014 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s9 op_sel_hi:[0,1] ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s7 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s8 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s6, s0 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s10, s2 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40010 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s8, s5 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40018 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40014 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s5 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s9, s10 -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s6, s0 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v3 +; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s5 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s2, s4 +; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s7, s1 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s12, s4 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v7 op_sel_hi:[0,1] ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v2, v3 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v6 op_sel_hi:[0,1] ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 @@ -2277,27 +2275,27 @@ ; GFX10-DL-LABEL: idot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_mov_b32 s0, 0xffff ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 4 ; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 4 -; GFX10-DL-NEXT: s_lshr_b32 s9, s0, 12 +; GFX10-DL-NEXT: s_lshr_b32 s8, s2, 4 +; GFX10-DL-NEXT: s_lshr_b32 s9, s2, 12 ; GFX10-DL-NEXT: s_lshr_b32 s16, s1, 12 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s8 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s2 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s15 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s8 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s16 -; GFX10-DL-NEXT: s_lshr_b32 s10, s0, 8 +; GFX10-DL-NEXT: s_lshr_b32 s10, s2, 8 ; GFX10-DL-NEXT: s_lshr_b32 s17, s1, 8 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s9 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7 @@ -2309,10 +2307,10 @@ ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v12 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v6 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v14, 12, v14 -; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 20 -; GFX10-DL-NEXT: s_lshr_b32 s5, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 24 +; GFX10-DL-NEXT: s_lshr_b32 s4, s2, 20 +; GFX10-DL-NEXT: s_lshr_b32 s5, s2, 16 +; GFX10-DL-NEXT: s_lshr_b32 s6, s2, 28 +; GFX10-DL-NEXT: s_lshr_b32 s7, s2, 24 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v3, v4 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v19, v14 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v7 @@ -2335,7 +2333,7 @@ ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v9 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v10 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s13 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v11 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v10, 12, v13 @@ -2357,7 +2355,7 @@ ; GFX10-DL-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v5 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -167,18 +167,19 @@ ; ; GFX10-DL-LABEL: udot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s2, s4, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -445,43 +446,42 @@ ; ; GFX10-DL-LABEL: udot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s1, s2, 15 +; GFX10-DL-NEXT: s_and_b32 s6, s4, 15 +; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s6, v2 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s5, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x4000c ; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40010 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 +; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -748,43 +748,42 @@ ; ; GFX10-DL-LABEL: udot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s1, s2, 15 +; GFX10-DL-NEXT: s_and_b32 s6, s4, 15 +; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s6, v2 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s5, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x4000c ; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40010 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 +; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -1061,45 +1060,44 @@ ; ; GFX10-DL-LABEL: udot8_acc4: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 +; GFX10-DL-NEXT: s_and_b32 s1, s2, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s4, s0, 15 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x4000c ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s7, v2 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s7, v2 ; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s4, s5 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s0, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -1361,45 +1359,44 @@ ; ; GFX10-DL-LABEL: udot8_CommutationInsideMAD: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x4000c +; GFX10-DL-NEXT: s_and_b32 s1, s2, 15 +; GFX10-DL-NEXT: s_and_b32 s6, s4, 15 +; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x4000c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s4, s8 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s7, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s6, v2 +; GFX10-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x40008 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s5, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s6, s8 +; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s7, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40010 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -1924,18 +1921,19 @@ ; ; GFX10-DL-LABEL: udot8_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s2, s4, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -2191,50 +2189,49 @@ ; ; GFX10-DL-LABEL: udot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: ; implicit-def: $vcc_hi -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s6 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40008 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s4 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s6, s7 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s8 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40014 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s5, s4 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s6 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: s_and_b32 s6, s2, 15 +; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s7, s4, 15 +; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s6, s6, s9 +; GFX10-DL-NEXT: s_bfe_u32 s11, s4, 0x40008 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX10-DL-NEXT: s_bfe_u32 s12, s4, 0x4000c +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s10 +; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40014 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, s6, s7 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s11, s12 +; GFX10-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x40010 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s5, s1 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s6 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s7, s8 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s4, s0 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s1 +; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s5 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v2, v3 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s0, s5 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s6, s4 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s0, s1 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s1, s0 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v4 @@ -2553,65 +2550,64 @@ ; ; GFX10-DL-LABEL: udot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: ; implicit-def: $vcc_hi -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: global_load_ubyte v5, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s5, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s7, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x4000c -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s2, s4 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s5, s7 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s8 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 -; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40014 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s2, s4 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5 -; GFX10-DL-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40018 -; GFX10-DL-NEXT: v_or_b32_sdwa v4, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_e32 v3, s5, v3 -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40010 -; GFX10-DL-NEXT: s_lshr_b32 s9, s1, 28 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s4, s7 -; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4 -; GFX10-DL-NEXT: s_bfe_u32 s1, s1, 0x40018 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s2, s8 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s0, s9 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v4 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s6, s4, 15 +; GFX10-DL-NEXT: s_and_b32 s8, s5, 15 +; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s9, s5, 0x4000c +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v2, s0, s1 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40008 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s6, s8 +; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40008 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s7, s9 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v2, 8, v2 +; GFX10-DL-NEXT: s_bfe_u32 s7, s5, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40018 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s0, s1 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4 +; GFX10-DL-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40010 +; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GFX10-DL-NEXT: v_or_b32_sdwa v3, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s1, s7 +; GFX10-DL-NEXT: s_bfe_u32 s8, s5, 0x40010 +; GFX10-DL-NEXT: s_lshr_b32 s9, s5, 28 +; GFX10-DL-NEXT: s_bfe_u32 s5, s5, 0x40018 +; GFX10-DL-NEXT: v_or_b32_e32 v3, v2, v3 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s0, s8 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s4, s9 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX10-DL-NEXT: v_or_b32_e32 v4, v6, v4 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 8, v7 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v5 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v5 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s5 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v8 -; GFX10-DL-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s5, v3 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v5 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v4 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: v_or_b32_e32 v3, v4, v5 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v3 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -2852,45 +2848,44 @@ ; ; GFX10-DL-LABEL: udot8_acc4_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 +; GFX10-DL-NEXT: s_and_b32 s1, s2, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s4, s0, 15 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x4000c ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s7, v2 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s7, v2 ; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s4, s5 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s0, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -3085,18 +3080,19 @@ ; ; GFX10-DL-LABEL: udot8_variant1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s0, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s3, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm i32 addrspace(1)* %v2addr, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll @@ -148,6 +148,8 @@ ; Does not imply memory fence on its own ; GCN-LABEL: {{^}}gws_init_wait_before: +; NOLOOP: s_load_dword +; NOLOOP: s_load_dword ; NOLOOP: s_waitcnt lgkmcnt(0) ; NOLOOP-NOT: s_waitcnt ; NOLOOP: ds_gws_init diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -105,16 +105,16 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v6, s8 ; encoding: [0x08,0x02,0x0c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v7, s9 ; encoding: [0x09,0x02,0x0e,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -189,16 +189,16 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v6, s8 ; encoding: [0x08,0x02,0x0c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v7, s9 ; encoding: [0x09,0x02,0x0e,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; encoding: [0x00,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -313,16 +313,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -441,16 +441,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] -; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; encoding: [0x10,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -569,16 +569,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] -; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm lwe ; encoding: [0x18,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -693,16 +693,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm tfe ; encoding: [0x20,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -821,16 +821,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] -; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe ; encoding: [0x28,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -949,16 +949,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] -; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x30,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1081,16 +1081,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v8, v3 ; encoding: [0x03,0x03,0x10,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] -; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_mov_b32_e32 v9, s8 ; encoding: [0x08,0x02,0x12,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x38,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, s9 ; encoding: [0x09,0x02,0x14,0x7e] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[9:10], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x09,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1205,16 +1205,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: image_load_mip v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; encoding: [0x00,0x1f,0x06,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1333,16 +1333,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] -; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x1f,0x05,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1726,15 +1726,15 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v5, s9 ; encoding: [0x09,0x02,0x0a,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: image_load v[0:3], v4, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x17,0x01,0xf0,0x04,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v4, s8 ; encoding: [0x08,0x02,0x08,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[4:5], v3, off ; encoding: [0x00,0x80,0x70,0xdc,0x04,0x03,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[5:6], v3, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x03,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1803,14 +1803,14 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v4, s9 ; encoding: [0x09,0x02,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, s8 ; encoding: [0x08,0x02,0x08,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v5, s9 ; encoding: [0x09,0x02,0x0a,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: image_load v[0:2], v3, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x16,0x01,0xf0,0x03,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v3, s8 ; encoding: [0x08,0x02,0x06,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[3:4], v2, off ; encoding: [0x00,0x80,0x70,0xdc,0x03,0x02,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[4:5], v2, off ; encoding: [0x00,0x80,0x70,0xdc,0x04,0x02,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1876,13 +1876,13 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, s9 ; encoding: [0x09,0x02,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, s8 ; encoding: [0x08,0x02,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, s9 ; encoding: [0x09,0x02,0x08,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x18,0x01,0xf0,0x02,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; encoding: [0x08,0x02,0x04,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[2:3], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x02,0x01,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[3:4], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x03,0x01,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1948,13 +1948,13 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, s9 ; encoding: [0x09,0x02,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, s8 ; encoding: [0x08,0x02,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v4, s9 ; encoding: [0x09,0x02,0x08,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe ; encoding: [0x00,0x18,0x01,0xf0,0x02,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; encoding: [0x08,0x02,0x04,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[2:3], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x02,0x01,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[3:4], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x03,0x01,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -3291,9 +3291,9 @@ ; GFX10-LABEL: image_load_mmo: ; GFX10: ; %bb.0: ; GFX10-NEXT: image_load v1, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm ; encoding: [0x08,0x11,0x00,0xf0,0x01,0x01,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: ds_write2_b32 v0, v2, v2 offset1:4 ; encoding: [0x00,0x04,0x38,0xd8,0x00,0x02,0x02,0x00] +; GFX10-NEXT: ds_write2_b32 v0, v3, v3 offset1:4 ; encoding: [0x00,0x04,0x38,0xd8,0x00,0x03,0x03,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; encoding: [0x01,0x03,0x00,0x7e] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -676,13 +676,13 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v5, v10, v5 ; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 ; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 -; GFX10-NEXT: v_and_b32_e32 v5, v10, v5 +; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v6, v6, 16, v5 -; GFX10-NEXT: image_sample_c_d v[0:3], [v0, v1, v3, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 +; GFX10-NEXT: image_sample_c_d v[0:3], [v0, v2, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -734,9 +734,9 @@ ; GFX10-NEXT: v_and_b32_e32 v4, v7, v4 ; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 -; GFX10-NEXT: image_sample_d_cl v[0:3], [v0, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_lshl_or_b32 v7, v5, 16, v4 +; GFX10-NEXT: v_lshl_or_b32 v5, v3, 16, v2 +; GFX10-NEXT: image_sample_d_cl v[0:3], [v0, v5, v7, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -786,13 +786,13 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_and_b32_e32 v5, v8, v5 -; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 ; GFX10-NEXT: v_and_b32_e32 v3, v8, v3 -; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 +; GFX10-NEXT: v_and_b32_e32 v5, v8, v5 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v6, v4, 16, v3 -; GFX10-NEXT: image_sample_c_d_cl v[0:3], [v0, v1, v6, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_lshl_or_b32 v6, v6, 16, v5 +; GFX10-NEXT: image_sample_c_d_cl v[0:3], [v0, v1, v3, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -888,13 +888,13 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v5, v10, v5 ; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 ; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 -; GFX10-NEXT: v_and_b32_e32 v5, v10, v5 +; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v6, v6, 16, v5 -; GFX10-NEXT: image_sample_c_cd v[0:3], [v0, v1, v3, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 +; GFX10-NEXT: image_sample_c_cd v[0:3], [v0, v2, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -946,9 +946,9 @@ ; GFX10-NEXT: v_and_b32_e32 v4, v7, v4 ; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 -; GFX10-NEXT: image_sample_cd_cl v[0:3], [v0, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_lshl_or_b32 v7, v5, 16, v4 +; GFX10-NEXT: v_lshl_or_b32 v5, v3, 16, v2 +; GFX10-NEXT: image_sample_cd_cl v[0:3], [v0, v5, v7, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -998,13 +998,13 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_and_b32_e32 v5, v8, v5 -; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 ; GFX10-NEXT: v_and_b32_e32 v3, v8, v3 -; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 +; GFX10-NEXT: v_and_b32_e32 v5, v8, v5 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v6, v4, 16, v3 -; GFX10-NEXT: image_sample_c_cd_cl v[0:3], [v0, v1, v6, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_lshl_or_b32 v6, v6, 16, v5 +; GFX10-NEXT: image_sample_c_cd_cl v[0:3], [v0, v1, v3, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1203,13 +1203,13 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_and_b32_e32 v4, v9, v4 -; GFX10-NEXT: v_and_b32_e32 v2, v9, v2 ; GFX10-NEXT: v_and_b32_e32 v6, v9, v6 -; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v2, v9, v2 +; GFX10-NEXT: v_and_b32_e32 v4, v9, v4 +; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v7, v7, 16, v6 -; GFX10-NEXT: image_sample_c_d_o v0, [v0, v1, v2, v4, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX10-NEXT: v_lshl_or_b32 v7, v5, 16, v4 +; GFX10-NEXT: image_sample_c_d_o v0, [v0, v1, v2, v7, v6, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1238,13 +1238,13 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_and_b32_e32 v4, v9, v4 -; GFX10-NEXT: v_and_b32_e32 v2, v9, v2 ; GFX10-NEXT: v_and_b32_e32 v6, v9, v6 -; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX10-NEXT: v_and_b32_e32 v2, v9, v2 +; GFX10-NEXT: v_and_b32_e32 v4, v9, v4 +; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v7, v7, 16, v6 -; GFX10-NEXT: image_sample_c_d_o v[0:1], [v0, v1, v2, v4, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX10-NEXT: v_lshl_or_b32 v7, v5, 16, v4 +; GFX10-NEXT: image_sample_c_d_o v[0:1], [v0, v1, v2, v7, v6, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -1783,13 +1783,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v10, v0 ; encoding: [0x00,0x03,0x14,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v9, v1 ; encoding: [0x01,0x03,0x12,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v11, s12 ; encoding: [0x0c,0x02,0x16,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v12, s13 ; encoding: [0x0d,0x02,0x18,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: image_sample_c_d_o v[0:1], [v10, v9, v2, v3, v4, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; encoding: [0x2c,0x04,0xe9,0xf0,0x0a,0x00,0x40,0x00,0x09,0x02,0x03,0x04,0x05,0x06,0x07,0x08] -; GFX10-NEXT: v_mov_b32_e32 v2, s12 ; encoding: [0x0c,0x02,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v3, s13 ; encoding: [0x0d,0x02,0x06,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[2:3], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x02,0x01,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[11:12], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x0b,0x01,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll @@ -145,6 +145,7 @@ ;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged_or: ;CHECK-NEXT: %bb. ;CHECK-NEXT: v_lshlrev_b32_e32 v{{[0-9]}}, 6, v0 +;GFX10-NEXT: ; implicit-def: $vcc_hi ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:28 ;CHECK: s_waitcnt @@ -214,6 +215,7 @@ ;CHECK-LABEL: {{^}}buffer_load_x2_offen_merged_or: ;CHECK-NEXT: %bb. ;CHECK-NEXT: v_lshlrev_b32_e32 v{{[0-9]}}, 4, v0 +;GFX10-NEXT: ; implicit-def: $vcc_hi ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 ;CHECK: s_waitcnt define amdgpu_ps void @buffer_load_x2_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp) { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -11,7 +11,7 @@ ; GCN-LABEL: {{^}}fmuladd_f16 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] +; GCN-DAG: buffer_load_ushort v[[C_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] @@ -25,7 +25,7 @@ ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] ; VI-DENORM: buffer_store_short [[RESULT]] -; GFX10-FLUSH: v_mul_f16_e32 [[MUL:v[0-9]+]], v[[A_F16]], v[[B_F16]] +; GFX10-FLUSH-DAG: v_mul_f16_e32 [[MUL:v[0-9]+]], v[[A_F16]], v[[B_F16]] ; GFX10-FLUSH: v_add_f16_e32 [[ADD:v[0-9]+]], [[MUL]], v[[C_F16]] ; GFX10-FLUSH: buffer_store_short [[ADD]] @@ -131,7 +131,7 @@ ; GFX10: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; GFX10: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; GFX10: buffer_load_dword v[[C_V2_F16:[0-9]+]] +; GFX10-DAG: buffer_load_dword v[[C_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] @@ -167,7 +167,7 @@ ; VI-DENORM-NOT: v_and_b32 ; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]] -; GFX10-FLUSH: v_pk_mul_f16 [[MUL:v[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] +; GFX10-FLUSH-DAG: v_pk_mul_f16 [[MUL:v[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] ; GFX10-FLUSH: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[MUL]], v[[C_V2_F16]] ; GFX10-DENORM: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]] diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll --- a/llvm/test/CodeGen/AMDGPU/madak.ll +++ b/llvm/test/CodeGen/AMDGPU/madak.ll @@ -238,12 +238,12 @@ ; GCN-LABEL: {{^}}madak_constant_bus_violation: ; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}} -; GCN: {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]] +; GCN-DAG: {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]] ; MAD: v_mov_b32_e32 [[MADAK:v[0-9]+]], 0x42280000 ; MAD: v_mac_f32_e64 [[MADAK]], [[SGPR0]], 0.5 -; GFX10: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]] -; GFX10-MAD: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 -; FMA: v_fmaak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 +; GFX10-DAG: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]] +; GFX10-MAD-DAG: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 +; FMA-DAG: v_fmaak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]] ; GFX6: buffer_store_dword [[MUL]] ; GFX8_9_10: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[MUL]] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll @@ -3088,9 +3088,8 @@ ; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} ; GFX8: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU: s_waitcnt vmcnt(0){{$}} ; GFX10CU-NOT: buffer_gl0_inv ; GFX8-NOT: buffer_wbinvl1_vol ; GFX10: .amdhsa_kernel workgroup_acquire_monotonic_ret @@ -3115,8 +3114,7 @@ ; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} ; GFX8: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10CU: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol ; GFX10WGP-NEXT: buffer_gl0_inv ; GFX10CU-NOT: buffer_gl0_inv @@ -3142,8 +3140,7 @@ ; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} ; GFX8: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10CU: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol ; GFX10WGP-NEXT: buffer_gl0_inv ; GFX10CU-NOT: buffer_gl0_inv @@ -3166,9 +3163,8 @@ ; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} ; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} ; GFX8: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU: s_waitcnt vmcnt(0){{$}} ; GFX10CU-NOT: buffer_gl0_inv ; GFX8-NOT: buffer_wbinvl1_vol ; GFX10: .amdhsa_kernel workgroup_acquire_acquire_ret @@ -3194,8 +3190,7 @@ ; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} ; GFX10: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} ; GFX8: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10CU: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol ; GFX10WGP-NEXT: buffer_gl0_inv ; GFX10CU-NOT: buffer_gl0_inv @@ -3220,8 +3215,7 @@ ; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} ; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10CU: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol ; GFX10WGP: buffer_gl0_inv ; GFX10CU-NOT: buffer_gl0_inv @@ -3246,8 +3240,7 @@ ; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} ; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10CU: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol ; GFX10WGP-NEXT: buffer_gl0_inv ; GFX10CU-NOT: buffer_gl0_inv @@ -3272,8 +3265,7 @@ ; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} ; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} ; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10CU: s_waitcnt vmcnt(0){{$}} +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} ; GFX8-NOT: buffer_wbinvl1_vol ; GFX10WGP: buffer_gl0_inv ; GFX10CU-NOT: buffer_gl0_inv diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -78,7 +78,6 @@ ; GCN-DAG: s_load_dwordx2 ; GCN-DAG: s_load_dword s ; GCN-DAG: s_load_dword s -; GCN-NOT: _load_ ; SI: s_min_i32 ; SI: s_min_i32 @@ -90,10 +89,10 @@ ; VI: s_min_i32 ; VI: v_min_i32_sdwa -; GFX9_10: v_min_i16 -; GFX9_10: v_min_i16 -; GFX9_10: v_min_i16 -; GFX9_10: v_min_i16 +; GFX9_10-DAG: v_min_i16 +; GFX9_10-DAG: v_min_i16 +; GFX9_10-DAG: v_min_i16 +; GFX9_10-DAG: v_min_i16 ; EG: MIN_INT ; EG: MIN_INT @@ -495,7 +494,9 @@ ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16: ; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}} ; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}} -; GCN: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]] +; GCN-DAG: s_and_b32 [[A2:s[0-9]+]], [[A]] +; GCN-DAG: s_and_b32 [[B2:s[0-9]+]], [[B]] +; GCN: s_min_u32 [[MIN:s[0-9]+]], [[A2]], [[B2]] ; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] ; GCN: buffer_store_dword [[VMIN]] diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -24,13 +24,13 @@ ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} entry: @@ -96,7 +96,7 @@ ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} @@ -106,9 +106,9 @@ ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} entry: %call = tail call i64 @_Z13get_global_idj(i32 0) %conv = and i64 %call, 255 @@ -231,15 +231,15 @@ ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; -; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 -; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 +; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 +; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 entry: %call = tail call i64 @_Z13get_global_idj(i32 0) @@ -304,10 +304,10 @@ ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} entry: %call = tail call i64 @_Z13get_global_idj(i32 0) %conv = and i64 %call, 255 @@ -350,9 +350,9 @@ ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; -; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 entry: %call = tail call i64 @_Z13get_global_idj(i32 0) @@ -401,8 +401,8 @@ ; ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} i8 addrspace(1)* %buffer2) { diff --git a/llvm/test/CodeGen/AMDGPU/scheduler-handle-move-bundle.mir b/llvm/test/CodeGen/AMDGPU/scheduler-handle-move-bundle.mir --- a/llvm/test/CodeGen/AMDGPU/scheduler-handle-move-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/scheduler-handle-move-bundle.mir @@ -19,10 +19,10 @@ ; GCN-LABEL: name: handleMove_bundle ; GCN: liveins: $sgpr4_sgpr5 ; GCN: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0, 0 :: (dereferenceable invariant load 4, align 16, addrspace 4) + ; GCN: $vcc_hi = IMPLICIT_DEF ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GCN: $vcc_hi = IMPLICIT_DEF + ; GCN: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0, 0 :: (dereferenceable invariant load 4, align 16, addrspace 4) ; GCN: DS_WRITE_B32_gfx9 [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (store 4, addrspace 3) ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; GCN: $m0 = S_MOV_B32 0 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -65,13 +65,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v3 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, 64, v0 +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -153,17 +153,17 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 ; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 64, v4 -; GFX10-NEXT: global_store_dword v[0:1], v2, off -; GFX10-NEXT: global_store_dword v[0:1], v3, off +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, 64, v0 +; GFX10-NEXT: global_store_dword v[2:3], v1, off +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -236,13 +236,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v3 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 64, v0 +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -312,13 +312,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v2, 0xffffffbf, v3 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0xffffffbf, v0 +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -388,13 +388,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0x41, v3 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x41, v0 +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -464,13 +464,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v2, 16, v3 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_add_nc_u32_e32 v0, 16, v0 +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -540,13 +540,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_nc_u32_e32 v2, -16, v3 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_sub_nc_u32_e32 v0, -16, v0 +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -616,13 +616,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v2, 17, v3 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_add_nc_u32_e32 v0, 17, v0 +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -692,13 +692,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0xffffffef, v3 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0xffffffef, v0 +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -814,13 +814,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_ushort v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_nc_u16_e64 v2, v3, 64 -; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, 64 +; GFX10-NEXT: global_store_short v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -985,17 +985,17 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_ushort v3, v[0:1], off +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 ; GFX10-NEXT: global_load_ushort v4, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_sub_nc_u16_e64 v2, v3, 64 +; GFX10-NEXT: v_sub_nc_u16_e64 v1, v4, 64 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_nc_u16_e64 v3, v4, 64 -; GFX10-NEXT: global_store_short v[0:1], v2, off -; GFX10-NEXT: global_store_short v[0:1], v3, off +; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, 64 +; GFX10-NEXT: global_store_short v[2:3], v1, off +; GFX10-NEXT: global_store_short v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1075,13 +1075,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v2, v3, 64 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_i16 v0, v0, 64 op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1159,13 +1159,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v2, v3, 7 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_i16 v0, v0, 7 op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1243,13 +1243,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v2, v3, 64 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_i16 v0, v0, 64 op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1325,13 +1325,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v2, v3, 7 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_i16 v0, v0, 7 +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1404,13 +1404,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_i16 v0, v0, 16 op_sel:[0,1] op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1482,13 +1482,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v2, v3, -4.0 op_sel:[0,1] op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_i16 v0, v0, -4.0 op_sel:[0,1] op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1560,13 +1560,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v2, v3, 4.0 op_sel:[0,1] op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_i16 v0, v0, 4.0 op_sel:[0,1] op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1644,13 +1644,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_u16 v0, v0, 32 op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1722,13 +1722,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_u16 v0, v0, 32 op_sel:[0,1] op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1803,13 +1803,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v2, v3, 32 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_u16 v0, v0, 32 +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1887,13 +1887,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v2, v3, 16 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_u16 v0, v0, 16 op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1965,13 +1965,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_u16 v0, v0, 16 op_sel:[0,1] op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2046,13 +2046,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v2, v3, 16 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_u16 v0, v0, 16 +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2129,13 +2129,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v2, v3, 1.0 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_u16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2212,13 +2212,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v2, v3, -1.0 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_u16 v0, v0, -1.0 op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2295,13 +2295,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v2, v3, -2.0 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_u16 v0, v0, -2.0 op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2378,13 +2378,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v2, v3, 2.0 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_u16 v0, v0, 2.0 op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2456,13 +2456,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_u16 v0, v0, 32 op_sel:[0,1] op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2533,13 +2533,13 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v2, v3, 32 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_u16 v0, v0, 32 +; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64