diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1144,9 +1144,9 @@ unsigned LoadSize = LoadTy.getSizeInBits(); const unsigned MaxNonSmrdLoadSize = 128; - const RegisterBank *PtrBank = - OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; - if (PtrBank == &AMDGPU::SGPRRegBank) { + const RegisterBank *DstBank = + OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; + if (DstBank == &AMDGPU::SGPRRegBank) { // There are some special cases that we need to look at for 32 bit and 96 // bit SGPR loads otherwise we have nothing to do. if (LoadSize != 32 && LoadSize != 96) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -446,13 +446,10 @@ ; ; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align1: ; GFX7-UNALIGNED: ; %bb.0: -; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX7-UNALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2 -; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-UNALIGNED-NEXT: s_mov_b32 s2, -1 +; GFX7-UNALIGNED-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-UNALIGNED-NEXT: buffer_load_dwordx3 v[0:2], off, s[0:3], 0 +; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7-UNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 @@ -564,13 +561,10 @@ ; ; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align2: ; GFX7-UNALIGNED: ; %bb.0: -; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX7-UNALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2 -; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-UNALIGNED-NEXT: s_mov_b32 s2, -1 +; GFX7-UNALIGNED-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-UNALIGNED-NEXT: buffer_load_dwordx3 v[0:2], off, s[0:3], 0 +; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7-UNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll @@ -435,3 +435,46 @@ store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 ret void } + +define amdgpu_ps void @test_s_load_constant_v8i32_align1(<8 x i32> addrspace(4)* inreg %ptr, <8 x i32> addrspace(1)* inreg %out) { +; GFX9-LABEL: test_s_load_constant_v8i32_align1: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1] +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:16 +; GFX9-NEXT: s_endpgm +; +; GFX7-LABEL: test_s_load_constant_v8i32_align1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s4, s2 +; GFX7-NEXT: s_mov_b32 s5, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-LABEL: test_s_load_constant_v8i32_align1: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1] +; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:16 +; GFX10-NEXT: s_endpgm + %load = load <8 x i32>, <8 x i32> addrspace(4)* %ptr, align 1 + store <8 x i32> %load, <8 x i32> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-uniform-load-noclobber.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-uniform-load-noclobber.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-uniform-load-noclobber.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-uniform-load-noclobber.mir @@ -14,7 +14,17 @@ ; GFX7: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 ; GFX7: %in_addr:sgpr(p1) = COPY $sgpr0_sgpr1 ; GFX7: %out_addr:sgpr(p1) = COPY $sgpr2_sgpr3 - ; GFX7: %load:vgpr(<16 x s32>) = G_LOAD %in_addr(p1) :: (load 64, align 4, addrspace 1) + ; GFX7: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD %in_addr(p1) :: (load 16, align 4, addrspace 1) + ; GFX7: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 + ; GFX7: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD %in_addr, [[C]](s64) + ; GFX7: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load 16 from unknown-address + 16, align 4, addrspace 1) + ; GFX7: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 + ; GFX7: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD %in_addr, [[C1]](s64) + ; GFX7: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p1) :: (load 16 from unknown-address + 32, align 4, addrspace 1) + ; GFX7: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48 + ; GFX7: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD %in_addr, [[C2]](s64) + ; GFX7: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p1) :: (load 16 from unknown-address + 48, align 4, addrspace 1) + ; GFX7: %load:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>) ; GFX7: %load0_3:vgpr(<4 x s32>), %load4_7:vgpr(<4 x s32>), %load8_11:vgpr(<4 x s32>), %load12_15:vgpr(<4 x s32>) = G_UNMERGE_VALUES %load(<16 x s32>) ; GFX7: G_STORE %load0_3(<4 x s32>), %out_addr(p1) :: (store 16, align 4, addrspace 1) ; GFX7: %cst16:sgpr(s64) = G_CONSTANT i64 16 @@ -88,7 +98,11 @@ ; GFX7: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 ; GFX7: %ptr:sgpr(p4) = COPY $sgpr0_sgpr1 ; GFX7: %out:sgpr(p1) = COPY $sgpr2_sgpr3 - ; GFX7: %load:vgpr(<8 x s32>) = G_LOAD %ptr(p4) :: (load 32, align 1, addrspace 4) + ; GFX7: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD %ptr(p4) :: (load 16, align 1, addrspace 4) + ; GFX7: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 + ; GFX7: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD %ptr, [[C]](s64) + ; GFX7: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load 16 from unknown-address + 16, align 1, addrspace 4) + ; GFX7: %load:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>) ; GFX7: %load0_3:vgpr(<4 x s32>), %load4_7:vgpr(<4 x s32>) = G_UNMERGE_VALUES %load(<8 x s32>) ; GFX7: G_STORE %load0_3(<4 x s32>), %out(p1) :: (store 16, align 32, addrspace 1) ; GFX7: %cst_16:sgpr(s64) = G_CONSTANT i64 16