Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1144,9 +1144,9 @@ unsigned LoadSize = LoadTy.getSizeInBits(); const unsigned MaxNonSmrdLoadSize = 128; - const RegisterBank *PtrBank = - OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; - if (PtrBank == &AMDGPU::SGPRRegBank) { + const RegisterBank *DstBank = + OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; + if (DstBank == &AMDGPU::SGPRRegBank) { // There are some special cases that we need to look at for 32 bit and 96 // bit SGPR loads otherwise we have nothing to do. if (LoadSize != 32 && LoadSize != 96) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -446,13 +446,10 @@ ; ; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align1: ; GFX7-UNALIGNED: ; %bb.0: -; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX7-UNALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2 -; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-UNALIGNED-NEXT: s_mov_b32 s2, -1 +; GFX7-UNALIGNED-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-UNALIGNED-NEXT: buffer_load_dwordx3 v[0:2], off, s[0:3], 0 +; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7-UNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 @@ -564,13 +561,10 @@ ; ; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align2: ; GFX7-UNALIGNED: ; %bb.0: -; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX7-UNALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2 -; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-UNALIGNED-NEXT: s_mov_b32 s2, -1 +; GFX7-UNALIGNED-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-UNALIGNED-NEXT: buffer_load_dwordx3 v[0:2], off, s[0:3], 0 +; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7-UNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-uniform-load-noclobber.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-uniform-load-noclobber.mir @@ -0,0 +1,98 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=kaveri -run-pass=regbankselect %s -verify-machineinstrs -o - | FileCheck -check-prefixes=GFX7 %s +# RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -run-pass=regbankselect %s -verify-machineinstrs -o -| FileCheck -check-prefixes=GFX10 %s + +--- | + define amdgpu_ps void @test_uniform_load_without_noclobber(<16 x float> addrspace(1)* inreg %inaddr, <16 x float> addrspace(1)* inreg %outaddr) #0 { + entry: + %load = getelementptr <16 x float>, <16 x float> addrspace(1)* %inaddr, i64 0, !amdgpu.uniform !0 + %in = load <16 x float>, <16 x float> addrspace(1)* %load, align 4 + store <16 x float> %in, <16 x float> addrspace(1)* %outaddr, align 4 + ret void + } + !0 = !{} +... + +--- +name: test_uniform_load_without_noclobber +legalized: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 + + ; GFX7-LABEL: name: test_uniform_load_without_noclobber + ; GFX7: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 + ; GFX7: %in_addr:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX7: %out_addr:sgpr(p1) = COPY $sgpr2_sgpr3 + ; GFX7: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD %in_addr(p1) :: (load 16 from %ir.load, align 4, addrspace 1) + ; GFX7: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 + ; GFX7: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD %in_addr, [[C]](s64) + ; GFX7: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load 16 from %ir.load + 16, align 4, addrspace 1) + ; GFX7: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 + ; GFX7: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD %in_addr, [[C1]](s64) + ; GFX7: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p1) :: (load 16 from %ir.load + 32, align 4, addrspace 1) + ; GFX7: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48 + ; GFX7: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD %in_addr, [[C2]](s64) + ; GFX7: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p1) :: (load 16 from %ir.load + 48, align 4, addrspace 1) + ; GFX7: %load:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>) + ; GFX7: %load0_3:vgpr(<4 x s32>), %load4_7:vgpr(<4 x s32>), %load8_11:vgpr(<4 x s32>), %load12_15:vgpr(<4 x s32>) = G_UNMERGE_VALUES %load(<16 x s32>) + ; GFX7: G_STORE %load0_3(<4 x s32>), %out_addr(p1) :: (store 16 into %ir.outaddr, align 4, addrspace 1) + ; GFX7: %cst16:sgpr(s64) = G_CONSTANT i64 16 + ; GFX7: %out_addr_plus_16:sgpr(p1) = G_PTR_ADD %out_addr, %cst16(s64) + ; GFX7: G_STORE %load4_7(<4 x s32>), %out_addr_plus_16(p1) :: (store 16 into %ir.outaddr + 16, align 4, addrspace 1) + ; GFX7: %cst32:sgpr(s64) = G_CONSTANT i64 32 + ; GFX7: %out_addr_plus_32:sgpr(p1) = G_PTR_ADD %out_addr, %cst32(s64) + ; GFX7: G_STORE %load8_11(<4 x s32>), %out_addr_plus_32(p1) :: (store 16 into %ir.outaddr + 32, align 4, addrspace 1) + ; GFX7: %cst48:sgpr(s64) = G_CONSTANT i64 48 + ; GFX7: %out_addr_plus_48:sgpr(p1) = G_PTR_ADD %out_addr, %cst48(s64) + ; GFX7: G_STORE %load12_15(<4 x s32>), %out_addr_plus_48(p1) :: (store 16 into %ir.outaddr + 48, align 4, addrspace 1) + ; GFX7: S_ENDPGM 0 + ; GFX10-LABEL: name: test_uniform_load_without_noclobber + ; GFX10: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 + ; GFX10: %in_addr:sgpr(p1) = COPY $sgpr0_sgpr1 + ; GFX10: %out_addr:sgpr(p1) = COPY $sgpr2_sgpr3 + ; GFX10: [[COPY:%[0-9]+]]:vgpr(p1) = COPY %in_addr(p1) + ; GFX10: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD %in_addr(p1) :: (load 16 from %ir.load, align 4, addrspace 1) + ; GFX10: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 + ; GFX10: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD %in_addr, [[C]](s64) + ; GFX10: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load 16 from %ir.load + 16, align 4, addrspace 1) + ; GFX10: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 + ; GFX10: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD %in_addr, [[C1]](s64) + ; GFX10: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p1) :: (load 16 from %ir.load + 32, align 4, addrspace 1) + ; GFX10: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48 + ; GFX10: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD %in_addr, [[C2]](s64) + ; GFX10: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p1) :: (load 16 from %ir.load + 48, align 4, addrspace 1) + ; GFX10: %load:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>) + ; GFX10: %load0_3:vgpr(<4 x s32>), %load4_7:vgpr(<4 x s32>), %load8_11:vgpr(<4 x s32>), %load12_15:vgpr(<4 x s32>) = G_UNMERGE_VALUES %load(<16 x s32>) + ; GFX10: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY %out_addr(p1) + ; GFX10: G_STORE %load0_3(<4 x s32>), [[COPY1]](p1) :: (store 16 into %ir.outaddr, align 4, addrspace 1) + ; GFX10: %cst16:sgpr(s64) = G_CONSTANT i64 16 + ; GFX10: %out_addr_plus_16:sgpr(p1) = G_PTR_ADD %out_addr, %cst16(s64) + ; GFX10: [[COPY2:%[0-9]+]]:vgpr(p1) = COPY %out_addr_plus_16(p1) + ; GFX10: G_STORE %load4_7(<4 x s32>), [[COPY2]](p1) :: (store 16 into %ir.outaddr + 16, align 4, addrspace 1) + ; GFX10: %cst32:sgpr(s64) = G_CONSTANT i64 32 + ; GFX10: %out_addr_plus_32:sgpr(p1) = G_PTR_ADD %out_addr, %cst32(s64) + ; GFX10: [[COPY3:%[0-9]+]]:vgpr(p1) = COPY %out_addr_plus_32(p1) + ; GFX10: G_STORE %load8_11(<4 x s32>), [[COPY3]](p1) :: (store 16 into %ir.outaddr + 32, align 4, addrspace 1) + ; GFX10: %cst48:sgpr(s64) = G_CONSTANT i64 48 + ; GFX10: %out_addr_plus_48:sgpr(p1) = G_PTR_ADD %out_addr, %cst48(s64) + ; GFX10: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY %out_addr_plus_48(p1) + ; GFX10: G_STORE %load12_15(<4 x s32>), [[COPY4]](p1) :: (store 16 into %ir.outaddr + 48, align 4, addrspace 1) + ; GFX10: S_ENDPGM 0 + %in_addr:_(p1) = COPY $sgpr0_sgpr1 + %out_addr:_(p1) = COPY $sgpr2_sgpr3 + %load:_(<16 x s32>) = G_LOAD %in_addr(p1) :: (load 64 from %ir.load, align 4, addrspace 1) + %load0_3:_(<4 x s32>), %load4_7:_(<4 x s32>), %load8_11:_(<4 x s32>), %load12_15:_(<4 x s32>) = G_UNMERGE_VALUES %load(<16 x s32>) + G_STORE %load0_3(<4 x s32>), %out_addr(p1) :: (store 16 into %ir.outaddr, align 4, addrspace 1) + %cst16:_(s64) = G_CONSTANT i64 16 + %out_addr_plus_16:_(p1) = G_PTR_ADD %out_addr, %cst16(s64) + G_STORE %load4_7(<4 x s32>), %out_addr_plus_16(p1) :: (store 16 into %ir.outaddr + 16, align 4, addrspace 1) + %cst32:_(s64) = G_CONSTANT i64 32 + %out_addr_plus_32:_(p1) = G_PTR_ADD %out_addr, %cst32(s64) + G_STORE %load8_11(<4 x s32>), %out_addr_plus_32(p1) :: (store 16 into %ir.outaddr + 32, align 4, addrspace 1) + %cst48:_(s64) = G_CONSTANT i64 48 + %out_addr_plus_48:_(p1) = G_PTR_ADD %out_addr, %cst48(s64) + G_STORE %load12_15(<4 x s32>), %out_addr_plus_48(p1) :: (store 16 into %ir.outaddr + 48, align 4, addrspace 1) + S_ENDPGM 0 +...