Index: llvm/lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -18,6 +18,7 @@ int Local = 3; int Constant = 4; int Private = 5; + int Constant32Bit = 6; } def AddrSpaces : AddressSpacesImpl; @@ -410,13 +411,17 @@ let IsStore = 1; } -def LoadAddress_constant : AddressSpaceList<[ AddrSpaces.Constant ]>; -def LoadAddress_global : AddressSpaceList<[ AddrSpaces.Global, AddrSpaces.Constant ]>; +def LoadAddress_constant : AddressSpaceList<[ AddrSpaces.Constant, + AddrSpaces.Constant32Bit ]>; +def LoadAddress_global : AddressSpaceList<[ AddrSpaces.Global, + AddrSpaces.Constant, + AddrSpaces.Constant32Bit ]>; def StoreAddress_global : AddressSpaceList<[ AddrSpaces.Global ]>; -def LoadAddress_flat : AddressSpaceList<[ AddrSpaces.Flat, - AddrSpaces.Global, - AddrSpaces.Constant ]>; +def LoadAddress_flat : AddressSpaceList<[ AddrSpaces.Flat, + AddrSpaces.Global, + AddrSpaces.Constant, + AddrSpaces.Constant32Bit ]>; def StoreAddress_flat : AddressSpaceList<[ AddrSpaces.Flat, AddrSpaces.Global ]>; def LoadAddress_private : AddressSpaceList<[ AddrSpaces.Private ]>; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant32bit.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant32bit.ll @@ -0,0 +1,93 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; FIXME: Test should be redundant with constant-address-space-32bit.ll + +; It's important to check with gfx8 and gfx9 to check access through global and flat. + +; Custom lowering needs to swap out the MMO address space +define amdgpu_ps float @load_constant32bit_vgpr_offset(i32 %arg) { +; GFX6-LABEL: load_constant32bit_vgpr_offset: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX6-NEXT: s_mov_b32 s2, 0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b64 s[0:1], 0 +; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: load_constant32bit_vgpr_offset: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_constant32bit_vgpr_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +entry: + %gep = getelementptr <{ [4294967295 x float] }>, <{ [4294967295 x float] }> addrspace(6)* null, i32 0, i32 0, i32 %arg + %load = load float, float addrspace(6)* %gep, align 4 + ret float %load +} + +define amdgpu_ps i32 @load_constant32bit_sgpr_offset(i32 inreg %arg) { +; GCN-LABEL: load_constant32bit_sgpr_offset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_lshl_b32 s0, s0, 2 +; GCN-NEXT: s_mov_b32 s1, 0 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ; return to shader part epilog +entry: + %gep = getelementptr <{ [4294967295 x i32] }>, <{ [4294967295 x i32] }> addrspace(6)* null, i32 0, i32 0, i32 %arg + %load = load i32, i32 addrspace(6)* %gep, align 4 + ret i32 %load +} + +; This gets split during regbankselect +define amdgpu_ps <8 x float> @load_constant32bit_vgpr_v8f32(<8 x float> addrspace(6)* %arg) { +; GFX6-LABEL: load_constant32bit_vgpr_v8f32: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: s_mov_b32 s2, 0 +; GFX6-NEXT: v_mov_b32_e32 v5, 0 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b64 s[0:1], 0 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[0:3], 0 addr64 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: load_constant32bit_vgpr_v8f32: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[4:5] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v4 +; GFX8-NEXT: v_addc_u32_e64 v5, s[0:1], 0, 0, vcc +; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_constant32bit_vgpr_v8f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[8:9], off +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +entry: + %load = load <8 x float>, <8 x float> addrspace(6)* %arg, align 32 + ret <8 x float> %load +}