Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -73,7 +73,7 @@ // Set defaults if needed. if (MaxPrivateElementSize == 0) - MaxPrivateElementSize = 16; + MaxPrivateElementSize = 4; return *this; } Index: test/CodeGen/AMDGPU/indirect-private-64.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-private-64.ll +++ test/CodeGen/AMDGPU/indirect-private-64.ll @@ -1,14 +1,20 @@ -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=SI -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s +; RUN: llc -march=amdgcn -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA16 -check-prefix=SI %s +; RUN: llc -march=amdgcn -mattr=-promote-alloca,+max-private-element-size-4 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA4 -check-prefix=SI %s +; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA16 -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s declare void @llvm.amdgcn.s.barrier() #1 ; SI-LABEL: {{^}}private_access_f64_alloca: -; SI-ALLOCA: buffer_store_dwordx2 -; SI-ALLOCA: buffer_load_dwordx2 +; SI-ALLOCA16: buffer_store_dwordx2 +; SI-ALLOCA16: buffer_load_dwordx2 + +; SI-ALLOCA4: buffer_store_dword v +; SI-ALLOCA4: buffer_store_dword v +; SI-ALLOCA4: buffer_load_dword v +; SI-ALLOCA4: buffer_load_dword v ; SI-PROMOTE: ds_write_b64 ; SI-PROMOTE: ds_read_b64 @@ -25,8 +31,17 @@ ; SI-LABEL: {{^}}private_access_v2f64_alloca: -; SI-ALLOCA: buffer_store_dwordx4 -; SI-ALLOCA: buffer_load_dwordx4 +; SI-ALLOCA16: buffer_store_dwordx4 +; SI-ALLOCA16: buffer_load_dwordx4 + +; SI-ALLOCA4: buffer_store_dword v +; SI-ALLOCA4: buffer_store_dword v +; SI-ALLOCA4: buffer_store_dword v +; SI-ALLOCA4: buffer_store_dword v +; SI-ALLOCA4: buffer_load_dword v +; SI-ALLOCA4: buffer_load_dword v +; SI-ALLOCA4: buffer_load_dword v +; SI-ALLOCA4: buffer_load_dword v ; SI-PROMOTE: ds_write_b64 ; SI-PROMOTE: ds_write_b64 @@ -45,8 +60,14 @@ ; SI-LABEL: {{^}}private_access_i64_alloca: -; SI-ALLOCA: buffer_store_dwordx2 -; SI-ALLOCA: buffer_load_dwordx2 +; SI-ALLOCA16: buffer_store_dwordx2 +; SI-ALLOCA16: buffer_load_dwordx2 + +; SI-ALLOCA4: buffer_store_dword v +; SI-ALLOCA4: buffer_store_dword v +; SI-ALLOCA4: buffer_load_dword v +; SI-ALLOCA4: buffer_load_dword v + ; SI-PROMOTE: ds_write_b64 ; SI-PROMOTE: ds_read_b64 @@ -63,8 +84,18 @@ ; SI-LABEL: {{^}}private_access_v2i64_alloca: -; SI-ALLOCA: buffer_store_dwordx4 -; SI-ALLOCA: buffer_load_dwordx4 +; SI-ALLOCA16: buffer_store_dwordx4 +; SI-ALLOCA16: buffer_load_dwordx4 + +; SI-ALLOCA4: buffer_store_dword v +; SI-ALLOCA4: buffer_store_dword v +; SI-ALLOCA4: buffer_store_dword v +; SI-ALLOCA4: buffer_store_dword v + +; SI-ALLOCA4: buffer_load_dword v +; SI-ALLOCA4: buffer_load_dword v +; SI-ALLOCA4: buffer_load_dword v +; SI-ALLOCA4: buffer_load_dword v ; SI-PROMOTE: ds_write_b64 ; SI-PROMOTE: ds_write_b64 Index: test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1,5 +1,5 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=SI %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=SI %s ; FIXME: Broken on evergreen ; FIXME: For some reason the 8 and 16 vectors are being stored as Index: test/CodeGen/AMDGPU/large-alloca-compute.ll =================================================================== --- test/CodeGen/AMDGPU/large-alloca-compute.ll +++ test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -7,11 +7,11 @@ ; ALL-LABEL: {{^}}large_alloca_compute_shader: -; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCN: s_mov_b32 s10, -1 -; CI: s_mov_b32 s11, 0x98f000 -; VI: s_mov_b32 s11, 0x980000 +; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s{{[0-9]+}}, -1 +; CI-NEXT: s_mov_b32 s{{[0-9]+}}, 0x88f000 +; VI-NEXT: s_mov_b32 s{{[0-9]+}}, 0x880000 ; GCNHSA: .amd_kernel_code_t Index: test/CodeGen/AMDGPU/large-alloca-graphics.ll =================================================================== --- test/CodeGen/AMDGPU/large-alloca-graphics.ll +++ test/CodeGen/AMDGPU/large-alloca-graphics.ll @@ -3,10 +3,10 @@ ; ALL-LABEL: {{^}}large_alloca_pixel_shader: ; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCN: s_mov_b32 s10, -1 -; CI: s_mov_b32 s11, 0x98f000 -; VI: s_mov_b32 s11, 0x980000 +; GCN-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_mov_b32 s11, 0x88f000 +; VI-NEXT: s_mov_b32 s11, 0x880000 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen @@ -24,10 +24,10 @@ ; ALL-LABEL: {{^}}large_alloca_pixel_shader_inreg: ; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCN: s_mov_b32 s10, -1 -; CI: s_mov_b32 s11, 0x98f000 -; VI: s_mov_b32 s11, 0x980000 +; GCN-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s10, -1 +; CI-NEXT: s_mov_b32 s11, 0x88f000 +; VI-NEXT: s_mov_b32 s11, 0x880000 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll =================================================================== --- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -23,14 +23,21 @@ ; GCNMESA: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCNMESA-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCNMESA-NEXT: s_mov_b32 s14, -1 -; SIMESA-NEXT: s_mov_b32 s15, 0x98f000 -; VIMESA-NEXT: s_mov_b32 s15, 0x980000 +; SIMESA-NEXT: s_mov_b32 s15, 0x88f000 +; VIMESA-NEXT: s_mov_b32 s15, 0x880000 ; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill -; GCN: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} -; GCN: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} +; GCN: buffer_store_dword {{v[0-9]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} +; GCN: buffer_store_dword {{v[0-9]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} +; GCN: buffer_store_dword {{v[0-9]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} +; GCN: buffer_store_dword {{v[0-9]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} + +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}} ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 1024 Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll =================================================================== --- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -15,8 +15,8 @@ ; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_mov_b32 s15, 0x98f000 -; VI-NEXT: s_mov_b32 s15, 0x980000 +; SI-NEXT: s_mov_b32 s15, 0x88f000 +; VI-NEXT: s_mov_b32 s15, 0x880000 ; s12 is offset user SGPR ; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Spill