Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3517,8 +3517,8 @@ // we would need to do a wave reduction to get the maximum size to know how // much to increment the uniform stack pointer. SDValue Size = Op.getOperand(1); - if (isa(Size)) - return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion. + if (!Size->isDivergent()) + return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion. return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG); } Index: llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll +++ llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll @@ -1,12 +1,953 @@ -; RUN: not llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tahiti -mattr=+promote-alloca -verify-machineinstrs < %s 2>&1 | FileCheck %s -; RUN: not llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tahiti -mattr=-promote-alloca -verify-machineinstrs < %s 2>&1 | FileCheck %s -; RUN: not llc -march=r600 -mtriple=r600-- -mcpu=cypress < %s 2>&1 | FileCheck %s -target datalayout = "A5" +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s -; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform(i32 %n) { +; GFX9-LABEL: test_dynamic_stackalloc_kernel_uniform: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX9-NEXT: s_movk_i32 s32, 0x400 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 s33, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s4, s4, s3 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-NEXT: s_add_i32 s0, s0, 15 +; GFX9-NEXT: s_and_b32 s0, s0, 0x3fffff0 +; GFX9-NEXT: s_lshl_b32 s0, s0, 6 +; GFX9-NEXT: s_add_i32 s0, s32, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_mov_b32 s32, s0 +; GFX9-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_dynamic_stackalloc_kernel_uniform: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_mov_b32 s32, 16 +; GFX11-NEXT: s_mov_b32 s33, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s0, s0, 15 +; GFX11-NEXT: s_and_b32 s0, s0, 0x7fffff0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-NEXT: s_add_i32 s0, s32, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s32, s0 +; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm + %dyn.alloca = alloca i32, i32 %n, addrspace(5) + store volatile i32 0, ptr addrspace(5) %dyn.alloca + ret void +} + +define amdgpu_kernel void @test_dynamic_stackalloc_kernel_uniform_other_object(i32 %n) { +; GFX9-LABEL: test_dynamic_stackalloc_kernel_uniform_other_object: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX9-NEXT: s_movk_i32 s32, 0x1400 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NEXT: s_mov_b32 s33, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s4, s4, s3 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-NEXT: s_add_i32 s0, s0, 15 +; GFX9-NEXT: s_and_b32 s0, s0, 0x3fffff0 +; GFX9-NEXT: s_lshl_b32 s0, s0, 6 +; GFX9-NEXT: v_mov_b32_e32 v1, 2 +; GFX9-NEXT: s_add_i32 s0, s32, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], s33 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[4:7], s33 offset:64 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 3 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_mov_b32 s32, s0 +; GFX9-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_dynamic_stackalloc_kernel_uniform_other_object: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-NEXT: v_mov_b32_e32 v2, 3 +; GFX11-NEXT: s_movk_i32 s32, 0x50 +; GFX11-NEXT: s_mov_b32 s33, 0 +; GFX11-NEXT: scratch_store_b32 off, v0, s33 offset:4 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_store_b32 off, v1, s33 offset:64 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s0, s0, 15 +; GFX11-NEXT: s_and_b32 s0, s0, 0x7fffff0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-NEXT: s_add_i32 s0, s32, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s32, s0 +; GFX11-NEXT: scratch_store_b32 off, v2, s0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm + %regular.object = alloca [16 x i32], addrspace(5) + store volatile i32 1, ptr addrspace(5) %regular.object + %regular.object.last = getelementptr inbounds [16 x i32], ptr addrspace(5) %regular.object, i32 0, i32 15 + store volatile i32 2, ptr addrspace(5) %regular.object.last + %dynamic.alloca = alloca i32, i32 %n, addrspace(5) + store volatile i32 3, ptr addrspace(5) %dynamic.alloca + ret void +} + +define amdgpu_gfx void @test_dynamic_stackalloc_function_uniform(i32 inreg %n) { +; GFX9-LABEL: test_dynamic_stackalloc_function_uniform: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s34, s4, 2 +; GFX9-NEXT: s_add_i32 s34, s34, 15 +; GFX9-NEXT: s_and_b32 s34, s34, 0x3fffff0 +; GFX9-NEXT: s_mov_b32 s35, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: s_lshl_b32 s34, s34, 6 +; GFX9-NEXT: s_add_i32 s34, s32, s34 +; GFX9-NEXT: s_mov_b32 s32, s34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s34 +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s35 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_dynamic_stackalloc_function_uniform: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_lshl_b32 s0, s4, 2 +; GFX11-NEXT: s_mov_b32 s1, s33 +; GFX11-NEXT: s_add_i32 s0, s0, 15 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_and_b32 s0, s0, 0x7fffff0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_add_i32 s0, s32, s0 +; GFX11-NEXT: s_mov_b32 s33, s1 +; GFX11-NEXT: s_mov_b32 s32, s0 +; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %dyn.alloca = alloca i32, i32 %n, addrspace(5) + store volatile i32 0, ptr addrspace(5) %dyn.alloca + ret void +} + +define amdgpu_gfx void @test_dynamic_stackalloc_function_uniform_other_object(i32 inreg %n) { +; GFX9-LABEL: test_dynamic_stackalloc_function_uniform_other_object: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s34, s4, 2 +; GFX9-NEXT: s_add_i32 s34, s34, 15 +; GFX9-NEXT: s_and_b32 s34, s34, 0x3fffff0 +; GFX9-NEXT: s_mov_b32 s35, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_addk_i32 s32, 0x1400 +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: s_lshl_b32 s34, s34, 6 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 2 +; GFX9-NEXT: s_add_i32 s34, s32, s34 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:60 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s32, s34 +; GFX9-NEXT: v_mov_b32_e32 v0, 3 +; GFX9-NEXT: v_mov_b32_e32 v1, s34 +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_addk_i32 s32, 0xec00 +; GFX9-NEXT: s_mov_b32 s33, s35 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_dynamic_stackalloc_function_uniform_other_object: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_lshl_b32 s0, s4, 2 +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-NEXT: s_add_i32 s0, s0, 15 +; GFX11-NEXT: s_mov_b32 s1, s33 +; GFX11-NEXT: s_and_b32 s0, s0, 0x7fffff0 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_addk_i32 s32, 0x50 +; GFX11-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-NEXT: scratch_store_b32 off, v0, s33 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 3 +; GFX11-NEXT: s_add_i32 s0, s32, s0 +; GFX11-NEXT: scratch_store_b32 off, v1, s33 offset:60 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s32, s0 +; GFX11-NEXT: s_mov_b32 s33, s1 +; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_addk_i32 s32, 0xffb0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %regular.object = alloca [16 x i32], addrspace(5) + store volatile i32 1, ptr addrspace(5) %regular.object + %regular.object.last = getelementptr inbounds [16 x i32], ptr addrspace(5) %regular.object, i32 0, i32 15 + store volatile i32 2, ptr addrspace(5) %regular.object.last + %dynamic.alloca = alloca i32, i32 %n, addrspace(5) + store volatile i32 3, ptr addrspace(5) %dynamic.alloca + ret void +} + +define amdgpu_gfx void @test_dynamic_stackalloc_function_uniform_realign32(i32 inreg %n) { +; GFX9-LABEL: test_dynamic_stackalloc_function_uniform_realign32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s35, s33 +; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0 +; GFX9-NEXT: s_lshl_b32 s34, s4, 2 +; GFX9-NEXT: s_add_i32 s34, s34, 15 +; GFX9-NEXT: s_and_b32 s34, s34, 0x3fffff0 +; GFX9-NEXT: s_addk_i32 s32, 0x1000 +; GFX9-NEXT: s_lshl_b32 s34, s34, 6 +; GFX9-NEXT: s_add_i32 s34, s32, s34 +; GFX9-NEXT: s_and_b32 s34, s34, 0xfffff800 +; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800 +; GFX9-NEXT: s_mov_b32 s32, s34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s34 +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_addk_i32 s32, 0xf000 +; GFX9-NEXT: s_mov_b32 s33, s35 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_dynamic_stackalloc_function_uniform_realign32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s1, s33 +; GFX11-NEXT: s_add_i32 s33, s32, 31 +; GFX11-NEXT: s_lshl_b32 s0, s4, 2 +; GFX11-NEXT: s_add_i32 s32, s32, 64 +; GFX11-NEXT: s_add_i32 s0, s0, 15 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_and_b32 s0, s0, 0x7fffff0 +; GFX11-NEXT: s_and_not1_b32 s33, s33, 31 +; GFX11-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-NEXT: s_mov_b32 s33, s1 +; GFX11-NEXT: s_add_i32 s0, s32, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, s0, 0xfffffc00 +; GFX11-NEXT: s_mov_b32 s32, s0 +; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_addk_i32 s32, 0xffc0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %alloca = alloca i32, i32 %n, align 32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %alloca + ret void +} -define amdgpu_kernel void @test_dynamic_stackalloc(ptr addrspace(1) %out, i32 %n) { +define amdgpu_gfx void @test_dynamic_stackalloc_function_uniform_realign32_other_object(i32 inreg %n) { +; GFX9-LABEL: test_dynamic_stackalloc_function_uniform_realign32_other_object: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s35, s33 +; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0 +; GFX9-NEXT: s_lshl_b32 s34, s4, 2 +; GFX9-NEXT: s_add_i32 s34, s34, 15 +; GFX9-NEXT: s_and_b32 s34, s34, 0x3fffff0 +; GFX9-NEXT: s_addk_i32 s32, 0x2000 +; GFX9-NEXT: s_lshl_b32 s34, s34, 6 +; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800 +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: s_add_i32 s34, s32, s34 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 2 +; GFX9-NEXT: s_and_b32 s34, s34, 0xfffff800 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:60 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s32, s34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s34 +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_addk_i32 s32, 0xe000 +; GFX9-NEXT: s_mov_b32 s33, s35 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_dynamic_stackalloc_function_uniform_realign32_other_object: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s1, s33 +; GFX11-NEXT: s_add_i32 s33, s32, 31 +; GFX11-NEXT: s_lshl_b32 s0, s4, 2 +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-NEXT: s_add_i32 s0, s0, 15 +; GFX11-NEXT: s_addk_i32 s32, 0x80 +; GFX11-NEXT: s_and_b32 s0, s0, 0x7fffff0 +; GFX11-NEXT: s_and_not1_b32 s33, s33, 31 +; GFX11-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-NEXT: scratch_store_b32 off, v0, s33 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_add_i32 s0, s32, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_and_b32 s0, s0, 0xfffffc00 +; GFX11-NEXT: scratch_store_b32 off, v1, s33 offset:60 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s32, s0 +; GFX11-NEXT: s_mov_b32 s33, s1 +; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_addk_i32 s32, 0xff80 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %regular.object = alloca [16 x i32], addrspace(5) + store volatile i32 1, ptr addrspace(5) %regular.object + %regular.object.last = getelementptr inbounds [16 x i32], ptr addrspace(5) %regular.object, i32 0, i32 15 + store volatile i32 2, ptr addrspace(5) %regular.object.last + %dyn.alloca = alloca i32, i32 %n, align 32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %dyn.alloca + ret void +} + +define amdgpu_gfx void @test_dynamic_stackalloc_stack_arg_usage_function_uniform([40 x i32] %stack.args, i32 inreg %n) { +; GFX9-LABEL: test_dynamic_stackalloc_stack_arg_usage_function_uniform: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s34, s4, 2 +; GFX9-NEXT: s_add_i32 s34, s34, 15 +; GFX9-NEXT: s_and_b32 s34, s34, 0x3fffff0 +; GFX9-NEXT: s_mov_b32 s35, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_addk_i32 s32, 0xc00 +; GFX9-NEXT: s_lshl_b32 s34, s34, 6 +; GFX9-NEXT: s_add_i32 s34, s32, s34 +; GFX9-NEXT: s_mov_b32 s32, s34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s34 +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_addk_i32 s32, 0xf400 +; GFX9-NEXT: s_mov_b32 s33, s35 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_dynamic_stackalloc_stack_arg_usage_function_uniform: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_lshl_b32 s0, s4, 2 +; GFX11-NEXT: s_mov_b32 s1, s33 +; GFX11-NEXT: s_add_i32 s0, s0, 15 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_and_b32 s0, s0, 0x7fffff0 +; GFX11-NEXT: s_add_i32 s32, s32, 48 +; GFX11-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_add_i32 s0, s32, s0 +; GFX11-NEXT: s_mov_b32 s33, s1 +; GFX11-NEXT: s_mov_b32 s32, s0 +; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_addk_i32 s32, 0xffd0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, i32 %n, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca ret void } + +define amdgpu_gfx void @test_dynamic_stackalloc_byval_stack_arg_usage_function_uniform(ptr addrspace(5) byval([40 x i32]) %byval.arg, i32 inreg %n) { +; GFX9-LABEL: test_dynamic_stackalloc_byval_stack_arg_usage_function_uniform: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen +; GFX9-NEXT: s_lshl_b32 s34, s4, 2 +; GFX9-NEXT: s_add_i32 s34, s34, 15 +; GFX9-NEXT: s_and_b32 s34, s34, 0x3fffff0 +; GFX9-NEXT: s_mov_b32 s35, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: s_lshl_b32 s34, s34, 6 +; GFX9-NEXT: s_add_i32 s34, s32, s34 +; GFX9-NEXT: s_mov_b32 s32, s34 +; GFX9-NEXT: v_mov_b32_e32 v1, s34 +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s35 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_dynamic_stackalloc_byval_stack_arg_usage_function_uniform: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: scratch_load_b32 v0, v0, off +; GFX11-NEXT: s_lshl_b32 s0, s4, 2 +; GFX11-NEXT: s_mov_b32 s1, s33 +; GFX11-NEXT: s_add_i32 s0, s0, 15 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_and_b32 s0, s0, 0x7fffff0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-NEXT: s_mov_b32 s33, s1 +; GFX11-NEXT: s_add_i32 s0, s32, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s32, s0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %alloca = alloca i32, i32 %n, addrspace(5) + %byval.0 = load i32, ptr addrspace(5) %byval.arg + store volatile i32 %byval.0, ptr addrspace(5) %alloca + %byval.39.ptr = getelementptr inbounds [16 x i32], ptr addrspace(5) %byval.arg, i32 0, i32 39 + %byval.39 = load i32, ptr addrspace(5) %byval.39.ptr + ret void +} + +define amdgpu_gfx void @test_dynamic_stackalloc_stack_arg_usage_function_uniform_realign32([40 x i32] %stack.args, i32 inreg %n) { +; GFX9-LABEL: test_dynamic_stackalloc_stack_arg_usage_function_uniform_realign32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s36, s33 +; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0 +; GFX9-NEXT: s_lshl_b32 s35, s4, 2 +; GFX9-NEXT: s_add_i32 s35, s35, 15 +; GFX9-NEXT: s_and_b32 s35, s35, 0x3fffff0 +; GFX9-NEXT: s_mov_b32 s37, s34 +; GFX9-NEXT: s_mov_b32 s34, s32 +; GFX9-NEXT: s_addk_i32 s32, 0x1800 +; GFX9-NEXT: s_lshl_b32 s35, s35, 6 +; GFX9-NEXT: s_add_i32 s35, s32, s35 +; GFX9-NEXT: s_and_b32 s35, s35, 0xfffff800 +; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800 +; GFX9-NEXT: s_mov_b32 s32, s35 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s34, s37 +; GFX9-NEXT: s_addk_i32 s32, 0xe800 +; GFX9-NEXT: s_mov_b32 s33, s36 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_dynamic_stackalloc_stack_arg_usage_function_uniform_realign32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s1, s33 +; GFX11-NEXT: s_add_i32 s33, s32, 31 +; GFX11-NEXT: s_lshl_b32 s0, s4, 2 +; GFX11-NEXT: s_mov_b32 s2, s34 +; GFX11-NEXT: s_add_i32 s0, s0, 15 +; GFX11-NEXT: s_mov_b32 s34, s32 +; GFX11-NEXT: s_and_b32 s0, s0, 0x7fffff0 +; GFX11-NEXT: s_addk_i32 s32, 0x60 +; GFX11-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_add_i32 s0, s32, s0 +; GFX11-NEXT: s_and_not1_b32 s33, s33, 31 +; GFX11-NEXT: s_and_b32 s0, s0, 0xfffffc00 +; GFX11-NEXT: s_mov_b32 s34, s2 +; GFX11-NEXT: s_mov_b32 s32, s0 +; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_addk_i32 s32, 0xffa0 +; GFX11-NEXT: s_mov_b32 s33, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %alloca = alloca i32, i32 %n, align 32, addrspace(5) + store volatile i32 0, ptr addrspace(5) %alloca + ret void +} + +define amdgpu_gfx void @test_dynamic_stackalloc_stack_arg_usage_function_uniform_realign32_other_object([40 x i32] %stack.args, i32 inreg %n) { +; GFX9-LABEL: test_dynamic_stackalloc_stack_arg_usage_function_uniform_realign32_other_object: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s36, s33 +; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0 +; GFX9-NEXT: s_lshl_b32 s35, s4, 2 +; GFX9-NEXT: s_add_i32 s35, s35, 15 +; GFX9-NEXT: s_and_b32 s35, s35, 0x3fffff0 +; GFX9-NEXT: s_mov_b32 s37, s34 +; GFX9-NEXT: s_mov_b32 s34, s32 +; GFX9-NEXT: s_addk_i32 s32, 0x4800 +; GFX9-NEXT: s_lshl_b32 s35, s35, 6 +; GFX9-NEXT: s_add_i32 s35, s32, s35 +; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800 +; GFX9-NEXT: v_mov_b32_e32 v0, 9 +; GFX9-NEXT: s_and_b32 s35, s35, 0xfffff800 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s32, s35 +; GFX9-NEXT: v_mov_b32_e32 v0, 10 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s34, s37 +; GFX9-NEXT: s_addk_i32 s32, 0xb800 +; GFX9-NEXT: s_mov_b32 s33, s36 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_dynamic_stackalloc_stack_arg_usage_function_uniform_realign32_other_object: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s1, s33 +; GFX11-NEXT: s_add_i32 s33, s32, 31 +; GFX11-NEXT: s_lshl_b32 s0, s4, 2 +; GFX11-NEXT: v_mov_b32_e32 v0, 9 +; GFX11-NEXT: s_add_i32 s0, s0, 15 +; GFX11-NEXT: s_mov_b32 s2, s34 +; GFX11-NEXT: s_and_b32 s0, s0, 0x7fffff0 +; GFX11-NEXT: s_mov_b32 s34, s32 +; GFX11-NEXT: s_addk_i32 s32, 0x120 +; GFX11-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-NEXT: s_and_not1_b32 s33, s33, 31 +; GFX11-NEXT: s_add_i32 s0, s32, s0 +; GFX11-NEXT: scratch_store_b32 off, v0, s33 offset:32 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 10 +; GFX11-NEXT: s_and_b32 s0, s0, 0xfffffc00 +; GFX11-NEXT: s_mov_b32 s34, s2 +; GFX11-NEXT: s_mov_b32 s32, s0 +; GFX11-NEXT: s_mov_b32 s33, s1 +; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_addk_i32 s32, 0xfee0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %fixed.realign = alloca [42 x i32], align 32, addrspace(5) + store volatile i32 9, ptr addrspace(5) %fixed.realign + %dyn.alloca = alloca i32, i32 %n, align 32, addrspace(5) + store volatile i32 10, ptr addrspace(5) %dyn.alloca + ret void +} + +declare hidden amdgpu_gfx void @uses_stack_args([40 x i32]) + +define amdgpu_gfx void @test_dynamic_stackalloc_function_uniform_outgoing_stack_args(i32 inreg %n) { +; GFX9-LABEL: test_dynamic_stackalloc_function_uniform_outgoing_stack_args: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: s_lshl_b32 s34, s4, 2 +; GFX9-NEXT: s_add_i32 s34, s34, 15 +; GFX9-NEXT: s_and_b32 s34, s34, 0x3fffff0 +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: s_lshl_b32 s34, s34, 6 +; GFX9-NEXT: s_add_i32 s34, s32, s34 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v41, s34 +; GFX9-NEXT: s_mov_b32 s32, s34 +; GFX9-NEXT: buffer_store_dword v0, v41, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-NEXT: v_mov_b32_e32 v14, 0 +; GFX9-NEXT: v_mov_b32_e32 v15, 0 +; GFX9-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-NEXT: v_mov_b32_e32 v17, 0 +; GFX9-NEXT: v_mov_b32_e32 v18, 0 +; GFX9-NEXT: v_mov_b32_e32 v19, 0 +; GFX9-NEXT: v_mov_b32_e32 v20, 0 +; GFX9-NEXT: v_mov_b32_e32 v21, 0 +; GFX9-NEXT: v_mov_b32_e32 v22, 0 +; GFX9-NEXT: v_mov_b32_e32 v23, 0 +; GFX9-NEXT: v_mov_b32_e32 v24, 0 +; GFX9-NEXT: v_mov_b32_e32 v25, 0 +; GFX9-NEXT: v_mov_b32_e32 v26, 0 +; GFX9-NEXT: v_mov_b32_e32 v27, 0 +; GFX9-NEXT: v_mov_b32_e32 v28, 0 +; GFX9-NEXT: v_mov_b32_e32 v29, 0 +; GFX9-NEXT: v_mov_b32_e32 v30, 0 +; GFX9-NEXT: v_mov_b32_e32 v31, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_getpc_b64 s[34:35] +; GFX9-NEXT: s_add_u32 s34, s34, uses_stack_args@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s35, s35, uses_stack_args@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v0, 2 +; GFX9-NEXT: buffer_store_dword v0, v41, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s34 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_dynamic_stackalloc_function_uniform_outgoing_stack_args: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: s_lshl_b32 s0, s4, 2 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_add_i32 s1, s0, 15 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s2, s1, 0x7fffff0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_lshl_b32 s34, s2, 5 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_add_i32 s4, s32, s34 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_mov_b32 s32, s4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: s_add_i32 s0, s32, 16 +; GFX11-NEXT: scratch_store_b32 off, v4, s4 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 0 +; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, 0 +; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v8, 0 +; GFX11-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v10, 0 +; GFX11-NEXT: v_dual_mov_b32 v13, 0 :: v_dual_mov_b32 v12, 0 +; GFX11-NEXT: v_dual_mov_b32 v15, 0 :: v_dual_mov_b32 v14, 0 +; GFX11-NEXT: v_dual_mov_b32 v17, 0 :: v_dual_mov_b32 v16, 0 +; GFX11-NEXT: v_dual_mov_b32 v19, 0 :: v_dual_mov_b32 v18, 0 +; GFX11-NEXT: v_dual_mov_b32 v21, 0 :: v_dual_mov_b32 v20, 0 +; GFX11-NEXT: v_dual_mov_b32 v23, 0 :: v_dual_mov_b32 v22, 0 +; GFX11-NEXT: v_dual_mov_b32 v25, 0 :: v_dual_mov_b32 v24, 0 +; GFX11-NEXT: v_dual_mov_b32 v27, 0 :: v_dual_mov_b32 v26, 0 +; GFX11-NEXT: v_dual_mov_b32 v29, 0 :: v_dual_mov_b32 v28, 0 +; GFX11-NEXT: v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, uses_stack_args@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, uses_stack_args@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v0, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: scratch_store_b32 off, v0, s4 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %dyn.alloca = alloca i32, i32 %n, addrspace(5) + store volatile i32 1, ptr addrspace(5) %dyn.alloca + call amdgpu_gfx void @uses_stack_args([40 x i32] zeroinitializer) + store volatile i32 2, ptr addrspace(5) %dyn.alloca + ret void +} + +define amdgpu_gfx void @test_dynamic_stackalloc_function_uniform_outgoing_stack_args_realign32(i32 inreg %n) { +; GFX9-LABEL: test_dynamic_stackalloc_function_uniform_outgoing_stack_args_realign32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s34, s33 +; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0 +; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: s_lshl_b32 s34, s4, 2 +; GFX9-NEXT: s_add_i32 s34, s34, 15 +; GFX9-NEXT: s_and_b32 s34, s34, 0x3fffff0 +; GFX9-NEXT: s_addk_i32 s32, 0x4800 +; GFX9-NEXT: s_lshl_b32 s34, s34, 6 +; GFX9-NEXT: s_add_i32 s34, s32, s34 +; GFX9-NEXT: v_mov_b32_e32 v0, 9 +; GFX9-NEXT: s_and_b32 s34, s34, 0xfffff800 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v41, s34 +; GFX9-NEXT: s_mov_b32 s32, s34 +; GFX9-NEXT: buffer_store_dword v0, v41, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-NEXT: v_mov_b32_e32 v14, 0 +; GFX9-NEXT: v_mov_b32_e32 v15, 0 +; GFX9-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-NEXT: v_mov_b32_e32 v17, 0 +; GFX9-NEXT: v_mov_b32_e32 v18, 0 +; GFX9-NEXT: v_mov_b32_e32 v19, 0 +; GFX9-NEXT: v_mov_b32_e32 v20, 0 +; GFX9-NEXT: v_mov_b32_e32 v21, 0 +; GFX9-NEXT: v_mov_b32_e32 v22, 0 +; GFX9-NEXT: v_mov_b32_e32 v23, 0 +; GFX9-NEXT: v_mov_b32_e32 v24, 0 +; GFX9-NEXT: v_mov_b32_e32 v25, 0 +; GFX9-NEXT: v_mov_b32_e32 v26, 0 +; GFX9-NEXT: v_mov_b32_e32 v27, 0 +; GFX9-NEXT: v_mov_b32_e32 v28, 0 +; GFX9-NEXT: v_mov_b32_e32 v29, 0 +; GFX9-NEXT: v_mov_b32_e32 v30, 0 +; GFX9-NEXT: v_mov_b32_e32 v31, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_getpc_b64 s[34:35] +; GFX9-NEXT: s_add_u32 s34, s34, uses_stack_args@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s35, s35, uses_stack_args@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v0, 2 +; GFX9-NEXT: buffer_store_dword v0, v41, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[36:37] +; GFX9-NEXT: s_addk_i32 s32, 0xb800 +; GFX9-NEXT: s_mov_b32 s33, s34 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_dynamic_stackalloc_function_uniform_outgoing_stack_args_realign32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_add_i32 s33, s32, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s33, s33, 31 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:192 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: s_lshl_b32 s0, s4, 2 +; GFX11-NEXT: s_addk_i32 s32, 0x100 +; GFX11-NEXT: s_add_i32 s1, s0, 15 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_and_b32 s1, s1, 0x7fffff0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_lshl_b32 s1, s1, 5 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_add_i32 s34, s32, s1 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, 9 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_mov_b32_e32 v4, 1 +; GFX11-NEXT: s_and_b32 s4, s34, 0xfffffc00 +; GFX11-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v11, 0 +; GFX11-NEXT: scratch_store_b32 off, v0, s33 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_mov_b32 s32, s4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: s_add_i32 s0, s32, 16 +; GFX11-NEXT: scratch_store_b32 off, v4, s4 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v4, 0 +; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v8, 0 +; GFX11-NEXT: v_dual_mov_b32 v13, 0 :: v_dual_mov_b32 v10, 0 +; GFX11-NEXT: v_dual_mov_b32 v15, 0 :: v_dual_mov_b32 v12, 0 +; GFX11-NEXT: v_dual_mov_b32 v17, 0 :: v_dual_mov_b32 v14, 0 +; GFX11-NEXT: v_dual_mov_b32 v19, 0 :: v_dual_mov_b32 v16, 0 +; GFX11-NEXT: v_dual_mov_b32 v21, 0 :: v_dual_mov_b32 v18, 0 +; GFX11-NEXT: v_dual_mov_b32 v23, 0 :: v_dual_mov_b32 v20, 0 +; GFX11-NEXT: v_dual_mov_b32 v25, 0 :: v_dual_mov_b32 v22, 0 +; GFX11-NEXT: v_dual_mov_b32 v27, 0 :: v_dual_mov_b32 v24, 0 +; GFX11-NEXT: v_dual_mov_b32 v29, 0 :: v_dual_mov_b32 v26, 0 +; GFX11-NEXT: v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v28, 0 +; GFX11-NEXT: v_mov_b32_e32 v30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, uses_stack_args@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, uses_stack_args@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v0, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: scratch_store_b32 off, v0, s4 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:192 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_addk_i32 s32, 0xff00 +; GFX11-NEXT: s_mov_b32 s33, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %fixed.realign = alloca [42 x i32], align 32, addrspace(5) + store volatile i32 9, ptr addrspace(5) %fixed.realign + %dyn.alloca = alloca i32, i32 %n, align 32, addrspace(5) + store volatile i32 1, ptr addrspace(5) %dyn.alloca + call amdgpu_gfx void @uses_stack_args([40 x i32] zeroinitializer) + store volatile i32 2, ptr addrspace(5) %dyn.alloca + ret void +} + +define amdgpu_gfx void @test_dynamic_stackalloc_function_uniform_realign32_csr_spilling(i32 inreg %n) { +; GFX9-LABEL: test_dynamic_stackalloc_function_uniform_realign32_csr_spilling: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s35, s33 +; GFX9-NEXT: s_add_i32 s33, s32, 0x7c0 +; GFX9-NEXT: s_lshl_b32 s34, s4, 2 +; GFX9-NEXT: s_add_i32 s34, s34, 15 +; GFX9-NEXT: s_and_b32 s34, s34, 0x3fffff0 +; GFX9-NEXT: s_addk_i32 s32, 0x4800 +; GFX9-NEXT: s_lshl_b32 s34, s34, 6 +; GFX9-NEXT: s_add_i32 s34, s32, s34 +; GFX9-NEXT: s_and_b32 s33, s33, 0xfffff800 +; GFX9-NEXT: v_mov_b32_e32 v0, 9 +; GFX9-NEXT: s_and_b32 s34, s34, 0xfffff800 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, 2 +; GFX9-NEXT: v_mov_b32_e32 v1, s34 +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; clobber v40 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; clobber v41 +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s32, s34 +; GFX9-NEXT: s_addk_i32 s32, 0xb800 +; GFX9-NEXT: s_mov_b32 s33, s35 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_dynamic_stackalloc_function_uniform_realign32_csr_spilling: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s1, s33 +; GFX11-NEXT: s_add_i32 s33, s32, 31 +; GFX11-NEXT: s_lshl_b32 s0, s4, 2 +; GFX11-NEXT: v_mov_b32_e32 v0, 9 +; GFX11-NEXT: s_add_i32 s0, s0, 15 +; GFX11-NEXT: s_and_not1_b32 s33, s33, 31 +; GFX11-NEXT: s_and_b32 s0, s0, 0x7fffff0 +; GFX11-NEXT: s_addk_i32 s32, 0x120 +; GFX11-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 +; GFX11-NEXT: scratch_store_b32 off, v0, s33 offset:32 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 2 +; GFX11-NEXT: s_add_i32 s0, s32, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, s0, 0xfffffc00 +; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; clobber v40 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; clobber v41 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 s32, s0 +; GFX11-NEXT: s_mov_b32 s33, s1 +; GFX11-NEXT: s_addk_i32 s32, 0xfee0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %fixed.realign = alloca [42 x i32], align 32, addrspace(5) + store volatile i32 9, ptr addrspace(5) %fixed.realign + %dyn.alloca = alloca i32, i32 %n, align 32, addrspace(5) + store volatile i32 2, ptr addrspace(5) %dyn.alloca + call void asm sideeffect "; clobber v40", "~{v40}" () + call void asm sideeffect "; clobber v41", "~{v41}" () + ret void +} Index: llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.r600.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.r600.ll @@ -0,0 +1,10 @@ +; RUN: not llc -march=r600 -mtriple=r600-- -mcpu=cypress < %s 2>&1 | FileCheck %s +target datalayout = "A5" + +; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca + +define amdgpu_kernel void @test_dynamic_stackalloc(ptr addrspace(1) %out, i32 %n) { + %alloca = alloca i32, i32 %n, addrspace(5) + store volatile i32 0, ptr addrspace(5) %alloca + ret void +}