Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -478,6 +478,12 @@ .clampMaxNumElements(0, S32, 16); setAction({G_FRAME_INDEX, PrivatePtr}, Legal); + + // If the amount is divergent, we have to do a wave reduction to get the + // maximum value, so this is expanded during RegBankSelect. + getActionDefinitionsBuilder(G_DYN_STACKALLOC) + .legalFor({{PrivatePtr, S32}}); + getActionDefinitionsBuilder(G_GLOBAL_VALUE) .unsupportedFor({PrivatePtr}) .custom(); @@ -1420,7 +1426,7 @@ }).lower(); getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, - G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, + G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) .unsupported(); Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -70,11 +70,15 @@ void constrainOpWithReadfirstlane(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const; bool applyMappingWideLoad(MachineInstr &MI, - const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, + const OperandsMapper &OpdMapper, MachineRegisterInfo &MRI) const; + + bool applyMappingDynStackAlloc(MachineInstr &MI, + const OperandsMapper &OpdMapper, + MachineRegisterInfo &MRI) const; bool applyMappingImage(MachineInstr &MI, - const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, + const OperandsMapper &OpdMapper, MachineRegisterInfo &MRI, int RSrcIdx) const; bool applyMappingSBufferLoad(const OperandsMapper &OpdMapper) const; Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1154,6 +1154,56 @@ return true; } +bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc( + MachineInstr &MI, + const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, + MachineRegisterInfo &MRI) const { + const MachineFunction &MF = *MI.getMF(); + const GCNSubtarget &ST = MF.getSubtarget(); + const auto &TFI = *ST.getFrameLowering(); + + // Guard in case the stack growth direction ever changes with scratch + // instructions. + if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown) + return false; + + Register Dst = MI.getOperand(0).getReg(); + Register AllocSize = MI.getOperand(1).getReg(); + Align Alignment = assumeAligned(MI.getOperand(2).getImm()); + + const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI); + + // TODO: Need to emit a wave reduction to get the maximum size. + if (SizeBank != &AMDGPU::SGPRRegBank) + return false; + + LLT PtrTy = MRI.getType(Dst); + LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); + + const SIMachineFunctionInfo *Info = MF.getInfo(); + Register SPReg = Info->getStackPtrOffsetReg(); + ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); + GISelObserverWrapper Observer(&ApplyBank); + + MachineIRBuilder B(MI); + B.setChangeObserver(Observer); + + auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2()); + auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize); + + auto SPCopy = B.buildCopy(PtrTy, SPReg); + if (Alignment > TFI.getStackAlign()) { + auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize); + B.buildMaskLowPtrBits(Dst, PtrAdd, + Log2(Alignment) + ST.getWavefrontSizeLog2()); + } else { + B.buildPtrAdd(Dst, SPCopy, ScaledSize); + } + + MI.eraseFromParent(); + return true; +} + bool AMDGPURegisterBankInfo::applyMappingImage( MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, MachineRegisterInfo &MRI, int RsrcIdx) const { @@ -2811,6 +2861,9 @@ return; break; } + case AMDGPU::G_DYN_STACKALLOC: + applyMappingDynStackAlloc(MI, OpdMapper, MRI); + return; default: break; } @@ -3315,6 +3368,13 @@ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); break; } + case AMDGPU::G_DYN_STACKALLOC: { + // Result is always uniform, and a wave reduction is needed for the source. + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); + unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32); + break; + } case AMDGPU::G_INSERT: { unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-divergent.ll @@ -0,0 +1,28 @@ +; RUN: not llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel-abort=2 -pass-remarks-missed="gisel.*" -o /dev/null 2>&1 %s | FileCheck -check-prefix=ERR %s + +; ERR: remark: :0:0: cannot select: %24:sreg_32(p5) = G_DYN_STACKALLOC %23:vgpr(s32), 1 (in function: kernel_dynamic_stackalloc_vgpr_align4) +; ERR-NEXT: warning: Instruction selection used fallback path for kernel_dynamic_stackalloc_vgpr_align4 +; ERR-NEXT: error: :0:0: in function kernel_dynamic_stackalloc_vgpr_align4 void (i32 addrspace(1)*): unsupported dynamic alloca + +; ERR: remark: :0:0: cannot select: %8:sreg_32(p5) = G_DYN_STACKALLOC %7:vgpr(s32), 1 (in function: func_dynamic_stackalloc_vgpr_align4) +; ERR-NEXT: warning: Instruction selection used fallback path for func_dynamic_stackalloc_vgpr_align4 +; ERR-NEXT: error: :0:0: in function func_dynamic_stackalloc_vgpr_align4 void (i32): unsupported dynamic alloca + +define amdgpu_kernel void @kernel_dynamic_stackalloc_vgpr_align4(i32 addrspace(1)* %ptr) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id + %n = load i32, i32 addrspace(1)* %gep + %alloca = alloca i32, i32 %n, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + ret void +} + +define void @func_dynamic_stackalloc_vgpr_align4(i32 %n) { + %alloca = alloca i32, i32 %n, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone speculatable } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll @@ -0,0 +1,317 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s + +@gv = external addrspace(4) constant i32 + +define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) { +; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 s0, s0, s9 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 +; GFX9-NEXT: s_and_b32 s4, s4, -16 +; GFX9-NEXT: s_lshl_b32 s4, s4, 6 +; GFX9-NEXT: s_add_u32 s4, s32, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_mov_b32 s33, 0 +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s6, s6, s9 +; GFX10-NEXT: s_mov_b32 s33, 0 +; GFX10-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-NEXT: s_add_u32 s0, s0, s9 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15 +; GFX10-NEXT: s_and_b32 s4, s4, -16 +; GFX10-NEXT: s_lshl_b32 s4, s4, 5 +; GFX10-NEXT: s_add_u32 s4, s32, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-NEXT: s_endpgm + %alloca = alloca i32, i32 %n, align 4, addrspace(5) + store i32 0, i32 addrspace(5)* %alloca + ret void +} + +define void @func_dynamic_stackalloc_sgpr_align4() { +; GFX9-LABEL: func_dynamic_stackalloc_sgpr_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+4 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 +; GFX9-NEXT: s_and_b32 s4, s4, -16 +; GFX9-NEXT: s_lshl_b32 s4, s4, 6 +; GFX9-NEXT: s_add_u32 s4, s32, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+4 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15 +; GFX10-NEXT: s_and_b32 s4, s4, -16 +; GFX10-NEXT: s_lshl_b32 s4, s4, 5 +; GFX10-NEXT: s_add_u32 s4, s32, s4 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %n = load i32, i32 addrspace(4)* @gv, align 4 + %alloca = alloca i32, i32 %n, addrspace(5) + store i32 0, i32 addrspace(5)* %alloca + ret void +} + +define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) { +; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 s0, s0, s9 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 +; GFX9-NEXT: s_and_b32 s4, s4, -16 +; GFX9-NEXT: s_lshl_b32 s4, s4, 6 +; GFX9-NEXT: s_add_u32 s4, s32, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_mov_b32 s33, 0 +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s6, s6, s9 +; GFX10-NEXT: s_mov_b32 s33, 0 +; GFX10-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-NEXT: s_add_u32 s0, s0, s9 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15 +; GFX10-NEXT: s_and_b32 s4, s4, -16 +; GFX10-NEXT: s_lshl_b32 s4, s4, 5 +; GFX10-NEXT: s_add_u32 s4, s32, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-NEXT: s_endpgm + %alloca = alloca i32, i32 %n, align 16, addrspace(5) + store i32 0, i32 addrspace(5)* %alloca + ret void +} + +define void @func_dynamic_stackalloc_sgpr_align16() { +; GFX9-LABEL: func_dynamic_stackalloc_sgpr_align16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+4 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 +; GFX9-NEXT: s_and_b32 s4, s4, -16 +; GFX9-NEXT: s_lshl_b32 s4, s4, 6 +; GFX9-NEXT: s_add_u32 s4, s32, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s33, s32 +; GFX10-NEXT: s_add_u32 s32, s32, 0x200 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+4 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15 +; GFX10-NEXT: s_and_b32 s4, s4, -16 +; GFX10-NEXT: s_lshl_b32 s4, s4, 5 +; GFX10-NEXT: s_add_u32 s4, s32, s4 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %n = load i32, i32 addrspace(4)* @gv, align 16 + %alloca = alloca i32, i32 %n, addrspace(5) + store i32 0, i32 addrspace(5)* %alloca + ret void +} + +define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) { +; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GFX9-NEXT: s_add_u32 s0, s0, s9 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 +; GFX9-NEXT: s_and_b32 s4, s4, -16 +; GFX9-NEXT: s_lshl_b32 s4, s4, 6 +; GFX9-NEXT: s_add_u32 s4, s32, s4 +; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff800 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_mov_b32 s33, 0 +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s6, s6, s9 +; GFX10-NEXT: s_mov_b32 s33, 0 +; GFX10-NEXT: s_addc_u32 s7, s7, 0 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-NEXT: s_add_u32 s0, s0, s9 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15 +; GFX10-NEXT: s_and_b32 s4, s4, -16 +; GFX10-NEXT: s_lshl_b32 s4, s4, 5 +; GFX10-NEXT: s_add_u32 s4, s32, s4 +; GFX10-NEXT: s_and_b32 s4, s4, 0xfffffc00 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-NEXT: s_endpgm + %alloca = alloca i32, i32 %n, align 32, addrspace(5) + store i32 0, i32 addrspace(5)* %alloca + ret void +} + +define void @func_dynamic_stackalloc_sgpr_align32(i32 addrspace(1)* %out) { +; GFX9-LABEL: func_dynamic_stackalloc_sgpr_align32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s4, s32, 0x7c0 +; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_and_b32 s33, s4, 0xfffff800 +; GFX9-NEXT: s_add_u32 s32, s32, 0x1000 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+4 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl2_add_u32 s4, s4, 15 +; GFX9-NEXT: s_and_b32 s4, s4, -16 +; GFX9-NEXT: s_lshl_b32 s4, s4, 6 +; GFX9-NEXT: s_add_u32 s4, s32, s4 +; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff800 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-NEXT: s_sub_u32 s32, s32, 0x1000 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_add_u32 s4, s32, 0x3e0 +; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_and_b32 s33, s4, 0xfffffc00 +; GFX10-NEXT: s_add_u32 s32, s32, 0x800 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, gv@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, gv@gotpcrel32@hi+4 +; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshl2_add_u32 s4, s4, 15 +; GFX10-NEXT: s_and_b32 s4, s4, -16 +; GFX10-NEXT: s_lshl_b32 s4, s4, 5 +; GFX10-NEXT: s_add_u32 s4, s32, s4 +; GFX10-NEXT: s_and_b32 s4, s4, 0xfffffc00 +; GFX10-NEXT: s_sub_u32 s32, s32, 0x800 +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %n = load i32, i32 addrspace(4)* @gv + %alloca = alloca i32, i32 %n, align 32, addrspace(5) + store i32 0, i32 addrspace(5)* %alloca + ret void +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -0,0 +1,290 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DEFAULTSIZE %s +; RUN: llc -global-isel -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=GCN,ASSUME1024 %s + +; FIXME: Generated test checks do not check metadata at the end of the +; function, so this also includes manually added checks. + +; Test that we can select a statically sized alloca outside of the +; entry block. + +; FIXME: FunctionLoweringInfo unhelpfully doesn't preserve an +; alignment less than the stack alignment. +define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) { +; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x8 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 s33, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cselect_b32 s6, 1, 0 +; GCN-NEXT: s_and_b32 s6, s6, 1 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cbranch_scc1 BB0_3 +; GCN-NEXT: ; %bb.1: ; %bb.0 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xc +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cselect_b32 s6, 1, 0 +; GCN-NEXT: s_and_b32 s6, s6, 1 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cbranch_scc1 BB0_3 +; GCN-NEXT: ; %bb.2: ; %bb.1 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GCN-NEXT: s_load_dword s4, s[4:5], 0x10 +; GCN-NEXT: s_add_u32 s5, s32, 0x1000 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: s_add_u32 s8, s5, 4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 s4, s4, 2 +; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NEXT: s_add_u32 s4, s5, s4 +; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_u32_e32 v2, v1, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: global_store_dword v[0:1], v2, off +; GCN-NEXT: BB0_3: ; %bb.2 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: global_store_dword v[0:1], v0, off +; GCN-NEXT: s_endpgm + +entry: + %cond0 = icmp eq i32 %arg.cond0, 0 + br i1 %cond0, label %bb.0, label %bb.2 + +bb.0: + %alloca = alloca [16 x i32], align 4, addrspace(5) + %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0 + %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1 + %cond1 = icmp eq i32 %arg.cond1, 0 + br i1 %cond1, label %bb.1, label %bb.2 + +bb.1: + ; Use the alloca outside of the defining block. + store i32 0, i32 addrspace(5)* %gep0 + store i32 1, i32 addrspace(5)* %gep1 + %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in + %load = load i32, i32 addrspace(5)* %gep2 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %add = add i32 %load, %tid + store i32 %add, i32 addrspace(1)* %out + br label %bb.2 + +bb.2: + store volatile i32 0, i32 addrspace(1)* undef + ret void +} +; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112 +; DEFAULTSIZE: ; ScratchSize: 4112 + +; ASSUME1024: .amdhsa_private_segment_fixed_size 1040 +; ASSUME1024: ; ScratchSize: 1040 + +define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) { +; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x8 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_mov_b32 s33, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cselect_b32 s6, 1, 0 +; GCN-NEXT: s_and_b32 s6, s6, 1 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cbranch_scc1 BB1_2 +; GCN-NEXT: ; %bb.1: ; %bb.0 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 +; GCN-NEXT: s_load_dword s4, s[4:5], 0xc +; GCN-NEXT: s_add_u32 s5, s32, 0x1000 +; GCN-NEXT: s_and_b32 s5, s5, 0xfffff000 +; GCN-NEXT: s_add_u32 s8, s5, 4 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 s4, s4, 2 +; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NEXT: s_add_u32 s4, s5, s4 +; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_u32_e32 v2, v1, v0 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: global_store_dword v[0:1], v2, off +; GCN-NEXT: BB1_2: ; %bb.1 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: global_store_dword v[0:1], v0, off +; GCN-NEXT: s_endpgm +entry: + %cond = icmp eq i32 %arg.cond, 0 + br i1 %cond, label %bb.0, label %bb.1 + +bb.0: + %alloca = alloca [16 x i32], align 64, addrspace(5) + %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0 + %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1 + store i32 0, i32 addrspace(5)* %gep0 + store i32 1, i32 addrspace(5)* %gep1 + %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in + %load = load i32, i32 addrspace(5)* %gep2 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %add = add i32 %load, %tid + store i32 %add, i32 addrspace(1)* %out + br label %bb.1 + +bb.1: + store volatile i32 0, i32 addrspace(1)* undef + ret void +} + +; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160 +; DEFAULTSIZE: ; ScratchSize: 4160 + +; ASSUME1024: .amdhsa_private_segment_fixed_size 1088 +; ASSUME1024: ; ScratchSize: 1088 + + +define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) { +; GCN-LABEL: func_non_entry_block_static_alloca_align4: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s8, s33 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_add_u32 s32, s32, 0x400 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz BB2_3 +; GCN-NEXT: ; %bb.1: ; %bb.0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GCN-NEXT: s_and_b64 exec, exec, vcc +; GCN-NEXT: s_cbranch_execz BB2_3 +; GCN-NEXT: ; %bb.2: ; %bb.1 +; GCN-NEXT: s_add_u32 s6, s32, 0x1000 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NEXT: s_add_u32 s7, s6, 4 +; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v4 +; GCN-NEXT: v_add_u32_e32 v2, s6, v2 +; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen +; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v5 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_u32_e32 v2, v2, v3 +; GCN-NEXT: global_store_dword v[0:1], v2, off +; GCN-NEXT: BB2_3: ; %bb.2 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: global_store_dword v[0:1], v0, off +; GCN-NEXT: s_sub_u32 s32, s32, 0x400 +; GCN-NEXT: s_mov_b32 s33, s8 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + +entry: + %cond0 = icmp eq i32 %arg.cond0, 0 + br i1 %cond0, label %bb.0, label %bb.2 + +bb.0: + %alloca = alloca [16 x i32], align 4, addrspace(5) + %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0 + %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1 + %cond1 = icmp eq i32 %arg.cond1, 0 + br i1 %cond1, label %bb.1, label %bb.2 + +bb.1: + ; Use the alloca outside of the defining block. + store i32 0, i32 addrspace(5)* %gep0 + store i32 1, i32 addrspace(5)* %gep1 + %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in + %load = load i32, i32 addrspace(5)* %gep2 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %add = add i32 %load, %tid + store i32 %add, i32 addrspace(1)* %out + br label %bb.2 + +bb.2: + store volatile i32 0, i32 addrspace(1)* undef + ret void +} + +define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) { +; GCN-LABEL: func_non_entry_block_static_alloca_align64: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_add_u32 s4, s32, 0xfc0 +; GCN-NEXT: s_mov_b32 s8, s33 +; GCN-NEXT: s_and_b32 s33, s4, 0xfffff000 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GCN-NEXT: s_add_u32 s32, s32, 0x2000 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz BB3_2 +; GCN-NEXT: ; %bb.1: ; %bb.0 +; GCN-NEXT: s_add_u32 s6, s32, 0x1000 +; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000 +; GCN-NEXT: s_add_u32 s7, s6, 4 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v5, s6 +; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GCN-NEXT: v_mov_b32_e32 v2, 1 +; GCN-NEXT: v_mov_b32_e32 v5, s7 +; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v3 +; GCN-NEXT: v_add_u32_e32 v2, s6, v2 +; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen +; GCN-NEXT: v_and_b32_e32 v3, 0x3ff, v4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_u32_e32 v2, v2, v3 +; GCN-NEXT: global_store_dword v[0:1], v2, off +; GCN-NEXT: BB3_2: ; %bb.1 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: global_store_dword v[0:1], v0, off +; GCN-NEXT: s_sub_u32 s32, s32, 0x2000 +; GCN-NEXT: s_mov_b32 s33, s8 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +entry: + %cond = icmp eq i32 %arg.cond, 0 + br i1 %cond, label %bb.0, label %bb.1 + +bb.0: + %alloca = alloca [16 x i32], align 64, addrspace(5) + %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0 + %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1 + store i32 0, i32 addrspace(5)* %gep0 + store i32 1, i32 addrspace(5)* %gep1 + %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in + %load = load i32, i32 addrspace(5)* %gep2 + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %add = add i32 %load, %tid + store i32 %add, i32 addrspace(1)* %out + br label %bb.1 + +bb.1: + store volatile i32 0, i32 addrspace(1)* undef + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone speculatable } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir @@ -0,0 +1,381 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck -check-prefix=WAVE64 %s +# XUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck -check-prefix=WAVE64 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck -check-prefix=WAVE32 %s +# XUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck -check-prefix=WAVE32 %s + + +--- +name: test_dyn_stackalloc_sgpr_align1 +legalized: true +frameInfo: + maxAlignment: 2 +stack: + - { id: 0, type: variable-sized, alignment: 1 } +body: | + bb.0: + liveins: $sgpr0 + + ; WAVE64-LABEL: name: test_dyn_stackalloc_sgpr_align1 + ; WAVE64: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 + ; WAVE64: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) + ; WAVE64: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE64: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) + ; WAVE64: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align1 + ; WAVE32: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 + ; WAVE32: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) + ; WAVE32: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE32: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) + ; WAVE32: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + %0:_(s32) = COPY $sgpr0 + %1:_(p5) = G_DYN_STACKALLOC %0, 1 + S_ENDPGM 0, implicit %1 +... + +--- +name: test_dyn_stackalloc_sgpr_align2 +legalized: true +frameInfo: + maxAlignment: 2 +stack: + - { id: 0, type: variable-sized, alignment: 2 } +body: | + bb.0: + liveins: $sgpr0 + + ; WAVE64-LABEL: name: test_dyn_stackalloc_sgpr_align2 + ; WAVE64: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 + ; WAVE64: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) + ; WAVE64: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE64: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) + ; WAVE64: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align2 + ; WAVE32: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 + ; WAVE32: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) + ; WAVE32: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE32: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) + ; WAVE32: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + %0:_(s32) = COPY $sgpr0 + %1:_(p5) = G_DYN_STACKALLOC %0, 2 + S_ENDPGM 0, implicit %1 +... + +--- +name: test_dyn_stackalloc_sgpr_align4 +legalized: true +frameInfo: + maxAlignment: 4 +stack: + - { id: 0, type: variable-sized, alignment: 4 } +body: | + bb.0: + liveins: $sgpr0 + + ; WAVE64-LABEL: name: test_dyn_stackalloc_sgpr_align4 + ; WAVE64: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 + ; WAVE64: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) + ; WAVE64: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE64: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) + ; WAVE64: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align4 + ; WAVE32: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 + ; WAVE32: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) + ; WAVE32: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE32: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) + ; WAVE32: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + %0:_(s32) = COPY $sgpr0 + %1:_(p5) = G_DYN_STACKALLOC %0, 4 + S_ENDPGM 0, implicit %1 +... + +--- +name: test_dyn_stackalloc_sgpr_align8 +legalized: true +frameInfo: + maxAlignment: 8 +stack: + - { id: 0, type: variable-sized, alignment: 8 } +body: | + bb.0: + liveins: $sgpr0 + + ; WAVE64-LABEL: name: test_dyn_stackalloc_sgpr_align8 + ; WAVE64: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 + ; WAVE64: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) + ; WAVE64: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE64: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) + ; WAVE64: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align8 + ; WAVE32: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 + ; WAVE32: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) + ; WAVE32: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE32: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) + ; WAVE32: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + %0:_(s32) = COPY $sgpr0 + %1:_(p5) = G_DYN_STACKALLOC %0, 8 + S_ENDPGM 0, implicit %1 +... + +--- +name: test_dyn_stackalloc_sgpr_align16 +legalized: true +frameInfo: + maxAlignment: 16 +stack: + - { id: 0, type: variable-sized, alignment: 16 } +body: | + bb.0: + liveins: $sgpr0 + + ; WAVE64-LABEL: name: test_dyn_stackalloc_sgpr_align16 + ; WAVE64: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 + ; WAVE64: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) + ; WAVE64: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE64: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) + ; WAVE64: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align16 + ; WAVE32: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 + ; WAVE32: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) + ; WAVE32: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE32: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) + ; WAVE32: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + %0:_(s32) = COPY $sgpr0 + %1:_(p5) = G_DYN_STACKALLOC %0, 16 + S_ENDPGM 0, implicit %1 +... + +--- +name: test_dyn_stackalloc_sgpr_align32 +legalized: true +frameInfo: + maxAlignment: 32 +stack: + - { id: 0, type: variable-sized, alignment: 32 } +body: | + bb.0: + liveins: $sgpr0 + + ; WAVE64-LABEL: name: test_dyn_stackalloc_sgpr_align32 + ; WAVE64: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 + ; WAVE64: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) + ; WAVE64: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE64: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) + ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048 + ; WAVE64: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C1]](s32) + ; WAVE64: S_ENDPGM 0, implicit [[PTRMASK]](p5) + ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align32 + ; WAVE32: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 + ; WAVE32: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) + ; WAVE32: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE32: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) + ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1024 + ; WAVE32: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C1]](s32) + ; WAVE32: S_ENDPGM 0, implicit [[PTRMASK]](p5) + %0:_(s32) = COPY $sgpr0 + %1:_(p5) = G_DYN_STACKALLOC %0, 32 + S_ENDPGM 0, implicit %1 +... + +--- +name: test_dyn_stackalloc_sgpr_align64 +legalized: true +frameInfo: + maxAlignment: 64 +stack: + - { id: 0, type: variable-sized, alignment: 64 } +body: | + bb.0: + liveins: $sgpr0 + + ; WAVE64-LABEL: name: test_dyn_stackalloc_sgpr_align64 + ; WAVE64: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 + ; WAVE64: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) + ; WAVE64: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE64: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) + ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -4096 + ; WAVE64: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C1]](s32) + ; WAVE64: S_ENDPGM 0, implicit [[PTRMASK]](p5) + ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align64 + ; WAVE32: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 + ; WAVE32: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) + ; WAVE32: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE32: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) + ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048 + ; WAVE32: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C1]](s32) + ; WAVE32: S_ENDPGM 0, implicit [[PTRMASK]](p5) + %0:_(s32) = COPY $sgpr0 + %1:_(p5) = G_DYN_STACKALLOC %0, 64 + S_ENDPGM 0, implicit %1 +... + +--- +name: test_dyn_stackalloc_sgpr_align128 +legalized: true +frameInfo: + maxAlignment: 64 +stack: + - { id: 0, type: variable-sized, alignment: 128 } +body: | + bb.0: + liveins: $sgpr0 + + ; WAVE64-LABEL: name: test_dyn_stackalloc_sgpr_align128 + ; WAVE64: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 + ; WAVE64: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) + ; WAVE64: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE64: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) + ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -8192 + ; WAVE64: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C1]](s32) + ; WAVE64: S_ENDPGM 0, implicit [[PTRMASK]](p5) + ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align128 + ; WAVE32: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 + ; WAVE32: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32) + ; WAVE32: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE32: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32) + ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -4096 + ; WAVE32: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C1]](s32) + ; WAVE32: S_ENDPGM 0, implicit [[PTRMASK]](p5) + %0:_(s32) = COPY $sgpr0 + %1:_(p5) = G_DYN_STACKALLOC %0, 128 + S_ENDPGM 0, implicit %1 +... + +--- +name: test_dyn_stackalloc_sgpr_constant_align4 +legalized: true +frameInfo: + maxAlignment: 4 +stack: + - { id: 0, type: variable-sized, alignment: 4 } +body: | + bb.0: + + ; WAVE64-LABEL: name: test_dyn_stackalloc_sgpr_constant_align4 + ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 32 + ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 + ; WAVE64: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32) + ; WAVE64: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE64: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32) + ; WAVE64: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_constant_align4 + ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 32 + ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 + ; WAVE32: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32) + ; WAVE32: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE32: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32) + ; WAVE32: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + %0:_(s32) = G_CONSTANT i32 32 + %1:_(p5) = G_DYN_STACKALLOC %0, 4 + S_ENDPGM 0, implicit %1 +... + +--- +name: test_dyn_stackalloc_sgpr_constant_align8 +legalized: true +frameInfo: + maxAlignment: 8 +stack: + - { id: 0, type: variable-sized, alignment: 8 } +body: | + bb.0: + liveins: $sgpr0 + + ; WAVE64-LABEL: name: test_dyn_stackalloc_sgpr_constant_align8 + ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 32 + ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 + ; WAVE64: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32) + ; WAVE64: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE64: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32) + ; WAVE64: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_constant_align8 + ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 32 + ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 + ; WAVE32: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32) + ; WAVE32: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE32: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32) + ; WAVE32: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + %0:_(s32) = G_CONSTANT i32 32 + %1:_(p5) = G_DYN_STACKALLOC %0, 8 + S_ENDPGM 0, implicit %1 +... + +--- +name: test_dyn_stackalloc_sgpr_constant_align16 +legalized: true +frameInfo: + maxAlignment: 16 +stack: + - { id: 0, type: variable-sized, alignment: 16 } +body: | + bb.0: + liveins: $sgpr0 + + ; WAVE64-LABEL: name: test_dyn_stackalloc_sgpr_constant_align16 + ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 32 + ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 + ; WAVE64: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32) + ; WAVE64: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE64: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32) + ; WAVE64: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_constant_align16 + ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 32 + ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 + ; WAVE32: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32) + ; WAVE32: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE32: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32) + ; WAVE32: S_ENDPGM 0, implicit [[PTR_ADD]](p5) + %0:_(s32) = G_CONSTANT i32 32 + %1:_(p5) = G_DYN_STACKALLOC %0, 16 + S_ENDPGM 0, implicit %1 +... + +--- +name: test_dyn_stackalloc_sgpr_constant_align32 +legalized: true +frameInfo: + maxAlignment: 32 +stack: + - { id: 0, type: variable-sized, alignment: 32 } +body: | + bb.0: + liveins: $sgpr0 + + ; WAVE64-LABEL: name: test_dyn_stackalloc_sgpr_constant_align32 + ; WAVE64: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 32 + ; WAVE64: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 + ; WAVE64: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32) + ; WAVE64: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE64: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32) + ; WAVE64: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048 + ; WAVE64: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32) + ; WAVE64: S_ENDPGM 0, implicit [[PTRMASK]](p5) + ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_constant_align32 + ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 32 + ; WAVE32: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 + ; WAVE32: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32) + ; WAVE32: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg + ; WAVE32: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32) + ; WAVE32: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1024 + ; WAVE32: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32) + ; WAVE32: S_ENDPGM 0, implicit [[PTRMASK]](p5) + %0:_(s32) = G_CONSTANT i32 32 + %1:_(p5) = G_DYN_STACKALLOC %0, 32 + S_ENDPGM 0, implicit %1 +...