Index: lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -468,6 +468,7 @@ } void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { + const Function& ContainingFunction = *I.getParent()->getParent(); if (!I.isStaticAlloca()) return; @@ -486,7 +487,7 @@ // FIXME: This is the maximum work group size. We should try to get // value from the reqd_work_group_size function attribute if it is // available. - unsigned WorkGroupSize = 256; + unsigned WorkGroupSize = AMDGPU::getMaximumWorkGroupSize(ContainingFunction); int AllocaSize = WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy); @@ -507,7 +508,7 @@ Function *F = I.getParent()->getParent(); - Type *GVTy = ArrayType::get(I.getAllocatedType(), 256); + Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize); GlobalVariable *GV = new GlobalVariable( *Mod, GVTy, false, GlobalValue::InternalLinkage, UndefValue::get(GVTy), Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -60,6 +60,8 @@ unsigned PSInputAddr; bool ReturnsVoid; + unsigned MaximumWorkGroupSize; + public: // FIXME: Make private unsigned LDSWaveSpillSize; Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -48,6 +48,7 @@ PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister), PSInputAddr(0), ReturnsVoid(true), + MaximumWorkGroupSize(256), LDSWaveSpillSize(0), PSInputEna(0), NumUserSGPRs(0), @@ -120,6 +121,11 @@ if (HasStackObjects && ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS && ST.isAmdHsaOS()) FlatScratchInit = true; + + if (getShaderType() == ShaderType::COMPUTE) + MaximumWorkGroupSize = AMDGPU::getMaximumWorkGroupSize(*F); + else + MaximumWorkGroupSize = ST.getWavefrontSize(); } unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( @@ -199,8 +205,5 @@ unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize( const MachineFunction &MF) const { - const AMDGPUSubtarget &ST = MF.getSubtarget(); - // FIXME: We should get this information from kernel attributes if it - // is available. - return getShaderType() == ShaderType::COMPUTE ? 256 : ST.getWavefrontSize(); + return MaximumWorkGroupSize; } Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -23,6 +23,44 @@ using namespace llvm; +namespace { + +unsigned getMaxWaveCountPerSIMD(const MachineFunction &MF) { + const SIMachineFunctionInfo& MFI = *MF.getInfo(); + return (MFI.getMaximumWorkGroupSize(MF) + 255) / 256; +} + +unsigned getAllowedSGPRCount(const MachineFunction &MF) { + const AMDGPUSubtarget &ST = MF.getSubtarget(); + unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF); + + unsigned AllowedSGPRCount; + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + AllowedSGPRCount = 800; + AllowedSGPRCount = (AllowedSGPRCount / MaxWaveCountPerSIMD) & ~15; + + if (ST.hasSGPRInitBug()) + AllowedSGPRCount = std::min(AllowedSGPRCount, + AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) - 6; + else + AllowedSGPRCount = std::min(AllowedSGPRCount - 6, 102U); + } else { + AllowedSGPRCount = 512; + AllowedSGPRCount = (AllowedSGPRCount / MaxWaveCountPerSIMD) & ~7; + AllowedSGPRCount = std::min(AllowedSGPRCount - 2, 104U); + } + + return AllowedSGPRCount; +} + +unsigned getAllowedVGPRCount(const MachineFunction &MF) { + unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF); + + return (256 / MaxWaveCountPerSIMD) & ~3; +} + +} + SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() { unsigned NumRegPressureSets = getNumRegPressureSets(); @@ -47,38 +85,20 @@ unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( const MachineFunction &MF) const { - const AMDGPUSubtarget &ST = MF.getSubtarget(); - if (ST.hasSGPRInitBug()) { - // Leave space for flat_scr, xnack_mask, vcc, and alignment - unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 8 - 4; - unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); - return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); - } - - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - // 96/97 need to be reserved for flat_scr, 98/99 for xnack_mask, and - // 100/101 for vcc. This is the next sgpr128 down. - return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95; - } - - return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99; + unsigned BaseIdx = (getAllowedSGPRCount(MF) & ~3) - 4; + unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); + return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); } unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( const MachineFunction &MF) const { - const AMDGPUSubtarget &ST = MF.getSubtarget(); - if (ST.hasSGPRInitBug()) { - unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6 - 1; - return AMDGPU::SGPR_32RegClass.getRegister(Idx); - } - - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - // Next register before reservations for flat_scr, xnack_mask, vcc, - // and scratch resource. - return AMDGPU::SGPR91; - } - - return AMDGPU::SGPR95; + unsigned Idx = getAllowedSGPRCount(MF); + // Try to place it in a hole after PrivateSegmentbufferReg. + if(Idx & 3) + Idx -= 1; + else + Idx -= 5; + return AMDGPU::SGPR_32RegClass.getRegister(Idx); } BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { @@ -90,35 +110,21 @@ reserveRegisterTuples(Reserved, AMDGPU::EXEC); reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); - // Reserve the last 2 registers so we will always have at least 2 more that - // will physically contain VCC. - reserveRegisterTuples(Reserved, AMDGPU::SGPR102_SGPR103); - const AMDGPUSubtarget &ST = MF.getSubtarget(); + unsigned AllowedSGPRCount = getAllowedSGPRCount(MF); + unsigned AllowedVGPRCount = getAllowedVGPRCount(MF); - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - // SI/CI have 104 SGPRs. VI has 102. We need to shift down the reservation - // for VCC/XNACK_MASK/FLAT_SCR. - // - // TODO The SGPRs that alias to XNACK_MASK could be used as general purpose - // SGPRs when the XNACK feature is not used. This is currently not done - // because the code that counts SGPRs cannot account for such holes. - reserveRegisterTuples(Reserved, AMDGPU::SGPR96_SGPR97); - reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99); - reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101); + unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); + unsigned NumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); + for (unsigned i = AllowedSGPRCount; i < NumSGPRs; ++i) { + unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); + reserveRegisterTuples(Reserved, Reg); } - // Tonga and Iceland can only allocate a fixed number of SGPRs due - // to a hw bug. - if (ST.hasSGPRInitBug()) { - unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); - // Reserve some SGPRs for FLAT_SCRATCH, XNACK_MASK, and VCC (6 SGPRs). - unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6; - for (unsigned i = Limit; i < NumSGPRs; ++i) { - unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); - reserveRegisterTuples(Reserved, Reg); - } + for (unsigned i = AllowedVGPRCount; i < NumVGPRs; ++i) { + unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); + reserveRegisterTuples(Reserved, Reg); } const SIMachineFunctionInfo *MFI = MF.getInfo(); Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -45,6 +45,7 @@ bool isReadOnlySegment(const GlobalValue *GV); unsigned getShaderType(const Function &F); +unsigned getMaximumWorkGroupSize(const Function &F); unsigned getInitialPSInputAddr(const Function &F); Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -128,6 +128,10 @@ return getIntegerAttribute(F, "ShaderType", ShaderType::COMPUTE); } +unsigned getMaximumWorkGroupSize(const Function &F) { + return getIntegerAttribute(F, "maximum-work-group-size", 256); +} + unsigned getInitialPSInputAddr(const Function &F) { return getIntegerAttribute(F, "InitialPSInputAddr", 0); } Index: test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll @@ -0,0 +1,73 @@ +; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck %s + +; CHECK: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4 + +define void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #63 { +entry: + %stack = alloca [5 x i32], align 4 + %0 = load i32, i32 addrspace(1)* %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 + store i32 4, i32* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 + %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 + store i32 5, i32* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 + %2 = load i32, i32* %arrayidx10, align 4 + store i32 %2, i32 addrspace(1)* %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 + %3 = load i32, i32* %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 + store i32 %3, i32 addrspace(1)* %arrayidx13 + ret void +} + +attributes #63 = { nounwind "maximum-work-group-size"="63"} + +; CHECK: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4 + +define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #256 { +entry: + %stack = alloca [5 x i32], align 4 + %0 = load i32, i32 addrspace(1)* %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 + store i32 4, i32* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 + %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 + store i32 5, i32* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 + %2 = load i32, i32* %arrayidx10, align 4 + store i32 %2, i32 addrspace(1)* %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 + %3 = load i32, i32* %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 + store i32 %3, i32 addrspace(1)* %arrayidx13 + ret void +} + +attributes #256 = { nounwind "maximum-work-group-size"="256"} + +; CHECK: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4 + +define void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1600 { +entry: + %stack = alloca [5 x i32], align 4 + %0 = load i32, i32 addrspace(1)* %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 + store i32 4, i32* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 + %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 + store i32 5, i32* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 + %2 = load i32, i32* %arrayidx10, align 4 + store i32 %2, i32 addrspace(1)* %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 + %3 = load i32, i32* %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 + store i32 %3, i32 addrspace(1)* %arrayidx13 + ret void +} + +attributes #1600 = { nounwind "maximum-work-group-size"="1600"} Index: test/CodeGen/AMDGPU/large-work-group-registers.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/large-work-group-registers.ll @@ -0,0 +1,42 @@ +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck %s + +; CHECK: NumVgprs: 63 +define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, [16 x <8 x i32>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, <3 x i32> inreg, <3 x i32> inreg, <3 x i32>) #0 { +main_body: + %8 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %4, i64 0, i64 8 + %9 = load <4 x i32>, <4 x i32> addrspace(2)* %8, align 16, !tbaa !0 + %10 = extractelement <3 x i32> %7, i32 0 + %11 = extractelement <3 x i32> %7, i32 1 + %12 = mul i32 %10, %11 + %bc = bitcast <3 x i32> %7 to <3 x float> + %13 = extractelement <3 x float> %bc, i32 1 + %14 = insertelement <512 x float> undef, float %13, i32 %12 + call void @llvm.amdgcn.s.barrier() + %15 = extractelement <3 x i32> %6, i32 0 + %16 = extractelement <3 x i32> %7, i32 0 + %17 = shl i32 %15, 5 + %18 = add i32 %17, %16 + %19 = shl i32 %18, 4 + %20 = extractelement <3 x i32> %7, i32 1 + %21 = shl i32 %20, 2 + %22 = sext i32 %21 to i64 + %23 = getelementptr i8, i8 addrspace(3)* null, i64 %22 + %24 = bitcast i8 addrspace(3)* %23 to i32 addrspace(3)* + %25 = load i32, i32 addrspace(3)* %24, align 4 + %26 = extractelement <512 x float> %14, i32 %25 + %27 = insertelement <4 x float> undef, float %26, i32 0 + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %27, <4 x i32> %9, i32 0, i32 %19, i1 false, i1 false) + ret void +} + +; Function Attrs: convergent nounwind +declare void @llvm.amdgcn.s.barrier() #1 + +; Function Attrs: nounwind +declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2 + +attributes #0 = { "maximum-work-group-size"="1024" "ShaderType"="3" } +attributes #1 = { convergent nounwind } +attributes #2 = { nounwind } + +!0 = !{!"const", null, i32 1}