Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -507,6 +507,10 @@ Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false); OutStreamer->emitRawComment( + " Occupancy: " + + Twine(CurrentProgramInfo.Occupancy), false); + + OutStreamer->emitRawComment( " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false); OutStreamer->emitRawComment( @@ -1057,6 +1061,10 @@ // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP. S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) | S_00B84C_EXCP_EN(0); + + ProgInfo.Occupancy = STM.computeOccupancy(MF, ProgInfo.LDSSize, + ProgInfo.NumSGPRsForWavesPerEU, + ProgInfo.NumVGPRsForWavesPerEU); } static unsigned getRsrcReg(CallingConv::ID CallConv) { Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -75,6 +75,7 @@ bool HasFminFmaxLegacy; bool EnablePromoteAlloca; bool HasTrigReducedRange; + unsigned MaxWavesPerEU; int LocalMemorySize; unsigned WavefrontSize; @@ -223,7 +224,9 @@ /// subtarget. virtual unsigned getMinWavesPerEU() const = 0; - unsigned getMaxWavesPerEU() const { return 10; } + /// \returns Maximum number of waves per execution unit supported by the + /// subtarget without any kind of limitation. + unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; } /// Creates value range metadata on an workitemid.* inrinsic call or load. bool makeLIDRangeMetadata(Instruction *I) const; @@ -245,6 +248,9 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo, public AMDGPUSubtarget { + + using AMDGPUSubtarget::getMaxWavesPerEU; + public: enum TrapHandlerAbi { TrapHandlerAbiNone = 0, @@ -881,12 +887,6 @@ return AMDGPU::IsaInfo::getMaxWavesPerCU(this, FlatWorkGroupSize); } - /// \returns Maximum number of waves per execution unit supported by the - /// subtarget without any kind of limitation. - unsigned getMaxWavesPerEU() const { - return AMDGPU::IsaInfo::getMaxWavesPerEU(this); - } - /// \returns Number of waves per work group supported by the subtarget and /// limited by given \p FlatWorkGroupSize. unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const { @@ -1036,6 +1036,13 @@ /// VGPRs unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; + /// Return occupancy for the given function. Used LDS and a number of + /// registers if provided. + /// Note, occupancy can be affected by the scratch allocation as well, but + /// we do not have enough information to compute it. + unsigned computeOccupancy(const MachineFunction &MF, unsigned LDSSize = 0, + unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; + /// \returns true if the flat_scratch register should be initialized with the /// pointer to the wave's scratch memory rather than a size and offset. bool flatScratchIsPointer() const { Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -175,6 +175,7 @@ HasFminFmaxLegacy(true), EnablePromoteAlloca(false), HasTrigReducedRange(false), + MaxWavesPerEU(10), LocalMemorySize(0), WavefrontSize(0) { } @@ -278,6 +279,7 @@ InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), TLInfo(TM, *this), FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { + MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); @@ -566,7 +568,7 @@ unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { if (getGeneration() >= AMDGPUSubtarget::GFX10) - return 10; + return getMaxWavesPerEU(); if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { if (SGPRs <= 80) @@ -616,6 +618,20 @@ return 2; // VCC. } +unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF, + unsigned LDSSize, + unsigned NumSGPRs, + unsigned NumVGPRs) const { + unsigned Occupancy = + std::min(getMaxWavesPerEU(), + getOccupancyWithLocalMemSize(LDSSize, MF.getFunction())); + if (NumSGPRs) + Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); + if (NumVGPRs) + Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); + return Occupancy; +} + unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { const Function &F = MF.getFunction(); const SIMachineFunctionInfo &MFI = *MF.getInfo(); Index: llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -53,8 +53,7 @@ FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); WavesPerEU = ST.getWavesPerEU(F); - Occupancy = getMaxWavesPerEU(); - limitOccupancy(MF); + Occupancy = ST.computeOccupancy(MF, getLDSSize()); CallingConv::ID CC = F.getCallingConv(); if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) { Index: llvm/trunk/lib/Target/AMDGPU/SIProgramInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIProgramInfo.h +++ llvm/trunk/lib/Target/AMDGPU/SIProgramInfo.h @@ -51,6 +51,9 @@ // Number of VGPRs that meets number of waves per execution unit request. uint32_t NumVGPRsForWavesPerEU = 0; + // Final occupancy. + uint32_t Occupancy = 0; + // Whether there is recursion, dynamic allocas, indirect calls or some other // reason there may be statically unknown stack usage. bool DynamicCallStack = false; Index: llvm/trunk/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll +++ llvm/trunk/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll @@ -74,10 +74,7 @@ ; CHECK: .name: num_spilled_vgprs ; CHECK: .symbol: num_spilled_vgprs.kd -; GFX700: .vgpr_spill_count: 14 -; GFX803: .vgpr_spill_count: 14 -; GFX900: .vgpr_spill_count: 14 -; GFX1010: .vgpr_spill_count: 0 +; CHECK: .vgpr_spill_count: 14 define amdgpu_kernel void @num_spilled_vgprs() #1 { %val0 = load volatile float, float addrspace(1)* @var %val1 = load volatile float, float addrspace(1)* @var Index: llvm/trunk/test/CodeGen/AMDGPU/nsa-reassign.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/nsa-reassign.ll +++ llvm/trunk/test/CodeGen/AMDGPU/nsa-reassign.ll @@ -21,8 +21,8 @@ } ; GCN-LABEL: {{^}}sample_contig_nsa_10vgprs: -; GCN-DAG: image_sample_c_l v{{[0-9]+}}, v[{{[0-9:]+}}], -; GCN-DAG: image_sample v{{[0-9]+}}, v[{{[0-9:]+}}], +; GCN-DAG: image_sample_c_l v{{[0-9]+}}, [{{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}], +; GCN-DAG: image_sample v{{[0-9]+}}, [{{v[0-9]+, v[0-9]+, v[0-9]+}}], define amdgpu_ps <2 x float> @sample_contig_nsa_10vgprs(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %r2, float %s2, float %t2) #0 { main_body: %zcompare.1 = fadd float %zcompare, 1.0 Index: llvm/trunk/test/CodeGen/AMDGPU/occupancy-levels.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/occupancy-levels.ll +++ llvm/trunk/test/CodeGen/AMDGPU/occupancy-levels.ll @@ -0,0 +1,288 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GCN,GFX1010,GFX1010W32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX1010,GFX1010W64 %s + +; GCN-LABEL: {{^}}max_occupancy: +; GFX9: ; Occupancy: 10 +; GFX1010: ; Occupancy: 20 +define amdgpu_kernel void @max_occupancy() { + ret void +} + +; GCN-LABEL: {{^}}limited_occupancy_3: +; GFX9: ; Occupancy: 3 +; GFX1010W64: ; Occupancy: 3 +; GFX1010W32: ; Occupancy: 4 +define amdgpu_kernel void @limited_occupancy_3() #0 { + ret void +} + +; GCN-LABEL: {{^}}limited_occupancy_18: +; GFX9: ; Occupancy: 10 +; GFX1010: ; Occupancy: 18 +define amdgpu_kernel void @limited_occupancy_18() #1 { + ret void +} + +; GCN-LABEL: {{^}}limited_occupancy_19: +; GFX9: ; Occupancy: 10 +; GFX1010: ; Occupancy: 18 +define amdgpu_kernel void @limited_occupancy_19() #2 { + ret void +} + +; GCN-LABEL: {{^}}used_24_vgprs: +; GFX9: ; Occupancy: 10 +; GFX1010: ; Occupancy: 20 +define amdgpu_kernel void @used_24_vgprs() { + call void asm sideeffect "", "~{v23}" () + ret void +} + +; GCN-LABEL: {{^}}used_28_vgprs: +; GFX9: ; Occupancy: 9 +; GFX1010W64: ; Occupancy: 18 +; GFX1010W32: ; Occupancy: 20 +define amdgpu_kernel void @used_28_vgprs() { + call void asm sideeffect "", "~{v27}" () + ret void +} + +; GCN-LABEL: {{^}}used_32_vgprs: +; GFX9: ; Occupancy: 8 +; GFX1010W64: ; Occupancy: 16 +; GFX1010W32: ; Occupancy: 20 +define amdgpu_kernel void @used_32_vgprs() { + call void asm sideeffect "", "~{v31}" () + ret void +} + +; GCN-LABEL: {{^}}used_36_vgprs: +; GFX9: ; Occupancy: 7 +; GFX1010W64: ; Occupancy: 14 +; GFX1010W32: ; Occupancy: 20 +define amdgpu_kernel void @used_36_vgprs() { + call void asm sideeffect "", "~{v35}" () + ret void +} + +; GCN-LABEL: {{^}}used_40_vgprs: +; GFX9: ; Occupancy: 6 +; GFX1010W64: ; Occupancy: 12 +; GFX1010W32: ; Occupancy: 20 +define amdgpu_kernel void @used_40_vgprs() { + call void asm sideeffect "", "~{v39}" () + ret void +} + +; GCN-LABEL: {{^}}used_44_vgprs: +; GFX9: ; Occupancy: 5 +; GFX1010W64: ; Occupancy: 11 +; GFX1010W32: ; Occupancy: 20 +define amdgpu_kernel void @used_44_vgprs() { + call void asm sideeffect "", "~{v43}" () + ret void +} + +; GCN-LABEL: {{^}}used_48_vgprs: +; GFX9: ; Occupancy: 5 +; GFX1010W64: ; Occupancy: 10 +; GFX1010W32: ; Occupancy: 20 +define amdgpu_kernel void @used_48_vgprs() { + call void asm sideeffect "", "~{v47}" () + ret void +} + +; GCN-LABEL: {{^}}used_56_vgprs: +; GFX9: ; Occupancy: 4 +; GFX1010W64: ; Occupancy: 9 +; GFX1010W32: ; Occupancy: 18 +define amdgpu_kernel void @used_56_vgprs() { + call void asm sideeffect "", "~{v55}" () + ret void +} + +; GCN-LABEL: {{^}}used_64_vgprs: +; GFX9: ; Occupancy: 4 +; GFX1010W64: ; Occupancy: 8 +; GFX1010W32: ; Occupancy: 16 +define amdgpu_kernel void @used_64_vgprs() { + call void asm sideeffect "", "~{v63}" () + ret void +} + +; GCN-LABEL: {{^}}used_72_vgprs: +; GFX9: ; Occupancy: 3 +; GFX1010W64: ; Occupancy: 7 +; GFX1010W32: ; Occupancy: 14 +define amdgpu_kernel void @used_72_vgprs() { + call void asm sideeffect "", "~{v71}" () + ret void +} + +; GCN-LABEL: {{^}}used_80_vgprs: +; GFX9: ; Occupancy: 3 +; GFX1010W64: ; Occupancy: 6 +; GFX1010W32: ; Occupancy: 12 +define amdgpu_kernel void @used_80_vgprs() { + call void asm sideeffect "", "~{v79}" () + ret void +} + +; GCN-LABEL: {{^}}used_84_vgprs: +; GFX9: ; Occupancy: 3 +; GFX1010W64: ; Occupancy: 6 +; GFX1010W32: ; Occupancy: 11 +define amdgpu_kernel void @used_84_vgprs() { + call void asm sideeffect "", "~{v83}" () + ret void +} + +; GCN-LABEL: {{^}}used_88_vgprs: +; GFX9: ; Occupancy: 2 +; GFX1010W64: ; Occupancy: 5 +; GFX1010W32: ; Occupancy: 11 +define amdgpu_kernel void @used_88_vgprs() { + call void asm sideeffect "", "~{v87}" () + ret void +} + +; GCN-LABEL: {{^}}used_96_vgprs: +; GFX9: ; Occupancy: 2 +; GFX1010W64: ; Occupancy: 5 +; GFX1010W32: ; Occupancy: 10 +define amdgpu_kernel void @used_96_vgprs() { + call void asm sideeffect "", "~{v95}" () + ret void +} + +; GCN-LABEL: {{^}}used_100_vgprs: +; GFX9: ; Occupancy: 2 +; GFX1010W64: ; Occupancy: 5 +; GFX1010W32: ; Occupancy: 9 +define amdgpu_kernel void @used_100_vgprs() { + call void asm sideeffect "", "~{v99}" () + ret void +} + +; GCN-LABEL: {{^}}used_112_vgprs: +; GFX9: ; Occupancy: 2 +; GFX1010W64: ; Occupancy: 4 +; GFX1010W32: ; Occupancy: 9 +define amdgpu_kernel void @used_112_vgprs() { + call void asm sideeffect "", "~{v111}" () + ret void +} + +; GCN-LABEL: {{^}}used_128_vgprs: +; GFX9: ; Occupancy: 2 +; GFX1010W64: ; Occupancy: 4 +; GFX1010W32: ; Occupancy: 8 +define amdgpu_kernel void @used_128_vgprs() { + call void asm sideeffect "", "~{v127}" () + ret void +} + +; GCN-LABEL: {{^}}used_144_vgprs: +; GFX9: ; Occupancy: 1 +; GFX1010W64: ; Occupancy: 3 +; GFX1010W32: ; Occupancy: 7 +define amdgpu_kernel void @used_144_vgprs() { + call void asm sideeffect "", "~{v143}" () + ret void +} + +; GCN-LABEL: {{^}}used_168_vgprs: +; GFX9: ; Occupancy: 1 +; GFX1010W64: ; Occupancy: 3 +; GFX1010W32: ; Occupancy: 6 +define amdgpu_kernel void @used_168_vgprs() { + call void asm sideeffect "", "~{v167}" () + ret void +} + +; GCN-LABEL: {{^}}used_200_vgprs: +; GFX9: ; Occupancy: 1 +; GFX1010W64: ; Occupancy: 2 +; GFX1010W32: ; Occupancy: 5 +define amdgpu_kernel void @used_200_vgprs() { + call void asm sideeffect "", "~{v199}" () + ret void +} + +; GCN-LABEL: {{^}}used_256_vgprs: +; GFX9: ; Occupancy: 1 +; GFX1010W64: ; Occupancy: 2 +; GFX1010W32: ; Occupancy: 4 +define amdgpu_kernel void @used_256_vgprs() { + call void asm sideeffect "", "~{v255}" () + ret void +} + +; GCN-LABEL: {{^}}used_80_sgprs: +; GFX9: ; Occupancy: 10 +; GFX1010: ; Occupancy: 20 +define amdgpu_kernel void @used_80_sgprs() { + call void asm sideeffect "", "~{s79}" () + ret void +} + +; GCN-LABEL: {{^}}used_88_sgprs: +; GFX9: ; Occupancy: 9 +; GFX1010: ; Occupancy: 20 +define amdgpu_kernel void @used_88_sgprs() { + call void asm sideeffect "", "~{s87}" () + ret void +} + +; GCN-LABEL: {{^}}used_100_sgprs: +; GFX9: ; Occupancy: 8 +; GFX1010: ; Occupancy: 20 +define amdgpu_kernel void @used_100_sgprs() { + call void asm sideeffect "", "~{s99}" () + ret void +} + +; GCN-LABEL: {{^}}used_101_sgprs: +; GFX9: ; Occupancy: 7 +; GFX1010: ; Occupancy: 20 +define amdgpu_kernel void @used_101_sgprs() { + call void asm sideeffect "", "~{s100}" () + ret void +} + +; GCN-LABEL: {{^}}used_lds_6552: +; GFX9: ; Occupancy: 10 +; GFX1010: ; Occupancy: 20 +@lds6552 = internal addrspace(3) global [6552 x i8] undef, align 4 +define amdgpu_kernel void @used_lds_6552() { + %p = bitcast [6552 x i8] addrspace(3)* @lds6552 to i8 addrspace(3)* + store volatile i8 1, i8 addrspace(3)* %p + ret void +} + +; GCN-LABEL: {{^}}used_lds_6556: +; GFX9: ; Occupancy: 9 +; GFX1010W64: ; Occupancy: 19 +; GFX1010W32: ; Occupancy: 20 +@lds6556 = internal addrspace(3) global [6556 x i8] undef, align 4 +define amdgpu_kernel void @used_lds_6556() { + %p = bitcast [6556 x i8] addrspace(3)* @lds6556 to i8 addrspace(3)* + store volatile i8 1, i8 addrspace(3)* %p + ret void +} + +; GCN-LABEL: {{^}}used_lds_13112: +; GFX9: ; Occupancy: 4 +; GFX1010W64: ; Occupancy: 9 +; GFX1010W32: ; Occupancy: 19 +@lds13112 = internal addrspace(3) global [13112 x i8] undef, align 4 +define amdgpu_kernel void @used_lds_13112() { + %p = bitcast [13112 x i8] addrspace(3)* @lds13112 to i8 addrspace(3)* + store volatile i8 1, i8 addrspace(3)* %p + ret void +} + +attributes #0 = { "amdgpu-waves-per-eu"="2,3" } +attributes #1 = { "amdgpu-waves-per-eu"="18,18" } +attributes #2 = { "amdgpu-waves-per-eu"="19,19" } Index: llvm/trunk/test/CodeGen/AMDGPU/wave32.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/wave32.ll +++ llvm/trunk/test/CodeGen/AMDGPU/wave32.ll @@ -920,7 +920,7 @@ ; GCN-LABEL: {{^}}test_vgprblocks_w64_attr: ; Test that the wave size can be overridden in function attributes and that the block size is correct as a result -; GFX10DEFWAVE: ; VGPRBlocks: 11 +; GFX10DEFWAVE: ; VGPRBlocks: 2 define amdgpu_gs float @test_vgprblocks_w64_attr(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i, float %j, float %k, float %l) #4 { main_body: