Index: lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -204,7 +204,8 @@ } } - unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage); + unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, + F); // Restrict local memory usage so that we don't drastically reduce occupancy, // unless it is already significantly reduced. @@ -225,7 +226,7 @@ // Round up to the next tier of usage. unsigned MaxSizeWithWaveCount - = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy); + = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F); // Program is possibly broken by using more local mem than available. if (CurrentLocalMemUsage > MaxSizeWithWaveCount) Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -274,11 +274,12 @@ /// Return the amount of LDS that can be used that will not restrict the /// occupancy lower than WaveCount. - unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const; + unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, + const Function &) const; /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if /// the given LDS memory size is the only constraint. - unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const; + unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const; bool hasFP16Denormals() const { return FP64FP16Denormals; Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -132,62 +132,26 @@ initializeSubtargetDependencies(TT, GPU, FS); } -// FIXME: These limits are for SI. Did they change with the larger maximum LDS -// size? -unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const { - switch (NWaves) { - case 10: - return 1638; - case 9: - return 1820; - case 8: - return 2048; - case 7: - return 2340; - case 6: - return 2730; - case 5: - return 3276; - case 4: - return 4096; - case 3: - return 5461; - case 2: - return 8192; - default: +unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, + const Function &F) const { + if (NWaves == 1) return getLocalMemorySize(); - } + unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; + unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); + unsigned MaxWaves = getMaxWavesPerEU(); + return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; } -unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const { - if (Bytes <= 1638) - return 10; - - if (Bytes <= 1820) - return 9; - - if (Bytes <= 2048) - return 8; - - if (Bytes <= 2340) - return 7; - - if (Bytes <= 2730) - return 6; - - if (Bytes <= 3276) - return 5; - - if (Bytes <= 4096) - return 4; - - if (Bytes <= 5461) - return 3; - - if (Bytes <= 8192) - return 2; - - return 1; +unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, + const Function &F) const { + unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; + unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); + unsigned MaxWaves = getMaxWavesPerEU(); + unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; + unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); + NumWaves = std::min(NumWaves, MaxWaves); + NumWaves = std::max(NumWaves, 1u); + return NumWaves; } std::pair AMDGPUSubtarget::getFlatWorkGroupSizes( Index: lib/Target/AMDGPU/GCNSchedStrategy.cpp =================================================================== --- lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -35,7 +35,8 @@ unsigned MinRegOccupancy = std::min(ST.getOccupancyWithNumSGPRs(SGPRs), ST.getOccupancyWithNumVGPRs(VGPRs)); return std::min(MinRegOccupancy, - ST.getOccupancyWithLocalMemSize(MFI->getLDSSize())); + ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), + *MF.getFunction())); } void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, Index: test/CodeGen/AMDGPU/indirect-private-64.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-private-64.ll +++ test/CodeGen/AMDGPU/indirect-private-64.ll @@ -22,8 +22,8 @@ ; CI-PROMOTE: ds_read_b64 define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) #1 { %val = load double, double addrspace(1)* %in, align 8 - %array = alloca [16 x double], align 8 - %ptr = getelementptr inbounds [16 x double], [16 x double]* %array, i32 0, i32 %b + %array = alloca [8 x double], align 8 + %ptr = getelementptr inbounds [8 x double], [8 x double]* %array, i32 0, i32 %b store double %val, double* %ptr, align 8 call void @llvm.amdgcn.s.barrier() %result = load double, double* %ptr, align 8 @@ -53,8 +53,8 @@ ; CI-PROMOTE: ds_read2_b64 define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 { %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16 - %array = alloca [8 x <2 x double>], align 16 - %ptr = getelementptr inbounds [8 x <2 x double>], [8 x <2 x double>]* %array, i32 0, i32 %b + %array = alloca [4 x <2 x double>], align 16 + %ptr = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* %array, i32 0, i32 %b store <2 x double> %val, <2 x double>* %ptr, align 16 call void @llvm.amdgcn.s.barrier() %result = load <2 x double>, <2 x double>* %ptr, align 16 @@ -111,8 +111,8 @@ ; CI-PROMOTE: ds_read2_b64 define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 { %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16 - %array = alloca [8 x <2 x i64>], align 16 - %ptr = getelementptr inbounds [8 x <2 x i64>], [8 x <2 x i64>]* %array, i32 0, i32 %b + %array = alloca [4 x <2 x i64>], align 16 + %ptr = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* %array, i32 0, i32 %b store <2 x i64> %val, <2 x i64>* %ptr, align 16 call void @llvm.amdgcn.s.barrier() %result = load <2 x i64>, <2 x i64>* %ptr, align 16 Index: test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll =================================================================== --- test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll +++ test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll @@ -1,6 +1,8 @@ -; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck --check-prefix=SI --check-prefix=ALL %s +; RUN: opt -S -mcpu=tonga -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck --check-prefix=CI --check-prefix=ALL %s -; CHECK: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4 +; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4 +; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4 define void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: @@ -22,7 +24,7 @@ ret void } -; CHECK: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4 +; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4 define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 { entry: @@ -44,7 +46,7 @@ ret void } -; CHECK: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4 +; ALL: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4 define void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 { entry: @@ -66,8 +68,8 @@ ret void } -; CHECK-LABEL: @occupancy_0( -; CHECK: alloca [5 x i32] +; ALL-LABEL: @occupancy_0( +; ALL: alloca [5 x i32] define void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 { entry: %stack = alloca [5 x i32], align 4 @@ -88,8 +90,8 @@ ret void } -; CHECK-LABEL: @occupancy_max( -; CHECK: alloca [5 x i32] +; ALL-LABEL: @occupancy_max( +; ALL: alloca [5 x i32] define void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 { entry: %stack = alloca [5 x i32], align 4 @@ -110,8 +112,10 @@ ret void } -; CHECK-LABEL: @occupancy_6( -; CHECK-NOT: alloca +; SI-LABEL: @occupancy_6( +; CI-LABEL: @occupancy_6( +; SI: alloca +; CI-NOT: alloca define void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 { entry: %stack = alloca [42 x i8], align 4 @@ -134,8 +138,8 @@ ret void } -; CHECK-LABEL: @occupancy_6_over( -; CHECK: alloca [43 x i8] +; ALL-LABEL: @occupancy_6_over( +; ALL: alloca [43 x i8] define void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 { entry: %stack = alloca [43 x i8], align 4 @@ -158,8 +162,10 @@ ret void } -; CHECK-LABEL: @occupancy_8( -; CHECK-NOT: alloca +; SI-LABEL: @occupancy_8( +; CI-LABEL: @occupancy_8( +; SI: alloca +; CI-NOT: alloca define void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 { entry: %stack = alloca [32 x i8], align 4 @@ -182,8 +188,8 @@ ret void } -; CHECK-LABEL: @occupancy_8_over( -; CHECK: alloca [33 x i8] +; ALL-LABEL: @occupancy_8_over( +; ALL: alloca [33 x i8] define void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 { entry: %stack = alloca [33 x i8], align 4 @@ -206,8 +212,10 @@ ret void } -; CHECK-LABEL: @occupancy_9( -; CHECK-NOT: alloca +; SI-LABEL: @occupancy_9( +; CI-LABEL: @occupancy_9( +; SI: alloca +; CI-NOT: alloca define void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 { entry: %stack = alloca [28 x i8], align 4 @@ -230,8 +238,8 @@ ret void } -; CHECK-LABEL: @occupancy_9_over( -; CHECK: alloca [29 x i8] +; ALL-LABEL: @occupancy_9_over( +; ALL: alloca [29 x i8] define void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 { entry: %stack = alloca [29 x i8], align 4