Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -182,6 +182,22 @@ "Enable scratch buffer sizes greater than 128 GB" >; +class MaxWorkGroupSizeFeature : SubtargetFeature< + "max-workgroup-size="#size, + "MaxWorkGroupSize", + !cast(size), + "Set max workgroup size to "#size#" ("# + !cast(!srl(size, 6))#" wavefronts)" +>; + +// Theoretically every multiple of 64 up to the maximum 2048 can be +// used as a workgroup size. +//XXX - I'm not sure what the maximum size is for R600. +foreach NumWaves = 1-32 in { + def FeatureMaxWorkGroupSize#!shl(NumWaves, 6) : + MaxWorkGroupSizeFeature; +} + def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling", "EnableVGPRSpilling", "true", Index: lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -35,10 +35,10 @@ Module *Mod; const DataLayout *DL; MDNode *MaxWorkGroupSizeRange; - int MaxWorkGroupSize; + int32_t MaxWorkGroupSize; // FIXME: This should be per-kernel. - int LocalMemAvailable; + int32_t LocalMemAvailable; bool IsAMDGCN; bool IsAMDHSA; @@ -489,7 +489,7 @@ // available. uint64_t AllocaSize = MaxWorkGroupSize * DL->getTypeAllocSize(AllocaTy); - if (AllocaSize > LocalMemAvailable) { + if (AllocaSize > static_cast(LocalMemAvailable)) { DEBUG(dbgs() << " Not enough local memory to promote alloca.\n"); return; } Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -80,7 +80,8 @@ bool EnableXNACK; unsigned WavefrontSize; bool CFALUBug; - uint32_t LocalMemorySize; + int32_t LocalMemorySize; + int32_t MaxWorkGroupSize; bool EnableVGPRSpilling; bool SGPRInitBug; bool IsGCN; @@ -249,14 +250,12 @@ return CFALUBug; } - uint32_t getLocalMemorySize() const { + int32_t getLocalMemorySize() const { return LocalMemorySize; } - int getMaxWorkGroupSize() const { - // FIXME: The hardware maximum is 2048. The runtime usually enforces a 256 - // limit though. - return 256; + int32_t getMaxWorkGroupSize() const { + return MaxWorkGroupSize; } int getHardwareMaxWorkGroupSize() const { Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -51,6 +51,10 @@ ParseSubtargetFeatures(GPU, FullFS); + // If no specific size requested, set the default. + if (MaxWorkGroupSize == 0) + MaxWorkGroupSize = 256; + // FIXME: I don't think think Evergreen has any useful support for // denormals, but should be checked. Should we issue a warning somewhere // if someone tries to enable these? @@ -74,7 +78,7 @@ EnableUnsafeDSOffsetFolding(false), EnableXNACK(false), WavefrontSize(0), CFALUBug(false), - LocalMemorySize(0), + LocalMemorySize(0), MaxWorkGroupSize(0), EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false), GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0), IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false), Index: test/CodeGen/AMDGPU/max-workgroup-size-attr.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/max-workgroup-size-attr.ll @@ -0,0 +1,50 @@ +; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mattr="+max-workgroup-size=64" -amdgpu-promote-alloca < %s | FileCheck -check-prefix=MAXSIZE64 %s +; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mattr="+max-workgroup-size=128" -amdgpu-promote-alloca < %s | FileCheck -check-prefix=MAXSIZE128 %s +; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mattr="+max-workgroup-size=192" -amdgpu-promote-alloca < %s | FileCheck -check-prefix=MAXSIZE192 %s +; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mattr="+max-workgroup-size=256" -amdgpu-promote-alloca < %s | FileCheck -check-prefix=MAXSIZE256 %s +; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mattr="+max-workgroup-size=640" -amdgpu-promote-alloca < %s | FileCheck -check-prefix=MAXSIZE640 %s +; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mattr="+max-workgroup-size=512" -amdgpu-promote-alloca < %s | FileCheck -check-prefix=MAXSIZE512 %s +; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mattr="+max-workgroup-size=1024" -amdgpu-promote-alloca < %s | FileCheck -check-prefix=MAXSIZE1024 %s +; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mattr="+max-workgroup-size=1600" -amdgpu-promote-alloca < %s | FileCheck -check-prefix=MAXSIZE1600 %s +; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mattr="+max-workgroup-size=1664" -amdgpu-promote-alloca < %s | FileCheck -check-prefix=TOOBIG %s +; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mattr="+max-workgroup-size=1984" -amdgpu-promote-alloca < %s | FileCheck -check-prefix=TOOBIG %s +; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mattr="+max-workgroup-size=2048" -amdgpu-promote-alloca < %s | FileCheck -check-prefix=TOOBIG %s + +; Test if attribute is repeated with different sizes. Should be max +; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mattr="+max-workgroup-size=128,+max-workgroup-size=192,+max-workgroup-size=64" -amdgpu-promote-alloca < %s | FileCheck -check-prefix=MAXSIZE192 %s + +; MAXSIZE64: @promote_alloca_size.stack = internal unnamed_addr addrspace(3) global [64 x [5 x i32]] undef, align 4 +; MAXSIZE128: @promote_alloca_size.stack = internal unnamed_addr addrspace(3) global [128 x [5 x i32]] undef, align 4 +; MAXSIZE192: @promote_alloca_size.stack = internal unnamed_addr addrspace(3) global [192 x [5 x i32]] undef, align 4 +; MAXSIZE256: @promote_alloca_size.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4 +; MAXSIZE512: @promote_alloca_size.stack = internal unnamed_addr addrspace(3) global [512 x [5 x i32]] undef, align 4 +; MAXSIZE640: @promote_alloca_size.stack = internal unnamed_addr addrspace(3) global [640 x [5 x i32]] undef, align 4 +; MAXSIZE1024: @promote_alloca_size.stack = internal unnamed_addr addrspace(3) global [1024 x [5 x i32]] undef, align 4 +; MAXSIZE1600: @promote_alloca_size.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4 + + +; This exceeds maximum LDS size. +; TOOBIG: alloca [5 x i32] + +; FIXME: We should use subtarget features for each function instead of many opt -S run lines +define void @promote_alloca_size(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { +entry: + %stack = alloca [5 x i32], align 4 + %0 = load i32, i32 addrspace(1)* %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 + store i32 4, i32* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 + %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 + store i32 5, i32* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 + %2 = load i32, i32* %arrayidx10, align 4 + store i32 %2, i32 addrspace(1)* %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 + %3 = load i32, i32* %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 + store i32 %3, i32 addrspace(1)* %arrayidx13 + ret void +} + +attributes #0 = { nounwind }