Index: lib/Target/R600/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/R600/AMDGPUAsmPrinter.cpp +++ lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -78,6 +78,7 @@ EmitFunctionBody(); if (isVerbose()) { + const SIMachineFunctionInfo *MFI = MF.getInfo(); const MCSectionELF *CommentSection = Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0, @@ -92,6 +93,15 @@ false); OutStreamer.emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR), false); + + if (MFI->hasReqdWorkGroupSize()) { + OutStreamer.emitRawComment(" NumThreadX: " + + Twine(MFI->getReqdWorkGroupSize(0)), false); + OutStreamer.emitRawComment(" NumThreadY: " + + Twine(MFI->getReqdWorkGroupSize(1)), false); + OutStreamer.emitRawComment(" NumThreadZ: " + + Twine(MFI->getReqdWorkGroupSize(2)), false); + } } else { R600MachineFunctionInfo *MFI = MF.getInfo(); OutStreamer.emitRawComment( @@ -288,7 +298,7 @@ const SIProgramInfo &KernelInfo) { const AMDGPUSubtarget &STM = TM.getSubtarget(); - SIMachineFunctionInfo *MFI = MF.getInfo(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); unsigned RsrcReg; switch (MFI->getShaderType()) { default: // Fall through @@ -316,7 +326,22 @@ if (MFI->getShaderType() == ShaderType::COMPUTE) { OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(LDSBlocks), 4); + + if (MFI->hasReqdWorkGroupSize()) { + OutStreamer.EmitIntValue(R_00B81C_COMPUTE_NUM_THREAD_X, 4); + OutStreamer.EmitIntValue( + S_00B81C_NUM_THREAD_FULL(MFI->getReqdWorkGroupSize(0)), 4); + + OutStreamer.EmitIntValue(R_00B820_COMPUTE_NUM_THREAD_Y, 4); + OutStreamer.EmitIntValue( + S_00B820_NUM_THREAD_FULL(MFI->getReqdWorkGroupSize(1)), 4); + + OutStreamer.EmitIntValue(R_00B824_COMPUTE_NUM_THREAD_Z, 4); + OutStreamer.EmitIntValue( + S_00B824_NUM_THREAD_FULL(MFI->getReqdWorkGroupSize(2)), 4); + } } + if (MFI->getShaderType() == ShaderType::PIXEL) { OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4); Index: lib/Target/R600/AMDGPUMachineFunction.h =================================================================== --- lib/Target/R600/AMDGPUMachineFunction.h +++ lib/Target/R600/AMDGPUMachineFunction.h @@ -19,8 +19,13 @@ namespace llvm { class AMDGPUMachineFunction : public MachineFunctionInfo { - virtual void anchor(); unsigned ShaderType; + bool IsKernel; + uint32_t ReqdWorkGroupSize[3]; + uint32_t WorkGroupSizeHint[3]; + + virtual void anchor(); + void findOpenCLKernelAttributes(const MDNode *); public: AMDGPUMachineFunction(const MachineFunction &MF); @@ -33,6 +38,26 @@ unsigned getShaderType() const { return ShaderType; } + + bool isKernel() const { + return IsKernel; + } + + uint32_t getReqdWorkGroupSize(unsigned I) const { + return ReqdWorkGroupSize[I]; + } + + uint32_t getWorkGroupSizeHint(unsigned I) const { + return WorkGroupSizeHint[I]; + } + + bool hasReqdWorkGroupSize() const { + return ReqdWorkGroupSize[0] != 0; + } + + uint32_t getReqdWorkGroupSizeFlat() const { + return ReqdWorkGroupSize[0] * ReqdWorkGroupSize[1] * ReqdWorkGroupSize[2]; + } }; } Index: lib/Target/R600/AMDGPUMachineFunction.cpp =================================================================== --- lib/Target/R600/AMDGPUMachineFunction.cpp +++ lib/Target/R600/AMDGPUMachineFunction.cpp @@ -1,7 +1,11 @@ #include "AMDGPUMachineFunction.h" #include "AMDGPU.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/IR/Attributes.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" + using namespace llvm; static const char *const ShaderTypeAttribute = "ShaderType"; @@ -12,8 +16,14 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MachineFunctionInfo(), ShaderType(ShaderType::COMPUTE), + IsKernel(false), + ReqdWorkGroupSize{0}, + WorkGroupSizeHint{0}, + LocalMemoryObjects(), LDSSize(0) { - AttributeSet Set = MF.getFunction()->getAttributes(); + const Function *F = MF.getFunction(); + + AttributeSet Set = F->getAttributes(); Attribute A = Set.getAttribute(AttributeSet::FunctionIndex, ShaderTypeAttribute); @@ -22,4 +32,68 @@ if (Str.getAsInteger(0, ShaderType)) llvm_unreachable("Can't parse shader type!"); } + + const MachineModuleInfo &MMI = MF.getMMI(); + const Module *M = MMI.getModule(); + + const NamedMDNode *Kernels = M->getNamedMetadata("opencl.kernels"); + if (!Kernels) + return; + + for (const MDNode *K : Kernels->operands()) { + unsigned N = K->getNumOperands(); + if (N == 0) + continue; + + // We expect the first operand to be the function. + const Value *First = K->getOperand(0); + if (First == F) { + IsKernel = true; + findOpenCLKernelAttributes(K); + break; + } + } +} + +static void parseWorkgroupSize(uint32_t Size[3], const MDNode *Node) { + unsigned N = Node->getNumOperands(); + + for (unsigned I = 0; I < std::min(N - 1, 3u); ++I) { + const ConstantInt *C = dyn_cast(Node->getOperand(I + 1)); + if (!C) { + // This is malformed, just give up. + Size[0] = 0; + Size[1] = 0; + Size[2] = 0; + return; + } + + Size[I] = C->getZExtValue(); + } +} + +void AMDGPUMachineFunction::findOpenCLKernelAttributes(const MDNode *Node) { + for (unsigned I = 1, E = Node->getNumOperands(); I != E; ++I) { + const MDNode *Op = dyn_cast(Node->getOperand(I)); + if (!Op) + continue; + + unsigned N = Op->getNumOperands(); + if (N == 0) + continue; + + const MDString *NameNode = dyn_cast(Op->getOperand(0)); + if (!NameNode) + continue; + + StringRef Name = NameNode->getName(); + + if (N == 4 && Name == "reqd_work_group_size") + parseWorkgroupSize(ReqdWorkGroupSize, Op); + else if (N == 4 && Name == "work_group_size_hint") + parseWorkgroupSize(WorkGroupSizeHint, Op); + else if (Name == "vec_type_hint") { + // TODO: Do we care about this at all? + } + } } Index: lib/Target/R600/SIDefines.h =================================================================== --- lib/Target/R600/SIDefines.h +++ lib/Target/R600/SIDefines.h @@ -35,4 +35,33 @@ #define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15) #define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC + +#define R_00B804_COMPUTE_DIM_X 0x00B804 +#define R_00B808_COMPUTE_DIM_Y 0x00B808 +#define R_00B80C_COMPUTE_DIM_Z 0x00B80C +#define R_00B810_COMPUTE_START_X 0x00B810 +#define R_00B814_COMPUTE_START_Y 0x00B814 +#define R_00B818_COMPUTE_START_Z 0x00B818 +#define R_00B81C_COMPUTE_NUM_THREAD_X 0x00B81C +#define S_00B81C_NUM_THREAD_FULL(x) (((x) & 0xFFFF) << 0) +#define G_00B81C_NUM_THREAD_FULL(x) (((x) >> 0) & 0xFFFF) +#define C_00B81C_NUM_THREAD_FULL 0xFFFF0000 +#define S_00B81C_NUM_THREAD_PARTIAL(x) (((x) & 0xFFFF) << 16) +#define G_00B81C_NUM_THREAD_PARTIAL(x) (((x) >> 16) & 0xFFFF) +#define C_00B81C_NUM_THREAD_PARTIAL 0x0000FFFF +#define R_00B820_COMPUTE_NUM_THREAD_Y 0x00B820 +#define S_00B820_NUM_THREAD_FULL(x) (((x) & 0xFFFF) << 0) +#define G_00B820_NUM_THREAD_FULL(x) (((x) >> 0) & 0xFFFF) +#define C_00B820_NUM_THREAD_FULL 0xFFFF0000 +#define S_00B820_NUM_THREAD_PARTIAL(x) (((x) & 0xFFFF) << 16) +#define G_00B820_NUM_THREAD_PARTIAL(x) (((x) >> 16) & 0xFFFF) +#define C_00B820_NUM_THREAD_PARTIAL 0x0000FFFF +#define R_00B824_COMPUTE_NUM_THREAD_Z 0x00B824 +#define S_00B824_NUM_THREAD_FULL(x) (((x) & 0xFFFF) << 0) +#define G_00B824_NUM_THREAD_FULL(x) (((x) >> 0) & 0xFFFF) +#define C_00B824_NUM_THREAD_FULL 0xFFFF0000 +#define S_00B824_NUM_THREAD_PARTIAL(x) (((x) & 0xFFFF) << 16) +#define G_00B824_NUM_THREAD_PARTIAL(x) (((x) >> 16) & 0xFFFF) +#define C_00B824_NUM_THREAD_PARTIAL 0x0000FFFF + #endif // SIDEFINES_H_ Index: test/CodeGen/R600/reqd_work_group_size.ll =================================================================== --- /dev/null +++ test/CodeGen/R600/reqd_work_group_size.ll @@ -0,0 +1,17 @@ +; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s + +; SI: NumThreadX: 32 +; SI: NumThreadY: 2 +; SI: NumThreadZ: 4 +define void @has_reqd_work_group_size(i32 addrspace(1)* nocapture %out) #0 { +entry: + store i32 0, i32 addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!opencl.kernels = !{!0} + +!0 = metadata !{void (i32 addrspace(1)*)* @has_reqd_work_group_size, metadata !1} +!1 = metadata !{metadata !"reqd_work_group_size", i32 32, i32 2, i32 4}