diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2614,33 +2614,39 @@ return Plugin::success(); // General Info - auto ConstWGSize = getDefaultNumThreads(GenericDevice); auto NumGroups = NumBlocks; - auto ThreadsPerGroup = getDefaultNumThreads(GenericDevice); - auto NumTeams = KernelArgs.NumTeams[0]; // Only first dimension - auto ThreadLimit = KernelArgs.ThreadLimit[0]; // Only first dimension + auto ThreadsPerGroup = NumThreads; // Kernel Arguments Info auto ArgNum = KernelArgs.NumArgs; auto LoopTripCount = KernelArgs.Tripcount; - // Details for AMDGPU kernels + // Details for AMDGPU kernels (read from image) + // https://www.llvm.org/docs/AMDGPUUsage.html#code-object-v4-metadata auto GroupSegmentSize = (*KernelInfo).GroupSegmentList; auto SGPRCount = (*KernelInfo).SGPRCount; auto VGPRCount = (*KernelInfo).VGPRCount; auto SGPRSpillCount = (*KernelInfo).SGPRSpillCount; auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount; - - // TODO set correctly once host services available - auto HostCallRequired = false; + auto MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize; + + // Prints additional launch info that contains the following. + // Num Args: The number of kernel arguments + // Teams x Thrds: The number of teams and the number of threads actually + // running. + // MaxFlatWorkgroupSize: Maximum flat work-group size supported by the + // kernel in work-items + // LDS Usage: Amount of bytes used in LDS storage + // S/VGPR Count: the number of S/V GPRs occupied by the kernel + // S/VGPR Spill Count: how many S/VGPRs are spilled by the kernel + // Tripcount: loop tripcount for the kernel INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(), - "SGN:%s ConstWGSize:%d args:%d teamsXthrds:(%4luX%4d) " - "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u " - "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d n:%s\n", - getExecutionModeName(), ConstWGSize, ArgNum, NumGroups, ThreadsPerGroup, - NumTeams, ThreadLimit, GroupSegmentSize, SGPRCount, VGPRCount, - SGPRSpillCount, VGPRSpillCount, LoopTripCount, HostCallRequired, - getName()); + "#Args: %d Teams x Thrds: %4lux%4u (MaxFlatWorkGroupSize: %u) LDS " + "Usage: %uB #SGPRs/VGPRs: %u/%u #SGPR/VGPR Spills: %u/%u Tripcount: " + "%lu\n", + ArgNum, NumGroups, ThreadsPerGroup, MaxFlatWorkgroupSize, + GroupSegmentSize, SGPRCount, VGPRCount, SGPRSpillCount, VGPRSpillCount, + LoopTripCount); return Plugin::success(); } diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h --- a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h @@ -146,6 +146,10 @@ uint32_t KernelSegmentSize; uint32_t ExplicitArgumentCount; uint32_t ImplicitArgumentCount; + uint32_t RequestedWorkgroupSize[3]; + uint32_t WorkgroupSizeHint[3]; + uint32_t WavefronSize; + uint32_t MaxFlatWorkgroupSize; }; namespace { @@ -194,6 +198,19 @@ return DK.getString() == SK; }; + const auto getSequenceOfThreeInts = [](msgpack::DocNode &DN, + uint32_t *Vals) { + assert(DN.isArray() && "MsgPack DocNode is an array node"); + auto DNA = DN.getArray(); + assert(DNA.size() == 3 && "ArrayNode has at most three elements"); + + int i = 0; + for (auto DNABegin = DNA.begin(), DNAEnd = DNA.end(); DNABegin != DNAEnd; + ++DNABegin) { + Vals[i++] = DNABegin->getUInt(); + } + }; + if (isKey(V.first, ".name")) { KernelName = V.second.toString(); } else if (isKey(V.first, ".sgpr_count")) { @@ -208,6 +225,14 @@ KernelData.PrivateSegmentSize = V.second.getUInt(); } else if (isKey(V.first, ".group_segement_fixed_size")) { KernelData.GroupSegmentList = V.second.getUInt(); + } else if (isKey(V.first, ".reqd_workgroup_size")) { + getSequenceOfThreeInts(V.second, KernelData.RequestedWorkgroupSize); + } else if (isKey(V.first, ".workgroup_size_hint")) { + getSequenceOfThreeInts(V.second, KernelData.WorkgroupSizeHint); + } else if (isKey(V.first, ".wavefront_size")) { + KernelData.WavefronSize = V.second.getUInt(); + } else if (isKey(V.first, ".max_flat_workgroup_size")) { + KernelData.MaxFlatWorkgroupSize = V.second.getUInt(); } return Error::success(); @@ -295,6 +320,7 @@ return Error::success(); } + } // namespace utils } // namespace plugin } // namespace target diff --git a/openmp/libomptarget/test/offloading/info.c b/openmp/libomptarget/test/offloading/info.c --- a/openmp/libomptarget/test/offloading/info.c +++ b/openmp/libomptarget/test/offloading/info.c @@ -39,7 +39,7 @@ // INFO: info: Entering OpenMP kernel at info.c:{{[0-9]+}}:{{[0-9]+}} with 1 arguments: // INFO: info: firstprivate(val)[4] // INFO: info: Launching kernel __omp_offloading_{{.*}}main{{.*}} with {{[0-9]+}} blocks and {{[0-9]+}} threads in Generic mode -// AMDGPU: AMDGPU device {{[0-9]}} info: SGN:Generic ConstWGSize:{{[0-9]+}} args:{{[0-9]}} teamsXthrds:({{ [0-9]+}}X {{[0-9]+}}) reqd:( {{[0-9]+}}X {{[0-9]+}}) lds_usage:{{[0-9]+}}B sgpr_count:{{[0-9]+}} vgpr_count:{{[0-9]+}} sgpr_spill_count:{{[0-9]+}} vgpr_spill_count:{{[0-9]+}} tripcount:{{[0-9]+}} rpc:0 n:__omp_offloading_{{.*}}main{{.*}} +// AMDGPU: AMDGPU device {{[0-9]}} info: #Args: {{[0-9]}} Teams x Thrds: {{[0-9]+}}x {{[0-9]+}} (MaxFlatWorkGroupSize: {{[0-9]+}}) LDS Usage: {{[0-9]+}}B #SGPRs/VGPRs: {{[0-9]+}}/{{[0-9]+}} #SGPR/VGPR Spills: {{[0-9]+}}/{{[0-9]+}} Tripcount: {{[0-9]+}} // INFO: info: OpenMP Host-Device pointer mappings after block at info.c:{{[0-9]+}}:{{[0-9]+}}: // INFO: info: Host Ptr Target Ptr Size (B) DynRefCount HoldRefCount Declaration // INFO: info: {{.*}} {{.*}} 256 1 0 C[0:64] at info.c:{{[0-9]+}}:{{[0-9]+}}