Diff 461769

llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp

//===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//		//===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
/// \file This pass does attempts to make use of reqd_work_group_size metadata		/// \file This pass does attempts to make use of reqd_work_group_size metadata
/// to eliminate loads from the dispatch packet and to constant fold OpenCL		/// to eliminate loads from the dispatch packet and to constant fold OpenCL
/// get_local_size-like functions.		/// get_local_size-like functions.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "AMDGPU.h"		#include "AMDGPU.h"
		#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/Analysis/ValueTracking.h"		#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/Passes.h"		#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"		#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Constants.h"		#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"		#include "llvm/IR/Function.h"
#include "llvm/IR/InstIterator.h"		#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"		#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"		#include "llvm/IR/IntrinsicsAMDGPU.h"
Show All 12 Lines	enum DispatchPackedOffsets {
WORKGROUP_SIZE_Y = 6,		WORKGROUP_SIZE_Y = 6,
WORKGROUP_SIZE_Z = 8,		WORKGROUP_SIZE_Z = 8,

GRID_SIZE_X = 12,		GRID_SIZE_X = 12,
GRID_SIZE_Y = 16,		GRID_SIZE_Y = 16,
GRID_SIZE_Z = 20		GRID_SIZE_Z = 20
};		};

		// Field offsets to implicit kernel argument pointer.
		enum ImplicitArgOffsets {
		HIDDEN_BLOCK_COUNT_X = 0,
		HIDDEN_BLOCK_COUNT_Y = 4,
		HIDDEN_BLOCK_COUNT_Z = 8,
		arsenmUnsubmitted Done Reply Inline Actions Should be named HIDDEN_BLOCK_? arsenm:* Should be named HIDDEN_BLOCK_*?
		cfangAuthorUnsubmitted Done Reply Inline Actions Ok, named with hidden_ prefix to reflect the field names. Thanks. cfang: Ok, named with hidden_ prefix to reflect the field names. Thanks.

		HIDDEN_GROUP_SIZE_X = 12,
		HIDDEN_GROUP_SIZE_Y = 14,
		HIDDEN_GROUP_SIZE_Z = 16,

		HIDDEN_REMAINDER_X = 18,
		HIDDEN_REMAINDER_Y = 20,
		HIDDEN_REMAINDER_Z = 22,
		};

class AMDGPULowerKernelAttributes : public ModulePass {		class AMDGPULowerKernelAttributes : public ModulePass {
public:		public:
static char ID;		static char ID;

AMDGPULowerKernelAttributes() : ModulePass(ID) {}		AMDGPULowerKernelAttributes() : ModulePass(ID) {}

bool runOnModule(Module &M) override;		bool runOnModule(Module &M) override;

StringRef getPassName() const override {		StringRef getPassName() const override {
return "AMDGPU Kernel Attributes";		return "AMDGPU Kernel Attributes";
}		}

void getAnalysisUsage(AnalysisUsage &AU) const override {		void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesAll();		AU.setPreservesAll();
}		}
};		};

} // end anonymous namespace		} // end anonymous namespace

static bool processUse(CallInst *CI) {		static bool processUse(CallInst *CI, bool IsV5OrAbove) {
		arsenmUnsubmitted Done Reply Inline Actions You've repeated most of the body of the existing function when only the core piece of the match differs. arsenm: You've repeated most of the body of the existing function when only the core piece of the match…
		cfangAuthorUnsubmitted Done Reply Inline Actions Actually that is not pure repeat even though the logic is the same. The fields, offsets and even the base pointers are different. cfang: Actually that is not pure repeat even though the logic is the same. The fields, offsets and…
		arsenmUnsubmitted Done Reply Inline Actions The entire core logic is the same. You can select between the pointers, fields and offsets within the same function. arsenm: The entire core logic is the same. You can select between the pointers, fields and offsets…
Function *F = CI->getParent()->getParent();		Function *F = CI->getParent()->getParent();

auto MD = F->getMetadata("reqd_work_group_size");		auto MD = F->getMetadata("reqd_work_group_size");
const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;		const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;

const bool HasUniformWorkGroupSize =		const bool HasUniformWorkGroupSize =
F->getFnAttribute("uniform-work-group-size").getValueAsBool();		F->getFnAttribute("uniform-work-group-size").getValueAsBool();

if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)		if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)
return false;		return false;

Value *WorkGroupSizeX = nullptr;		Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
Value *WorkGroupSizeY = nullptr;		Value *GroupSizes[3] = {nullptr, nullptr, nullptr};
Value *WorkGroupSizeZ = nullptr;		Value *Remainders[3] = {nullptr, nullptr, nullptr};
		Value *GridSizes[3] = {nullptr, nullptr, nullptr};
		arsenmUnsubmitted Done Reply Inline Actions Hidden? arsenm: Hidden?
		cfangAuthorUnsubmitted Done Reply Inline Actions These are just temporary variable names, and some of them are shared with pre-v5 implementation. I am thinking it is better not to add hidden_ prefix. cfang: These are just temporary variable names, and some of them are shared with pre-v5 implementation.
Value *GridSizeX = nullptr;
Value *GridSizeY = nullptr;
Value *GridSizeZ = nullptr;

const DataLayout &DL = F->getParent()->getDataLayout();		const DataLayout &DL = F->getParent()->getDataLayout();

// We expect to see several GEP users, casted to the appropriate type and		// We expect to see several GEP users, casted to the appropriate type and
// loaded.		// loaded.
for (User *U : CI->users()) {		for (User *U : CI->users()) {
if (!U->hasOneUse())		if (!U->hasOneUse())
continue;		continue;

int64_t Offset = 0;		int64_t Offset = 0;
		BitCastInst *BCI = dyn_cast<BitCastInst>(U);
		if (!BCI) {
if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)		if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
continue;		continue;
		BCI = dyn_cast<BitCastInst>(*U->user_begin());
		}

auto BCI = dyn_cast<BitCastInst>(U->user_begin());
if (!BCI \|\| !BCI->hasOneUse())		if (!BCI \|\| !BCI->hasOneUse())
continue;		continue;

auto Load = dyn_cast<LoadInst>(BCI->user_begin());		auto Load = dyn_cast<LoadInst>(BCI->user_begin());
if (!Load \|\| !Load->isSimple())		if (!Load \|\| !Load->isSimple())
continue;		continue;

unsigned LoadSize = DL.getTypeStoreSize(Load->getType());		unsigned LoadSize = DL.getTypeStoreSize(Load->getType());

// TODO: Handle merged loads.		// TODO: Handle merged loads.
		if (IsV5OrAbove) { // Base is ImplicitArgPtr.
		switch (Offset) {
		case HIDDEN_BLOCK_COUNT_X:
		if (LoadSize == 4)
		BlockCounts[0] = Load;
		break;
		case HIDDEN_BLOCK_COUNT_Y:
		if (LoadSize == 4)
		BlockCounts[1] = Load;
		break;
		case HIDDEN_BLOCK_COUNT_Z:
		if (LoadSize == 4)
		BlockCounts[2] = Load;
		break;
		case HIDDEN_GROUP_SIZE_X:
		if (LoadSize == 2)
		GroupSizes[0] = Load;
		break;
		case HIDDEN_GROUP_SIZE_Y:
		if (LoadSize == 2)
		GroupSizes[1] = Load;
		break;
		case HIDDEN_GROUP_SIZE_Z:
		if (LoadSize == 2)
		GroupSizes[2] = Load;
		break;
		case HIDDEN_REMAINDER_X:
		if (LoadSize == 2)
		Remainders[0] = Load;
		break;
		case HIDDEN_REMAINDER_Y:
		if (LoadSize == 2)
		Remainders[1] = Load;
		break;
		case HIDDEN_REMAINDER_Z:
		if (LoadSize == 2)
		Remainders[2] = Load;
		break;
		default:
		break;
		}
		} else { // Base is DispatchPtr.
switch (Offset) {		switch (Offset) {
case WORKGROUP_SIZE_X:		case WORKGROUP_SIZE_X:
if (LoadSize == 2)		if (LoadSize == 2)
WorkGroupSizeX = Load;		GroupSizes[0] = Load;
break;		break;
case WORKGROUP_SIZE_Y:		case WORKGROUP_SIZE_Y:
if (LoadSize == 2)		if (LoadSize == 2)
WorkGroupSizeY = Load;		GroupSizes[1] = Load;
break;		break;
case WORKGROUP_SIZE_Z:		case WORKGROUP_SIZE_Z:
if (LoadSize == 2)		if (LoadSize == 2)
WorkGroupSizeZ = Load;		GroupSizes[2] = Load;
break;		break;
case GRID_SIZE_X:		case GRID_SIZE_X:
if (LoadSize == 4)		if (LoadSize == 4)
GridSizeX = Load;		GridSizes[0] = Load;
break;		break;
case GRID_SIZE_Y:		case GRID_SIZE_Y:
if (LoadSize == 4)		if (LoadSize == 4)
GridSizeY = Load;		GridSizes[1] = Load;
break;		break;
case GRID_SIZE_Z:		case GRID_SIZE_Z:
if (LoadSize == 4)		if (LoadSize == 4)
GridSizeZ = Load;		GridSizes[2] = Load;
break;		break;
default:		default:
break;		break;
}		}
}		}
		}

		bool MadeChange = false;
		if (IsV5OrAbove && HasUniformWorkGroupSize) {
		// Under v5 __ockl_get_local_size returns the value computed by the expression:
		//
		// workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder
		//
		// For functions with the attribute uniform-work-group-size=true. we can evaluate
		// workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned
		// for __ockl_get_local_size.
		for (int I = 0; I < 3; ++I) {
		Value *BlockCount = BlockCounts[I];
		if (!BlockCount)
		continue;

		using namespace llvm::PatternMatch;
		auto GroupIDIntrin =
		I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
		: (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
		: m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());

		for (User *ICmp : BlockCount->users()) {
		ICmpInst::Predicate Pred;
		if (match(ICmp, m_ICmp(Pred, GroupIDIntrin, m_Specific(BlockCount)))) {
		if (Pred != ICmpInst::ICMP_ULT)
		continue;
		ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType()));
		MadeChange = true;
		}
		}
		}

		// All remainders should be 0 with uniform work group size.
		for (Value *Remainder : Remainders) {
		if (!Remainder)
		continue;
		Remainder->replaceAllUsesWith(Constant::getNullValue(Remainder->getType()));
		MadeChange = true;
		}
		} else if (HasUniformWorkGroupSize) { // Pre-V5.
// Pattern match the code used to handle partial workgroup dispatches in the		// Pattern match the code used to handle partial workgroup dispatches in the
// library implementation of get_local_size, so the entire function can be		// library implementation of get_local_size, so the entire function can be
// constant folded with a known group size.		// constant folded with a known group size.
//		//
// uint r = grid_size - group_id * group_size;		// uint r = grid_size - group_id * group_size;
// get_local_size = (r < group_size) ? r : group_size;		// get_local_size = (r < group_size) ? r : group_size;
//		//
// If we have uniform-work-group-size (which is the default in OpenCL 1.2),		// If we have uniform-work-group-size (which is the default in OpenCL 1.2),
// the grid_size is required to be a multiple of group_size). In this case:		// the grid_size is required to be a multiple of group_size). In this case:
//		//
// grid_size - (group_id * group_size) < group_size		// grid_size - (group_id * group_size) < group_size
// ->		// ->
// grid_size < group_size + (group_id * group_size)		// grid_size < group_size + (group_id * group_size)
//		//
// (grid_size / group_size) < 1 + group_id		// (grid_size / group_size) < 1 + group_id
//		//
// grid_size / group_size is at least 1, so we can conclude the select		// grid_size / group_size is at least 1, so we can conclude the select
// condition is false (except for group_id == 0, where the select result is		// condition is false (except for group_id == 0, where the select result is
// the same).		// the same).
		for (int I = 0; I < 3; ++I) {
bool MadeChange = false;		Value *GroupSize = GroupSizes[I];
Value *WorkGroupSizes[3] = { WorkGroupSizeX, WorkGroupSizeY, WorkGroupSizeZ };
Value *GridSizes[3] = { GridSizeX, GridSizeY, GridSizeZ };

for (int I = 0; HasUniformWorkGroupSize && I < 3; ++I) {
Value *GroupSize = WorkGroupSizes[I];
Value *GridSize = GridSizes[I];		Value *GridSize = GridSizes[I];
if (!GroupSize \|\| !GridSize)		if (!GroupSize \|\| !GridSize)
continue;		continue;

using namespace llvm::PatternMatch;		using namespace llvm::PatternMatch;
auto GroupIDIntrin =		auto GroupIDIntrin =
I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()		I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
: (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()		: (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
: m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());		: m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());

for (User *U : GroupSize->users()) {		for (User *U : GroupSize->users()) {
auto *ZextGroupSize = dyn_cast<ZExtInst>(U);		auto *ZextGroupSize = dyn_cast<ZExtInst>(U);
if (!ZextGroupSize)		if (!ZextGroupSize)
continue;		continue;

for (User *UMin : ZextGroupSize->users()) {		for (User *UMin : ZextGroupSize->users()) {
if (match(UMin,		if (match(UMin,
m_UMin(m_Sub(m_Specific(GridSize),		m_UMin(m_Sub(m_Specific(GridSize),
m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))),		m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))),
m_Specific(ZextGroupSize)))) {		m_Specific(ZextGroupSize)))) {
if (HasReqdWorkGroupSize) {		if (HasReqdWorkGroupSize) {
ConstantInt *KnownSize		ConstantInt *KnownSize
= mdconst::extract<ConstantInt>(MD->getOperand(I));		= mdconst::extract<ConstantInt>(MD->getOperand(I));
UMin->replaceAllUsesWith(ConstantExpr::getIntegerCast(		UMin->replaceAllUsesWith(ConstantExpr::getIntegerCast(
KnownSize, UMin->getType(), false));		KnownSize, UMin->getType(), false));
} else {		} else {
UMin->replaceAllUsesWith(ZextGroupSize);		UMin->replaceAllUsesWith(ZextGroupSize);
}		}

MadeChange = true;		MadeChange = true;
}		}
}		}
}		}
}		}
		}

		// If reqd_work_group_size is set, we can replace work group size with it.
if (!HasReqdWorkGroupSize)		if (!HasReqdWorkGroupSize)
return MadeChange;		return MadeChange;

// Eliminate any other loads we can from the dispatch packet.		for (int I = 0; I < 3; I++) {
for (int I = 0; I < 3; ++I) {		Value *GroupSize = GroupSizes[I];
Value *GroupSize = WorkGroupSizes[I];
if (!GroupSize)		if (!GroupSize)
continue;		continue;

ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));		ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
GroupSize->replaceAllUsesWith(		GroupSize->replaceAllUsesWith(
ConstantExpr::getIntegerCast(KnownSize,		ConstantExpr::getIntegerCast(KnownSize, GroupSize->getType(), false));
GroupSize->getType(),
false));
MadeChange = true;		MadeChange = true;
}		}

return MadeChange;		return MadeChange;
}		}


// TODO: Move makeLIDRangeMetadata usage into here. Seem to not get		// TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
// TargetPassConfig for subtarget.		// TargetPassConfig for subtarget.
bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {		bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
StringRef DispatchPtrName		bool MadeChange = false;
= Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr);		Function *BasePtr = nullptr;
		bool IsV5OrAbove = AMDGPU::getAmdhsaCodeObjectVersion() >= 5;
		if (IsV5OrAbove) {
		StringRef ImplicitArgPtrName =
		Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr);
		BasePtr = M.getFunction(ImplicitArgPtrName);
		} else { // Pre-V5.
		StringRef DispatchPtrName =
		Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr);
		BasePtr = M.getFunction(DispatchPtrName);
		}

Function *DispatchPtr = M.getFunction(DispatchPtrName);		if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
if (!DispatchPtr) // Dispatch ptr not used.
return false;		return false;
		arsenmUnsubmitted Done Reply Inline Actions Looks like something weird is happening with the return indentation arsenm: Looks like something weird is happening with the return indentation
		cfangAuthorUnsubmitted Done Reply Inline Actions Fixed. cfang: Fixed.

bool MadeChange = false;

SmallPtrSet<Instruction *, 4> HandledUses;		SmallPtrSet<Instruction *, 4> HandledUses;
for (auto *U : DispatchPtr->users()) {		for (auto *U : BasePtr->users()) {
CallInst *CI = cast<CallInst>(U);		CallInst *CI = cast<CallInst>(U);
if (HandledUses.insert(CI).second) {		if (HandledUses.insert(CI).second) {
if (processUse(CI))		if (processUse(CI, IsV5OrAbove))
MadeChange = true;		MadeChange = true;
}		}
}		}

return MadeChange;		return MadeChange;
}		}

		arsenmUnsubmitted Done Reply Inline Actions Typo ImplicitArf arsenm: Typo ImplicitArf
		cfangAuthorUnsubmitted Done Reply Inline Actions Thanks. cfang: Thanks.

INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,		INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
"AMDGPU Kernel Attributes", false, false)		"AMDGPU Kernel Attributes", false, false)
INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,		INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,
"AMDGPU Kernel Attributes", false, false)		"AMDGPU Kernel Attributes", false, false)

char AMDGPULowerKernelAttributes::ID = 0;		char AMDGPULowerKernelAttributes::ID = 0;

ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {		ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
return new AMDGPULowerKernelAttributes();		return new AMDGPULowerKernelAttributes();
		arsenmUnsubmitted Done Reply Inline Actions These can reuse the same set and user loop arsenm: These can reuse the same set and user loop
		cfangAuthorUnsubmitted Done Reply Inline Actions Done. Thanks for the suggestions. cfang: Done. Thanks for the suggestions.
}		}

PreservedAnalyses		PreservedAnalyses
AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {		AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {
		Function *BasePtr = nullptr;
		bool IsV5OrAbove = AMDGPU::getAmdhsaCodeObjectVersion() >= 5;
		if (IsV5OrAbove) {
		StringRef ImplicitArgPtrName =
		Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr);
		BasePtr = F.getParent()->getFunction(ImplicitArgPtrName);
		} else { // Pre_V5.
StringRef DispatchPtrName =		StringRef DispatchPtrName =
Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr);		Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr);
		BasePtr = F.getParent()->getFunction(DispatchPtrName);
		}

Function *DispatchPtr = F.getParent()->getFunction(DispatchPtrName);		if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
if (!DispatchPtr) // Dispatch ptr not used.
return PreservedAnalyses::all();		return PreservedAnalyses::all();

for (Instruction &I : instructions(F)) {		for (Instruction &I : instructions(F)) {
		arsenmUnsubmitted Done Reply Inline Actions This isn't reading the IR/module flag arsenm: This isn't reading the IR/module flag
		cfangAuthorUnsubmitted Done Reply Inline Actions Waiting for clang to generate a complete code object versions (only v5 at this moment). This is be updated with a separate task (patch). cfang: Waiting for clang to generate a complete code object versions (only v5 at this moment). This is…
if (CallInst *CI = dyn_cast<CallInst>(&I)) {		if (CallInst *CI = dyn_cast<CallInst>(&I)) {
if (CI->getCalledFunction() == DispatchPtr)		if (CI->getCalledFunction() == BasePtr)
processUse(CI);		processUse(CI, IsV5OrAbove);
}		}
		arsenmUnsubmitted Done Reply Inline Actions else if arsenm: else if
		cfangAuthorUnsubmitted Done Reply Inline Actions Done. Thanks. cfang: Done. Thanks.
}		}

return PreservedAnalyses::all();		return PreservedAnalyses::all();
}		}
		arsenmUnsubmitted Not Done Reply Inline Actions You're missing a code object version check since these inputs don't exist for < v5 arsenm: You're missing a code object version check since these inputs don't exist for < v5

llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
				; RUN: opt -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=5 -S -passes=amdgpu-lower-kernel-attributes,instcombine %s \| FileCheck -enable-var-scope -check-prefix=GCN %s

				; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
				define amdgpu_kernel void @get_local_size_x(i16 addrspace(1)* %out) #0 {
				; GCN-LABEL: @get_local_size_x(
				; GCN-NEXT: [[IMPLICITARG_PTR:%.]] = tail call i8 addrspace(4) @llvm.amdgcn.implicitarg.ptr()
				; GCN-NEXT: [[GEP_LOCAL_SIZE:%.]] = getelementptr inbounds i8, i8 addrspace(4) [[IMPLICITARG_PTR]], i64 12
				; GCN-NEXT: [[BC_GEP_LOCAL_SIZE:%.]] = bitcast i8 addrspace(4) [[GEP_LOCAL_SIZE]] to i16 addrspace(4)*
				; GCN-NEXT: [[LOCAL_SIZE:%.]] = load i16, i16 addrspace(4) [[BC_GEP_LOCAL_SIZE]], align 4
				; GCN-NEXT: store i16 [[LOCAL_SIZE]], i16 addrspace(1)* [[OUT:%.*]], align 2
				; GCN-NEXT: ret void
				;
				%group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
				%implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
				%bc.block.count.x = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
				%block.count.x = load i32, i32 addrspace(4)* %bc.block.count.x, align 4
				%cmp.id.count = icmp ult i32 %group.id, %block.count.x
				%local.size.offset = select i1 %cmp.id.count, i64 12, i64 18
				%gep.local.size = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 %local.size.offset
				%bc.gep.local.size = bitcast i8 addrspace(4)* %gep.local.size to i16 addrspace(4)*
				%local.size = load i16, i16 addrspace(4)* %bc.gep.local.size, align 2
				store i16 %local.size, i16 addrspace(1)* %out
				ret void
				}

				; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
				define amdgpu_kernel void @get_local_size_y(i16 addrspace(1)* %out) #0 {
				; GCN-LABEL: @get_local_size_y(
				; GCN-NEXT: [[IMPLICITARG_PTR:%.]] = tail call i8 addrspace(4) @llvm.amdgcn.implicitarg.ptr()
				; GCN-NEXT: [[GEP_LOCAL_SIZE:%.]] = getelementptr inbounds i8, i8 addrspace(4) [[IMPLICITARG_PTR]], i64 14
				; GCN-NEXT: [[BC_GEP_LOCAL_SIZE:%.]] = bitcast i8 addrspace(4) [[GEP_LOCAL_SIZE]] to i16 addrspace(4)*
				; GCN-NEXT: [[LOCAL_SIZE:%.]] = load i16, i16 addrspace(4) [[BC_GEP_LOCAL_SIZE]], align 2
				; GCN-NEXT: store i16 [[LOCAL_SIZE]], i16 addrspace(1)* [[OUT:%.*]], align 2
				; GCN-NEXT: ret void
				;
				%group.id = tail call i32 @llvm.amdgcn.workgroup.id.y()
				%implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
				%gep.block.count.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 4
				%bc.block.count.y = bitcast i8 addrspace(4)* %gep.block.count.y to i32 addrspace(4)*
				%block.count.y = load i32, i32 addrspace(4)* %bc.block.count.y, align 4
				%cmp.id.count = icmp ult i32 %group.id, %block.count.y
				%local.size.offset = select i1 %cmp.id.count, i64 14, i64 20
				%gep.local.size = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 %local.size.offset
				%bc.gep.local.size = bitcast i8 addrspace(4)* %gep.local.size to i16 addrspace(4)*
				%local.size = load i16, i16 addrspace(4)* %bc.gep.local.size, align 2
				store i16 %local.size, i16 addrspace(1)* %out
				ret void
				}

				; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
				define amdgpu_kernel void @get_local_size_z(i16 addrspace(1)* %out) #0 {
				; GCN-LABEL: @get_local_size_z(
				; GCN-NEXT: [[IMPLICITARG_PTR:%.]] = tail call i8 addrspace(4) @llvm.amdgcn.implicitarg.ptr()
				; GCN-NEXT: [[GEP_LOCAL_SIZE:%.]] = getelementptr inbounds i8, i8 addrspace(4) [[IMPLICITARG_PTR]], i64 16
				; GCN-NEXT: [[BC_GEP_LOCAL_SIZE:%.]] = bitcast i8 addrspace(4) [[GEP_LOCAL_SIZE]] to i16 addrspace(4)*
				; GCN-NEXT: [[LOCAL_SIZE:%.]] = load i16, i16 addrspace(4) [[BC_GEP_LOCAL_SIZE]], align 4
				; GCN-NEXT: store i16 [[LOCAL_SIZE]], i16 addrspace(1)* [[OUT:%.*]], align 2
				; GCN-NEXT: ret void
				;
				%group.id = tail call i32 @llvm.amdgcn.workgroup.id.z()
				%implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
				%gep.block.count.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 8
				%bc.block.count.z = bitcast i8 addrspace(4)* %gep.block.count.z to i32 addrspace(4)*
				%block.count.z = load i32, i32 addrspace(4)* %bc.block.count.z, align 4
				%cmp.id.count = icmp ult i32 %group.id, %block.count.z
				%local.size.offset = select i1 %cmp.id.count, i64 16, i64 22
				%gep.local.size = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 %local.size.offset
				%bc.gep.local.size = bitcast i8 addrspace(4)* %gep.local.size to i16 addrspace(4)*
				%local.size = load i16, i16 addrspace(4)* %bc.gep.local.size, align 2
				store i16 %local.size, i16 addrspace(1)* %out
				ret void
				}

				; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
				define amdgpu_kernel void @get_remainder_x(i16 addrspace(1)* %out) #0 {
				; GCN-LABEL: @get_remainder_x(
				; GCN-NEXT: store i16 0, i16 addrspace(1)* [[OUT:%.*]], align 2
				; GCN-NEXT: ret void
				;
				%implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
				%gep.x = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 18
				%bc.x = bitcast i8 addrspace(4)* %gep.x to i16 addrspace(4)*
				%remainder.x = load i16, i16 addrspace(4)* %bc.x, align 2
				store i16 %remainder.x, i16 addrspace(1)* %out
				ret void
				}

				; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
				define amdgpu_kernel void @get_remainder_y(i16 addrspace(1)* %out) #0 {
				; GCN-LABEL: @get_remainder_y(
				; GCN-NEXT: store i16 0, i16 addrspace(1)* [[OUT:%.*]], align 2
				; GCN-NEXT: ret void
				;
				%implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
				%gep.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 18
				%bc.y = bitcast i8 addrspace(4)* %gep.y to i16 addrspace(4)*
				%remainder.y = load i16, i16 addrspace(4)* %bc.y, align 2
				store i16 %remainder.y, i16 addrspace(1)* %out
				ret void
				}

				; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
				define amdgpu_kernel void @get_remainder_z(i16 addrspace(1)* %out) #0 {
				; GCN-LABEL: @get_remainder_z(
				; GCN-NEXT: store i16 0, i16 addrspace(1)* [[OUT:%.*]], align 2
				; GCN-NEXT: ret void
				;
				%implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
				%gep.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 18
				%bc.z = bitcast i8 addrspace(4)* %gep.z to i16 addrspace(4)*
				%remainder.z = load i16, i16 addrspace(4)* %bc.z, align 2
				store i16 %remainder.z, i16 addrspace(1)* %out
				ret void
				}

				; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
				define amdgpu_kernel void @get_work_group_size_x(i16 addrspace(1)* %out) #0 {
				; GCN-LABEL: @get_work_group_size_x(
				; GCN-NEXT: [[IMPLICITARG_PTR:%.]] = tail call i8 addrspace(4) @llvm.amdgcn.implicitarg.ptr()
				; GCN-NEXT: [[GEP_X:%.]] = getelementptr inbounds i8, i8 addrspace(4) [[IMPLICITARG_PTR]], i64 12
				; GCN-NEXT: [[BC_X:%.]] = bitcast i8 addrspace(4) [[GEP_X]] to i16 addrspace(4)*
				; GCN-NEXT: [[GROUP_SIZE_X:%.]] = load i16, i16 addrspace(4) [[BC_X]], align 4
				; GCN-NEXT: store i16 [[GROUP_SIZE_X]], i16 addrspace(1)* [[OUT:%.*]], align 2
				; GCN-NEXT: ret void
				;
				%implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
				%gep.x = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 12
				%bc.x = bitcast i8 addrspace(4)* %gep.x to i16 addrspace(4)*
				%group.size.x = load i16, i16 addrspace(4)* %bc.x, align 2
				store i16 %group.size.x, i16 addrspace(1)* %out
				ret void
				}

				; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
				define amdgpu_kernel void @get_work_group_size_y(i16 addrspace(1)* %out) #0 {
				; GCN-LABEL: @get_work_group_size_y(
				; GCN-NEXT: [[IMPLICITARG_PTR:%.]] = tail call i8 addrspace(4) @llvm.amdgcn.implicitarg.ptr()
				; GCN-NEXT: [[GEP_Y:%.]] = getelementptr inbounds i8, i8 addrspace(4) [[IMPLICITARG_PTR]], i64 14
				; GCN-NEXT: [[BC_Y:%.]] = bitcast i8 addrspace(4) [[GEP_Y]] to i16 addrspace(4)*
				; GCN-NEXT: [[GROUP_SIZE_Y:%.]] = load i16, i16 addrspace(4) [[BC_Y]], align 2
				; GCN-NEXT: store i16 [[GROUP_SIZE_Y]], i16 addrspace(1)* [[OUT:%.*]], align 2
				; GCN-NEXT: ret void
				;
				%implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
				%gep.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 14
				%bc.y = bitcast i8 addrspace(4)* %gep.y to i16 addrspace(4)*
				%group.size.y = load i16, i16 addrspace(4)* %bc.y, align 2
				store i16 %group.size.y, i16 addrspace(1)* %out
				ret void
				}

				; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
				define amdgpu_kernel void @get_work_group_size_z(i16 addrspace(1)* %out) #0 {
				; GCN-LABEL: @get_work_group_size_z(
				; GCN-NEXT: [[IMPLICITARG_PTR:%.]] = tail call i8 addrspace(4) @llvm.amdgcn.implicitarg.ptr()
				; GCN-NEXT: [[GEP_Z:%.]] = getelementptr inbounds i8, i8 addrspace(4) [[IMPLICITARG_PTR]], i64 16
				; GCN-NEXT: [[BC_Z:%.]] = bitcast i8 addrspace(4) [[GEP_Z]] to i16 addrspace(4)*
				; GCN-NEXT: [[GROUP_SIZE_Z:%.]] = load i16, i16 addrspace(4) [[BC_Z]], align 4
				; GCN-NEXT: store i16 [[GROUP_SIZE_Z]], i16 addrspace(1)* [[OUT:%.*]], align 2
				; GCN-NEXT: ret void
				;
				%implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
				%gep.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 16
				%bc.z = bitcast i8 addrspace(4)* %gep.z to i16 addrspace(4)*
				%group.size.z = load i16, i16 addrspace(4)* %bc.z, align 2
				store i16 %group.size.z, i16 addrspace(1)* %out
				ret void
				}

				; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
				define amdgpu_kernel void @get_work_group_size_x_reqd(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
				; GCN-LABEL: @get_work_group_size_x_reqd(
				; GCN-NEXT: store i16 8, i16 addrspace(1)* [[OUT:%.*]], align 2
				; GCN-NEXT: ret void
				;
				%implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
				%gep.x = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 12
				%bc.x = bitcast i8 addrspace(4)* %gep.x to i16 addrspace(4)*
				%group.size.x = load i16, i16 addrspace(4)* %bc.x, align 2
				store i16 %group.size.x, i16 addrspace(1)* %out
				ret void
				}

				; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
				define amdgpu_kernel void @get_work_group_size_y_reqd(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
				; GCN-LABEL: @get_work_group_size_y_reqd(
				; GCN-NEXT: store i16 16, i16 addrspace(1)* [[OUT:%.*]], align 2
				; GCN-NEXT: ret void
				;
				%implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
				%gep.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 14
				%bc.y = bitcast i8 addrspace(4)* %gep.y to i16 addrspace(4)*
				%group.size.y = load i16, i16 addrspace(4)* %bc.y, align 2
				store i16 %group.size.y, i16 addrspace(1)* %out
				ret void
				}

				; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
				define amdgpu_kernel void @get_work_group_size_z_reqd(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
				; GCN-LABEL: @get_work_group_size_z_reqd(
				; GCN-NEXT: store i16 2, i16 addrspace(1)* [[OUT:%.*]], align 2
				; GCN-NEXT: ret void
				;
				%implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
				%gep.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 16
				%bc.z = bitcast i8 addrspace(4)* %gep.z to i16 addrspace(4)*
				%group.size.z = load i16, i16 addrspace(4)* %bc.z, align 2
				store i16 %group.size.z, i16 addrspace(1)* %out
				ret void
				}


				declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #1
				declare i32 @llvm.amdgcn.workgroup.id.x() #1
				declare i32 @llvm.amdgcn.workgroup.id.y() #1
				declare i32 @llvm.amdgcn.workgroup.id.z() #1

				!llvm.module.flags = !{!1}

				attributes #0 = { nounwind "uniform-work-group-size"="true" }
				attributes #1 = { nounwind readnone speculatable }
				!0 = !{i32 8, i32 16, i32 2}
				!1 = !{i32 1, !"amdgpu_code_object_version", i32 500}

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Implicit kernel arguments related optimization when uniform-workgroup-size=true
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 461769

llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp

llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Implicit kernel arguments related optimization when uniform-workgroup-size=trueClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 461769

llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp

llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll

AMDGPU: Implicit kernel arguments related optimization when uniform-workgroup-size=true
ClosedPublic