Diff 450333

llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp

	Show All 22 Lines
	WORKGROUP_SIZE_Y = 6,	WORKGROUP_SIZE_Y = 6,
	WORKGROUP_SIZE_Z = 8,	WORKGROUP_SIZE_Z = 8,

	GRID_SIZE_X = 12,	GRID_SIZE_X = 12,
	GRID_SIZE_Y = 16,	GRID_SIZE_Y = 16,
	GRID_SIZE_Z = 20	GRID_SIZE_Z = 20
	};	};

		// Field offsets to implicit kernel argument pointer.
		enum ImplicitArgOffsets {
		BLOCK_COUNT_X = 0,
		BLOCK_COUNT_Y = 4,
		BLOCK_COUNT_Z = 8,

		arsenmUnsubmitted Done Reply Inline Actions Should be named HIDDEN_BLOCK_? arsenm:* Should be named HIDDEN_BLOCK_*?
		cfangAuthorUnsubmitted Done Reply Inline Actions Ok, named with hidden_ prefix to reflect the field names. Thanks. cfang: Ok, named with hidden_ prefix to reflect the field names. Thanks.
		GROUP_SIZE_X = 12,
		GROUP_SIZE_Y = 14,
		GROUP_SIZE_Z = 16,

		REMAINDER_X = 18,
		REMAINDER_Y = 20,
		REMAINDER_Z = 22,
		};


	class AMDGPULowerKernelAttributes : public ModulePass {	class AMDGPULowerKernelAttributes : public ModulePass {
	public:	public:
	static char ID;	static char ID;

	AMDGPULowerKernelAttributes() : ModulePass(ID) {}	AMDGPULowerKernelAttributes() : ModulePass(ID) {}

	bool runOnModule(Module &M) override;	bool runOnModule(Module &M) override;

	StringRef getPassName() const override {	StringRef getPassName() const override {
	return "AMDGPU Kernel Attributes";	return "AMDGPU Kernel Attributes";
	}	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.setPreservesAll();	AU.setPreservesAll();
	}	}
	};	};

	} // end anonymous namespace	} // end anonymous namespace

		static bool processImplicitArgUse(CallInst *CI) {
		arsenmUnsubmitted Done Reply Inline Actions You've repeated most of the body of the existing function when only the core piece of the match differs. arsenm: You've repeated most of the body of the existing function when only the core piece of the match…
		cfangAuthorUnsubmitted Done Reply Inline Actions Actually that is not pure repeat even though the logic is the same. The fields, offsets and even the base pointers are different. cfang: Actually that is not pure repeat even though the logic is the same. The fields, offsets and…
		arsenmUnsubmitted Done Reply Inline Actions The entire core logic is the same. You can select between the pointers, fields and offsets within the same function. arsenm: The entire core logic is the same. You can select between the pointers, fields and offsets…
		Function *F = CI->getParent()->getParent();
		const bool HasUniformWorkGroupSize =
		F->getFnAttribute("uniform-work-group-size").getValueAsBool();

		if (!HasUniformWorkGroupSize)
		return false;

		Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
		Value *GroupSizes[3] = {nullptr, nullptr, nullptr};
		Value *Remainders[3] = {nullptr, nullptr, nullptr};

		const DataLayout &DL = F->getParent()->getDataLayout();
		arsenmUnsubmitted Done Reply Inline Actions Hidden? arsenm: Hidden?
		cfangAuthorUnsubmitted Done Reply Inline Actions These are just temporary variable names, and some of them are shared with pre-v5 implementation. I am thinking it is better not to add hidden_ prefix. cfang: These are just temporary variable names, and some of them are shared with pre-v5 implementation.

		// We expect to see several GEP users, casted to the appropriate type and
		// loaded.
		for (User *U : CI->users()) {
		if (!U->hasOneUse())
		continue;

		int64_t Offset = 0;
		BitCastInst *BCI = dyn_cast<BitCastInst>(U);
		if (!BCI) {
		if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
		continue;
		BCI = dyn_cast<BitCastInst>(*U->user_begin());
		}

		if (!BCI \|\| !BCI->hasOneUse())
		continue;

		auto Load = dyn_cast<LoadInst>(BCI->user_begin());
		if (!Load \|\| !Load->isSimple())
		continue;

		unsigned LoadSize = DL.getTypeStoreSize(Load->getType());

		// TODO: Handle merged loads.
		switch (Offset) {
		case BLOCK_COUNT_X:
		if (LoadSize == 4)
		BlockCounts[0] = Load;
		break;
		case BLOCK_COUNT_Y:
		if (LoadSize == 4)
		BlockCounts[1] = Load;
		break;
		case BLOCK_COUNT_Z:
		if (LoadSize == 4)
		BlockCounts[2] = Load;
		break;
		case GROUP_SIZE_X:
		if (LoadSize == 2)
		GroupSizes[0] = Load;
		break;
		case GROUP_SIZE_Y:
		if (LoadSize == 2)
		GroupSizes[1] = Load;
		break;
		case GROUP_SIZE_Z:
		if (LoadSize == 2)
		GroupSizes[2] = Load;
		break;
		case REMAINDER_X:
		if (LoadSize == 2)
		Remainders[0] = Load;
		break;
		case REMAINDER_Y:
		if (LoadSize == 2)
		Remainders[1] = Load;
		break;
		case REMAINDER_Z:
		if (LoadSize == 2)
		Remainders[2] = Load;
		break;
		default:
		break;
		}
		}

		// Under v5 __ockl_get_local_size returns the value computed by the expression:
		//
		// workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder
		//
		// For functions with the attribute uniform-work-group-size=true. we can evaluate
		// workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned
		// __ockl_get_local_size.
		bool MadeChange = false;
		for (int I = 0; I < 3; ++I) {
		Value *BlockCount = BlockCounts[I];
		if (!BlockCount)
		continue;

		using namespace llvm::PatternMatch;
		auto GroupIDIntrin =
		I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
		: (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
		: m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());

		for (User *ICmp : BlockCount->users()) {
		ICmpInst::Predicate Pred;
		if (match(ICmp, m_ICmp(Pred, GroupIDIntrin, m_Specific(BlockCount)))) {
		if (Pred != ICmpInst::ICMP_ULT)
		continue;
		ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType()));
		MadeChange = true;
		}
		}
		}

		// All remainders should be 0 with uniform work group size.
		for (Value *Remainder : Remainders) {
		if (!Remainder)
		continue;
		Remainder->replaceAllUsesWith(Constant::getNullValue(Remainder->getType()));
		MadeChange = true;
		}

		// If reqd_work_group_size is set, we can replace work group size with it.
		auto MD = F->getMetadata("reqd_work_group_size");
		const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
		if (!HasReqdWorkGroupSize)
		return MadeChange;

		for (int I = 0; I < 3; I++) {
		Value *GroupSize = GroupSizes[I];
		if (!GroupSize)
		continue;

		ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
		GroupSize->replaceAllUsesWith(
		ConstantExpr::getIntegerCast(KnownSize, GroupSize->getType(), false));
		MadeChange = true;
		}

		return MadeChange;
		}

	static bool processUse(CallInst *CI) {	static bool processUse(CallInst *CI) {
	Function *F = CI->getParent()->getParent();	Function *F = CI->getParent()->getParent();

	auto MD = F->getMetadata("reqd_work_group_size");	auto MD = F->getMetadata("reqd_work_group_size");
	const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;	const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;

	const bool HasUniformWorkGroupSize =	const bool HasUniformWorkGroupSize =
	F->getFnAttribute("uniform-work-group-size").getValueAsBool();	F->getFnAttribute("uniform-work-group-size").getValueAsBool();
	Show All 13 Lines

	// We expect to see several GEP users, casted to the appropriate type and	// We expect to see several GEP users, casted to the appropriate type and
	// loaded.	// loaded.
	for (User *U : CI->users()) {	for (User *U : CI->users()) {
	if (!U->hasOneUse())	if (!U->hasOneUse())
	continue;	continue;

	int64_t Offset = 0;	int64_t Offset = 0;
	if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)	if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
		arsenmUnsubmitted Done Reply Inline Actions Looks like something weird is happening with the return indentation arsenm: Looks like something weird is happening with the return indentation
		cfangAuthorUnsubmitted Done Reply Inline Actions Fixed. cfang: Fixed.
		arsenmUnsubmitted Done Reply Inline Actions Typo ImplicitArf arsenm: Typo ImplicitArf
		cfangAuthorUnsubmitted Done Reply Inline Actions Thanks. cfang: Thanks.
		arsenmUnsubmitted Done Reply Inline Actions These can reuse the same set and user loop arsenm: These can reuse the same set and user loop
		cfangAuthorUnsubmitted Done Reply Inline Actions Done. Thanks for the suggestions. cfang: Done. Thanks for the suggestions.
	Show All 22 Lines
	}	}

	return MadeChange;	return MadeChange;
	}	}

	// TODO: Move makeLIDRangeMetadata usage into here. Seem to not get	// TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
	// TargetPassConfig for subtarget.	// TargetPassConfig for subtarget.
	bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {	bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
	StringRef DispatchPtrName	StringRef DispatchPtrName =
	= Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr);	Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr);
		StringRef ImplicitArgPtrName
		arsenmUnsubmitted Done Reply Inline Actions This isn't reading the IR/module flag arsenm: This isn't reading the IR/module flag
		cfangAuthorUnsubmitted Done Reply Inline Actions Waiting for clang to generate a complete code object versions (only v5 at this moment). This is be updated with a separate task (patch). cfang: Waiting for clang to generate a complete code object versions (only v5 at this moment). This is…
		= Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr);

	Function *DispatchPtr = M.getFunction(DispatchPtrName);	Function *DispatchPtr = M.getFunction(DispatchPtrName);
	if (!DispatchPtr) // Dispatch ptr not used.	Function *ImplicitArgPtr = M.getFunction(ImplicitArgPtrName);
		if (!DispatchPtr && !ImplicitArgPtr) // Dispatch/ImplicitArf ptr not used.
		arsenmUnsubmitted Done Reply Inline Actions else if arsenm: else if
		cfangAuthorUnsubmitted Done Reply Inline Actions Done. Thanks. cfang: Done. Thanks.
	return false;	return false;

	bool MadeChange = false;	bool MadeChange = false;

		SmallPtrSet<Instruction *, 4> HandledImplicitArgUses;
		for (auto *U : ImplicitArgPtr->users()) {
		CallInst *CI = cast<CallInst>(U);
		if (HandledImplicitArgUses.insert(CI).second) {
		arsenmUnsubmitted Not Done Reply Inline Actions You're missing a code object version check since these inputs don't exist for < v5 arsenm: You're missing a code object version check since these inputs don't exist for < v5
		if (processImplicitArgUse(CI))
		MadeChange = true;
		}
		}

	SmallPtrSet<Instruction *, 4> HandledUses;	SmallPtrSet<Instruction *, 4> HandledUses;
	for (auto *U : DispatchPtr->users()) {	for (auto *U : DispatchPtr->users()) {
	CallInst *CI = cast<CallInst>(U);	CallInst *CI = cast<CallInst>(U);
	if (HandledUses.insert(CI).second) {	if (HandledUses.insert(CI).second) {
	if (processUse(CI))	if (processUse(CI))
	MadeChange = true;	MadeChange = true;
	}	}
	}	}

	return MadeChange;	return MadeChange;
	}	}


	INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,	INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
	"AMDGPU Kernel Attributes", false, false)	"AMDGPU Kernel Attributes", false, false)
	INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,	INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,
	"AMDGPU Kernel Attributes", false, false)	"AMDGPU Kernel Attributes", false, false)

	char AMDGPULowerKernelAttributes::ID = 0;	char AMDGPULowerKernelAttributes::ID = 0;

	ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {	ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
	return new AMDGPULowerKernelAttributes();	return new AMDGPULowerKernelAttributes();
	}	}

	PreservedAnalyses	PreservedAnalyses
	AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {	AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {
	StringRef DispatchPtrName =	StringRef DispatchPtrName =
	Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr);	Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr);
		StringRef ImplicitArgPtrName =
		Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr);

	Function *DispatchPtr = F.getParent()->getFunction(DispatchPtrName);	Function *DispatchPtr = F.getParent()->getFunction(DispatchPtrName);
	if (!DispatchPtr) // Dispatch ptr not used.	Function *ImplicitArgPtr = F.getParent()->getFunction(ImplicitArgPtrName);
		if (!DispatchPtr && !ImplicitArgPtr) // Dispatch /ImplicitArg ptr not used.
	return PreservedAnalyses::all();	return PreservedAnalyses::all();

	for (Instruction &I : instructions(F)) {	for (Instruction &I : instructions(F)) {
	if (CallInst *CI = dyn_cast<CallInst>(&I)) {	if (CallInst *CI = dyn_cast<CallInst>(&I)) {
		if (CI->getCalledFunction() == ImplicitArgPtr)
		processImplicitArgUse(CI);
	if (CI->getCalledFunction() == DispatchPtr)	if (CI->getCalledFunction() == DispatchPtr)
	processUse(CI);	processUse(CI);
	}	}
	}	}

	return PreservedAnalyses::all();	return PreservedAnalyses::all();
	}	}
Context not available.

llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
				; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine %s \| FileCheck -enable-var-scope -check-prefix=GCN %s

				; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
				define amdgpu_kernel void @get_local_size_x(i16 addrspace(1)* %out) #0 {
				; GCN-LABEL: @get_local_size_x(
				; GCN-NEXT: [[IMPLICITARG_PTR:%.]] = tail call i8 addrspace(4) @llvm.amdgcn.implicitarg.ptr()
				; GCN-NEXT: [[GEP_LOCAL_SIZE:%.]] = getelementptr inbounds i8, i8 addrspace(4) [[IMPLICITARG_PTR]], i64 12
				; GCN-NEXT: [[BC_GEP_LOCAL_SIZE:%.]] = bitcast i8 addrspace(4) [[GEP_LOCAL_SIZE]] to i16 addrspace(4)*
				; GCN-NEXT: [[LOCAL_SIZE:%.]] = load i16, i16 addrspace(4) [[BC_GEP_LOCAL_SIZE]], align 4
				; GCN-NEXT: store i16 [[LOCAL_SIZE]], i16 addrspace(1)* [[OUT:%.*]], align 2
				; GCN-NEXT: ret void
				;
				%group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
				%implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
				%bc.block.count.x = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
				%block.count.x = load i32, i32 addrspace(4)* %bc.block.count.x, align 4
				%cmp.id.count = icmp ult i32 %group.id, %block.count.x
				%local.size.offset = select i1 %cmp.id.count, i64 12, i64 18
				%gep.local.size = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 %local.size.offset
				%bc.gep.local.size = bitcast i8 addrspace(4)* %gep.local.size to i16 addrspace(4)*
				%local.size = load i16, i16 addrspace(4)* %bc.gep.local.size, align 2
				store i16 %local.size, i16 addrspace(1)* %out
				ret void
				}

				; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
				define amdgpu_kernel void @get_local_size_y(i16 addrspace(1)* %out) #0 {
				; GCN-LABEL: @get_local_size_y(
				; GCN-NEXT: [[IMPLICITARG_PTR:%.]] = tail call i8 addrspace(4) @llvm.amdgcn.implicitarg.ptr()
				; GCN-NEXT: [[GEP_LOCAL_SIZE:%.]] = getelementptr inbounds i8, i8 addrspace(4) [[IMPLICITARG_PTR]], i64 14
				; GCN-NEXT: [[BC_GEP_LOCAL_SIZE:%.]] = bitcast i8 addrspace(4) [[GEP_LOCAL_SIZE]] to i16 addrspace(4)*
				; GCN-NEXT: [[LOCAL_SIZE:%.]] = load i16, i16 addrspace(4) [[BC_GEP_LOCAL_SIZE]], align 2
				; GCN-NEXT: store i16 [[LOCAL_SIZE]], i16 addrspace(1)* [[OUT:%.*]], align 2
				; GCN-NEXT: ret void
				;
				%group.id = tail call i32 @llvm.amdgcn.workgroup.id.y()
				%implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
				%gep.block.count.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 4
				%bc.block.count.y = bitcast i8 addrspace(4)* %gep.block.count.y to i32 addrspace(4)*
				%block.count.y = load i32, i32 addrspace(4)* %bc.block.count.y, align 4
				%cmp.id.count = icmp ult i32 %group.id, %block.count.y
				%local.size.offset = select i1 %cmp.id.count, i64 14, i64 20
				%gep.local.size = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 %local.size.offset
				%bc.gep.local.size = bitcast i8 addrspace(4)* %gep.local.size to i16 addrspace(4)*
				%local.size = load i16, i16 addrspace(4)* %bc.gep.local.size, align 2
				store i16 %local.size, i16 addrspace(1)* %out
				ret void
				}

				; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
				define amdgpu_kernel void @get_local_size_z(i16 addrspace(1)* %out) #0 {
				; GCN-LABEL: @get_local_size_z(
				; GCN-NEXT: [[IMPLICITARG_PTR:%.]] = tail call i8 addrspace(4) @llvm.amdgcn.implicitarg.ptr()
				; GCN-NEXT: [[GEP_LOCAL_SIZE:%.]] = getelementptr inbounds i8, i8 addrspace(4) [[IMPLICITARG_PTR]], i64 16
				; GCN-NEXT: [[BC_GEP_LOCAL_SIZE:%.]] = bitcast i8 addrspace(4) [[GEP_LOCAL_SIZE]] to i16 addrspace(4)*
				; GCN-NEXT: [[LOCAL_SIZE:%.]] = load i16, i16 addrspace(4) [[BC_GEP_LOCAL_SIZE]], align 4
				; GCN-NEXT: store i16 [[LOCAL_SIZE]], i16 addrspace(1)* [[OUT:%.*]], align 2
				; GCN-NEXT: ret void
				;
				%group.id = tail call i32 @llvm.amdgcn.workgroup.id.z()
				%implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
				%gep.block.count.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 8
				%bc.block.count.z = bitcast i8 addrspace(4)* %gep.block.count.z to i32 addrspace(4)*
				%block.count.z = load i32, i32 addrspace(4)* %bc.block.count.z, align 4
				%cmp.id.count = icmp ult i32 %group.id, %block.count.z
				%local.size.offset = select i1 %cmp.id.count, i64 16, i64 22
				%gep.local.size = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 %local.size.offset
				%bc.gep.local.size = bitcast i8 addrspace(4)* %gep.local.size to i16 addrspace(4)*
				%local.size = load i16, i16 addrspace(4)* %bc.gep.local.size, align 2
				store i16 %local.size, i16 addrspace(1)* %out
				ret void
				}

				; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
				define amdgpu_kernel void @get_remainder_x(i16 addrspace(1)* %out) #0 {
				; GCN-LABEL: @get_remainder_x(
				; GCN-NEXT: store i16 0, i16 addrspace(1)* [[OUT:%.*]], align 2
				; GCN-NEXT: ret void
				;
				%implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
				%gep.x = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 18
				%bc.x = bitcast i8 addrspace(4)* %gep.x to i16 addrspace(4)*
				%remainder.x = load i16, i16 addrspace(4)* %bc.x, align 2
				store i16 %remainder.x, i16 addrspace(1)* %out
				ret void
				}

				; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
				define amdgpu_kernel void @get_remainder_y(i16 addrspace(1)* %out) #0 {
				; GCN-LABEL: @get_remainder_y(
				; GCN-NEXT: store i16 0, i16 addrspace(1)* [[OUT:%.*]], align 2
				; GCN-NEXT: ret void
				;
				%implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
				%gep.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 18
				%bc.y = bitcast i8 addrspace(4)* %gep.y to i16 addrspace(4)*
				%remainder.y = load i16, i16 addrspace(4)* %bc.y, align 2
				store i16 %remainder.y, i16 addrspace(1)* %out
				ret void
				}

				; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
				define amdgpu_kernel void @get_remainder_z(i16 addrspace(1)* %out) #0 {
				; GCN-LABEL: @get_remainder_z(
				; GCN-NEXT: store i16 0, i16 addrspace(1)* [[OUT:%.*]], align 2
				; GCN-NEXT: ret void
				;
				%implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
				%gep.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 18
				%bc.z = bitcast i8 addrspace(4)* %gep.z to i16 addrspace(4)*
				%remainder.z = load i16, i16 addrspace(4)* %bc.z, align 2
				store i16 %remainder.z, i16 addrspace(1)* %out
				ret void
				}

				; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
				define amdgpu_kernel void @get_work_group_size_x(i16 addrspace(1)* %out) #0 {
				; GCN-LABEL: @get_work_group_size_x(
				; GCN-NEXT: [[IMPLICITARG_PTR:%.]] = tail call i8 addrspace(4) @llvm.amdgcn.implicitarg.ptr()
				; GCN-NEXT: [[GEP_X:%.]] = getelementptr inbounds i8, i8 addrspace(4) [[IMPLICITARG_PTR]], i64 12
				; GCN-NEXT: [[BC_X:%.]] = bitcast i8 addrspace(4) [[GEP_X]] to i16 addrspace(4)*
				; GCN-NEXT: [[GROUP_SIZE_X:%.]] = load i16, i16 addrspace(4) [[BC_X]], align 4
				; GCN-NEXT: store i16 [[GROUP_SIZE_X]], i16 addrspace(1)* [[OUT:%.*]], align 2
				; GCN-NEXT: ret void
				;
				%implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
				%gep.x = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 12
				%bc.x = bitcast i8 addrspace(4)* %gep.x to i16 addrspace(4)*
				%group.size.x = load i16, i16 addrspace(4)* %bc.x, align 2
				store i16 %group.size.x, i16 addrspace(1)* %out
				ret void
				}

				; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
				define amdgpu_kernel void @get_work_group_size_y(i16 addrspace(1)* %out) #0 {
				; GCN-LABEL: @get_work_group_size_y(
				; GCN-NEXT: [[IMPLICITARG_PTR:%.]] = tail call i8 addrspace(4) @llvm.amdgcn.implicitarg.ptr()
				; GCN-NEXT: [[GEP_Y:%.]] = getelementptr inbounds i8, i8 addrspace(4) [[IMPLICITARG_PTR]], i64 14
				; GCN-NEXT: [[BC_Y:%.]] = bitcast i8 addrspace(4) [[GEP_Y]] to i16 addrspace(4)*
				; GCN-NEXT: [[GROUP_SIZE_Y:%.]] = load i16, i16 addrspace(4) [[BC_Y]], align 2
				; GCN-NEXT: store i16 [[GROUP_SIZE_Y]], i16 addrspace(1)* [[OUT:%.*]], align 2
				; GCN-NEXT: ret void
				;
				%implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
				%gep.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 14
				%bc.y = bitcast i8 addrspace(4)* %gep.y to i16 addrspace(4)*
				%group.size.y = load i16, i16 addrspace(4)* %bc.y, align 2
				store i16 %group.size.y, i16 addrspace(1)* %out
				ret void
				}

				; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
				define amdgpu_kernel void @get_work_group_size_z(i16 addrspace(1)* %out) #0 {
				; GCN-LABEL: @get_work_group_size_z(
				; GCN-NEXT: [[IMPLICITARG_PTR:%.]] = tail call i8 addrspace(4) @llvm.amdgcn.implicitarg.ptr()
				; GCN-NEXT: [[GEP_Z:%.]] = getelementptr inbounds i8, i8 addrspace(4) [[IMPLICITARG_PTR]], i64 16
				; GCN-NEXT: [[BC_Z:%.]] = bitcast i8 addrspace(4) [[GEP_Z]] to i16 addrspace(4)*
				; GCN-NEXT: [[GROUP_SIZE_Z:%.]] = load i16, i16 addrspace(4) [[BC_Z]], align 4
				; GCN-NEXT: store i16 [[GROUP_SIZE_Z]], i16 addrspace(1)* [[OUT:%.*]], align 2
				; GCN-NEXT: ret void
				;
				%implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
				%gep.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 16
				%bc.z = bitcast i8 addrspace(4)* %gep.z to i16 addrspace(4)*
				%group.size.z = load i16, i16 addrspace(4)* %bc.z, align 2
				store i16 %group.size.z, i16 addrspace(1)* %out
				ret void
				}

				; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
				define amdgpu_kernel void @get_work_group_size_x_reqd(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
				; GCN-LABEL: @get_work_group_size_x_reqd(
				; GCN-NEXT: store i16 8, i16 addrspace(1)* [[OUT:%.*]], align 2
				; GCN-NEXT: ret void
				;
				%implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
				%gep.x = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 12
				%bc.x = bitcast i8 addrspace(4)* %gep.x to i16 addrspace(4)*
				%group.size.x = load i16, i16 addrspace(4)* %bc.x, align 2
				store i16 %group.size.x, i16 addrspace(1)* %out
				ret void
				}

				; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
				define amdgpu_kernel void @get_work_group_size_y_reqd(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
				; GCN-LABEL: @get_work_group_size_y_reqd(
				; GCN-NEXT: store i16 16, i16 addrspace(1)* [[OUT:%.*]], align 2
				; GCN-NEXT: ret void
				;
				%implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
				%gep.y = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 14
				%bc.y = bitcast i8 addrspace(4)* %gep.y to i16 addrspace(4)*
				%group.size.y = load i16, i16 addrspace(4)* %bc.y, align 2
				store i16 %group.size.y, i16 addrspace(1)* %out
				ret void
				}

				; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
				define amdgpu_kernel void @get_work_group_size_z_reqd(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
				; GCN-LABEL: @get_work_group_size_z_reqd(
				; GCN-NEXT: store i16 2, i16 addrspace(1)* [[OUT:%.*]], align 2
				; GCN-NEXT: ret void
				;
				%implicitarg.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
				%gep.z = getelementptr inbounds i8, i8 addrspace(4)* %implicitarg.ptr, i64 16
				%bc.z = bitcast i8 addrspace(4)* %gep.z to i16 addrspace(4)*
				%group.size.z = load i16, i16 addrspace(4)* %bc.z, align 2
				store i16 %group.size.z, i16 addrspace(1)* %out
				ret void
				}


				declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #1
				declare i32 @llvm.amdgcn.workgroup.id.x() #1
				declare i32 @llvm.amdgcn.workgroup.id.y() #1
				declare i32 @llvm.amdgcn.workgroup.id.z() #1


				attributes #0 = { nounwind "uniform-work-group-size"="true" }
				attributes #1 = { nounwind readnone speculatable }
				!0 = !{i32 8, i32 16, i32 2}

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Implicit kernel arguments related optimization when uniform-workgroup-size=true
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 450333

llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp

llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Implicit kernel arguments related optimization when uniform-workgroup-size=trueClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 450333

llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp

llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll

AMDGPU: Implicit kernel arguments related optimization when uniform-workgroup-size=true
ClosedPublic