This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Add pass to optimize reqd_work_group_size
ClosedPublic

Authored by arsenm on May 17 2018, 3:45 AM.

Download Raw Diff

Details

Reviewers

rampitec
kzhuravl
cfang
b-sumner
yaxunl

Summary

Eliminate loads from the dispatch packet when they will have
a known value.

Also pattern match the code used by the library to handle partial
workgroup dispatches, which isn't necessary if reqd_work_group_size
is used.

Diff Detail

Event Timeline

arsenm created this revision.May 17 2018, 3:45 AM

Herald added subscribers: t-tye, tpr, dstuttard and 3 others. · View Herald TranscriptMay 17 2018, 3:45 AM

Also handle -cl-uniform-work-group-size attribute

As far as I understand it is only applicable if:

both reqd_work_group_size is used and the program is compiled with -cl-uniform-work-group-size
reqd_work_group_size is used and the program is compiled with -cl-std less than 2.0.

Potentially other languages can benefit it as well per language standard.

This may be an easier work for an FE to call simplified function, but an FE will not solve the issue with call from a non-kernel function. Since you are writing the whole pass for it makes sense to address this as well.

Account for difference between 1.2 and 2.0 wrt uniform-work-group-size

LGTM

This revision is now accepted and ready to land.May 18 2018, 1:20 PM

r332771

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

AMDGPU.h

4 lines

AMDGPULowerKernelAttributes.cpp

257 lines

AMDGPUTargetMachine.cpp

5 lines

CMakeLists.txt

1 line

test/

CodeGen/

AMDGPU/

reqd-work-group-size.ll

426 lines

Diff 147284

lib/Target/AMDGPU/AMDGPU.h

	Show First 20 Lines • Show All 67 Lines • ▼ Show 20 Lines
	Pass *createAMDGPUAnnotateKernelFeaturesPass();			Pass *createAMDGPUAnnotateKernelFeaturesPass();
	void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);			void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
	extern char &AMDGPUAnnotateKernelFeaturesID;			extern char &AMDGPUAnnotateKernelFeaturesID;

	ModulePass *createAMDGPULowerIntrinsicsPass();			ModulePass *createAMDGPULowerIntrinsicsPass();
	void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);			void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
	extern char &AMDGPULowerIntrinsicsID;			extern char &AMDGPULowerIntrinsicsID;

				ModulePass *createAMDGPULowerKernelAttributesPass();
				void initializeAMDGPULowerKernelAttributesPass(PassRegistry &);
				extern char &AMDGPULowerKernelAttributesID;

	void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);			void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
	extern char &AMDGPURewriteOutArgumentsID;			extern char &AMDGPURewriteOutArgumentsID;

	void initializeR600ClauseMergePassPass(PassRegistry &);			void initializeR600ClauseMergePassPass(PassRegistry &);
	extern char &R600ClauseMergePassID;			extern char &R600ClauseMergePassID;

	void initializeR600ControlFlowFinalizerPass(PassRegistry &);			void initializeR600ControlFlowFinalizerPass(PassRegistry &);
	extern char &R600ControlFlowFinalizerID;			extern char &R600ControlFlowFinalizerID;
	▲ Show 20 Lines • Show All 184 Lines • Show Last 20 Lines

lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp

This file was added.

				//===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//
				//
				// The LLVM Compiler Infrastructure
				//
				// This file is distributed under the University of Illinois Open Source
				// License. See LICENSE.TXT for details.
				//
				//===----------------------------------------------------------------------===//
				//
				/// \file This pass does attempts to make use of reqd_work_group_size metadata
				/// to eliminate loads from the dispatch packet and to constant fold OpenCL
				/// get_local_size-like functions.
				//
				//===----------------------------------------------------------------------===//

				#include "AMDGPU.h"
				#include "AMDGPUTargetMachine.h"
				#include "llvm/Analysis/ValueTracking.h"
				#include "llvm/CodeGen/Passes.h"
				#include "llvm/CodeGen/TargetPassConfig.h"
				#include "llvm/IR/Constants.h"
				#include "llvm/IR/Function.h"
				#include "llvm/IR/Instructions.h"
				#include "llvm/IR/PatternMatch.h"
				#include "llvm/Pass.h"

				#define DEBUG_TYPE "amdgpu-lower-kernel-attributes"

				using namespace llvm;

				namespace {

				// Field offsets in hsa_kernel_dispatch_packet_t.
				enum DispatchPackedOffsets {
				WORKGROUP_SIZE_X = 4,
				WORKGROUP_SIZE_Y = 6,
				WORKGROUP_SIZE_Z = 8,

				GRID_SIZE_X = 12,
				GRID_SIZE_Y = 16,
				GRID_SIZE_Z = 20
				};

				class AMDGPULowerKernelAttributes : public ModulePass {
				Module *Mod = nullptr;

				public:
				static char ID;

				AMDGPULowerKernelAttributes() : ModulePass(ID) {}

				bool processUse(CallInst *CI);

				bool doInitialization(Module &M) override;
				bool runOnModule(Module &M) override;

				StringRef getPassName() const override {
				return "AMDGPU Kernel Attributes";
				}

				void getAnalysisUsage(AnalysisUsage &AU) const override {
				AU.setPreservesAll();
				}
				};

				} // end anonymous namespace

				bool AMDGPULowerKernelAttributes::doInitialization(Module &M) {
				Mod = &M;
				return false;
				}

				bool AMDGPULowerKernelAttributes::processUse(CallInst *CI) {
				Function *F = CI->getParent()->getParent();

				auto MD = F->getMetadata("reqd_work_group_size");
				if (!MD \|\| MD->getNumOperands() < 3)
				return false;

				Value *WorkGroupSizeX = nullptr;
				Value *WorkGroupSizeY = nullptr;
				Value *WorkGroupSizeZ = nullptr;

				Value *GridSizeX = nullptr;
				Value *GridSizeY = nullptr;
				Value *GridSizeZ = nullptr;

				const DataLayout &DL = Mod->getDataLayout();

				// We expect to see several GEP users, casted to the appropriate type and
				// loaded.
				for (User *U : CI->users()) {
				if (!U->hasOneUse())
				continue;

				int64_t Offset = 0;
				if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
				continue;

				auto BCI = dyn_cast<BitCastInst>(U->user_begin());
				if (!BCI \|\| !BCI->hasOneUse())
				continue;

				auto Load = dyn_cast<LoadInst>(BCI->user_begin());
				if (!Load \|\| !Load->isSimple())
				continue;

				unsigned LoadSize = DL.getTypeStoreSize(Load->getType());

				// TODO: Handle merged loads.
				switch (Offset) {
				case WORKGROUP_SIZE_X:
				if (LoadSize == 2)
				WorkGroupSizeX = Load;
				break;
				case WORKGROUP_SIZE_Y:
				if (LoadSize == 2)
				WorkGroupSizeY = Load;
				break;
				case WORKGROUP_SIZE_Z:
				if (LoadSize == 2)
				WorkGroupSizeZ = Load;
				break;
				case GRID_SIZE_X:
				if (LoadSize == 4)
				GridSizeX = Load;
				break;
				case GRID_SIZE_Y:
				if (LoadSize == 4)
				GridSizeY = Load;
				break;
				case GRID_SIZE_Z:
				if (LoadSize == 4)
				GridSizeZ = Load;
				break;
				default:
				break;
				}
				}

				// Pattern match the code used to handle partial workgroup dispatches in the
				// library implementation of get_local_size, so the entire function can be
				// constant folded with a known group size.
				//
				// uint r = grid_size - group_id * group_size;
				// get_local_size = (r < group_size) ? r : group_size;
				//
				// If we have reqd_work_group_size, the grid_size is required to be a multiple
				// of group_size. In this case:
				//
				// grid_size - (group_id * group_size) < group_size
				// ->
				// grid_size < group_size + (group_id * group_size)
				//
				// (grid_size / group_size) < 1 + group_id
				//
				// grid_size / group_size is at least 1, so we can conclude the select
				// condition is false (except for group_id == 0, where the select result is
				// the same).

				bool MadeChange = false;
				Value *WorkGroupSizes[3] = { WorkGroupSizeX, WorkGroupSizeY, WorkGroupSizeZ };
				Value *GridSizes[3] = { GridSizeX, GridSizeY, GridSizeZ };

				for (int I = 0; I < 3; ++I) {
				Value *GroupSize = WorkGroupSizes[I];
				Value *GridSize = GridSizes[I];
				if (!GroupSize \|\| !GridSize)
				continue;

				for (User *U : GroupSize->users()) {
				auto *ZextGroupSize = dyn_cast<ZExtInst>(U);
				if (!ZextGroupSize)
				continue;

				for (User *ZextUser : ZextGroupSize->users()) {
				auto *SI = dyn_cast<SelectInst>(ZextUser);
				if (!SI)
				continue;

				using namespace llvm::PatternMatch;
				auto GroupIDIntrin = I == 0 ?
				m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() :
				(I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() :
				m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());

				auto SubExpr = m_Sub(m_Specific(GridSize),
				m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize)));

				ICmpInst::Predicate Pred;
				if (match(SI,
				m_Select(m_ICmp(Pred, SubExpr, m_Specific(ZextGroupSize)),
				SubExpr,
				m_Specific(ZextGroupSize))) &&
				Pred == ICmpInst::ICMP_ULT) {
				ConstantInt *KnownSize
				= mdconst::extract<ConstantInt>(MD->getOperand(I));
				SI->replaceAllUsesWith(ConstantExpr::getIntegerCast(KnownSize,
				SI->getType(),
				false));
				MadeChange = true;
				}
				}
				}
				}

				// Eliminate any other loads we can from the dispatch packet.
				for (int I = 0; I < 3; ++I) {
				Value *GroupSize = WorkGroupSizes[I];
				if (!GroupSize)
				continue;

				ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
				GroupSize->replaceAllUsesWith(
				ConstantExpr::getIntegerCast(KnownSize,
				GroupSize->getType(),
				false));
				MadeChange = true;
				}

				return MadeChange;
				}

				// TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
				// TargetPassConfig for subtarget.
				bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
				StringRef DispatchPtrName
				= Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr);

				Function *DispatchPtr = Mod->getFunction(DispatchPtrName);
				if (!DispatchPtr) // Dispatch ptr not used.
				return false;

				bool MadeChange = false;

				SmallPtrSet<Instruction *, 4> HandledUses;
				for (auto *U : DispatchPtr->users()) {
				CallInst *CI = cast<CallInst>(U);
				if (HandledUses.insert(CI).second) {
				if (processUse(CI))
				MadeChange = true;
				}
				}

				return MadeChange;
				}

				INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
				"AMDGPU IR optimizations", false, false)
				INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE, "AMDGPU IR optimizations",
				false, false)

				char AMDGPULowerKernelAttributes::ID = 0;

				ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
				return new AMDGPULowerKernelAttributes();
				}

lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Show First 20 Lines • Show All 175 Lines • ▼ Show 20 Lines	extern "C" void LLVMInitializeAMDGPUTarget() {
initializeSIDebuggerInsertNopsPass(*PR);		initializeSIDebuggerInsertNopsPass(*PR);
initializeSIOptimizeExecMaskingPass(*PR);		initializeSIOptimizeExecMaskingPass(*PR);
initializeSIFixWWMLivenessPass(*PR);		initializeSIFixWWMLivenessPass(*PR);
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);		initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
initializeAMDGPUAAWrapperPassPass(*PR);		initializeAMDGPUAAWrapperPassPass(*PR);
initializeAMDGPUUseNativeCallsPass(*PR);		initializeAMDGPUUseNativeCallsPass(*PR);
initializeAMDGPUSimplifyLibCallsPass(*PR);		initializeAMDGPUSimplifyLibCallsPass(*PR);
initializeAMDGPUInlinerPass(*PR);		initializeAMDGPUInlinerPass(*PR);
		initializeAMDGPULowerKernelAttributesPass(*PR);
}		}

static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {		static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
return llvm::make_unique<AMDGPUTargetObjectFile>();		return llvm::make_unique<AMDGPUTargetObjectFile>();
}		}

static ScheduleDAGInstrs createR600MachineScheduler(MachineSchedContext C) {		static ScheduleDAGInstrs createR600MachineScheduler(MachineSchedContext C) {
return new ScheduleDAGMILive(C, llvm::make_unique<R600SchedStrategy>());		return new ScheduleDAGMILive(C, llvm::make_unique<R600SchedStrategy>());
▲ Show 20 Lines • Show All 206 Lines • ▼ Show 20 Lines	void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
});		});

Builder.addExtension(		Builder.addExtension(
PassManagerBuilder::EP_CGSCCOptimizerLate,		PassManagerBuilder::EP_CGSCCOptimizerLate,
[](const PassManagerBuilder &, legacy::PassManagerBase &PM) {		[](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
// Add infer address spaces pass to the opt pipeline after inlining		// Add infer address spaces pass to the opt pipeline after inlining
// but before SROA to increase SROA opportunities.		// but before SROA to increase SROA opportunities.
PM.add(createInferAddressSpacesPass());		PM.add(createInferAddressSpacesPass());

		// This should run after inlining to have any chance of doing anything,
		// and before other cleanup optimizations.
		PM.add(createAMDGPULowerKernelAttributesPass());
});		});
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// R600 Target Machine (R600 -> Cayman)		// R600 Target Machine (R600 -> Cayman)
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,		R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
▲ Show 20 Lines • Show All 475 Lines • Show Last 20 Lines

lib/Target/AMDGPU/CMakeLists.txt

Show All 33 Lines	add_llvm_target(AMDGPUCodeGen
AMDGPUInstructionSelector.cpp		AMDGPUInstructionSelector.cpp
AMDGPUIntrinsicInfo.cpp		AMDGPUIntrinsicInfo.cpp
AMDGPUISelDAGToDAG.cpp		AMDGPUISelDAGToDAG.cpp
AMDGPUISelLowering.cpp		AMDGPUISelLowering.cpp
AMDGPULegalizerInfo.cpp		AMDGPULegalizerInfo.cpp
AMDGPULibCalls.cpp		AMDGPULibCalls.cpp
AMDGPULibFunc.cpp		AMDGPULibFunc.cpp
AMDGPULowerIntrinsics.cpp		AMDGPULowerIntrinsics.cpp
		AMDGPULowerKernelAttributes.cpp
AMDGPUMachineCFGStructurizer.cpp		AMDGPUMachineCFGStructurizer.cpp
AMDGPUMachineFunction.cpp		AMDGPUMachineFunction.cpp
AMDGPUMachineModuleInfo.cpp		AMDGPUMachineModuleInfo.cpp
AMDGPUMacroFusion.cpp		AMDGPUMacroFusion.cpp
AMDGPUMCInstLower.cpp		AMDGPUMCInstLower.cpp
AMDGPUOpenCLEnqueuedBlockLowering.cpp		AMDGPUOpenCLEnqueuedBlockLowering.cpp
AMDGPUPromoteAlloca.cpp		AMDGPUPromoteAlloca.cpp
AMDGPURegAsmNames.inc.cpp		AMDGPURegAsmNames.inc.cpp
▲ Show 20 Lines • Show All 61 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/reqd-work-group-size.ll

This file was added.

				; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -amdgpu-lower-kernel-attributes -instcombine %s \| FileCheck -enable-var-scope %s

				; CHECK-LABEL: @invalid_reqd_work_group_size(
				; CHECK: load i16,
				define amdgpu_kernel void @invalid_reqd_work_group_size(i16 addrspace(1)* %out) #0 !reqd_work_group_size !1 {
				%dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
				%gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
				%gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
				%group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
				store i16 %group.size.x, i16 addrspace(1)* %out
				ret void
				}

				; CHECK-LABEL: @volatile_load_group_size_x(
				; CHECK: load volatile i16,
				define amdgpu_kernel void @volatile_load_group_size_x(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
				%dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
				%gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
				%gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
				%group.size.x = load volatile i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
				store i16 %group.size.x, i16 addrspace(1)* %out
				ret void
				}

				; CHECK-LABEL: @load_group_size_x(
				; CHECK-NEXT: store i16 8,
				define amdgpu_kernel void @load_group_size_x(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
				%dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
				%gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
				%gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
				%group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
				store i16 %group.size.x, i16 addrspace(1)* %out
				ret void
				}

				; CHECK-LABEL: @load_group_size_y(
				; CHECK-NEXT: store i16 16,
				define amdgpu_kernel void @load_group_size_y(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
				%dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
				%gep.group.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 6
				%gep.group.size.y.bc = bitcast i8 addrspace(4)* %gep.group.size.y to i16 addrspace(4)*
				%group.size.y = load i16, i16 addrspace(4)* %gep.group.size.y.bc, align 4
				store i16 %group.size.y, i16 addrspace(1)* %out
				ret void
				}

				; CHECK-LABEL: @load_group_size_z(
				; CHECK-NEXT: store i16 2,
				define amdgpu_kernel void @load_group_size_z(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
				%dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
				%gep.group.size.z = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 8
				%gep.group.size.z.bc = bitcast i8 addrspace(4)* %gep.group.size.z to i16 addrspace(4)*
				%group.size.z = load i16, i16 addrspace(4)* %gep.group.size.z.bc, align 4
				store i16 %group.size.z, i16 addrspace(1)* %out
				ret void
				}

				; Metadata uses i64 instead of i32
				; CHECK-LABEL: @load_group_size_x_reqd_work_group_size_i64(
				; CHECK-NEXT: store i16 8,
				define amdgpu_kernel void @load_group_size_x_reqd_work_group_size_i64(i16 addrspace(1)* %out) #0 !reqd_work_group_size !2 {
				%dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
				%gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
				%gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
				%group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
				store i16 %group.size.x, i16 addrspace(1)* %out
				ret void
				}

				; Metadata uses i16 instead of i32
				; CHECK-LABEL: @load_group_size_x_reqd_work_group_size_i16(
				; CHECK-NEXT: store i16 8,
				define amdgpu_kernel void @load_group_size_x_reqd_work_group_size_i16(i16 addrspace(1)* %out) #0 !reqd_work_group_size !3 {
				%dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
				%gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
				%gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
				%group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
				store i16 %group.size.x, i16 addrspace(1)* %out
				ret void
				}

				; CHECK-LABEL: @use_local_size_x_8_16_2(
				; CHECK-NEXT: store i64 8,
				define amdgpu_kernel void @use_local_size_x_8_16_2(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
				%dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
				%gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
				%gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
				%group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
				%gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
				%gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
				%grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
				%group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
				%group.size.x.zext = zext i16 %group.size.x to i32
				%group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
				%sub = sub i32 %grid.size.x, %group.id_x_group.size.x
				%cmp = icmp ult i32 %sub, %group.size.x.zext
				%select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
				%zext = zext i32 %select to i64
				store i64 %zext, i64 addrspace(1)* %out
				ret void
				}

				; CHECK-LABEL: @use_local_size_y_8_16_2(
				; CHECK-NEXT: store i64 16,
				define amdgpu_kernel void @use_local_size_y_8_16_2(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
				%dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
				%gep.group.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 6
				%gep.group.size.y.bc = bitcast i8 addrspace(4)* %gep.group.size.y to i16 addrspace(4)*
				%group.size.y = load i16, i16 addrspace(4)* %gep.group.size.y.bc, align 4
				%gep.grid.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 16
				%gep.grid.size.y.bc = bitcast i8 addrspace(4)* %gep.grid.size.y to i32 addrspace(4)*
				%grid.size.y = load i32, i32 addrspace(4)* %gep.grid.size.y.bc, align 4
				%group.id = tail call i32 @llvm.amdgcn.workgroup.id.y()
				%group.size.y.zext = zext i16 %group.size.y to i32
				%group.id_x_group.size.y = mul i32 %group.id, %group.size.y.zext
				%sub = sub i32 %grid.size.y, %group.id_x_group.size.y
				%cmp = icmp ult i32 %sub, %group.size.y.zext
				%select = select i1 %cmp, i32 %sub, i32 %group.size.y.zext
				%zext = zext i32 %select to i64
				store i64 %zext, i64 addrspace(1)* %out
				ret void
				}

				; CHECK-LABEL: @use_local_size_z_8_16_2(
				; CHECK-NEXT: store i64 2,
				define amdgpu_kernel void @use_local_size_z_8_16_2(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
				%dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
				%gep.group.size.z = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 8
				%gep.group.size.z.bc = bitcast i8 addrspace(4)* %gep.group.size.z to i16 addrspace(4)*
				%group.size.z = load i16, i16 addrspace(4)* %gep.group.size.z.bc, align 4
				%gep.grid.size.z = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 20
				%gep.grid.size.z.bc = bitcast i8 addrspace(4)* %gep.grid.size.z to i32 addrspace(4)*
				%grid.size.z = load i32, i32 addrspace(4)* %gep.grid.size.z.bc, align 4
				%group.id = tail call i32 @llvm.amdgcn.workgroup.id.z()
				%group.size.z.zext = zext i16 %group.size.z to i32
				%group.id_x_group.size.z = mul i32 %group.id, %group.size.z.zext
				%sub = sub i32 %grid.size.z, %group.id_x_group.size.z
				%cmp = icmp ult i32 %sub, %group.size.z.zext
				%select = select i1 %cmp, i32 %sub, i32 %group.size.z.zext
				%zext = zext i32 %select to i64
				store i64 %zext, i64 addrspace(1)* %out
				ret void
				}

				; Simplification on select is invalid, but we can still eliminate the
				; load of the group size.

				; CHECK-LABEL: @local_size_x_8_16_2_wrong_group_id(
				; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y()
				; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3
				define amdgpu_kernel void @local_size_x_8_16_2_wrong_group_id(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
				%dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
				%gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
				%gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
				%group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
				%gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
				%gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
				%grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
				%group.id = tail call i32 @llvm.amdgcn.workgroup.id.y()
				%group.size.x.zext = zext i16 %group.size.x to i32
				%group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
				%sub = sub i32 %grid.size.x, %group.id_x_group.size.x
				%cmp = icmp ult i32 %sub, %group.size.x.zext
				%select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
				%zext = zext i32 %select to i64
				store i64 %zext, i64 addrspace(1)* %out
				ret void
				}

				; CHECK-LABEL: @local_size_x_8_16_2_wrong_grid_size(
				; CHECK: %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
				; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
				; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3
				; CHECK: %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
				define amdgpu_kernel void @local_size_x_8_16_2_wrong_grid_size(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
				%dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
				%gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
				%gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
				%group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
				%gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 16
				%gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
				%grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
				%group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
				%group.size.x.zext = zext i16 %group.size.x to i32
				%group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
				%sub = sub i32 %grid.size.x, %group.id_x_group.size.x
				%cmp = icmp ult i32 %sub, %group.size.x.zext
				%select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
				%zext = zext i32 %select to i64
				store i64 %zext, i64 addrspace(1)* %out
				ret void
				}

				; CHECK-LABEL: @local_size_x_8_16_2_wrong_cmp_type(
				; CHECK: %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
				; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
				; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3
				; CHECK: %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
				; CHECK: %cmp = icmp slt i32 %sub, 8
				; CHECK: %select = select i1 %cmp, i32 %sub, i32 8
				define amdgpu_kernel void @local_size_x_8_16_2_wrong_cmp_type(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
				%dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
				%gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
				%gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
				%group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
				%gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
				%gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
				%grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
				%group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
				%group.size.x.zext = zext i16 %group.size.x to i32
				%group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
				%sub = sub i32 %grid.size.x, %group.id_x_group.size.x
				%cmp = icmp slt i32 %sub, %group.size.x.zext
				%select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
				%zext = zext i32 %select to i64
				store i64 %zext, i64 addrspace(1)* %out
				ret void
				}

				; CHECK-LABEL: @use_local_size_x_8_16_2_wrong_grid_load_size(
				; CHECK: %grid.size.x = load i16, i16 addrspace(4)* %gep.grid.size.x.bc, align 4
				; CHECK: %grid.size.x.zext = zext i16 %grid.size.x to i32
				; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
				; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3
				; CHECK: %sub = sub i32 %grid.size.x.zext, %group.id_x_group.size.x
				define amdgpu_kernel void @use_local_size_x_8_16_2_wrong_grid_load_size(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
				%dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
				%gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
				%gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
				%group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
				%gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
				%gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i16 addrspace(4)*
				%grid.size.x = load i16, i16 addrspace(4)* %gep.grid.size.x.bc, align 4
				%grid.size.x.zext = zext i16 %grid.size.x to i32
				%group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
				%group.size.x.zext = zext i16 %group.size.x to i32
				%group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
				%sub = sub i32 %grid.size.x.zext, %group.id_x_group.size.x
				%cmp = icmp ult i32 %sub, %group.size.x.zext
				%select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
				%zext = zext i32 %select to i64
				store i64 %zext, i64 addrspace(1)* %out
				ret void
				}

				; CHECK-LABEL: @func_group_size_x(
				; CHECK-NEXT: ret i32 8
				define i32 @func_group_size_x(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
				%dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
				%gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
				%gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
				%group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
				%zext = zext i16 %group.size.x to i32
				ret i32 %zext
				}

				; CHECK-LABEL: @__ockl_get_local_size_reqd_size(
				; CHECK: %group.size = phi i32 [ 2, %bb17 ], [ 16, %bb9 ], [ 8, %bb1 ], [ 1, %bb ]
				define i64 @__ockl_get_local_size_reqd_size(i32 %arg) #1 !reqd_work_group_size !0 {
				bb:
				%tmp = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2
				switch i32 %arg, label %bb25 [
				i32 0, label %bb1
				i32 1, label %bb9
				i32 2, label %bb17
				]

				bb1: ; preds = %bb
				%tmp2 = tail call i32 @llvm.amdgcn.workgroup.id.x()
				%tmp3 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 12
				%tmp4 = bitcast i8 addrspace(4)* %tmp3 to i32 addrspace(4)*
				%tmp5 = load i32, i32 addrspace(4)* %tmp4, align 4
				%tmp6 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 4
				%tmp7 = bitcast i8 addrspace(4)* %tmp6 to i16 addrspace(4)*
				%tmp8 = load i16, i16 addrspace(4)* %tmp7, align 4
				br label %bb25

				bb9: ; preds = %bb
				%tmp10 = tail call i32 @llvm.amdgcn.workgroup.id.y()
				%tmp11 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 16
				%tmp12 = bitcast i8 addrspace(4)* %tmp11 to i32 addrspace(4)*
				%tmp13 = load i32, i32 addrspace(4)* %tmp12, align 8
				%tmp14 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 6
				%tmp15 = bitcast i8 addrspace(4)* %tmp14 to i16 addrspace(4)*
				%tmp16 = load i16, i16 addrspace(4)* %tmp15, align 2
				br label %bb25

				bb17: ; preds = %bb
				%tmp18 = tail call i32 @llvm.amdgcn.workgroup.id.z()
				%tmp19 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 20
				%tmp20 = bitcast i8 addrspace(4)* %tmp19 to i32 addrspace(4)*
				%tmp21 = load i32, i32 addrspace(4)* %tmp20, align 4
				%tmp22 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 8
				%tmp23 = bitcast i8 addrspace(4)* %tmp22 to i16 addrspace(4)*
				%tmp24 = load i16, i16 addrspace(4)* %tmp23, align 8
				br label %bb25

				bb25: ; preds = %bb17, %bb9, %bb1, %bb
				%tmp26 = phi i32 [ %tmp21, %bb17 ], [ %tmp13, %bb9 ], [ %tmp5, %bb1 ], [ 0, %bb ]
				%group.size = phi i16 [ %tmp24, %bb17 ], [ %tmp16, %bb9 ], [ %tmp8, %bb1 ], [ 1, %bb ]
				%tmp28 = phi i32 [ %tmp18, %bb17 ], [ %tmp10, %bb9 ], [ %tmp2, %bb1 ], [ 0, %bb ]
				%tmp29 = zext i16 %group.size to i32
				%tmp30 = mul i32 %tmp28, %tmp29
				%tmp31 = sub i32 %tmp26, %tmp30
				%tmp32 = icmp ult i32 %tmp31, %tmp29
				%tmp33 = select i1 %tmp32, i32 %tmp31, i32 %tmp29
				%tmp34 = zext i32 %tmp33 to i64
				ret i64 %tmp34
				}

				; CHECK-LABEL: @all_local_size(
				; CHECK-NEXT: store volatile i64 8, i64 addrspace(1)* %out, align 4
				; CHECK-NEXT: store volatile i64 16, i64 addrspace(1)* %out, align 4
				; CHECK-NEXT: store volatile i64 2, i64 addrspace(1)* %out, align 4
				define amdgpu_kernel void @all_local_size(i64 addrspace(1)* nocapture readnone %out) #0 !reqd_work_group_size !0 {
				%tmp.i = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
				%tmp2.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #0
				%tmp3.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 12
				%tmp4.i = bitcast i8 addrspace(4)* %tmp3.i to i32 addrspace(4)*
				%tmp5.i = load i32, i32 addrspace(4)* %tmp4.i, align 4
				%tmp6.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 4
				%tmp7.i = bitcast i8 addrspace(4)* %tmp6.i to i16 addrspace(4)*
				%tmp8.i = load i16, i16 addrspace(4)* %tmp7.i, align 4
				%tmp29.i = zext i16 %tmp8.i to i32
				%tmp30.i = mul i32 %tmp2.i, %tmp29.i
				%tmp31.i = sub i32 %tmp5.i, %tmp30.i
				%tmp32.i = icmp ult i32 %tmp31.i, %tmp29.i
				%tmp33.i = select i1 %tmp32.i, i32 %tmp31.i, i32 %tmp29.i
				%tmp34.i = zext i32 %tmp33.i to i64
				%tmp10.i = tail call i32 @llvm.amdgcn.workgroup.id.y() #0
				%tmp11.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 16
				%tmp12.i = bitcast i8 addrspace(4)* %tmp11.i to i32 addrspace(4)*
				%tmp13.i = load i32, i32 addrspace(4)* %tmp12.i, align 8
				%tmp14.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 6
				%tmp15.i = bitcast i8 addrspace(4)* %tmp14.i to i16 addrspace(4)*
				%tmp16.i = load i16, i16 addrspace(4)* %tmp15.i, align 2
				%tmp29.i9 = zext i16 %tmp16.i to i32
				%tmp30.i10 = mul i32 %tmp10.i, %tmp29.i9
				%tmp31.i11 = sub i32 %tmp13.i, %tmp30.i10
				%tmp32.i12 = icmp ult i32 %tmp31.i11, %tmp29.i9
				%tmp33.i13 = select i1 %tmp32.i12, i32 %tmp31.i11, i32 %tmp29.i9
				%tmp34.i14 = zext i32 %tmp33.i13 to i64
				%tmp18.i = tail call i32 @llvm.amdgcn.workgroup.id.z() #0
				%tmp19.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 20
				%tmp20.i = bitcast i8 addrspace(4)* %tmp19.i to i32 addrspace(4)*
				%tmp21.i = load i32, i32 addrspace(4)* %tmp20.i, align 4
				%tmp22.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 8
				%tmp23.i = bitcast i8 addrspace(4)* %tmp22.i to i16 addrspace(4)*
				%tmp24.i = load i16, i16 addrspace(4)* %tmp23.i, align 8
				%tmp29.i2 = zext i16 %tmp24.i to i32
				%tmp30.i3 = mul i32 %tmp18.i, %tmp29.i2
				%tmp31.i4 = sub i32 %tmp21.i, %tmp30.i3
				%tmp32.i5 = icmp ult i32 %tmp31.i4, %tmp29.i2
				%tmp33.i6 = select i1 %tmp32.i5, i32 %tmp31.i4, i32 %tmp29.i2
				%tmp34.i7 = zext i32 %tmp33.i6 to i64
				store volatile i64 %tmp34.i, i64 addrspace(1)* %out, align 4
				store volatile i64 %tmp34.i14, i64 addrspace(1)* %out, align 4
				store volatile i64 %tmp34.i7, i64 addrspace(1)* %out, align 4
				ret void
				}

				; TODO: Should be able to handle this, but not much reason to.
				; CHECK-LABEL: @partial_load_group_size_x(
				; CHECK-NEXT: %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
				; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
				; CHECK-NEXT: %group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 1
				; CHECK-NEXT: store i8 %group.size.x.lo, i8 addrspace(1)* %out, align 1
				define amdgpu_kernel void @partial_load_group_size_x(i8 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
				%dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
				%gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
				%group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 1
				store i8 %group.size.x.lo, i8 addrspace(1)* %out
				ret void
				}

				; TODO: Should be able to handle this
				; CHECK-LABEL: @load_group_size_xy_i32(
				; CHECK: %group.size.xy = load i32,
				; CHECK: store i32 %group.size.xy
				define amdgpu_kernel void @load_group_size_xy_i32(i32 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
				%dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
				%gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
				%gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i32 addrspace(4)*
				%group.size.xy = load i32, i32 addrspace(4)* %gep.group.size.x.bc, align 4
				store i32 %group.size.xy, i32 addrspace(1)* %out
				ret void
				}

				; CHECK-LABEL: @load_group_size_x_y_multiple_dispatch_ptr(
				; CHECK-NEXT: store volatile i16 8, i16 addrspace(1)* %out, align 2
				; CHECK-NEXT: store volatile i16 16, i16 addrspace(1)* %out, align 2
				define amdgpu_kernel void @load_group_size_x_y_multiple_dispatch_ptr(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
				%dispatch.ptr0 = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
				%gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr0, i64 4
				%gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
				%group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
				store volatile i16 %group.size.x, i16 addrspace(1)* %out

				%dispatch.ptr1 = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
				%gep.group.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr1, i64 6
				%gep.group.size.y.bc = bitcast i8 addrspace(4)* %gep.group.size.y to i16 addrspace(4)*
				%group.size.y = load i16, i16 addrspace(4)* %gep.group.size.y.bc, align 4
				store volatile i16 %group.size.y, i16 addrspace(1)* %out

				ret void
				}

				; CHECK-LABEL: @no_use_dispatch_ptr(
				; CHECK-NEXT: ret void
				define amdgpu_kernel void @no_use_dispatch_ptr() {
				%dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
				ret void
				}

				declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
				declare i32 @llvm.amdgcn.workgroup.id.x() #1
				declare i32 @llvm.amdgcn.workgroup.id.y() #1
				declare i32 @llvm.amdgcn.workgroup.id.z() #1

				attributes #0 = { nounwind }
				attributes #1 = { nounwind readnone speculatable }

				!0 = !{i32 8, i32 16, i32 2}
				!1 = !{i32 8, i32 16}
				!2 = !{i64 8, i64 16, i64 2}
				!3 = !{i16 8, i16 16, i16 2}