This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Add __builtin_amdgcn_workgroup_size_x/y/z
ClosedPublic

Authored by yaxunl on Mar 25 2020, 6:57 AM.

Download Raw Diff

Details

Reviewers

arsenm
b-sumner
cfang
rjmccall
Anastasia

Commits

rG369e26ca9e0d: [AMDGPU] Add __builtin_amdgcn_workgroup_size_x/y/z

Summary

The main purpose of introducing these builtins is to add a range metadata [1, 1025) on the work group size loaded from dispatch ptr, which cannot be done by source code.

Diff Detail

Event Timeline

yaxunl created this revision.Mar 25 2020, 6:57 AM

Herald added subscribers: kerbowa, t-tye, tpr and 5 others. · View Herald TranscriptMar 25 2020, 6:57 AM

arsenm added inline comments.Mar 25 2020, 8:40 AM

clang/lib/CodeGen/CGBuiltin.cpp
13428	Why is this necessary? The builtin always has the same return type?
13435	Comment that this is indexing the hsa_kernel_dispatch_packet sstruct?
13442	I thought I had a patch to include the maximum group size in AMDGPUTargetInfo to avoid hardcoding it, but I guess it was never committed
13443	Also set it's invariant
clang/test/CodeGenOpenCL/builtins-amdgcn.cl
539	Also run in a hip test, or some case where the addrspacecast is needed?

Revised by Matt's comments

clang/lib/CodeGen/CGBuiltin.cpp
13428	due to https://github.com/llvm/llvm-project/commit/c65f966d76aa5412920b3f14d199e764135bd5ec pointers returned by builtin functions are in default address space for HIP.
13435	done
13442	Added getMaxOpenCLWorkGroupSize() to TargetInfo
13443	done

yaxunl added reviewers: rjmccall, Anastasia.Mar 25 2020, 5:01 PM

arsenm accepted this revision.Mar 26 2020, 6:50 AM

arsenm added inline comments.

clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
2	I assume the addrspacecast got optimized out? Should this disable llvm passes?

This revision is now accepted and ready to land.Mar 26 2020, 6:50 AM

yaxunl marked 2 inline comments as done.Mar 26 2020, 10:21 AM

yaxunl added inline comments.

clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
2	We did not emit addrspacecast here since we only need return the loaded value. HIP by default uses -O0, therefore no need to disable llvm passes.

Closed by commit rG369e26ca9e0d: [AMDGPU] Add __builtin_amdgcn_workgroup_size_x/y/z (authored by yaxunl). · Explain WhyMar 27 2020, 10:32 PM

This revision was automatically updated to reflect the committed changes.

yaxunl marked an inline comment as done.

Herald added a project: Restricted Project. · View Herald TranscriptMar 27 2020, 10:32 PM

JonChesterfield mentioned this in D90251: [AMDGPU] Add __builtin_amdgcn_grid_size.Oct 27 2020, 10:25 AM

JonChesterfield mentioned this in rGdee7704829bd: [AMDGPU] Add __builtin_amdgcn_grid_size.Oct 29 2020, 9:25 AM

Revision Contents

Path

Size

clang/

include/

clang/

Basic/

BuiltinsAMDGPU.def

4 lines

TargetInfo.h

4 lines

lib/

Basic/

TargetInfo.cpp

2 lines

CodeGen/

CGBuiltin.cpp

67 lines

test/

CodeGenCUDA/

amdgpu-workgroup-size.cu

25 lines

CodeGenOpenCL/

builtins-amdgcn.cl

19 lines

Diff 252621

clang/include/clang/Basic/BuiltinsAMDGPU.def

	Show All 27 Lines
	BUILTIN(__builtin_amdgcn_workgroup_id_x, "Ui", "nc")			BUILTIN(__builtin_amdgcn_workgroup_id_x, "Ui", "nc")
	BUILTIN(__builtin_amdgcn_workgroup_id_y, "Ui", "nc")			BUILTIN(__builtin_amdgcn_workgroup_id_y, "Ui", "nc")
	BUILTIN(__builtin_amdgcn_workgroup_id_z, "Ui", "nc")			BUILTIN(__builtin_amdgcn_workgroup_id_z, "Ui", "nc")

	BUILTIN(__builtin_amdgcn_workitem_id_x, "Ui", "nc")			BUILTIN(__builtin_amdgcn_workitem_id_x, "Ui", "nc")
	BUILTIN(__builtin_amdgcn_workitem_id_y, "Ui", "nc")			BUILTIN(__builtin_amdgcn_workitem_id_y, "Ui", "nc")
	BUILTIN(__builtin_amdgcn_workitem_id_z, "Ui", "nc")			BUILTIN(__builtin_amdgcn_workitem_id_z, "Ui", "nc")

				BUILTIN(__builtin_amdgcn_workgroup_size_x, "Ui", "nc")
				BUILTIN(__builtin_amdgcn_workgroup_size_y, "Ui", "nc")
				BUILTIN(__builtin_amdgcn_workgroup_size_z, "Ui", "nc")

	BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc")			BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc")
	BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc")			BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc")

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// Instruction builtins.			// Instruction builtins.
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	BUILTIN(__builtin_amdgcn_s_getreg, "UiIi", "n")			BUILTIN(__builtin_amdgcn_s_getreg, "UiIi", "n")
	BUILTIN(__builtin_amdgcn_s_getpc, "LUi", "n")			BUILTIN(__builtin_amdgcn_s_getpc, "LUi", "n")
	▲ Show 20 Lines • Show All 198 Lines • Show Last 20 Lines

clang/include/clang/Basic/TargetInfo.h

Show First 20 Lines • Show All 206 Lines • ▼ Show 20 Lines	protected:
unsigned HasBuiltinMSVaList : 1;		unsigned HasBuiltinMSVaList : 1;

unsigned IsRenderScriptTarget : 1;		unsigned IsRenderScriptTarget : 1;

unsigned HasAArch64SVETypes : 1;		unsigned HasAArch64SVETypes : 1;

unsigned ARMCDECoprocMask : 8;		unsigned ARMCDECoprocMask : 8;

		unsigned MaxOpenCLWorkGroupSize;

// TargetInfo Constructor. Default initializes all fields.		// TargetInfo Constructor. Default initializes all fields.
TargetInfo(const llvm::Triple &T);		TargetInfo(const llvm::Triple &T);

void resetDataLayout(StringRef DL);		void resetDataLayout(StringRef DL);

public:		public:
/// Construct a target for the given options.		/// Construct a target for the given options.
///		///
▲ Show 20 Lines • Show All 428 Lines • ▼ Show 20 Lines	public:

/// Return the maximum vector alignment supported for the given target.		/// Return the maximum vector alignment supported for the given target.
unsigned getMaxVectorAlign() const { return MaxVectorAlign; }		unsigned getMaxVectorAlign() const { return MaxVectorAlign; }
/// Return default simd alignment for the given target. Generally, this		/// Return default simd alignment for the given target. Generally, this
/// value is type-specific, but this alignment can be used for most of the		/// value is type-specific, but this alignment can be used for most of the
/// types for the given target.		/// types for the given target.
unsigned getSimdDefaultAlign() const { return SimdDefaultAlign; }		unsigned getSimdDefaultAlign() const { return SimdDefaultAlign; }

		unsigned getMaxOpenCLWorkGroupSize() const { return MaxOpenCLWorkGroupSize; }

/// Return the alignment (in bits) of the thrown exception object. This is		/// Return the alignment (in bits) of the thrown exception object. This is
/// only meaningful for targets that allocate C++ exceptions in a system		/// only meaningful for targets that allocate C++ exceptions in a system
/// runtime, such as those using the Itanium C++ ABI.		/// runtime, such as those using the Itanium C++ ABI.
virtual unsigned getExnObjectAlignment() const {		virtual unsigned getExnObjectAlignment() const {
// Itanium says that an _Unwind_Exception has to be "double-word"		// Itanium says that an _Unwind_Exception has to be "double-word"
// aligned (and thus the end of it is also so-aligned), meaning 16		// aligned (and thus the end of it is also so-aligned), meaning 16
// bytes. Of course, that was written for the actual Itanium,		// bytes. Of course, that was written for the actual Itanium,
// which is a 64-bit platform. Classically, the ABI doesn't really		// which is a 64-bit platform. Classically, the ABI doesn't really
▲ Show 20 Lines • Show All 762 Lines • Show Last 20 Lines

clang/lib/Basic/TargetInfo.cpp

Show First 20 Lines • Show All 127 Lines • ▼ Show 20 Lines	TargetInfo::TargetInfo(const llvm::Triple &T) : TargetOpts(), Triple(T) {

// Default to an empty address space map.		// Default to an empty address space map.
AddrSpaceMap = &DefaultAddrSpaceMap;		AddrSpaceMap = &DefaultAddrSpaceMap;
UseAddrSpaceMapMangling = false;		UseAddrSpaceMapMangling = false;

// Default to an unknown platform name.		// Default to an unknown platform name.
PlatformName = "unknown";		PlatformName = "unknown";
PlatformMinVersion = VersionTuple();		PlatformMinVersion = VersionTuple();

		MaxOpenCLWorkGroupSize = 1024;
}		}

// Out of line virtual dtor for TargetInfo.		// Out of line virtual dtor for TargetInfo.
TargetInfo::~TargetInfo() {}		TargetInfo::~TargetInfo() {}

void TargetInfo::resetDataLayout(StringRef DL) {		void TargetInfo::resetDataLayout(StringRef DL) {
DataLayout.reset(new llvm::DataLayout(DL));		DataLayout.reset(new llvm::DataLayout(DL));
}		}
▲ Show 20 Lines • Show All 693 Lines • Show Last 20 Lines

clang/lib/CodeGen/CGBuiltin.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 9,991 Lines • ▼ Show 20 Lines
	if (getTarget().isLittleEndian())			if (getTarget().isLittleEndian())
	Index = ConstantInt::get(Index->getType(), 1 - Index->getZExtValue());			Index = ConstantInt::get(Index->getType(), 1 - Index->getZExtValue());

	return Builder.CreateExtractElement(Unpacked, Index);			return Builder.CreateExtractElement(Unpacked, Index);
	}			}
	}			}
	}			}

				namespace {
				// If \p E is not null pointer, insert address space cast to match return
				// type of \p E if necessary.
				Value *EmitAMDGPUDispatchPtr(CodeGenFunction &CGF,
				const CallExpr *E = nullptr) {
				auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_dispatch_ptr);
				auto *Call = CGF.Builder.CreateCall(F);
				Call->addAttribute(
				AttributeList::ReturnIndex,
				Attribute::getWithDereferenceableBytes(Call->getContext(), 64));
				Call->addAttribute(AttributeList::ReturnIndex,
				Attribute::getWithAlignment(Call->getContext(), Align(4)));
				if (!E)
				return Call;
				QualType BuiltinRetType = E->getType();
				auto *RetTy = cast<llvm::PointerType>(CGF.ConvertType(BuiltinRetType));
				if (RetTy == Call->getType())
				return Call;
				return CGF.Builder.CreateAddrSpaceCast(Call, RetTy);
				arsenmUnsubmitted Done Reply Inline Actions Why is this necessary? The builtin always has the same return type? arsenm: Why is this necessary? The builtin always has the same return type?
				yaxunlAuthorUnsubmitted Done Reply Inline Actions due to https://github.com/llvm/llvm-project/commit/c65f966d76aa5412920b3f14d199e764135bd5ec pointers returned by builtin functions are in default address space for HIP. yaxunl: due to https://github.com/llvm/llvm-project/commit/c65f966d76aa5412920b3f14d199e764135bd5ec…
				}

				// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
				Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
				const unsigned XOffset = 4;
				auto *DP = EmitAMDGPUDispatchPtr(CGF);
				// Indexing the HSA kernel_dispatch_packet struct.
				arsenmUnsubmitted Done Reply Inline Actions Comment that this is indexing the hsa_kernel_dispatch_packet sstruct? arsenm: Comment that this is indexing the hsa_kernel_dispatch_packet sstruct?
				yaxunlAuthorUnsubmitted Done Reply Inline Actions done yaxunl: done
				auto Offset = llvm::ConstantInt::get(CGF.Int32Ty, XOffset + Index 2);
				auto *GEP = CGF.Builder.CreateGEP(DP, Offset);
				auto *DstTy =
				CGF.Int16Ty->getPointerTo(GEP->getType()->getPointerAddressSpace());
				auto *Cast = CGF.Builder.CreateBitCast(GEP, DstTy);
				auto *LD = CGF.Builder.CreateLoad(Address(Cast, CharUnits::fromQuantity(2)));
				llvm::MDBuilder MDHelper(CGF.getLLVMContext());
				arsenmUnsubmitted Done Reply Inline Actions I thought I had a patch to include the maximum group size in AMDGPUTargetInfo to avoid hardcoding it, but I guess it was never committed arsenm: I thought I had a patch to include the maximum group size in AMDGPUTargetInfo to avoid…
				yaxunlAuthorUnsubmitted Done Reply Inline Actions Added getMaxOpenCLWorkGroupSize() to TargetInfo yaxunl: Added getMaxOpenCLWorkGroupSize() to TargetInfo
				llvm::MDNode *RNode = MDHelper.createRange(APInt(16, 1),
				arsenmUnsubmitted Done Reply Inline Actions Also set it's invariant arsenm: Also set it's invariant
				yaxunlAuthorUnsubmitted Done Reply Inline Actions done yaxunl: done
				APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1));
				LD->setMetadata(llvm::LLVMContext::MD_range, RNode);
				LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
				llvm::MDNode::get(CGF.getLLVMContext(), None));
				return LD;
				}
				} // namespace

	Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,			Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
	const CallExpr *E) {			const CallExpr *E) {
	switch (BuiltinID) {			switch (BuiltinID) {
	case AMDGPU::BI__builtin_amdgcn_div_scale:			case AMDGPU::BI__builtin_amdgcn_div_scale:
	case AMDGPU::BI__builtin_amdgcn_div_scalef: {			case AMDGPU::BI__builtin_amdgcn_div_scalef: {
	// Translate from the intrinsics's struct return to the builtin's out			// Translate from the intrinsics's struct return to the builtin's out
	// argument.			// argument.

	▲ Show 20 Lines • Show All 66 Lines • ▼ Show 20 Lines
	case AMDGPU::BI__builtin_amdgcn_rsq_clampf:			case AMDGPU::BI__builtin_amdgcn_rsq_clampf:
	return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rsq_clamp);			return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rsq_clamp);
	case AMDGPU::BI__builtin_amdgcn_sinf:			case AMDGPU::BI__builtin_amdgcn_sinf:
	case AMDGPU::BI__builtin_amdgcn_sinh:			case AMDGPU::BI__builtin_amdgcn_sinh:
	return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_sin);			return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_sin);
	case AMDGPU::BI__builtin_amdgcn_cosf:			case AMDGPU::BI__builtin_amdgcn_cosf:
	case AMDGPU::BI__builtin_amdgcn_cosh:			case AMDGPU::BI__builtin_amdgcn_cosh:
	return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_cos);			return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_cos);
	case AMDGPU::BI__builtin_amdgcn_dispatch_ptr: {			case AMDGPU::BI__builtin_amdgcn_dispatch_ptr:
	auto *F = CGM.getIntrinsic(Intrinsic::amdgcn_dispatch_ptr);			return EmitAMDGPUDispatchPtr(*this, E);
	auto *Call = Builder.CreateCall(F);
	Call->addAttribute(
	AttributeList::ReturnIndex,
	Attribute::getWithDereferenceableBytes(Call->getContext(), 64));
	Call->addAttribute(
	AttributeList::ReturnIndex,
	Attribute::getWithAlignment(Call->getContext(), Align(4)));
	QualType BuiltinRetType = E->getType();
	auto *RetTy = cast<llvm::PointerType>(ConvertType(BuiltinRetType));
	if (RetTy == Call->getType())
	return Call;
	return Builder.CreateAddrSpaceCast(Call, RetTy);
	}
	case AMDGPU::BI__builtin_amdgcn_log_clampf:			case AMDGPU::BI__builtin_amdgcn_log_clampf:
	return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log_clamp);			return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log_clamp);
	case AMDGPU::BI__builtin_amdgcn_ldexp:			case AMDGPU::BI__builtin_amdgcn_ldexp:
	case AMDGPU::BI__builtin_amdgcn_ldexpf:			case AMDGPU::BI__builtin_amdgcn_ldexpf:
	case AMDGPU::BI__builtin_amdgcn_ldexph:			case AMDGPU::BI__builtin_amdgcn_ldexph:
	return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_ldexp);			return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_ldexp);
	case AMDGPU::BI__builtin_amdgcn_frexp_mant:			case AMDGPU::BI__builtin_amdgcn_frexp_mant:
	case AMDGPU::BI__builtin_amdgcn_frexp_mantf:			case AMDGPU::BI__builtin_amdgcn_frexp_mantf:
	▲ Show 20 Lines • Show All 79 Lines • ▼ Show 20 Lines
	// amdgcn workitem			// amdgcn workitem
	case AMDGPU::BI__builtin_amdgcn_workitem_id_x:			case AMDGPU::BI__builtin_amdgcn_workitem_id_x:
	return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_x, 0, 1024);			return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_x, 0, 1024);
	case AMDGPU::BI__builtin_amdgcn_workitem_id_y:			case AMDGPU::BI__builtin_amdgcn_workitem_id_y:
	return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_y, 0, 1024);			return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_y, 0, 1024);
	case AMDGPU::BI__builtin_amdgcn_workitem_id_z:			case AMDGPU::BI__builtin_amdgcn_workitem_id_z:
	return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_z, 0, 1024);			return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_z, 0, 1024);

				// amdgcn workgroup size
				case AMDGPU::BI__builtin_amdgcn_workgroup_size_x:
				return EmitAMDGPUWorkGroupSize(*this, 0);
				case AMDGPU::BI__builtin_amdgcn_workgroup_size_y:
				return EmitAMDGPUWorkGroupSize(*this, 1);
				case AMDGPU::BI__builtin_amdgcn_workgroup_size_z:
				return EmitAMDGPUWorkGroupSize(*this, 2);

	// r600 intrinsics			// r600 intrinsics
	case AMDGPU::BI__builtin_r600_recipsqrt_ieee:			case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
	case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:			case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
	return emitUnaryBuiltin(*this, E, Intrinsic::r600_recipsqrt_ieee);			return emitUnaryBuiltin(*this, E, Intrinsic::r600_recipsqrt_ieee);
	case AMDGPU::BI__builtin_r600_read_tidig_x:			case AMDGPU::BI__builtin_r600_read_tidig_x:
	return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_x, 0, 1024);			return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_x, 0, 1024);
	case AMDGPU::BI__builtin_r600_read_tidig_y:			case AMDGPU::BI__builtin_r600_read_tidig_y:
	return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_y, 0, 1024);			return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_y, 0, 1024);
	▲ Show 20 Lines • Show All 1,898 Lines • Show Last 20 Lines

clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu

This file was added.

				// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
				// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
				arsenmUnsubmitted Done Reply Inline Actions I assume the addrspacecast got optimized out? Should this disable llvm passes? arsenm: I assume the addrspacecast got optimized out? Should this disable llvm passes?
				yaxunlAuthorUnsubmitted Done Reply Inline Actions We did not emit addrspacecast here since we only need return the loaded value. HIP by default uses -O0, therefore no need to disable llvm passes. yaxunl: We did not emit addrspacecast here since we only need return the loaded value. HIP by default…
				// RUN: \| FileCheck %s

				#include "Inputs/cuda.h"

				// CHECK-LABEL: test_get_workgroup_size
				// CHECK: call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
				// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 4
				// CHECK: load i16, i16 addrspace(4)* %{{.}}, align 2, !range [[$WS_RANGE:![0-9]]], !invariant.load
				// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 6
				// CHECK: load i16, i16 addrspace(4)* %{{.}}, align 2, !range [[$WS_RANGE:![0-9]]], !invariant.load
				// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 8
				// CHECK: load i16, i16 addrspace(4)* %{{.}}, align 2, !range [[$WS_RANGE:![0-9]]], !invariant.load
				__device__ void test_get_workgroup_size(int d, int *out)
				{
				switch (d) {
				case 0: *out = __builtin_amdgcn_workgroup_size_x(); break;
				case 1: *out = __builtin_amdgcn_workgroup_size_y(); break;
				case 2: *out = __builtin_amdgcn_workgroup_size_z(); break;
				default: *out = 0;
				}
				}

				// CHECK-DAG: [[$WS_RANGE]] = !{i16 1, i16 1025}

clang/test/CodeGenOpenCL/builtins-amdgcn.cl

Show First 20 Lines • Show All 521 Lines • ▼ Show 20 Lines	void test_get_local_id(int d, global int *out)
switch (d) {		switch (d) {
case 0: *out = __builtin_amdgcn_workitem_id_x(); break;		case 0: *out = __builtin_amdgcn_workitem_id_x(); break;
case 1: *out = __builtin_amdgcn_workitem_id_y(); break;		case 1: *out = __builtin_amdgcn_workitem_id_y(); break;
case 2: *out = __builtin_amdgcn_workitem_id_z(); break;		case 2: *out = __builtin_amdgcn_workitem_id_z(); break;
default: *out = 0;		default: *out = 0;
}		}
}		}

		// CHECK-LABEL: @test_get_workgroup_size(
		// CHECK: call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
		// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 4
		// CHECK: load i16, i16 addrspace(4)* %{{.}}, align 4, !range [[$WS_RANGE:![0-9]]], !invariant.load
		// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 6
		// CHECK: load i16, i16 addrspace(4)* %{{.}}, align 2, !range [[$WS_RANGE:![0-9]]], !invariant.load
		// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 8
		// CHECK: load i16, i16 addrspace(4)* %{{.}}, align 4, !range [[$WS_RANGE:![0-9]]], !invariant.load
		void test_get_workgroup_size(int d, global int *out)
		{
		arsenmUnsubmitted Done Reply Inline Actions Also run in a hip test, or some case where the addrspacecast is needed? arsenm: Also run in a hip test, or some case where the addrspacecast is needed?
		switch (d) {
		case 0: *out = __builtin_amdgcn_workgroup_size_x(); break;
		case 1: *out = __builtin_amdgcn_workgroup_size_y(); break;
		case 2: *out = __builtin_amdgcn_workgroup_size_z(); break;
		default: *out = 0;
		}
		}

// CHECK-LABEL: @test_fmed3_f32		// CHECK-LABEL: @test_fmed3_f32
// CHECK: call float @llvm.amdgcn.fmed3.f32(		// CHECK: call float @llvm.amdgcn.fmed3.f32(
void test_fmed3_f32(global float* out, float a, float b, float c)		void test_fmed3_f32(global float* out, float a, float b, float c)
{		{
*out = __builtin_amdgcn_fmed3f(a, b, c);		*out = __builtin_amdgcn_fmed3f(a, b, c);
}		}

// CHECK-LABEL: @test_s_getpc		// CHECK-LABEL: @test_s_getpc
▲ Show 20 Lines • Show All 155 Lines • ▼ Show 20 Lines

// CHECK-LABEL: test_mqsad_u32_u8(		// CHECK-LABEL: test_mqsad_u32_u8(
// CHECK: call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src0, i32 %src1, <4 x i32> %src2)		// CHECK: call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src0, i32 %src1, <4 x i32> %src2)
kernel void test_mqsad_u32_u8(global uint4* out, ulong src0, uint src1, uint4 src2) {		kernel void test_mqsad_u32_u8(global uint4* out, ulong src0, uint src1, uint4 src2) {
*out = __builtin_amdgcn_mqsad_u32_u8(src0, src1, src2);		*out = __builtin_amdgcn_mqsad_u32_u8(src0, src1, src2);
}		}

// CHECK-DAG: [[$WI_RANGE]] = !{i32 0, i32 1024}		// CHECK-DAG: [[$WI_RANGE]] = !{i32 0, i32 1024}
		// CHECK-DAG: [[$WS_RANGE]] = !{i16 1, i16 1025}
// CHECK-DAG: attributes #[[$NOUNWIND_READONLY:[0-9]+]] = { nounwind readonly }		// CHECK-DAG: attributes #[[$NOUNWIND_READONLY:[0-9]+]] = { nounwind readonly }
// CHECK-DAG: attributes #[[$READ_EXEC_ATTRS]] = { convergent }		// CHECK-DAG: attributes #[[$READ_EXEC_ATTRS]] = { convergent }
// CHECK-DAG: ![[$EXEC]] = !{!"exec"}		// CHECK-DAG: ![[$EXEC]] = !{!"exec"}
// CHECK-DAG: ![[$EXEC_LO]] = !{!"exec_lo"}		// CHECK-DAG: ![[$EXEC_LO]] = !{!"exec_lo"}
// CHECK-DAG: ![[$EXEC_HI]] = !{!"exec_hi"}		// CHECK-DAG: ![[$EXEC_HI]] = !{!"exec_hi"}