This is an archive of the discontinued LLVM Phabricator instance.

What's the point of this? The reason for the other case was because there was no other way to attach the range metadata. The invariant load here is redundant since with AMDGPU AA the load from constant will be treated as invariant anyway

Consistency really. It seemed strange to have a builtin for reading the workgroup size and not one for the grid size.

There's probably a range limit that can be set on this one too, I'm just not sure what it is. Happy to leave the invariant annotation for a pass to insert if preferred.

I think there is value in this change. It may help simplify device library and hide some compiler details.

Harbormaster completed remote builds in B76603: Diff 301052.Oct 27 2020, 12:01 PM

LGTM. Thanks.

This revision is now accepted and ready to land.Oct 28 2020, 6:52 PM

This revision was landed with ongoing or failed builds.Oct 29 2020, 9:25 AM

Closed by commit rGdee7704829bd: [AMDGPU] Add __builtin_amdgcn_grid_size (authored by JonChesterfield). · Explain Why

This revision was automatically updated to reflect the committed changes.

JonChesterfield added a commit: rGdee7704829bd: [AMDGPU] Add __builtin_amdgcn_grid_size.

Revision Contents

Path

Size

clang/

include/

clang/

Basic/

BuiltinsAMDGPU.def

4 lines

lib/

CodeGen/

CGBuiltin.cpp

24 lines

test/

CodeGenOpenCL/

builtins-amdgcn.cl

18 lines

openmp/

libomptarget/

deviceRTLs/

amdgcn/

src/

target_impl.hip

10 lines

Diff 301653

clang/include/clang/Basic/BuiltinsAMDGPU.def

	Show All 31 Lines
	BUILTIN(__builtin_amdgcn_workitem_id_x, "Ui", "nc")			BUILTIN(__builtin_amdgcn_workitem_id_x, "Ui", "nc")
	BUILTIN(__builtin_amdgcn_workitem_id_y, "Ui", "nc")			BUILTIN(__builtin_amdgcn_workitem_id_y, "Ui", "nc")
	BUILTIN(__builtin_amdgcn_workitem_id_z, "Ui", "nc")			BUILTIN(__builtin_amdgcn_workitem_id_z, "Ui", "nc")

	BUILTIN(__builtin_amdgcn_workgroup_size_x, "Us", "nc")			BUILTIN(__builtin_amdgcn_workgroup_size_x, "Us", "nc")
	BUILTIN(__builtin_amdgcn_workgroup_size_y, "Us", "nc")			BUILTIN(__builtin_amdgcn_workgroup_size_y, "Us", "nc")
	BUILTIN(__builtin_amdgcn_workgroup_size_z, "Us", "nc")			BUILTIN(__builtin_amdgcn_workgroup_size_z, "Us", "nc")

				BUILTIN(__builtin_amdgcn_grid_size_x, "Ui", "nc")
				BUILTIN(__builtin_amdgcn_grid_size_y, "Ui", "nc")
				BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc")

	BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc")			BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc")
	BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc")			BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc")

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// Instruction builtins.			// Instruction builtins.
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	BUILTIN(__builtin_amdgcn_s_getreg, "UiIi", "n")			BUILTIN(__builtin_amdgcn_s_getreg, "UiIi", "n")
	BUILTIN(__builtin_amdgcn_s_setreg, "vIiUi", "n")			BUILTIN(__builtin_amdgcn_s_setreg, "vIiUi", "n")
	▲ Show 20 Lines • Show All 209 Lines • Show Last 20 Lines

clang/lib/CodeGen/CGBuiltin.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 14,744 Lines • ▼ Show 20 Lines	Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
llvm::MDBuilder MDHelper(CGF.getLLVMContext());		llvm::MDBuilder MDHelper(CGF.getLLVMContext());
llvm::MDNode *RNode = MDHelper.createRange(APInt(16, 1),		llvm::MDNode *RNode = MDHelper.createRange(APInt(16, 1),
APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1));		APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1));
LD->setMetadata(llvm::LLVMContext::MD_range, RNode);		LD->setMetadata(llvm::LLVMContext::MD_range, RNode);
LD->setMetadata(llvm::LLVMContext::MD_invariant_load,		LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
llvm::MDNode::get(CGF.getLLVMContext(), None));		llvm::MDNode::get(CGF.getLLVMContext(), None));
return LD;		return LD;
}		}

		// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
		Value *EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned Index) {
		const unsigned XOffset = 12;
		auto *DP = EmitAMDGPUDispatchPtr(CGF);
		// Indexing the HSA kernel_dispatch_packet struct.
		auto Offset = llvm::ConstantInt::get(CGF.Int32Ty, XOffset + Index 4);
		auto *GEP = CGF.Builder.CreateGEP(DP, Offset);
		auto *DstTy =
		CGF.Int32Ty->getPointerTo(GEP->getType()->getPointerAddressSpace());
		auto *Cast = CGF.Builder.CreateBitCast(GEP, DstTy);
		auto *LD = CGF.Builder.CreateLoad(Address(Cast, CharUnits::fromQuantity(4)));
		LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
		llvm::MDNode::get(CGF.getLLVMContext(), None));
		return LD;
		}
} // namespace		} // namespace

// For processing memory ordering and memory scope arguments of various		// For processing memory ordering and memory scope arguments of various
// amdgcn builtins.		// amdgcn builtins.
// \p Order takes a C++11 comptabile memory-ordering specifier and converts		// \p Order takes a C++11 comptabile memory-ordering specifier and converts
// it into LLVM's memory ordering specifier using atomic C ABI, and writes		// it into LLVM's memory ordering specifier using atomic C ABI, and writes
// to \p AO. \p Scope takes a const char * and converts it into AMDGCN		// to \p AO. \p Scope takes a const char * and converts it into AMDGCN
// specific SyncScopeID and writes it to \p SSID.		// specific SyncScopeID and writes it to \p SSID.
▲ Show 20 Lines • Show All 244 Lines • ▼ Show 20 Lines	Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
// amdgcn workgroup size		// amdgcn workgroup size
case AMDGPU::BI__builtin_amdgcn_workgroup_size_x:		case AMDGPU::BI__builtin_amdgcn_workgroup_size_x:
return EmitAMDGPUWorkGroupSize(*this, 0);		return EmitAMDGPUWorkGroupSize(*this, 0);
case AMDGPU::BI__builtin_amdgcn_workgroup_size_y:		case AMDGPU::BI__builtin_amdgcn_workgroup_size_y:
return EmitAMDGPUWorkGroupSize(*this, 1);		return EmitAMDGPUWorkGroupSize(*this, 1);
case AMDGPU::BI__builtin_amdgcn_workgroup_size_z:		case AMDGPU::BI__builtin_amdgcn_workgroup_size_z:
return EmitAMDGPUWorkGroupSize(*this, 2);		return EmitAMDGPUWorkGroupSize(*this, 2);

		// amdgcn grid size
		case AMDGPU::BI__builtin_amdgcn_grid_size_x:
		return EmitAMDGPUGridSize(*this, 0);
		case AMDGPU::BI__builtin_amdgcn_grid_size_y:
		return EmitAMDGPUGridSize(*this, 1);
		case AMDGPU::BI__builtin_amdgcn_grid_size_z:
		return EmitAMDGPUGridSize(*this, 2);

// r600 intrinsics		// r600 intrinsics
case AMDGPU::BI__builtin_r600_recipsqrt_ieee:		case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:		case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
return emitUnaryBuiltin(*this, E, Intrinsic::r600_recipsqrt_ieee);		return emitUnaryBuiltin(*this, E, Intrinsic::r600_recipsqrt_ieee);
case AMDGPU::BI__builtin_r600_read_tidig_x:		case AMDGPU::BI__builtin_r600_read_tidig_x:
return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_x, 0, 1024);		return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_x, 0, 1024);
case AMDGPU::BI__builtin_r600_read_tidig_y:		case AMDGPU::BI__builtin_r600_read_tidig_y:
return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_y, 0, 1024);		return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_y, 0, 1024);
▲ Show 20 Lines • Show All 2,057 Lines • Show Last 20 Lines

clang/test/CodeGenOpenCL/builtins-amdgcn.cl

Show First 20 Lines • Show All 553 Lines • ▼ Show 20 Lines	void test_get_workgroup_size(int d, global int *out)
switch (d) {		switch (d) {
case 0: *out = __builtin_amdgcn_workgroup_size_x() + 1; break;		case 0: *out = __builtin_amdgcn_workgroup_size_x() + 1; break;
case 1: *out = __builtin_amdgcn_workgroup_size_y(); break;		case 1: *out = __builtin_amdgcn_workgroup_size_y(); break;
case 2: *out = __builtin_amdgcn_workgroup_size_z(); break;		case 2: *out = __builtin_amdgcn_workgroup_size_z(); break;
default: *out = 0;		default: *out = 0;
}		}
}		}

		// CHECK-LABEL: @test_get_grid_size(
		// CHECK: call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
		// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 12
		// CHECK: load i32, i32 addrspace(4)* %{{.*}}, align 4, !invariant.load
		// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 16
		// CHECK: load i32, i32 addrspace(4)* %{{.*}}, align 4, !invariant.load
		// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 20
		// CHECK: load i32, i32 addrspace(4)* %{{.*}}, align 4, !invariant.load
		void test_get_grid_size(int d, global int *out)
		{
		switch (d) {
		case 0: *out = __builtin_amdgcn_grid_size_x(); break;
		case 1: *out = __builtin_amdgcn_grid_size_y(); break;
		case 2: *out = __builtin_amdgcn_grid_size_z(); break;
		default: *out = 0;
		}
		}

// CHECK-LABEL: @test_fmed3_f32		// CHECK-LABEL: @test_fmed3_f32
// CHECK: call float @llvm.amdgcn.fmed3.f32(		// CHECK: call float @llvm.amdgcn.fmed3.f32(
void test_fmed3_f32(global float* out, float a, float b, float c)		void test_fmed3_f32(global float* out, float a, float b, float c)
{		{
*out = __builtin_amdgcn_fmed3f(a, b, c);		*out = __builtin_amdgcn_fmed3f(a, b, c);
}		}

// CHECK-LABEL: @test_s_getpc		// CHECK-LABEL: @test_s_getpc
▲ Show 20 Lines • Show All 175 Lines • Show Last 20 Lines

openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip

Show First 20 Lines • Show All 113 Lines • ▼ Show 20 Lines	if ((load & 0x0000ffffu) == (num_waves - 1)) {
load = __atomic_load_n(&L1_Barrier, __ATOMIC_RELAXED);		load = __atomic_load_n(&L1_Barrier, __ATOMIC_RELAXED);
} while ((load & 0xffff0000u) == generation);		} while ((load & 0xffff0000u) == generation);
}		}
}		}
__atomic_thread_fence(__ATOMIC_RELEASE);		__atomic_thread_fence(__ATOMIC_RELEASE);
}		}

namespace {		namespace {
DEVICE uint32_t grid_size_x() {
size_t grid_size_x_offset = 96; // In bits, from AQL kernel dispatch format
return (uint32_t )((char *)__builtin_amdgcn_dispatch_ptr() +
grid_size_x_offset / 8);
}

DEVICE uint32_t get_grid_dim(uint32_t n, uint16_t d) {		DEVICE uint32_t get_grid_dim(uint32_t n, uint16_t d) {
uint32_t q = n / d;		uint32_t q = n / d;
return q + (n > q * d);		return q + (n > q * d);
}		}
DEVICE uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size,		DEVICE uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size,
uint16_t group_size) {		uint16_t group_size) {
uint32_t r = grid_size - group_id * group_size;		uint32_t r = grid_size - group_id * group_size;
return (r < group_size) ? r : group_size;		return (r < group_size) ? r : group_size;
}		}
} // namespace		} // namespace

DEVICE int GetNumberOfBlocksInKernel() {		DEVICE int GetNumberOfBlocksInKernel() {
return get_grid_dim(grid_size_x(), __builtin_amdgcn_workgroup_size_x());		return get_grid_dim(__builtin_amdgcn_grid_size_x(), __builtin_amdgcn_workgroup_size_x());
}		}

DEVICE int GetNumberOfThreadsInBlock() {		DEVICE int GetNumberOfThreadsInBlock() {
return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), grid_size_x(),		return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), __builtin_amdgcn_grid_size_x(),
__builtin_amdgcn_workgroup_size_x());		__builtin_amdgcn_workgroup_size_x());
}		}

DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }		DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
DEVICE unsigned GetLaneId() {		DEVICE unsigned GetLaneId() {
return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));		return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
}		}

// Stub implementations		// Stub implementations
DEVICE void *__kmpc_impl_malloc(size_t ) { return nullptr }		DEVICE void *__kmpc_impl_malloc(size_t ) { return nullptr }
DEVICE void __kmpc_impl_free(void *) {}		DEVICE void __kmpc_impl_free(void *) {}

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Add __builtin_amdgcn_grid_sizeClosedPublic

Details

Diff Detail