diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -33,6 +33,10 @@ BUILTIN(__builtin_amdgcn_workitem_id_y, "Ui", "nc") BUILTIN(__builtin_amdgcn_workitem_id_z, "Ui", "nc") +BUILTIN(__builtin_amdgcn_workgroup_size_x, "Ui", "nc") +BUILTIN(__builtin_amdgcn_workgroup_size_y, "Ui", "nc") +BUILTIN(__builtin_amdgcn_workgroup_size_z, "Ui", "nc") + BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc") BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc") diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -212,6 +212,8 @@ unsigned ARMCDECoprocMask : 8; + unsigned MaxOpenCLWorkGroupSize; + // TargetInfo Constructor. Default initializes all fields. TargetInfo(const llvm::Triple &T); @@ -663,6 +665,8 @@ /// types for the given target. unsigned getSimdDefaultAlign() const { return SimdDefaultAlign; } + unsigned getMaxOpenCLWorkGroupSize() const { return MaxOpenCLWorkGroupSize; } + /// Return the alignment (in bits) of the thrown exception object. This is /// only meaningful for targets that allocate C++ exceptions in a system /// runtime, such as those using the Itanium C++ ABI. diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp --- a/clang/lib/Basic/TargetInfo.cpp +++ b/clang/lib/Basic/TargetInfo.cpp @@ -133,6 +133,8 @@ // Default to an unknown platform name. PlatformName = "unknown"; PlatformMinVersion = VersionTuple(); + + MaxOpenCLWorkGroupSize = 1024; } // Out of line virtual dtor for TargetInfo. diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -13407,6 +13407,48 @@ } } +namespace { +// If \p E is not null pointer, insert address space cast to match return +// type of \p E if necessary. +Value *EmitAMDGPUDispatchPtr(CodeGenFunction &CGF, + const CallExpr *E = nullptr) { + auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_dispatch_ptr); + auto *Call = CGF.Builder.CreateCall(F); + Call->addAttribute( + AttributeList::ReturnIndex, + Attribute::getWithDereferenceableBytes(Call->getContext(), 64)); + Call->addAttribute(AttributeList::ReturnIndex, + Attribute::getWithAlignment(Call->getContext(), Align(4))); + if (!E) + return Call; + QualType BuiltinRetType = E->getType(); + auto *RetTy = cast(CGF.ConvertType(BuiltinRetType)); + if (RetTy == Call->getType()) + return Call; + return CGF.Builder.CreateAddrSpaceCast(Call, RetTy); +} + +// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively. +Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) { + const unsigned XOffset = 4; + auto *DP = EmitAMDGPUDispatchPtr(CGF); + // Indexing the HSA kernel_dispatch_packet struct. + auto *Offset = llvm::ConstantInt::get(CGF.Int32Ty, XOffset + Index * 2); + auto *GEP = CGF.Builder.CreateGEP(DP, Offset); + auto *DstTy = + CGF.Int16Ty->getPointerTo(GEP->getType()->getPointerAddressSpace()); + auto *Cast = CGF.Builder.CreateBitCast(GEP, DstTy); + auto *LD = CGF.Builder.CreateLoad(Address(Cast, CharUnits::fromQuantity(2))); + llvm::MDBuilder MDHelper(CGF.getLLVMContext()); + llvm::MDNode *RNode = MDHelper.createRange(APInt(16, 1), + APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1)); + LD->setMetadata(llvm::LLVMContext::MD_range, RNode); + LD->setMetadata(llvm::LLVMContext::MD_invariant_load, + llvm::MDNode::get(CGF.getLLVMContext(), None)); + return LD; +} +} // namespace + Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { switch (BuiltinID) { @@ -13489,21 +13531,8 @@ case AMDGPU::BI__builtin_amdgcn_cosf: case AMDGPU::BI__builtin_amdgcn_cosh: return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_cos); - case AMDGPU::BI__builtin_amdgcn_dispatch_ptr: { - auto *F = CGM.getIntrinsic(Intrinsic::amdgcn_dispatch_ptr); - auto *Call = Builder.CreateCall(F); - Call->addAttribute( - AttributeList::ReturnIndex, - Attribute::getWithDereferenceableBytes(Call->getContext(), 64)); - Call->addAttribute( - AttributeList::ReturnIndex, - Attribute::getWithAlignment(Call->getContext(), Align(4))); - QualType BuiltinRetType = E->getType(); - auto *RetTy = cast(ConvertType(BuiltinRetType)); - if (RetTy == Call->getType()) - return Call; - return Builder.CreateAddrSpaceCast(Call, RetTy); - } + case AMDGPU::BI__builtin_amdgcn_dispatch_ptr: + return EmitAMDGPUDispatchPtr(*this, E); case AMDGPU::BI__builtin_amdgcn_log_clampf: return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log_clamp); case AMDGPU::BI__builtin_amdgcn_ldexp: @@ -13599,6 +13628,14 @@ case AMDGPU::BI__builtin_amdgcn_workitem_id_z: return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_z, 0, 1024); + // amdgcn workgroup size + case AMDGPU::BI__builtin_amdgcn_workgroup_size_x: + return EmitAMDGPUWorkGroupSize(*this, 0); + case AMDGPU::BI__builtin_amdgcn_workgroup_size_y: + return EmitAMDGPUWorkGroupSize(*this, 1); + case AMDGPU::BI__builtin_amdgcn_workgroup_size_z: + return EmitAMDGPUWorkGroupSize(*this, 2); + // r600 intrinsics case AMDGPU::BI__builtin_r600_recipsqrt_ieee: case AMDGPU::BI__builtin_r600_recipsqrt_ieeef: diff --git a/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu b/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu @@ -0,0 +1,25 @@ +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \ +// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck %s + +#include "Inputs/cuda.h" + +// CHECK-LABEL: test_get_workgroup_size +// CHECK: call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() +// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 4 +// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load +// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 6 +// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load +// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 8 +// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load +__device__ void test_get_workgroup_size(int d, int *out) +{ + switch (d) { + case 0: *out = __builtin_amdgcn_workgroup_size_x(); break; + case 1: *out = __builtin_amdgcn_workgroup_size_y(); break; + case 2: *out = __builtin_amdgcn_workgroup_size_z(); break; + default: *out = 0; + } +} + +// CHECK-DAG: [[$WS_RANGE]] = !{i16 1, i16 1025} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -527,6 +527,24 @@ } } +// CHECK-LABEL: @test_get_workgroup_size( +// CHECK: call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() +// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 4 +// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 4, !range [[$WS_RANGE:![0-9]*]], !invariant.load +// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 6 +// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load +// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 8 +// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 4, !range [[$WS_RANGE:![0-9]*]], !invariant.load +void test_get_workgroup_size(int d, global int *out) +{ + switch (d) { + case 0: *out = __builtin_amdgcn_workgroup_size_x(); break; + case 1: *out = __builtin_amdgcn_workgroup_size_y(); break; + case 2: *out = __builtin_amdgcn_workgroup_size_z(); break; + default: *out = 0; + } +} + // CHECK-LABEL: @test_fmed3_f32 // CHECK: call float @llvm.amdgcn.fmed3.f32( void test_fmed3_f32(global float* out, float a, float b, float c) @@ -698,6 +716,7 @@ } // CHECK-DAG: [[$WI_RANGE]] = !{i32 0, i32 1024} +// CHECK-DAG: [[$WS_RANGE]] = !{i16 1, i16 1025} // CHECK-DAG: attributes #[[$NOUNWIND_READONLY:[0-9]+]] = { nounwind readonly } // CHECK-DAG: attributes #[[$READ_EXEC_ATTRS]] = { convergent } // CHECK-DAG: ![[$EXEC]] = !{!"exec"}