diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -196,6 +196,19 @@ TARGET_BUILTIN(__builtin_amdgcn_fmed3h, "hhhh", "nc", "gfx9-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f64, "dd*1d", "t", "gfx90a-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f32, "ff*1f", "t", "gfx90a-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2f16, "V2hV2h*1V2h", "t", "gfx90a-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fmin_f64, "dd*1d", "t", "gfx90a-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fmax_f64, "dd*1d", "t", "gfx90a-insts") + +TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_f64, "dd*0d", "t", "gfx90a-insts") +TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fmin_f64, "dd*0d", "t", "gfx90a-insts") +TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fmax_f64, "dd*0d", "t", "gfx90a-insts") + +TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_f64, "dd*3d", "t", "gfx90a-insts") +TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_f32, "ff*3f", "t", "gfx8-insts") + //===----------------------------------------------------------------------===// // Deep learning builtins. //===----------------------------------------------------------------------===// diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -16201,6 +16201,74 @@ Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy); return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 }); } + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: { + Intrinsic::ID IID; + llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext()); + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32: + ArgTy = llvm::Type::getFloatTy(getLLVMContext()); + IID = Intrinsic::amdgcn_global_atomic_fadd; + break; + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16: + ArgTy = llvm::FixedVectorType::get( + llvm::Type::getHalfTy(getLLVMContext()), 2); + IID = Intrinsic::amdgcn_global_atomic_fadd; + break; + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: + IID = Intrinsic::amdgcn_global_atomic_fadd; + break; + case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64: + IID = Intrinsic::amdgcn_global_atomic_fmin; + break; + case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64: + IID = Intrinsic::amdgcn_global_atomic_fmax; + break; + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: + IID = Intrinsic::amdgcn_flat_atomic_fadd; + break; + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64: + IID = Intrinsic::amdgcn_flat_atomic_fmin; + break; + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: + IID = Intrinsic::amdgcn_flat_atomic_fmax; + break; + } + llvm::Value *Addr = EmitScalarExpr(E->getArg(0)); + llvm::Value *Val = EmitScalarExpr(E->getArg(1)); + llvm::Function *F = + CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()}); + return Builder.CreateCall(F, {Addr, Val}); + } + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64: + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32: { + Intrinsic::ID IID; + llvm::Type *ArgTy; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32: + ArgTy = llvm::Type::getFloatTy(getLLVMContext()); + IID = Intrinsic::amdgcn_ds_fadd; + break; + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64: + ArgTy = llvm::Type::getDoubleTy(getLLVMContext()); + IID = Intrinsic::amdgcn_ds_fadd; + break; + } + llvm::Value *Addr = EmitScalarExpr(E->getArg(0)); + llvm::Value *Val = EmitScalarExpr(E->getArg(1)); + llvm::Constant *ZeroI32 = llvm::ConstantInt::getIntegerValue( + llvm::Type::getInt32Ty(getLLVMContext()), APInt(32, 0, true)); + llvm::Constant *ZeroI1 = llvm::ConstantInt::getIntegerValue( + llvm::Type::getInt1Ty(getLLVMContext()), APInt(1, 0)); + llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy}); + return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1}); + } case AMDGPU::BI__builtin_amdgcn_read_exec: { CallInst *CI = cast( EmitSpecialRegisterBuiltin(*this, E, Int64Ty, Int64Ty, NormalRead, "exec")); diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx1030.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx1030.cl new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx1030.cl @@ -0,0 +1,14 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1030 \ +// RUN: -S -o - %s +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1030 \ +// RUN: -S -o - %s | FileCheck -check-prefix=GFX1030 %s + +// CHECK-LABEL: test_ds_addf_local +// CHECK: call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %{{.*}}, float %{{.*}}, +// GFX1030-LABEL: test_ds_addf_local$local +// GFX1030: ds_add_rtn_f32 +void test_ds_addf_local(__local float *addr, float x){ + float *rtn; + *rtn = __builtin_amdgcn_ds_atomic_fadd_f32(addr, x); +} diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl @@ -0,0 +1,115 @@ +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \ +// RUN: %s -S -emit-llvm -o - | FileCheck %s -check-prefix=CHECK + +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \ +// RUN: -S -o - %s | FileCheck -check-prefix=GFX90A %s + +typedef half __attribute__((ext_vector_type(2))) half2; + +// CHECK-LABEL: test_global_add +// CHECK: call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}}) +// GFX90A-LABEL: test_global_add$local: +// GFX90A: global_atomic_add_f64 +void test_global_add(__global double *addr, double x) { + double *rtn; + *rtn = __builtin_amdgcn_global_atomic_fadd_f64(addr, x); +} + +// CHECK-LABEL: test_global_add_half2 +// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %{{.*}}, <2 x half> %{{.*}}) +// GFX90A-LABEL: test_global_add_half2 +// GFX90A: global_atomic_pk_add_f16 v2, v[0:1], v2, off glc +void test_global_add_half2(__global half2 *addr, half2 x) { + half2 *rtn; + *rtn = __builtin_amdgcn_global_atomic_fadd_v2f16(addr, x); +} + +// CHECK-LABEL: test_global_global_min +// CHECK: call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}}) +// GFX90A-LABEL: test_global_global_min$local +// GFX90A: global_atomic_min_f64 +void test_global_global_min(__global double *addr, double x){ + double *rtn; + *rtn = __builtin_amdgcn_global_atomic_fmin_f64(addr, x); +} + +// CHECK-LABEL: test_global_max +// CHECK: call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}}) +// GFX90A-LABEL: test_global_max$local +// GFX90A: global_atomic_max_f64 +void test_global_max(__global double *addr, double x){ + double *rtn; + *rtn = __builtin_amdgcn_global_atomic_fmax_f64(addr, x); +} + +// CHECK-LABEL: test_flat_add_local +// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p3f64.f64(double addrspace(3)* %{{.*}}, double %{{.*}}) +// GFX90A-LABEL: test_flat_add_local$local +// GFX90A: ds_add_rtn_f64 +void test_flat_add_local(__local double *addr, double x){ + double *rtn; + *rtn = __builtin_amdgcn_flat_atomic_fadd_f64(addr, x); +} + +// CHECK-LABEL: test_flat_global_add +// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}}) +// GFX90A-LABEL: test_flat_global_add$local +// GFX90A: global_atomic_add_f64 +void test_flat_global_add(__global double *addr, double x){ + double *rtn; + *rtn = __builtin_amdgcn_flat_atomic_fadd_f64(addr, x); +} + +// CHECK-LABEL: test_flat_min_constant +// CHECK: call double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* %{{.*}}, double %{{.*}}) +// GFX90A-LABEL: test_flat_min_constant$local +// GFX90A: flat_atomic_min_f64 +void test_flat_min_constant(__generic double *addr, double x){ + double *rtn; + *rtn = __builtin_amdgcn_flat_atomic_fmin_f64(addr, x); +} + +// CHECK-LABEL: test_flat_global_min +// CHECK: call double @llvm.amdgcn.flat.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}}) +// GFX90A: test_flat_global_min$local +// GFX90A: global_atomic_min_f64 +void test_flat_global_min(__global double *addr, double x){ + double *rtn; + *rtn = __builtin_amdgcn_flat_atomic_fmin_f64(addr, x); +} + +// CHECK-LABEL: test_flat_max_constant +// CHECK: call double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* %{{.*}}, double %{{.*}}) +// GFX90A-LABEL: test_flat_max_constant$local +// GFX90A: flat_atomic_max_f64 +void test_flat_max_constant(__generic double *addr, double x){ + double *rtn; + *rtn = __builtin_amdgcn_flat_atomic_fmax_f64(addr, x); +} + +// CHECK-LABEL: test_flat_global_max +// CHECK: call double @llvm.amdgcn.flat.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}}) +// GFX90A-LABEL: test_flat_global_max$local +// GFX90A: global_atomic_max_f64 +void test_flat_global_max(__global double *addr, double x){ + double *rtn; + *rtn = __builtin_amdgcn_flat_atomic_fmax_f64(addr, x); +} + +// CHECK-LABEL: test_ds_add_local +// CHECK: call double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* %{{.*}}, double %{{.*}}, +// GFX90A: test_ds_add_local$local +// GFX90A: ds_add_rtn_f64 +void test_ds_add_local(__local double *addr, double x){ + double *rtn; + *rtn = __builtin_amdgcn_ds_atomic_fadd_f64(addr, x); +} + +// CHECK-LABEL: test_ds_addf_local +// CHECK: call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %{{.*}}, float %{{.*}}, +// GFX90A-LABEL: test_ds_addf_local$local +// GFX90A: ds_add_rtn_f32 +void test_ds_addf_local(__local float *addr, float x){ + float *rtn; + *rtn = __builtin_amdgcn_ds_atomic_fadd_f32(addr, x); +} diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-unsupported-gfx908.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-unsupported-gfx908.cl new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-unsupported-gfx908.cl @@ -0,0 +1,8 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx908 \ +// RUN: -verify -S -o - %s + +void test_global_add(__global double *addr, double x) { + double *rtn; + *rtn = __builtin_amdgcn_global_atomic_fadd_f64(addr, x); // expected-error{{'__builtin_amdgcn_global_atomic_fadd_f64' needs target feature gfx90a-insts}} +}