diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -196,6 +196,19 @@ TARGET_BUILTIN(__builtin_amdgcn_fmed3h, "hhhh", "nc", "gfx9-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f64, "dd*1d", "t", "gfx90a-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f32, "ff*1f", "t", "gfx90a-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2f16, "V2hV2h*1V2h", "t", "gfx90a-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fmin_f64, "dd*1d", "t", "gfx90a-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fmax_f64, "dd*1d", "t", "gfx90a-insts") + +TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_f64, "dd*0d", "t", "gfx90a-insts") +TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fmin_f64, "dd*0d", "t", "gfx90a-insts") +TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fmax_f64, "dd*0d", "t", "gfx90a-insts") + +TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_f64, "dd*3d", "t", "gfx90a-insts") +TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_f32, "ff*3f", "t", "gfx8-insts") + //===----------------------------------------------------------------------===// // Deep learning builtins. //===----------------------------------------------------------------------===// diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -16201,6 +16201,74 @@ Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy); return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 }); } + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: { + Intrinsic::ID IID; + llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext()); + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32: + ArgTy = llvm::Type::getFloatTy(getLLVMContext()); + IID = Intrinsic::amdgcn_global_atomic_fadd; + break; + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16: + ArgTy = llvm::FixedVectorType::get( + llvm::Type::getHalfTy(getLLVMContext()), 2); + IID = Intrinsic::amdgcn_global_atomic_fadd; + break; + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: + IID = Intrinsic::amdgcn_global_atomic_fadd; + break; + case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64: + IID = Intrinsic::amdgcn_global_atomic_fmin; + break; + case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64: + IID = Intrinsic::amdgcn_global_atomic_fmax; + break; + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: + IID = Intrinsic::amdgcn_flat_atomic_fadd; + break; + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64: + IID = Intrinsic::amdgcn_flat_atomic_fmin; + break; + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: + IID = Intrinsic::amdgcn_flat_atomic_fmax; + break; + } + llvm::Value *Addr = EmitScalarExpr(E->getArg(0)); + llvm::Value *Val = EmitScalarExpr(E->getArg(1)); + llvm::Function *F = + CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()}); + return Builder.CreateCall(F, {Addr, Val}); + } + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64: + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32: { + Intrinsic::ID IID; + llvm::Type *ArgTy; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32: + ArgTy = llvm::Type::getFloatTy(getLLVMContext()); + IID = Intrinsic::amdgcn_ds_fadd; + break; + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64: + ArgTy = llvm::Type::getDoubleTy(getLLVMContext()); + IID = Intrinsic::amdgcn_ds_fadd; + break; + } + llvm::Value *Addr = EmitScalarExpr(E->getArg(0)); + llvm::Value *Val = EmitScalarExpr(E->getArg(1)); + llvm::Constant *ZeroI32 = llvm::ConstantInt::getIntegerValue( + llvm::Type::getInt32Ty(getLLVMContext()), APInt(32, 0, true)); + llvm::Constant *ZeroI1 = llvm::ConstantInt::getIntegerValue( + llvm::Type::getInt1Ty(getLLVMContext()), APInt(1, 0)); + llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy}); + return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1}); + } case AMDGPU::BI__builtin_amdgcn_read_exec: { CallInst *CI = cast( EmitSpecialRegisterBuiltin(*this, E, Int64Ty, Int64Ty, NormalRead, "exec")); diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx1030.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx1030.cl new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx1030.cl @@ -0,0 +1,14 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1030 \ +// RUN: -S -o - %s +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1030 \ +// RUN: -S -o - %s | FileCheck -check-prefix=GFX1030 %s + +// CHECK-LABEL: test_ds_addf_local +// CHECK: call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %{{.*}}, float %{{.*}}, +// GFX1030-LABEL: test_ds_addf_local$local +// GFX1030: ds_add_rtn_f32 +void test_ds_addf_local(__local float *addr, float x){ + float *rtn; + *rtn = __builtin_amdgcn_ds_atomic_fadd_f32(addr, x); +} diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl @@ -0,0 +1,14 @@ +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx810 \ +// RUN: %s -S -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx810 \ +// RUN: -S -o - %s | FileCheck -check-prefix=GFX8 %s + +// CHECK-LABEL: test_fadd_local +// CHECK: call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %{{.*}}, float %{{.*}}, i32 0, i32 0, i1 false) +// GFX8-LABEL: test_fadd_local$local: +// GFX8: ds_add_rtn_f32 v2, v0, v1 +// GFX8: s_endpgm +kernel void test_fadd_local(__local float *ptr, float val){ + float *res; + *res = __builtin_amdgcn_ds_atomic_fadd_f32(ptr, val); +} diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl @@ -0,0 +1,115 @@ +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \ +// RUN: %s -S -emit-llvm -o - | FileCheck %s -check-prefix=CHECK + +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \ +// RUN: -S -o - %s | FileCheck -check-prefix=GFX90A %s + +typedef half __attribute__((ext_vector_type(2))) half2; + +// CHECK-LABEL: test_global_add_f64 +// CHECK: call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}}) +// GFX90A-LABEL: test_global_add_f64$local: +// GFX90A: global_atomic_add_f64 +void test_global_add_f64(__global double *addr, double x) { + double *rtn; + *rtn = __builtin_amdgcn_global_atomic_fadd_f64(addr, x); +} + +// CHECK-LABEL: test_global_add_half2 +// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %{{.*}}, <2 x half> %{{.*}}) +// GFX90A-LABEL: test_global_add_half2 +// GFX90A: global_atomic_pk_add_f16 v2, v[0:1], v2, off glc +void test_global_add_half2(__global half2 *addr, half2 x) { + half2 *rtn; + *rtn = __builtin_amdgcn_global_atomic_fadd_v2f16(addr, x); +} + +// CHECK-LABEL: test_global_global_min_f64 +// CHECK: call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}}) +// GFX90A-LABEL: test_global_global_min_f64$local +// GFX90A: global_atomic_min_f64 +void test_global_global_min_f64(__global double *addr, double x){ + double *rtn; + *rtn = __builtin_amdgcn_global_atomic_fmin_f64(addr, x); +} + +// CHECK-LABEL: test_global_max_f64 +// CHECK: call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}}) +// GFX90A-LABEL: test_global_max_f64$local +// GFX90A: global_atomic_max_f64 +void test_global_max_f64(__global double *addr, double x){ + double *rtn; + *rtn = __builtin_amdgcn_global_atomic_fmax_f64(addr, x); +} + +// CHECK-LABEL: test_flat_add_local_f64 +// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p3f64.f64(double addrspace(3)* %{{.*}}, double %{{.*}}) +// GFX90A-LABEL: test_flat_add_local_f64$local +// GFX90A: ds_add_rtn_f64 +void test_flat_add_local_f64(__local double *addr, double x){ + double *rtn; + *rtn = __builtin_amdgcn_flat_atomic_fadd_f64(addr, x); +} + +// CHECK-LABEL: test_flat_global_add_f64 +// CHECK: call double @llvm.amdgcn.flat.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}}) +// GFX90A-LABEL: test_flat_global_add_f64$local +// GFX90A: global_atomic_add_f64 +void test_flat_global_add_f64(__global double *addr, double x){ + double *rtn; + *rtn = __builtin_amdgcn_flat_atomic_fadd_f64(addr, x); +} + +// CHECK-LABEL: test_flat_min_flat_f64 +// CHECK: call double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* %{{.*}}, double %{{.*}}) +// GFX90A-LABEL: test_flat_min_flat_f64$local +// GFX90A: flat_atomic_min_f64 +void test_flat_min_flat_f64(__generic double *addr, double x){ + double *rtn; + *rtn = __builtin_amdgcn_flat_atomic_fmin_f64(addr, x); +} + +// CHECK-LABEL: test_flat_global_min_f64 +// CHECK: call double @llvm.amdgcn.flat.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}}) +// GFX90A: test_flat_global_min_f64$local +// GFX90A: global_atomic_min_f64 +void test_flat_global_min_f64(__global double *addr, double x){ + double *rtn; + *rtn = __builtin_amdgcn_flat_atomic_fmin_f64(addr, x); +} + +// CHECK-LABEL: test_flat_max_flat_f64 +// CHECK: call double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* %{{.*}}, double %{{.*}}) +// GFX90A-LABEL: test_flat_max_flat_f64$local +// GFX90A: flat_atomic_max_f64 +void test_flat_max_flat_f64(__generic double *addr, double x){ + double *rtn; + *rtn = __builtin_amdgcn_flat_atomic_fmax_f64(addr, x); +} + +// CHECK-LABEL: test_flat_global_max_f64 +// CHECK: call double @llvm.amdgcn.flat.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}}) +// GFX90A-LABEL: test_flat_global_max_f64$local +// GFX90A: global_atomic_max_f64 +void test_flat_global_max_f64(__global double *addr, double x){ + double *rtn; + *rtn = __builtin_amdgcn_flat_atomic_fmax_f64(addr, x); +} + +// CHECK-LABEL: test_ds_add_local_f64 +// CHECK: call double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* %{{.*}}, double %{{.*}}, +// GFX90A: test_ds_add_local_f64$local +// GFX90A: ds_add_rtn_f64 +void test_ds_add_local_f64(__local double *addr, double x){ + double *rtn; + *rtn = __builtin_amdgcn_ds_atomic_fadd_f64(addr, x); +} + +// CHECK-LABEL: test_ds_addf_local_f32 +// CHECK: call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %{{.*}}, float %{{.*}}, +// GFX90A-LABEL: test_ds_addf_local_f32$local +// GFX90A: ds_add_rtn_f32 +void test_ds_addf_local_f32(__local float *addr, float x){ + float *rtn; + *rtn = __builtin_amdgcn_ds_atomic_fadd_f32(addr, x); +} diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-unsupported-gfx7.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-unsupported-gfx7.cl new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-unsupported-gfx7.cl @@ -0,0 +1,7 @@ +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx700 \ +// RUN: %s -verify -S -o - + +kernel void test_fadd_local(__local float *ptr, float val){ + float *res; + *res = __builtin_amdgcn_ds_atomic_fadd_f32(ptr, val); // expected-error{{'__builtin_amdgcn_ds_atomic_fadd_f32' needs target feature gfx8-insts}} +} \ No newline at end of file diff --git a/clang/test/CodeGenOpenCL/unsupported-fadd2f16-gfx908.cl b/clang/test/CodeGenOpenCL/unsupported-fadd2f16-gfx908.cl new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenOpenCL/unsupported-fadd2f16-gfx908.cl @@ -0,0 +1,8 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx908 \ +// RUN: -verify -S -o - %s +typedef half __attribute__((ext_vector_type(2))) half2; +void test_global_add_2f16(__global half2 *addr, half2 x) { + half2 *rtn; + *rtn = __builtin_amdgcn_global_atomic_fadd_v2f16(addr, x); // expected-error{{'__builtin_amdgcn_global_atomic_fadd_v2f16' needs target feature gfx90a-insts}} +} diff --git a/clang/test/CodeGenOpenCL/unsupported-fadd32-gfx908.cl b/clang/test/CodeGenOpenCL/unsupported-fadd32-gfx908.cl new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenOpenCL/unsupported-fadd32-gfx908.cl @@ -0,0 +1,8 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx908 \ +// RUN: -verify -S -o - %s + +void test_global_add_f32(__global double *addr, double x) { + double *rtn; + *rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x); // expected-error{{'__builtin_amdgcn_global_atomic_fadd_f32' needs target feature gfx90a-insts}} +} diff --git a/clang/test/CodeGenOpenCL/unsupported-fadd64-flat-gfx908.cl b/clang/test/CodeGenOpenCL/unsupported-fadd64-flat-gfx908.cl new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenOpenCL/unsupported-fadd64-flat-gfx908.cl @@ -0,0 +1,8 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx908 \ +// RUN: -verify -S -o - %s + +void test_flat_add_f64(__global double *addr, double x) { + double *rtn; + *rtn = __builtin_amdgcn_flat_atomic_fadd_f64(addr, x); // expected-error{{'__builtin_amdgcn_flat_atomic_fadd_f64' needs target feature gfx90a-insts}} +} diff --git a/clang/test/CodeGenOpenCL/unsupported-fadd64-gfx908.cl b/clang/test/CodeGenOpenCL/unsupported-fadd64-gfx908.cl new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenOpenCL/unsupported-fadd64-gfx908.cl @@ -0,0 +1,8 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx908 \ +// RUN: -verify -S -o - %s + +void test_global_add_f64(__global double *addr, double x) { + double *rtn; + *rtn = __builtin_amdgcn_global_atomic_fadd_f64(addr, x); // expected-error{{'__builtin_amdgcn_global_atomic_fadd_f64' needs target feature gfx90a-insts}} +} diff --git a/clang/test/CodeGenOpenCL/unsupported-fmax64-flat-gfx908.cl b/clang/test/CodeGenOpenCL/unsupported-fmax64-flat-gfx908.cl new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenOpenCL/unsupported-fmax64-flat-gfx908.cl @@ -0,0 +1,8 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx908 \ +// RUN: -verify -S -o - %s + +void test_flat_max_f64(__global double *addr, double x) { + double *rtn; + *rtn = __builtin_amdgcn_flat_atomic_fmax_f64(addr, x); // expected-error{{'__builtin_amdgcn_flat_atomic_fmax_f64' needs target feature gfx90a-insts}} +} diff --git a/clang/test/CodeGenOpenCL/unsupported-fmax64-gfx908.cl b/clang/test/CodeGenOpenCL/unsupported-fmax64-gfx908.cl new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenOpenCL/unsupported-fmax64-gfx908.cl @@ -0,0 +1,8 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx908 \ +// RUN: -verify -S -o - %s + +void test_global_max_f64(__global double *addr, double x) { + double *rtn; + *rtn = __builtin_amdgcn_global_atomic_fmax_f64(addr, x); // expected-error{{'__builtin_amdgcn_global_atomic_fmax_f64' needs target feature gfx90a-insts}} +} diff --git a/clang/test/CodeGenOpenCL/unsupported-fmin64-flat-gfx908.cl b/clang/test/CodeGenOpenCL/unsupported-fmin64-flat-gfx908.cl new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenOpenCL/unsupported-fmin64-flat-gfx908.cl @@ -0,0 +1,8 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx908 \ +// RUN: -verify -S -o - %s + +void test_flat_min_f64(__global double *addr, double x) { + double *rtn; + *rtn = __builtin_amdgcn_flat_atomic_fmin_f64(addr, x); // expected-error{{'__builtin_amdgcn_flat_atomic_fmin_f64' needs target feature gfx90a-insts}} +} diff --git a/clang/test/CodeGenOpenCL/unsupported-fmin64-gfx908.cl b/clang/test/CodeGenOpenCL/unsupported-fmin64-gfx908.cl new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenOpenCL/unsupported-fmin64-gfx908.cl @@ -0,0 +1,8 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx908 \ +// RUN: -verify -S -o - %s + +void test_global_min_f64(__global double *addr, double x) { + double *rtn; + *rtn = __builtin_amdgcn_global_atomic_fmin_f64(addr, x); // expected-error{{'__builtin_amdgcn_global_atomic_fmin_f64' needs target feature gfx90a-insts}} +}