diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -196,6 +196,19 @@ TARGET_BUILTIN(__builtin_amdgcn_fmed3h, "hhhh", "nc", "gfx9-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f64, "dd*1dicC*", "t", "gfx90a-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_f32, "ff*1ficC*", "t", "gfx90a-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_2f16, "hh*1hicC*", "t", "gfx90a-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fmin_f64, "dd*1dicC*", "t", "gfx90a-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fmax_f64, "dd*1dicC*", "t", "gfx90a-insts") + +TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_f64, "dd*1dicC*", "t", "gfx90a-insts") +TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fmin_f64, "dd*1dicC*", "t", "gfx90a-insts") +TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fmax_f64, "dd*1dicC*", "t", "gfx90a-insts") + +TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_f64, "dd*3dicC*", "t", "gfx90a-insts") +TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_f32, "ff*3ficC*", "t", "gfx90a-insts") + //===----------------------------------------------------------------------===// // Deep learning builtins. //===----------------------------------------------------------------------===// diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -15826,6 +15826,71 @@ Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy); return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 }); } + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_2f16: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64: + case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: { + Intrinsic::ID IID; + llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext()); + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32: + ArgTy = llvm::Type::getFloatTy(getLLVMContext()); + IID = Intrinsic::amdgcn_global_atomic_fadd; + break; + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_2f16: + ArgTy = llvm::FixedVectorType::get( + llvm::Type::getHalfTy(getLLVMContext()), 2); + LLVM_FALLTHROUGH; + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64: + IID = Intrinsic::amdgcn_global_atomic_fadd; + break; + case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64: + IID = Intrinsic::amdgcn_global_atomic_fmin; + break; + case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64: + IID = Intrinsic::amdgcn_global_atomic_fmax; + break; + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: + IID = Intrinsic::amdgcn_flat_atomic_fadd; + break; + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64: + IID = Intrinsic::amdgcn_flat_atomic_fmin; + break; + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: + IID = Intrinsic::amdgcn_flat_atomic_fmax; + break; + } + llvm::Value *Addr = EmitScalarExpr(E->getArg(0)); + llvm::Value *Val = EmitScalarExpr(E->getArg(1)); + llvm::Function *F = + CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()}); + return Builder.CreateCall(F, {Addr, Val}); + } + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64: + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32: { + Intrinsic::ID IID; + llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext()); + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32: + ArgTy = llvm::Type::getFloatTy(getLLVMContext()); + LLVM_FALLTHROUGH; + case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64: + IID = Intrinsic::amdgcn_ds_fadd; + break; + } + llvm::Value *Addr = EmitScalarExpr(E->getArg(0)); + llvm::Value *Val = EmitScalarExpr(E->getArg(1)); + llvm::Constant *ZeroI32 = llvm::ConstantInt::getIntegerValue( + llvm::Type::getInt32Ty(getLLVMContext()), APInt(32, 0, true)); + llvm::Constant *ZeroI1 = llvm::ConstantInt::getIntegerValue( + llvm::Type::getInt1Ty(getLLVMContext()), APInt(1, 0)); + llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy}); + return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1}); + } case AMDGPU::BI__builtin_amdgcn_read_exec: { CallInst *CI = cast( EmitSpecialRegisterBuiltin(*this, E, Int64Ty, Int64Ty, NormalRead, "exec")); diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics.cl new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics.cl @@ -0,0 +1,133 @@ +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \ +// RUN: %s -S -emit-llvm -o - | FileCheck %s -check-prefix=CHECK + +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \ +// RUN: -S -o - %s | FileCheck -check-prefix=GFX90A %s + + +typedef enum memory_order { + memory_order_relaxed = __ATOMIC_RELAXED, + memory_order_acquire = __ATOMIC_ACQUIRE, + memory_order_release = __ATOMIC_RELEASE, + memory_order_acq_rel = __ATOMIC_ACQ_REL, + memory_order_seq_cst = __ATOMIC_SEQ_CST +} memory_order; + +typedef half __attribute__((ext_vector_type(2))) half2; + +// CHECK-LABEL: test_global_add +// CHECK: tail call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}}) +// GFX90A: test_global_add +// GFX90A: global_atomic_add_f64 v2, v[0:1], s[0:1] +// GFX90A: s_endpgm +kernel void test_global_add(__global double *addr, double x) { + __builtin_amdgcn_global_atomic_fadd_f64(addr, x, memory_order_relaxed, "workgroup"); +} + +// CHECK-LABEL: test_global_addf +// CHECK: tail call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %{{.*}}, float %{{.*}}) +// GFX90A-LABEL: test_global_addf +// GFX90A: global_atomic_add_f32 v0, v1, s[0:1] +// GFX90A: s_endpgm +kernel void test_global_addf(__global float *addr, float x) { + __builtin_amdgcn_global_atomic_fadd_f32(addr, x, memory_order_relaxed, "workgroup"); +} + +// CHECK-LABEL: test_global_add2h +// CHECK: tail call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %{{.*}}, <2 x half> %{{.*}}) +// GFX90A-LABEL: test_global_add2h +// GFX90A: global_atomic_pk_add_f16 v0, v1, s[0:1] +// GFX90A: s_endpgm +kernel void test_global_add2h(__global half2 *addr, half2 x){ + __builtin_amdgcn_global_atomic_fadd_2f16(addr, x, memory_order_relaxed, "workgroup"); +} + +// CHECK-LABEL: test_global_global_min +// CHECK: tail call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}}) +// GFX90A-LABEL: test_global_global_min +// GFX90A: global_atomic_min_f64 v2, v[0:1], s[0:1] +// GFX90A: s_endpgm +kernel void test_global_global_min(__global double *addr, double x){ + __builtin_amdgcn_global_atomic_fmin_f64(addr, x, memory_order_relaxed, "workgroup"); +} + +// CHECK-LABEL: test_global_max +// CHECK: tail call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}}) +// GFX90A: test_global_max +// GFX90A: global_atomic_max_f64 v2, v[0:1], s[0:1] +// GFX90A: s_endpgm +kernel void test_global_max(__global double *addr, double x){ + __builtin_amdgcn_global_atomic_fmax_f64(addr, x, memory_order_relaxed, "workgroup"); +} + +// CHECK-LABEL: test_flat_add_local +// CHECK: tail call double @llvm.amdgcn.flat.atomic.fadd.f64.p3f64.f64(double addrspace(3)* %{{.*}}, double %{{.*}}) +// GFX90A: test_flat_add_local +// GFX90A: ds_add_f64 v2, v[0:1] +// GFX90A: s_endpgm +kernel void test_flat_add_local(__local double *addr, double x){ + __builtin_amdgcn_flat_atomic_fadd_f64(addr, x, memory_order_relaxed, "workgroup"); +} + +// CHECK-LABEL: test_flat_global_add +// CHECK: tail call double @llvm.amdgcn.flat.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}}) +// GFX90A: test_flat_global_add +// GFX90A: global_atomic_add_f64 +// GFX90A: s_endpgm +kernel void test_flat_global_add(__global double *addr, double x){ + __builtin_amdgcn_flat_atomic_fadd_f64(addr, x, memory_order_relaxed, "workgroup"); +} + +// CHECK-LABEL: test_flat_min_constant +// CHECK: tail call double @llvm.amdgcn.flat.atomic.fmin.f64.p4f64.f64(double addrspace(4)* %{{.*}}, double %{{.*}}) +// GFX90A: test_flat_min_constant +// GFX90A: global_atomic_min_f64 +// GFX90A: s_endpgm +kernel void test_flat_min_constant(__generic double *addr, double x){ + __builtin_amdgcn_flat_atomic_fmin_f64(addr, x, memory_order_relaxed, "workgroup"); +} + +// CHECK-LABEL: test_flat_global_min +// CHECK: tail call double @llvm.amdgcn.flat.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}}) +// GFX90A: test_flat_global_min +// GFX90A: global_atomic_min_f64 +// GFX90A: s_endpgm +kernel void test_flat_global_min(__global double *addr, double x){ + __builtin_amdgcn_flat_atomic_fmin_f64(addr, x, memory_order_relaxed, "workgroup"); +} + +// CHECK-LABEL: test_flat_max_constant +// CHECK: tail call double @llvm.amdgcn.flat.atomic.fmax.f64.p4f64.f64(double addrspace(4)* %{{.*}}, double %{{.*}}) +// GFX90A-LABEL: test_flat_max_constant +// GFX90A: global_atomic_max_f64 v2, v[0:1], s[0:1] +// GFX90A: s_endpgm +kernel void test_flat_max_constant(__generic double *addr, double x){ + __builtin_amdgcn_flat_atomic_fmax_f64(addr, x, memory_order_relaxed, "workgroup"); +} + +// CHECK-LABEL: test_flat_global_max +// CHECK: tail call double @llvm.amdgcn.flat.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %{{.*}}, double %{{.*}}) +// GFX90A-LABEL: test_flat_global_max +// GFX90A: global_atomic_max_f64 v2, v[0:1], s[0:1] +// GFX90A: s_endpgm +kernel void test_flat_global_max(__global double *addr, double x){ + __builtin_amdgcn_flat_atomic_fmax_f64(addr, x, memory_order_relaxed, "workgroup"); +} + +// CHECK-LABEL: test_ds_add_local +// CHECK: tail call double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* %{{.*}}, double %{{.*}}, +// GFX90A: test_ds_add_local +// GFX90A: ds_add_f64 v2, v[0:1] +// GFX90A: s_endpgm +kernel void test_ds_add_local(__local double *addr, double x){ + __builtin_amdgcn_ds_atomic_fadd_f64(addr, x, memory_order_relaxed, "workgroup"); +} + +// CHECK-LABEL: test_ds_addf_local +// CHECK: tail call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %{{.*}}, float %{{.*}}, +// GFX90A-LABEL: test_ds_addf_local +// GFX90A: ds_add_f32 v0, v1 +// GFX90A: s_endpgm +kernel void test_ds_addf_local(__local float *addr, float x){ + __builtin_amdgcn_ds_atomic_fadd_f32(addr, x, memory_order_relaxed, "workgroup"); +}