diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -8529,6 +8529,9 @@ def err_atomic_op_needs_atomic_int_or_ptr : Error< "address argument to atomic operation must be a pointer to %select{|atomic }0" "integer or pointer (%1 invalid)">; +def err_atomic_op_needs_atomic_int_or_fp : Error< + "address argument to atomic operation must be a pointer to %select{|atomic }0" + "integer or supported floating point type (%1 invalid)">; def err_atomic_op_needs_atomic_int : Error< "address argument to atomic operation must be a pointer to " "%select{|atomic }0integer (%1 invalid)">; diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp --- a/clang/lib/CodeGen/CGAtomic.cpp +++ b/clang/lib/CodeGen/CGAtomic.cpp @@ -637,8 +637,11 @@ case AtomicExpr::AO__hip_atomic_fetch_min: case AtomicExpr::AO__opencl_atomic_fetch_min: case AtomicExpr::AO__atomic_fetch_min: - Op = E->getValueType()->isSignedIntegerType() ? llvm::AtomicRMWInst::Min - : llvm::AtomicRMWInst::UMin; + Op = E->getValueType()->isFloatingType() + ? llvm::AtomicRMWInst::FMin + : (E->getValueType()->isSignedIntegerType() + ? llvm::AtomicRMWInst::Min + : llvm::AtomicRMWInst::UMin); break; case AtomicExpr::AO__atomic_max_fetch: @@ -648,8 +651,11 @@ case AtomicExpr::AO__hip_atomic_fetch_max: case AtomicExpr::AO__opencl_atomic_fetch_max: case AtomicExpr::AO__atomic_fetch_max: - Op = E->getValueType()->isSignedIntegerType() ? llvm::AtomicRMWInst::Max - : llvm::AtomicRMWInst::UMax; + Op = E->getValueType()->isFloatingType() + ? llvm::AtomicRMWInst::FMax + : (E->getValueType()->isSignedIntegerType() + ? llvm::AtomicRMWInst::Max + : llvm::AtomicRMWInst::UMax); break; case AtomicExpr::AO__atomic_and_fetch: @@ -918,9 +924,19 @@ } [[fallthrough]]; case AtomicExpr::AO__atomic_fetch_add: + case AtomicExpr::AO__atomic_fetch_max: + case AtomicExpr::AO__atomic_fetch_min: case AtomicExpr::AO__atomic_fetch_sub: case AtomicExpr::AO__atomic_add_fetch: + case AtomicExpr::AO__atomic_max_fetch: + case AtomicExpr::AO__atomic_min_fetch: case AtomicExpr::AO__atomic_sub_fetch: + case AtomicExpr::AO__c11_atomic_fetch_max: + case AtomicExpr::AO__c11_atomic_fetch_min: + case AtomicExpr::AO__opencl_atomic_fetch_max: + case AtomicExpr::AO__opencl_atomic_fetch_min: + case AtomicExpr::AO__hip_atomic_fetch_max: + case AtomicExpr::AO__hip_atomic_fetch_min: ShouldCastToIntPtrTy = !MemTy->isFloatingType(); [[fallthrough]]; @@ -936,13 +952,9 @@ case AtomicExpr::AO__c11_atomic_fetch_or: case AtomicExpr::AO__c11_atomic_fetch_xor: case AtomicExpr::AO__c11_atomic_fetch_nand: - case AtomicExpr::AO__c11_atomic_fetch_max: - case AtomicExpr::AO__c11_atomic_fetch_min: case AtomicExpr::AO__opencl_atomic_fetch_and: case AtomicExpr::AO__opencl_atomic_fetch_or: case AtomicExpr::AO__opencl_atomic_fetch_xor: - case AtomicExpr::AO__opencl_atomic_fetch_min: - case AtomicExpr::AO__opencl_atomic_fetch_max: case AtomicExpr::AO__atomic_fetch_and: case AtomicExpr::AO__hip_atomic_fetch_and: case AtomicExpr::AO__atomic_fetch_or: @@ -954,12 +966,6 @@ case AtomicExpr::AO__atomic_or_fetch: case AtomicExpr::AO__atomic_xor_fetch: case AtomicExpr::AO__atomic_nand_fetch: - case AtomicExpr::AO__atomic_max_fetch: - case AtomicExpr::AO__atomic_min_fetch: - case AtomicExpr::AO__atomic_fetch_max: - case AtomicExpr::AO__hip_atomic_fetch_max: - case AtomicExpr::AO__atomic_fetch_min: - case AtomicExpr::AO__hip_atomic_fetch_min: Val1 = EmitValToTemp(*this, E->getVal1()); break; } diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -6411,7 +6411,15 @@ Op == AtomicExpr::AO__atomic_store_n || Op == AtomicExpr::AO__atomic_exchange_n || Op == AtomicExpr::AO__atomic_compare_exchange_n; - bool IsAddSub = false; + // Bit mask for extra allowed value types other than integers for atomic + // arithmetic operations. Add/sub allow pointer and floating point. Min/max + // allow floating point. + enum ArithOpExtraValueType { + AOEVT_None = 0, + AOEVT_Pointer = 1, + AOEVT_FP = 2, + }; + unsigned ArithAllows = AOEVT_None; switch (Op) { case AtomicExpr::AO__c11_atomic_init: @@ -6437,19 +6445,30 @@ case AtomicExpr::AO__atomic_store_n: Form = Copy; break; - case AtomicExpr::AO__hip_atomic_fetch_add: - case AtomicExpr::AO__hip_atomic_fetch_sub: - case AtomicExpr::AO__hip_atomic_fetch_min: - case AtomicExpr::AO__hip_atomic_fetch_max: - case AtomicExpr::AO__c11_atomic_fetch_add: - case AtomicExpr::AO__c11_atomic_fetch_sub: - case AtomicExpr::AO__opencl_atomic_fetch_add: - case AtomicExpr::AO__opencl_atomic_fetch_sub: case AtomicExpr::AO__atomic_fetch_add: case AtomicExpr::AO__atomic_fetch_sub: case AtomicExpr::AO__atomic_add_fetch: case AtomicExpr::AO__atomic_sub_fetch: - IsAddSub = true; + case AtomicExpr::AO__c11_atomic_fetch_add: + case AtomicExpr::AO__c11_atomic_fetch_sub: + case AtomicExpr::AO__opencl_atomic_fetch_add: + case AtomicExpr::AO__opencl_atomic_fetch_sub: + case AtomicExpr::AO__hip_atomic_fetch_add: + case AtomicExpr::AO__hip_atomic_fetch_sub: + ArithAllows = AOEVT_Pointer | AOEVT_FP; + Form = Arithmetic; + break; + case AtomicExpr::AO__atomic_fetch_max: + case AtomicExpr::AO__atomic_fetch_min: + case AtomicExpr::AO__atomic_max_fetch: + case AtomicExpr::AO__atomic_min_fetch: + case AtomicExpr::AO__c11_atomic_fetch_max: + case AtomicExpr::AO__c11_atomic_fetch_min: + case AtomicExpr::AO__opencl_atomic_fetch_max: + case AtomicExpr::AO__opencl_atomic_fetch_min: + case AtomicExpr::AO__hip_atomic_fetch_max: + case AtomicExpr::AO__hip_atomic_fetch_min: + ArithAllows = AOEVT_FP; Form = Arithmetic; break; case AtomicExpr::AO__c11_atomic_fetch_and: @@ -6472,16 +6491,6 @@ case AtomicExpr::AO__atomic_nand_fetch: Form = Arithmetic; break; - case AtomicExpr::AO__c11_atomic_fetch_min: - case AtomicExpr::AO__c11_atomic_fetch_max: - case AtomicExpr::AO__opencl_atomic_fetch_min: - case AtomicExpr::AO__opencl_atomic_fetch_max: - case AtomicExpr::AO__atomic_min_fetch: - case AtomicExpr::AO__atomic_max_fetch: - case AtomicExpr::AO__atomic_fetch_min: - case AtomicExpr::AO__atomic_fetch_max: - Form = Arithmetic; - break; case AtomicExpr::AO__c11_atomic_exchange: case AtomicExpr::AO__hip_atomic_exchange: @@ -6569,12 +6578,13 @@ if (Form == Arithmetic) { // GCC does not enforce these rules for GNU atomics, but we do to help catch // trivial type errors. - auto IsAllowedValueType = [&](QualType ValType) { + auto IsAllowedValueType = [&](QualType ValType, + unsigned AllowedType) -> bool { if (ValType->isIntegerType()) return true; if (ValType->isPointerType()) - return true; - if (!ValType->isFloatingType()) + return AllowedType & AOEVT_Pointer; + if (!(ValType->isFloatingType() && (AllowedType & AOEVT_FP))) return false; // LLVM Parser does not allow atomicrmw with x86_fp80 type. if (ValType->isSpecificBuiltinType(BuiltinType::LongDouble) && @@ -6583,13 +6593,13 @@ return false; return true; }; - if (IsAddSub && !IsAllowedValueType(ValType)) { - Diag(ExprRange.getBegin(), diag::err_atomic_op_needs_atomic_int_ptr_or_fp) - << IsC11 << Ptr->getType() << Ptr->getSourceRange(); - return ExprError(); - } - if (!IsAddSub && !ValType->isIntegerType()) { - Diag(ExprRange.getBegin(), diag::err_atomic_op_needs_atomic_int) + if (!IsAllowedValueType(ValType, ArithAllows)) { + auto DID = ArithAllows & AOEVT_FP + ? (ArithAllows & AOEVT_Pointer + ? diag::err_atomic_op_needs_atomic_int_ptr_or_fp + : diag::err_atomic_op_needs_atomic_int_or_fp) + : diag::err_atomic_op_needs_atomic_int; + Diag(ExprRange.getBegin(), DID) << IsC11 << Ptr->getType() << Ptr->getSourceRange(); return ExprError(); } diff --git a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu --- a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu +++ b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu @@ -1,29 +1,98 @@ -// RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa \ +// RUN: %clang_cc1 -x hip %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa \ // RUN: -fcuda-is-device -target-cpu gfx906 -fnative-half-type \ // RUN: -fnative-half-arguments-and-returns | FileCheck %s +// RUN: %clang_cc1 -x hip %s -O3 -S -o - -triple=amdgcn-amd-amdhsa \ +// RUN: -fcuda-is-device -target-cpu gfx1100 -fnative-half-type \ +// RUN: -fnative-half-arguments-and-returns | FileCheck -check-prefix=SAFE %s + +// RUN: %clang_cc1 -x hip %s -O3 -S -o - -triple=amdgcn-amd-amdhsa \ +// RUN: -fcuda-is-device -target-cpu gfx940 -fnative-half-type \ +// RUN: -fnative-half-arguments-and-returns -munsafe-fp-atomics \ +// RUN: | FileCheck -check-prefix=UNSAFE %s + // REQUIRES: amdgpu-registered-target #include "Inputs/cuda.h" #include -__device__ float ffp1(float *p) { +__global__ void ffp1(float *p) { // CHECK-LABEL: @_Z4ffp1Pf // CHECK: atomicrmw fadd ptr {{.*}} monotonic - return __atomic_fetch_add(p, 1.0f, memory_order_relaxed); + // CHECK: atomicrmw fmax ptr {{.*}} monotonic + // CHECK: atomicrmw fmin ptr {{.*}} monotonic + // CHECK: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic + // CHECK: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic + // SAFE: _Z4ffp1Pf + // SAFE: global_atomic_cmpswap + // SAFE: global_atomic_cmpswap + // SAFE: global_atomic_cmpswap + // SAFE: global_atomic_cmpswap + // SAFE: global_atomic_cmpswap + // UNSAFE: _Z4ffp1Pf + // UNSAFE: global_atomic_add_f32 + // UNSAFE: global_atomic_cmpswap + // UNSAFE: global_atomic_cmpswap + // UNSAFE: global_atomic_cmpswap + // UNSAFE: global_atomic_cmpswap + __atomic_fetch_add(p, 1.0f, memory_order_relaxed); + __atomic_fetch_max(p, 1.0f, memory_order_relaxed); + __atomic_fetch_min(p, 1.0f, memory_order_relaxed); + __hip_atomic_fetch_max(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT); + __hip_atomic_fetch_min(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_WORKGROUP); } -__device__ double ffp2(double *p) { +__global__ void ffp2(double *p) { // CHECK-LABEL: @_Z4ffp2Pd // CHECK: atomicrmw fsub ptr {{.*}} monotonic - return __atomic_fetch_sub(p, 1.0, memory_order_relaxed); + // CHECK: atomicrmw fmax ptr {{.*}} monotonic + // CHECK: atomicrmw fmin ptr {{.*}} monotonic + // CHECK: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic + // CHECK: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic + // SAFE: _Z4ffp2Pd + // SAFE: global_atomic_cmpswap_b64 + // SAFE: global_atomic_cmpswap_b64 + // SAFE: global_atomic_cmpswap_b64 + // SAFE: global_atomic_cmpswap_b64 + // SAFE: global_atomic_cmpswap_b64 + // UNSAFE: _Z4ffp2Pd + // UNSAFE: global_atomic_cmpswap_x2 + // UNSAFE: global_atomic_cmpswap_x2 + // UNSAFE: global_atomic_cmpswap_x2 + // UNSAFE: global_atomic_cmpswap_x2 + // UNSAFE: global_atomic_cmpswap_x2 + __atomic_fetch_sub(p, 1.0, memory_order_relaxed); + __atomic_fetch_max(p, 1.0, memory_order_relaxed); + __atomic_fetch_min(p, 1.0, memory_order_relaxed); + __hip_atomic_fetch_max(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT); + __hip_atomic_fetch_min(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_WORKGROUP); } // long double is the same as double for amdgcn. -__device__ long double ffp3(long double *p) { +__global__ void ffp3(long double *p) { // CHECK-LABEL: @_Z4ffp3Pe // CHECK: atomicrmw fsub ptr {{.*}} monotonic - return __atomic_fetch_sub(p, 1.0L, memory_order_relaxed); + // CHECK: atomicrmw fmax ptr {{.*}} monotonic + // CHECK: atomicrmw fmin ptr {{.*}} monotonic + // CHECK: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic + // CHECK: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic + // SAFE: _Z4ffp3Pe + // SAFE: global_atomic_cmpswap_b64 + // SAFE: global_atomic_cmpswap_b64 + // SAFE: global_atomic_cmpswap_b64 + // SAFE: global_atomic_cmpswap_b64 + // SAFE: global_atomic_cmpswap_b64 + // UNSAFE: _Z4ffp3Pe + // UNSAFE: global_atomic_cmpswap_x2 + // UNSAFE: global_atomic_cmpswap_x2 + // UNSAFE: global_atomic_cmpswap_x2 + // UNSAFE: global_atomic_cmpswap_x2 + // UNSAFE: global_atomic_cmpswap_x2 + __atomic_fetch_sub(p, 1.0L, memory_order_relaxed); + __atomic_fetch_max(p, 1.0L, memory_order_relaxed); + __atomic_fetch_min(p, 1.0L, memory_order_relaxed); + __hip_atomic_fetch_max(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT); + __hip_atomic_fetch_min(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_WORKGROUP); } __device__ double ffp4(double *p, float f) { @@ -39,3 +108,29 @@ // CHECK: atomicrmw fsub ptr {{.*}} monotonic return __atomic_fetch_sub(p, i, memory_order_relaxed); } + +__global__ void ffp6(_Float16 *p) { + // CHECK-LABEL: @_Z4ffp6PDF16 + // CHECK: atomicrmw fadd ptr {{.*}} monotonic + // CHECK: atomicrmw fmax ptr {{.*}} monotonic + // CHECK: atomicrmw fmin ptr {{.*}} monotonic + // CHECK: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic + // CHECK: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic + // SAFE: _Z4ffp6PDF16 + // SAFE: global_atomic_cmpswap + // SAFE: global_atomic_cmpswap + // SAFE: global_atomic_cmpswap + // SAFE: global_atomic_cmpswap + // SAFE: global_atomic_cmpswap + // UNSAFE: _Z4ffp6PDF16 + // UNSAFE: global_atomic_cmpswap + // UNSAFE: global_atomic_cmpswap + // UNSAFE: global_atomic_cmpswap + // UNSAFE: global_atomic_cmpswap + // UNSAFE: global_atomic_cmpswap + __atomic_fetch_add(p, 1.0, memory_order_relaxed); + __atomic_fetch_max(p, 1.0, memory_order_relaxed); + __atomic_fetch_min(p, 1.0, memory_order_relaxed); + __hip_atomic_fetch_max(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT); + __hip_atomic_fetch_min(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_WORKGROUP); +} diff --git a/clang/test/Sema/atomic-ops.c b/clang/test/Sema/atomic-ops.c --- a/clang/test/Sema/atomic-ops.c +++ b/clang/test/Sema/atomic-ops.c @@ -131,7 +131,7 @@ _Atomic(int*) *p, _Atomic(float) *f, _Atomic(double) *d, _Atomic(long double) *ld, int *I, const int *CI, - int **P, float *D, struct S *s1, struct S *s2) { + int **P, float *F, double *D, struct S *s1, struct S *s2) { __c11_atomic_init(I, 5); // expected-error {{pointer to _Atomic}} __c11_atomic_init(ci, 5); // expected-error {{address argument to atomic operation must be a pointer to non-const _Atomic type ('const _Atomic(int) *' invalid)}} @@ -199,14 +199,27 @@ __c11_atomic_fetch_add(f, 1.0f, memory_order_seq_cst); __c11_atomic_fetch_add(d, 1.0, memory_order_seq_cst); __c11_atomic_fetch_add(ld, 1.0, memory_order_seq_cst); // fp80-error {{must be a pointer to atomic integer, pointer or supported floating point type}} + __c11_atomic_fetch_min(i, 1, memory_order_seq_cst); + __c11_atomic_fetch_min(p, 1, memory_order_seq_cst); // expected-error {{must be a pointer to atomic integer or supported floating point type}} + __c11_atomic_fetch_min(f, 1.0f, memory_order_seq_cst); + __c11_atomic_fetch_min(d, 1.0, memory_order_seq_cst); + __c11_atomic_fetch_min(ld, 1.0, memory_order_seq_cst); // fp80-error {{must be a pointer to atomic integer or supported floating point type}} + __c11_atomic_fetch_max(i, 1, memory_order_seq_cst); + __c11_atomic_fetch_max(p, 1, memory_order_seq_cst); // expected-error {{must be a pointer to atomic integer or supported floating point type}} + __c11_atomic_fetch_max(f, 1.0f, memory_order_seq_cst); + __c11_atomic_fetch_max(d, 1.0, memory_order_seq_cst); + __c11_atomic_fetch_max(ld, 1.0, memory_order_seq_cst); // fp80-error {{must be a pointer to atomic integer or supported floating point type}} __atomic_fetch_add(i, 3, memory_order_seq_cst); // expected-error {{pointer to integer, pointer or supported floating point type}} __atomic_fetch_sub(I, 3, memory_order_seq_cst); __atomic_fetch_sub(P, 3, memory_order_seq_cst); - __atomic_fetch_sub(D, 3, memory_order_seq_cst); + __atomic_fetch_sub(F, 3, memory_order_seq_cst); __atomic_fetch_sub(s1, 3, memory_order_seq_cst); // expected-error {{must be a pointer to integer, pointer or supported floating point type}} - __atomic_fetch_min(D, 3, memory_order_seq_cst); // expected-error {{must be a pointer to integer}} - __atomic_fetch_max(P, 3, memory_order_seq_cst); // expected-error {{must be a pointer to integer}} + __atomic_fetch_min(F, 3, memory_order_seq_cst); + __atomic_fetch_min(D, 3, memory_order_seq_cst); + __atomic_fetch_max(F, 3, memory_order_seq_cst); + __atomic_fetch_max(D, 3, memory_order_seq_cst); + __atomic_fetch_max(P, 3, memory_order_seq_cst); // expected-error {{must be a pointer to integer or supported floating point type}} __atomic_fetch_max(p, 3); // expected-error {{too few arguments to function call, expected 3, have 2}} __c11_atomic_fetch_and(i, 1, memory_order_seq_cst); @@ -216,7 +229,7 @@ __atomic_fetch_and(i, 3, memory_order_seq_cst); // expected-error {{pointer to integer}} __atomic_fetch_or(I, 3, memory_order_seq_cst); __atomic_fetch_xor(P, 3, memory_order_seq_cst); // expected-error {{must be a pointer to integer}} - __atomic_fetch_or(D, 3, memory_order_seq_cst); // expected-error {{must be a pointer to integer}} + __atomic_fetch_or(F, 3, memory_order_seq_cst); // expected-error {{must be a pointer to integer}} __atomic_fetch_and(s1, 3, memory_order_seq_cst); // expected-error {{must be a pointer to integer}} _Bool cmpexch_1 = __c11_atomic_compare_exchange_strong(i, I, 1, memory_order_seq_cst, memory_order_seq_cst); diff --git a/clang/test/SemaOpenCL/atomic-ops.cl b/clang/test/SemaOpenCL/atomic-ops.cl --- a/clang/test/SemaOpenCL/atomic-ops.cl +++ b/clang/test/SemaOpenCL/atomic-ops.cl @@ -61,8 +61,10 @@ __opencl_atomic_fetch_min(i, 1, memory_order_seq_cst, memory_scope_work_group); __opencl_atomic_fetch_max(i, 1, memory_order_seq_cst, memory_scope_work_group); - __opencl_atomic_fetch_min(f, 1, memory_order_seq_cst, memory_scope_work_group); // expected-error {{address argument to atomic operation must be a pointer to atomic integer ('__generic atomic_float *' (aka '__generic _Atomic(float) *') invalid)}} - __opencl_atomic_fetch_max(f, 1, memory_order_seq_cst, memory_scope_work_group); // expected-error {{address argument to atomic operation must be a pointer to atomic integer ('__generic atomic_float *' (aka '__generic _Atomic(float) *') invalid)}} + __opencl_atomic_fetch_min(f, 1, memory_order_seq_cst, memory_scope_work_group); + __opencl_atomic_fetch_max(f, 1, memory_order_seq_cst, memory_scope_work_group); + __opencl_atomic_fetch_min(d, 1, memory_order_seq_cst, memory_scope_work_group); + __opencl_atomic_fetch_max(d, 1, memory_order_seq_cst, memory_scope_work_group); bool cmpexch_1 = __opencl_atomic_compare_exchange_strong(i, I, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_work_group); bool cmpexch_2 = __opencl_atomic_compare_exchange_strong(p, P, 1, memory_order_seq_cst, memory_order_seq_cst, memory_scope_work_group);