diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -59,6 +59,8 @@ BUILTIN(__builtin_amdgcn_ds_gws_sema_br, "vUiUi", "n") BUILTIN(__builtin_amdgcn_ds_gws_sema_p, "vUi", "n") BUILTIN(__builtin_amdgcn_fence, "vUicC*", "n") +BUILTIN(__builtin_amdgcn_atomic_inc, "ii*iUicC*b", "n") +BUILTIN(__builtin_amdgcn_atomic_dec, "ii*iUicC*b", "n") // FIXME: Need to disallow constant address space. BUILTIN(__builtin_amdgcn_div_scale, "dddbb*", "n") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -14251,8 +14251,49 @@ } } // namespace +// For processing memory ordering and memory scope arguments of various +// amdgcn builtins. +// \p Order takes a C++11 comptabile memory-ordering specifier and converts +// it into LLVM's memory ordering specifier using atomic C ABI, and writes +// to \p AO. \p Scope takes a const char * and converts it into AMDGCN +// specific SyncScopeID and writes it to \p SSID. +bool CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope, + llvm::AtomicOrdering &AO, + llvm::SyncScope::ID &SSID) { + if (isa(Order)) { + int ord = cast(Order)->getZExtValue(); + + // Map C11/C++11 memory ordering to LLVM memory ordering + switch (static_cast(ord)) { + case llvm::AtomicOrderingCABI::acquire: + AO = llvm::AtomicOrdering::Acquire; + break; + case llvm::AtomicOrderingCABI::release: + AO = llvm::AtomicOrdering::Release; + break; + case llvm::AtomicOrderingCABI::acq_rel: + AO = llvm::AtomicOrdering::AcquireRelease; + break; + case llvm::AtomicOrderingCABI::seq_cst: + AO = llvm::AtomicOrdering::SequentiallyConsistent; + break; + case llvm::AtomicOrderingCABI::consume: + case llvm::AtomicOrderingCABI::relaxed: + break; + } + + StringRef scp; + llvm::getConstantStringInfo(Scope, scp); + SSID = getLLVMContext().getOrInsertSyncScopeID(scp); + return true; + } + return false; +} + Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { + llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent; + llvm::SyncScope::ID SSID; switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_div_scale: case AMDGPU::BI__builtin_amdgcn_div_scalef: { @@ -14457,38 +14498,42 @@ } case AMDGPU::BI__builtin_amdgcn_fence: { - llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent; - llvm::SyncScope::ID SSID; - Value *Order = EmitScalarExpr(E->getArg(0)); - Value *Scope = EmitScalarExpr(E->getArg(1)); + if (ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(0)), + EmitScalarExpr(E->getArg(1)), AO, SSID)) + return Builder.CreateFence(AO, SSID); + LLVM_FALLTHROUGH; + } + case AMDGPU::BI__builtin_amdgcn_atomic_inc: + case AMDGPU::BI__builtin_amdgcn_atomic_dec: { + unsigned BuiltinAtomicOp; - if (isa(Order)) { - int ord = cast(Order)->getZExtValue(); + Value *Ptr = EmitScalarExpr(E->getArg(0)); + Value *Val = EmitScalarExpr(E->getArg(1)); - // Map C11/C++11 memory ordering to LLVM memory ordering - switch (static_cast(ord)) { - case llvm::AtomicOrderingCABI::acquire: - AO = llvm::AtomicOrdering::Acquire; - break; - case llvm::AtomicOrderingCABI::release: - AO = llvm::AtomicOrdering::Release; - break; - case llvm::AtomicOrderingCABI::acq_rel: - AO = llvm::AtomicOrdering::AcquireRelease; - break; - case llvm::AtomicOrderingCABI::seq_cst: - AO = llvm::AtomicOrdering::SequentiallyConsistent; - break; - case llvm::AtomicOrderingCABI::consume: // not supported by LLVM fence - case llvm::AtomicOrderingCABI::relaxed: // not supported by LLVM fence - break; - } + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_atomic_inc: + BuiltinAtomicOp = Intrinsic::amdgcn_atomic_inc; + break; + case AMDGPU::BI__builtin_amdgcn_atomic_dec: + BuiltinAtomicOp = Intrinsic::amdgcn_atomic_dec; + break; + } - StringRef scp; - llvm::getConstantStringInfo(Scope, scp); - SSID = getLLVMContext().getOrInsertSyncScopeID(scp); + llvm::Function *F = CGM.getIntrinsic( + BuiltinAtomicOp, + {Ptr->getType()->getPointerElementType(), Ptr->getType()}); - return Builder.CreateFence(AO, SSID); + if (ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)), + EmitScalarExpr(E->getArg(3)), AO, SSID)) { + + // llvm.amdgcn.atomic.inc and llvm.amdgcn.atomic.dec expects ordering and + // scope as unsigned values + Value *MemOrder = Builder.getInt32(static_cast(AO)); + Value *MemScope = Builder.getInt32(static_cast(SSID)); + + Value *IsVolatile = EmitScalarExpr(E->getArg(4)); + + return Builder.CreateCall(F, {Ptr, Val, MemOrder, MemScope, IsVolatile}); } LLVM_FALLTHROUGH; } diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -3959,6 +3959,9 @@ llvm::Value *EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, const CallExpr *E); llvm::Value *EmitHexagonBuiltinExpr(unsigned BuiltinID, const CallExpr *E); + bool ProcessOrderScopeAMDGCN(llvm::Value *Order, llvm::Value *Scope, + llvm::AtomicOrdering &AO, + llvm::SyncScope::ID &SSID); private: enum class MSVCIntrin; diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3061,41 +3061,54 @@ bool Sema::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { + // position of memory order and scope arguments in the builtin + unsigned OrderIndex, ScopeIndex; switch (BuiltinID) { - case AMDGPU::BI__builtin_amdgcn_fence: { - ExprResult Arg = TheCall->getArg(0); - auto ArgExpr = Arg.get(); - Expr::EvalResult ArgResult; - - if (!ArgExpr->EvaluateAsInt(ArgResult, Context)) - return Diag(ArgExpr->getExprLoc(), diag::err_typecheck_expect_int) - << ArgExpr->getType(); - int ord = ArgResult.Val.getInt().getZExtValue(); - - // Check valididty of memory ordering as per C11 / C++11's memody model. - switch (static_cast(ord)) { - case llvm::AtomicOrderingCABI::acquire: - case llvm::AtomicOrderingCABI::release: - case llvm::AtomicOrderingCABI::acq_rel: - case llvm::AtomicOrderingCABI::seq_cst: - break; - default: { - return Diag(ArgExpr->getBeginLoc(), - diag::warn_atomic_op_has_invalid_memory_order) - << ArgExpr->getSourceRange(); - } - } + case AMDGPU::BI__builtin_amdgcn_atomic_inc: + case AMDGPU::BI__builtin_amdgcn_atomic_dec: + OrderIndex = 2; + ScopeIndex = 3; + break; + case AMDGPU::BI__builtin_amdgcn_fence: + OrderIndex = 0; + ScopeIndex = 1; + break; + default: + return false; + } - Arg = TheCall->getArg(1); - ArgExpr = Arg.get(); - Expr::EvalResult ArgResult1; - // Check that sync scope is a constant literal - if (!ArgExpr->EvaluateAsConstantExpr(ArgResult1, Expr::EvaluateForCodeGen, - Context)) - return Diag(ArgExpr->getExprLoc(), diag::err_expr_not_string_literal) - << ArgExpr->getType(); - } break; + ExprResult Arg = TheCall->getArg(OrderIndex); + auto ArgExpr = Arg.get(); + Expr::EvalResult ArgResult; + + if (!ArgExpr->EvaluateAsInt(ArgResult, Context)) + return Diag(ArgExpr->getExprLoc(), diag::err_typecheck_expect_int) + << ArgExpr->getType(); + int ord = ArgResult.Val.getInt().getZExtValue(); + + // Check valididty of memory ordering as per C11 / C++11's memody model. + switch (static_cast(ord)) { + case llvm::AtomicOrderingCABI::acquire: + case llvm::AtomicOrderingCABI::release: + case llvm::AtomicOrderingCABI::acq_rel: + case llvm::AtomicOrderingCABI::seq_cst: + break; + default: { + return Diag(ArgExpr->getBeginLoc(), + diag::warn_atomic_op_has_invalid_memory_order) + << ArgExpr->getSourceRange(); + } } + + Arg = TheCall->getArg(ScopeIndex); + ArgExpr = Arg.get(); + Expr::EvalResult ArgResult1; + // Check that sync scope is a constant literal + if (!ArgExpr->EvaluateAsConstantExpr(ArgResult1, Expr::EvaluateForCodeGen, + Context)) + return Diag(ArgExpr->getExprLoc(), diag::err_expr_not_string_literal) + << ArgExpr->getType(); + return false; } diff --git a/clang/test/CodeGenCXX/builtin-amdgcn-atomic-inc-dec.cpp b/clang/test/CodeGenCXX/builtin-amdgcn-atomic-inc-dec.cpp new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenCXX/builtin-amdgcn-atomic-inc-dec.cpp @@ -0,0 +1,105 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 %s -x hip -fcuda-is-device -emit-llvm -O0 -o - \ +// RUN: -triple=amdgcn-amd-amdhsa | opt -S | FileCheck %s + +__attribute__((device)) void test_parameter(int *ptr) { + // CHECK-LABEL: test_parameter + + // CHECK: %ptr.addr = alloca i32*, align 8, addrspace(5) + // CHECK-NEXT: %ptr.addr.ascast = addrspacecast i32* addrspace(5)* %ptr.addr to i32** + // CHECK-NEXT: store i32* %ptr, i32** %ptr.addr.ascast, align 8 + // CHECK-NEXT: %0 = load i32*, i32** %ptr.addr.ascast, align 8 + // CHECK-NEXT: %1 = load i32*, i32** %ptr.addr.ascast, align 8 + // CHECK-NEXT: %2 = load i32, i32* %1, align 4 + // CHECK-NEXT: %3 = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %0, i32 %2, i32 7, i32 2, i1 true) + // CHECK-NEXT: %4 = load i32*, i32** %ptr.addr.ascast, align 8 + // CHECK-NEXT: store i32 %3, i32* %4, align 4 + *ptr = __builtin_amdgcn_atomic_inc(ptr, *ptr, __ATOMIC_SEQ_CST, "workgroup", true); + + // CHECK: %5 = load i32*, i32** %ptr.addr.ascast, align 8 + // CHECK-NEXT: %6 = load i32*, i32** %ptr.addr.ascast, align 8 + // CHECK-NEXT: %7 = load i32, i32* %6, align 4 + // CHECK-NEXT: %8 = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* %5, i32 %7, i32 7, i32 2, i1 true) + // CHECK-NEXT: %9 = load i32*, i32** %ptr.addr.ascast, align 8 + // CHECK-NEXT: store i32 %8, i32* %9, align 4 + *ptr = __builtin_amdgcn_atomic_dec(ptr, *ptr, __ATOMIC_SEQ_CST, "workgroup", true); +} + +__attribute__((device)) void test_shared() { + // CHECK-LABEL: test_shared + __attribute__((shared)) int val; + + // CHECK: %0 = load i32, i32* addrspacecast (i32 addrspace(3)* @_ZZ11test_sharedvE3val to i32*), align 4 + // CHECK-NEXT: %1 = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* addrspacecast (i32 addrspace(3)* @_ZZ11test_sharedvE3val to i32*), i32 %0, i32 7, i32 2, i1 true) + // CHECK-NEXT: store i32 %1, i32* addrspacecast (i32 addrspace(3)* @_ZZ11test_sharedvE3val to i32*), align 4 + val = __builtin_amdgcn_atomic_inc(&val, val, __ATOMIC_SEQ_CST, "workgroup", true); + + // CHECK: %2 = load i32, i32* addrspacecast (i32 addrspace(3)* @_ZZ11test_sharedvE3val to i32*), align 4 + // CHECK-NEXT: %3 = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* addrspacecast (i32 addrspace(3)* @_ZZ11test_sharedvE3val to i32*), i32 %2, i32 7, i32 2, i1 true) + // CHECK-NEXT: store i32 %3, i32* addrspacecast (i32 addrspace(3)* @_ZZ11test_sharedvE3val to i32*), align 4 + val = __builtin_amdgcn_atomic_dec(&val, val, __ATOMIC_SEQ_CST, "workgroup", true); +} + +int global_val; +__attribute__((device)) void test_global() { + // CHECK-LABEL: test_global + // CHECK: %0 = load i32, i32* addrspacecast (i32 addrspace(1)* @global_val to i32*), align 4 + // CHECK-NEXT: %1 = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* addrspacecast (i32 addrspace(1)* @global_val to i32*), i32 %0, i32 7, i32 2, i1 true) + // CHECK-NEXT: store i32 %1, i32* addrspacecast (i32 addrspace(1)* @global_val to i32*), align 4 + global_val = __builtin_amdgcn_atomic_inc(&global_val, global_val, __ATOMIC_SEQ_CST, "workgroup", true); + + // CHECK: %2 = load i32, i32* addrspacecast (i32 addrspace(1)* @global_val to i32*), align 4 + // CHECK-NEXT: %3 = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* addrspacecast (i32 addrspace(1)* @global_val to i32*), i32 %2, i32 7, i32 2, i1 true) + // CHECK-NEXT: store i32 %3, i32* addrspacecast (i32 addrspace(1)* @global_val to i32*), align 4 + global_val = __builtin_amdgcn_atomic_dec(&global_val, global_val, __ATOMIC_SEQ_CST, "workgroup", true); +} + +__attribute__((constant)) int cval; +__attribute__((device)) void test_constant() { + // CHECK-LABEL: test_constant + int local_val; + + // CHECK: %0 = load i32, i32* addrspacecast (i32 addrspace(4)* @cval to i32*), align 4 + // CHECK-NEXT: %1 = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* addrspacecast (i32 addrspace(4)* @cval to i32*), i32 %0, i32 7, i32 2, i1 true) + // CHECK-NEXT: store i32 %1, i32* %local_val.ascast, align 4 + local_val = __builtin_amdgcn_atomic_inc(&cval, cval, __ATOMIC_SEQ_CST, "workgroup", true); + + // CHECK: %2 = load i32, i32* addrspacecast (i32 addrspace(4)* @cval to i32*), align 4 + // CHECK-NEXT: %3 = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* addrspacecast (i32 addrspace(4)* @cval to i32*), i32 %2, i32 7, i32 2, i1 true) + // CHECK-NEXT: store i32 %3, i32* %local_val.ascast, align 4 + local_val = __builtin_amdgcn_atomic_dec(&cval, cval, __ATOMIC_SEQ_CST, "workgroup", true); +} + +__attribute__((device)) void test_order() { + // CHECK-LABEL: test_order + __attribute__((shared)) int val; + + // CHECK: %1 = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* addrspacecast (i32 addrspace(3)* @_ZZ10test_ordervE3val to i32*), i32 %0, i32 4, i32 2, i1 true) + val = __builtin_amdgcn_atomic_inc(&val, val, __ATOMIC_ACQUIRE, "workgroup", true); + + // CHECK: %3 = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* addrspacecast (i32 addrspace(3)* @_ZZ10test_ordervE3val to i32*), i32 %2, i32 5, i32 2, i1 true) + val = __builtin_amdgcn_atomic_dec(&val, val, __ATOMIC_RELEASE, "workgroup", true); + + // CHECK: %5 = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* addrspacecast (i32 addrspace(3)* @_ZZ10test_ordervE3val to i32*), i32 %4, i32 6, i32 2, i1 true) + val = __builtin_amdgcn_atomic_dec(&val, val, __ATOMIC_ACQ_REL, "workgroup", true); + + // CHECK: %7 = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* addrspacecast (i32 addrspace(3)* @_ZZ10test_ordervE3val to i32*), i32 %6, i32 7, i32 2, i1 true) + val = __builtin_amdgcn_atomic_dec(&val, val, __ATOMIC_SEQ_CST, "workgroup", true); +} + +__attribute__((device)) void test_scope() { + // CHECK-LABEL: test_scope + __attribute__((shared)) int val; + + // CHECK: %1 = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* addrspacecast (i32 addrspace(3)* @_ZZ10test_scopevE3val to i32*), i32 %0, i32 7, i32 1, i1 true) + val = __builtin_amdgcn_atomic_inc(&val, val, __ATOMIC_SEQ_CST, "", true); + + // CHECK: %3 = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* addrspacecast (i32 addrspace(3)* @_ZZ10test_scopevE3val to i32*), i32 %2, i32 7, i32 2, i1 true) + val = __builtin_amdgcn_atomic_dec(&val, val, __ATOMIC_SEQ_CST, "workgroup", true); + + // CHECK: %5 = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* addrspacecast (i32 addrspace(3)* @_ZZ10test_scopevE3val to i32*), i32 %4, i32 7, i32 3, i1 true) + val = __builtin_amdgcn_atomic_dec(&val, val, __ATOMIC_SEQ_CST, "agent", true); + + // CHECK: %7 = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* addrspacecast (i32 addrspace(3)* @_ZZ10test_scopevE3val to i32*), i32 %6, i32 7, i32 4, i1 true) + val = __builtin_amdgcn_atomic_dec(&val, val, __ATOMIC_SEQ_CST, "wavefront", true); +} diff --git a/clang/test/Sema/builtin-amdgcn-atomic-inc-dec-failure.cpp b/clang/test/Sema/builtin-amdgcn-atomic-inc-dec-failure.cpp new file mode 100644 --- /dev/null +++ b/clang/test/Sema/builtin-amdgcn-atomic-inc-dec-failure.cpp @@ -0,0 +1,12 @@ +// REQUIRES: amdgpu-registered-target +// RUN: not %clang_cc1 %s -x hip -fcuda-is-device -o - -emit-llvm -triple=amdgcn-amd-amdhsa 2>&1 | FileCheck %s + +void test_host() { + int val; + + // CHECK: error: reference to __device__ function '__builtin_amdgcn_atomic_inc' in __host__ function + val = __builtin_amdgcn_atomic_inc(&val, val, __ATOMIC_SEQ_CST, "", true); + + // CHECK: error: reference to __device__ function '__builtin_amdgcn_atomic_dec' in __host__ function + val = __builtin_amdgcn_atomic_dec(&val, val, __ATOMIC_SEQ_CST, "", true); +} \ No newline at end of file diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error.cl --- a/clang/test/SemaOpenCL/builtins-amdgcn-error.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error.cl @@ -144,3 +144,27 @@ __builtin_amdgcn_s_setreg(x, 0); // expected-error {{argument to '__builtin_amdgcn_s_setreg' must be a constant integer}} __builtin_amdgcn_s_setreg(x, y); // expected-error {{argument to '__builtin_amdgcn_s_setreg' must be a constant integer}} } + +void test_atomic_inc() { + int val = 17; + val = __builtin_amdgcn_atomic_inc(&val, val, __ATOMIC_SEQ_CST + 1, "workgroup", true); // expected-warning {{memory order argument to atomic operation is invalid}} + val = __builtin_amdgcn_atomic_inc(&val, val, __ATOMIC_ACQUIRE - 1, "workgroup", true); // expected-warning {{memory order argument to atomic operation is invalid}} + val = __builtin_amdgcn_atomic_inc(4); // expected-error {{too few arguments to function call, expected 5}} + val = __builtin_amdgcn_atomic_inc(&val, val, 4, 4, 4, 4); // expected-error {{too many arguments to function call, expected 5}} + val = __builtin_amdgcn_atomic_inc(&val, val, 3.14, "", true); // expected-warning {{implicit conversion from 'double' to 'unsigned int' changes value from 3.14 to 3}} + val = __builtin_amdgcn_atomic_inc(&val, val, __ATOMIC_ACQUIRE, 5, true); // expected-warning {{incompatible integer to pointer conversion passing 'int' to parameter of type 'const char *'}} + const char ptr[] = "workgroup"; + val = __builtin_amdgcn_atomic_inc(&val, val, __ATOMIC_ACQUIRE, ptr, true); // expected-error {{expression is not a string literal}} +} + +void test_atomic_dec() { + int val = 17; + val = __builtin_amdgcn_atomic_dec(&val, val, __ATOMIC_SEQ_CST + 1, "workgroup", true); // expected-warning {{memory order argument to atomic operation is invalid}} + val = __builtin_amdgcn_atomic_dec(&val, val, __ATOMIC_ACQUIRE - 1, "workgroup", true); // expected-warning {{memory order argument to atomic operation is invalid}} + val = __builtin_amdgcn_atomic_dec(4); // expected-error {{too few arguments to function call, expected 5}} + val = __builtin_amdgcn_atomic_dec(&val, val, 4, 4, 4, 4); // expected-error {{too many arguments to function call, expected 5}} + val = __builtin_amdgcn_atomic_dec(&val, val, 3.14, "", true); // expected-warning {{implicit conversion from 'double' to 'unsigned int' changes value from 3.14 to 3}} + val = __builtin_amdgcn_atomic_dec(&val, val, __ATOMIC_ACQUIRE, 5, true); // expected-warning {{incompatible integer to pointer conversion passing 'int' to parameter of type 'const char *'}} + const char ptr[] = "workgroup"; + val = __builtin_amdgcn_atomic_dec(&val, val, __ATOMIC_ACQUIRE, ptr, true); // expected-error {{expression is not a string literal}} +}