Index: clang/include/clang/Basic/DiagnosticParseKinds.td =================================================================== --- clang/include/clang/Basic/DiagnosticParseKinds.td +++ clang/include/clang/Basic/DiagnosticParseKinds.td @@ -1445,12 +1445,12 @@ def warn_cuda_attr_lambda_position : Warning< "nvcc does not allow '__%0__' to appear after '()' in lambdas">, InGroup; -def warn_pragma_force_cuda_host_device_bad_arg : Warning< - "incorrect use of #pragma clang force_cuda_host_device begin|end">, +def warn_pragma_force_cuda_bad_arg : Warning< + "incorrect use of #pragma clang force_cuda_%select{host_device|device_globals}0 begin|end">, InGroup; -def err_pragma_cannot_end_force_cuda_host_device : Error< - "force_cuda_host_device end pragma without matching " - "force_cuda_host_device begin">; +def err_pragma_cannot_end_force_cuda : Error< + "force_cuda_%select{host_device|device_globals}0 end pragma without matching " + "force_cuda_%select{host_device|device_globals}0 begin">; } // end of Parse Issue category. let CategoryName = "Modules Issue" in { Index: clang/include/clang/Parse/Parser.h =================================================================== --- clang/include/clang/Parse/Parser.h +++ clang/include/clang/Parse/Parser.h @@ -194,6 +194,7 @@ std::unique_ptr MSIntrinsic; std::unique_ptr MSOptimize; std::unique_ptr CUDAForceHostDeviceHandler; + std::unique_ptr CUDAForceDeviceGlobalsHandler; std::unique_ptr OptimizeHandler; std::unique_ptr LoopHintHandler; std::unique_ptr UnrollHintHandler; Index: clang/include/clang/Sema/Sema.h =================================================================== --- clang/include/clang/Sema/Sema.h +++ clang/include/clang/Sema/Sema.h @@ -11934,6 +11934,7 @@ private: unsigned ForceCUDAHostDeviceDepth = 0; + unsigned ForceCUDADeviceGlobalsDepth = 0; public: /// Increments our count of the number of times we've seen a pragma forcing @@ -11946,6 +11947,16 @@ /// before incrementing, so you can emit an error. bool PopForceCUDAHostDevice(); + /// Increments our count of the number of times we've seen a pragma forcing + /// global variables to be __device__. So long as this count is greater than + /// zero, all global variables encounters will be __device__. + void PushForceCUDADeviceGlobals(); + + /// Decrements our count of the number of times we've seen a pragma forcing + /// global variables to be __device__. Returns false if the count is 0 before + /// decrementing, so we can emit an error. + bool PopForceCUDADeviceGlobals(); + /// Diagnostics that are emitted only if we discover that the given function /// must be codegen'ed. Because handling these correctly adds overhead to /// compilation, this is currently only enabled for CUDA compilations. @@ -12112,6 +12123,10 @@ /// and current compilation settings. void MaybeAddCUDAConstantAttr(VarDecl *VD); + /// May add CUDADeviceAttr attribute to VD depending on pragma pair + /// force_cuda_device_globals depth. + void MaybeAddCUDADeviceAttr(VarDecl *VD); + public: /// Check whether we're allowed to call Callee from the current context. /// Index: clang/include/clang/Serialization/ASTBitCodes.h =================================================================== --- clang/include/clang/Serialization/ASTBitCodes.h +++ clang/include/clang/Serialization/ASTBitCodes.h @@ -692,6 +692,10 @@ /// Record code for \#pragma float_control options. FLOAT_CONTROL_PRAGMA_OPTIONS = 65, + + /// Number of unmatched #pragma clang cuda_force_device_globals + /// begin directives we're seen. + CUDA_PRAGMA_FORCE_DEVICE_GLOBALS_DEPTH = 66, }; /// Record types used within a source manager block. Index: clang/include/clang/Serialization/ASTReader.h =================================================================== --- clang/include/clang/Serialization/ASTReader.h +++ clang/include/clang/Serialization/ASTReader.h @@ -833,6 +833,10 @@ /// macros. unsigned ForceCUDAHostDeviceDepth = 0; + /// Our current depth in #pragma clang force_cuda_device_globals + /// begin/end macros. + unsigned ForceCUDADeviceGlobalsDepth = 0; + /// The IDs of the declarations Sema stores directly. /// /// Sema tracks a few important decls, such as namespace std, directly. Index: clang/lib/Parse/ParsePragma.cpp =================================================================== --- clang/lib/Parse/ParsePragma.cpp +++ clang/lib/Parse/ParsePragma.cpp @@ -271,6 +271,16 @@ Sema &Actions; }; +struct PragmaForceCUDADeviceGlobalsHandler : public PragmaHandler { + PragmaForceCUDADeviceGlobalsHandler(Sema &Actions) + : PragmaHandler("force_cuda_device_globals"), Actions(Actions) {} + void HandlePragma(Preprocessor &PP, PragmaIntroducer Introducer, + Token &FirstToken) override; + +private: + Sema &Actions; +}; + /// PragmaAttributeHandler - "\#pragma clang attribute ...". struct PragmaAttributeHandler : public PragmaHandler { PragmaAttributeHandler(AttributeFactory &AttrFactory) @@ -395,6 +405,9 @@ CUDAForceHostDeviceHandler = std::make_unique(Actions); PP.AddPragmaHandler("clang", CUDAForceHostDeviceHandler.get()); + CUDAForceDeviceGlobalsHandler = + std::make_unique(Actions); + PP.AddPragmaHandler("clang", CUDAForceDeviceGlobalsHandler.get()); } OptimizeHandler = std::make_unique(Actions); @@ -499,6 +512,8 @@ if (getLangOpts().CUDA) { PP.RemovePragmaHandler("clang", CUDAForceHostDeviceHandler.get()); CUDAForceHostDeviceHandler.reset(); + PP.RemovePragmaHandler("clang", CUDAForceDeviceGlobalsHandler.get()); + CUDAForceDeviceGlobalsHandler.reset(); } PP.RemovePragmaHandler("STDC", FPContractHandler.get()); @@ -3498,8 +3513,7 @@ PP.Lex(Tok); IdentifierInfo *Info = Tok.getIdentifierInfo(); if (!Info || (!Info->isStr("begin") && !Info->isStr("end"))) { - PP.Diag(FirstTok.getLocation(), - diag::warn_pragma_force_cuda_host_device_bad_arg); + PP.Diag(FirstTok.getLocation(), diag::warn_pragma_force_cuda_bad_arg) << 0; return; } @@ -3507,12 +3521,33 @@ Actions.PushForceCUDAHostDevice(); else if (!Actions.PopForceCUDAHostDevice()) PP.Diag(FirstTok.getLocation(), - diag::err_pragma_cannot_end_force_cuda_host_device); + diag::err_pragma_cannot_end_force_cuda) << 0; PP.Lex(Tok); if (!Tok.is(tok::eod)) + PP.Diag(FirstTok.getLocation(), diag::warn_pragma_force_cuda_bad_arg) << 0; +} + +void PragmaForceCUDADeviceGlobalsHandler::HandlePragma( + Preprocessor &PP, PragmaIntroducer Introducer, Token &Tok) { + Token FirstTok = Tok; + + PP.Lex(Tok); + IdentifierInfo *Info = Tok.getIdentifierInfo(); + if (!Info || (!Info->isStr("begin") && !Info->isStr("end"))) { + PP.Diag(FirstTok.getLocation(), diag::warn_pragma_force_cuda_bad_arg) << 1; + return; + } + + if (Info->isStr("begin")) + Actions.PushForceCUDADeviceGlobals(); + else if (!Actions.PopForceCUDADeviceGlobals()) PP.Diag(FirstTok.getLocation(), - diag::warn_pragma_force_cuda_host_device_bad_arg); + diag::err_pragma_cannot_end_force_cuda) << 1; + + PP.Lex(Tok); + if (Tok.isNot(tok::eod)) + PP.Diag(FirstTok.getLocation(), diag::warn_pragma_force_cuda_bad_arg) << 1; } /// Handle the #pragma clang attribute directive. Index: clang/lib/Sema/SemaCUDA.cpp =================================================================== --- clang/lib/Sema/SemaCUDA.cpp +++ clang/lib/Sema/SemaCUDA.cpp @@ -39,6 +39,19 @@ return true; } +void Sema::PushForceCUDADeviceGlobals() { + assert(getLangOpts().CUDA && "Should only be called during CUDA compilation"); + ForceCUDADeviceGlobalsDepth++; +} + +bool Sema::PopForceCUDADeviceGlobals() { + assert(getLangOpts().CUDA && "Should only be called during CUDA compilation"); + if (ForceCUDADeviceGlobalsDepth == 0) + return false; + ForceCUDADeviceGlobalsDepth--; + return true; +} + ExprResult Sema::ActOnCUDAExecConfigExpr(Scope *S, SourceLocation LLLLoc, MultiExprArg ExecConfig, SourceLocation GGGLoc) { @@ -642,6 +655,26 @@ } } +/// All global variables are added with __device__ attribute when +/// ForceCUDADeviceGlobalsDepth > 0 (corresponding to code within a +/// #pragma clang force_cuda_device_globals begin/end pair. +void Sema::MaybeAddCUDADeviceAttr(VarDecl *VD) { + assert(getLangOpts().CUDA && "Should only be called during CUDA compilation"); + // Avoid non-global variables. + if (!VD->hasGlobalStorage()) + return; + + // Avoid system globals. + ASTContext &Context = getASTContext(); + const SourceManager &SM = Context.getSourceManager(); + FullSourceLoc Loc = Context.getFullLoc(VD->getBeginLoc()); + if (SM.isInSystemHeader(Loc)) + return; + + if (ForceCUDADeviceGlobalsDepth > 0) + VD->addAttr(CUDADeviceAttr::CreateImplicit(VD->getASTContext())); +} + Sema::SemaDiagnosticBuilder Sema::CUDADiagIfDeviceCode(SourceLocation Loc, unsigned DiagID) { assert(getLangOpts().CUDA && "Should only be called during CUDA compilation"); Index: clang/lib/Sema/SemaDecl.cpp =================================================================== --- clang/lib/Sema/SemaDecl.cpp +++ clang/lib/Sema/SemaDecl.cpp @@ -7250,6 +7250,10 @@ // Handle attributes prior to checking for duplicates in MergeVarDecl ProcessDeclAttributes(S, NewVD, D); + if (getLangOpts().CUDA) { + MaybeAddCUDADeviceAttr(NewVD); + } + if (getLangOpts().CUDA || getLangOpts().OpenMPIsDevice || getLangOpts().SYCLIsDevice) { if (EmitTLSUnsupportedError && Index: clang/lib/Sema/SemaTemplateInstantiateDecl.cpp =================================================================== --- clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -5031,6 +5031,8 @@ NewVar->setObjCForDecl(OldVar->isObjCForDecl()); NewVar->setConstexpr(OldVar->isConstexpr()); MaybeAddCUDAConstantAttr(NewVar); + if (getLangOpts().CUDA) + MaybeAddCUDADeviceAttr(NewVar); NewVar->setInitCapture(OldVar->isInitCapture()); NewVar->setPreviousDeclInSameBlockScope( OldVar->isPreviousDeclInSameBlockScope()); Index: clang/lib/Serialization/ASTReader.cpp =================================================================== --- clang/lib/Serialization/ASTReader.cpp +++ clang/lib/Serialization/ASTReader.cpp @@ -3786,6 +3786,14 @@ ForceCUDAHostDeviceDepth = Record[0]; break; + case CUDA_PRAGMA_FORCE_DEVICE_GLOBALS_DEPTH: + if (Record.size() != 1) { + Error("invalid cuda pragma options record"); + return Failure; + } + ForceCUDADeviceGlobalsDepth = Record[0]; + break; + case ALIGN_PACK_PRAGMA_OPTIONS: { if (Record.size() < 3) { Error("invalid pragma pack record"); @@ -7928,6 +7936,7 @@ PointersToMembersPragmaLocation); } SemaObj->ForceCUDAHostDeviceDepth = ForceCUDAHostDeviceDepth; + SemaObj->ForceCUDADeviceGlobalsDepth = ForceCUDADeviceGlobalsDepth; if (PragmaAlignPackCurrentValue) { // The bottom of the stack might have a default value. It must be adjusted Index: clang/lib/Serialization/ASTWriter.cpp =================================================================== --- clang/lib/Serialization/ASTWriter.cpp +++ clang/lib/Serialization/ASTWriter.cpp @@ -789,6 +789,7 @@ RECORD(CUDA_PRAGMA_FORCE_HOST_DEVICE_DEPTH); RECORD(PP_CONDITIONAL_STACK); RECORD(DECLS_TO_CHECK_FOR_DEFERRED_DIAGS); + RECORD(CUDA_PRAGMA_FORCE_DEVICE_GLOBALS_DEPTH); // SourceManager Block. BLOCK(SOURCE_MANAGER_BLOCK); @@ -4036,6 +4037,10 @@ RecordData::value_type Record[] = {SemaRef.ForceCUDAHostDeviceDepth}; Stream.EmitRecord(CUDA_PRAGMA_FORCE_HOST_DEVICE_DEPTH, Record); } + if (SemaRef.ForceCUDADeviceGlobalsDepth > 0) { + RecordData::value_type Record[] = {SemaRef.ForceCUDADeviceGlobalsDepth}; + Stream.EmitRecord(CUDA_PRAGMA_FORCE_DEVICE_GLOBALS_DEPTH, Record); + } } void ASTWriter::WriteObjCCategories() { Index: clang/test/PCH/pragma-cuda-force-device-globals.cu =================================================================== --- /dev/null +++ clang/test/PCH/pragma-cuda-force-device-globals.cu @@ -0,0 +1,37 @@ +// RUN: %clang_cc1 -triple nvptx -fcuda-is-device -emit-pch %s -o %t +// RUN: %clang_cc1 -triple nvptx -verify -verify-ignore-unexpected=note \ +// RUN: -fcuda-is-device -include-pch %t -S -o /dev/null %s +// +// This test checks that serialization code maintains push count in PCH. +// Also, non-zero count at the end of TU is okay. + +#ifndef HEADER +#define HEADER + +static int global_before_pragma = 1; + +#pragma clang force_cuda_device_globals begin +#pragma clang force_cuda_device_globals begin +#pragma clang force_cuda_device_globals end + +static int global1 = 1; +static const int const_global = 1; + +#else + +static int global2 = 1; + +#pragma clang force_cuda_device_globals end + +static int global_host_only = 1; + +__attribute__((device)) void device() { + int g = 0; + g += global_before_pragma; // expected-error {{reference to __host__ variable 'global_before_pragma' in __device__ function}} + g += global1; + g += const_global; + g += global2; + g+= global_host_only; // expected-error {{reference to __host__ variable 'global_host_only' in __device__ function}} +} + +#endif Index: clang/test/SemaCUDA/force-device-globals.cu =================================================================== --- /dev/null +++ clang/test/SemaCUDA/force-device-globals.cu @@ -0,0 +1,102 @@ +// RUN: %clang_cc1 -std=c++14 %s -o - -triple nvptx64-nvidia-cuda \ +// RUN: -fcuda-is-device -verify -fsyntax-only +#include "Inputs/cuda.h" + +static int global_before_pragma = 1; + +#pragma clang force_cuda_device_globals end +// expected-error@-1 {{force_cuda_device_globals end pragma without matching force_cuda_device_globals begin}} + +#pragma clang force_cuda_device_globals begin + +static int global = 1; +static const int const_global = 1; + +struct S { + static const int static_const_field = 1; + static int static_field; +}; + +namespace NS1 { + namespace NS2 { + int ns_global = 1; + } + + struct S { + static const int ns1_static_const_field = 1; + static int ns1_static_field; + }; +} + +int S::static_field = 1; +int NS1::S::ns1_static_field = 1; + +#pragma clang force_cuda_device_globals end + +__device__ void device_func(int &a) { + static constexpr int local_static_constexpr = 1; + constexpr int local_constexpr = 2; + static int local_static_before_pragma = 3; +#pragma clang force_cuda_device_globals begin + static int local_static = 3; +#pragma clang force_cuda_device_globals end + a += local_static_constexpr; + a += local_constexpr; + a += local_static_before_pragma; + // expected-error@-1 {{reference to __host__ variable 'local_static_before_pragma' in __device__ function}} + a += local_static; + a += global_before_pragma; + // expected-error@-1 {{reference to __host__ variable 'global_before_pragma' in __device__ function}} + a += global; + a += const_global; + a += S::static_const_field; + a += S::static_field; + a += NS1::S::ns1_static_const_field; + a += NS1::S::ns1_static_field; + a += NS1::NS2::ns_global; +} + +void host_func(int &a) { + static constexpr int local_static_constexpr = 1; + constexpr int local_constexpr = 2; + static int local_static = 3; + a += local_static_constexpr; + a += local_constexpr; + a += local_static; + a += global_before_pragma; + // Note: variables below should not be allowed to read from host. + // Clang allows reading their shadows, but that is not OK. + a += global; + a += const_global; + a += S::static_const_field; + a += S::static_field; + a += NS1::S::ns1_static_const_field; + a += NS1::S::ns1_static_field; + a += NS1::NS2::ns_global; +} + +__host__ __device__ void host_device_func(int &a) { + static constexpr int local_static_constexpr = 1; + constexpr int local_constexpr = 2; + static int local_static_before_pragma = 3; +#pragma clang force_cuda_device_globals begin + static int local_static = 3; +#pragma clang force_cuda_device_globals end + a += local_static_constexpr; + a += local_constexpr; + a += local_static_before_pragma; + // expected-error@-1 {{reference to __host__ variable 'local_static_before_pragma' in __host__ __device__ function}} + a += local_static; + a += global_before_pragma; + // expected-error@-1 {{reference to __host__ variable 'global_before_pragma' in __host__ __device__ function}} + a += global; + a += const_global; + a += S::static_const_field; + a += S::static_field; + a += NS1::S::ns1_static_const_field; + a += NS1::S::ns1_static_field; + a += NS1::NS2::ns_global; +} + +#pragma clang force_cuda_device_globals begin foo +// expected-warning@-1 {{incorrect use of #pragma clang force_cuda_device_globals begin|end}}