Index: cfe/trunk/include/clang/Basic/LangOptions.def =================================================================== --- cfe/trunk/include/clang/Basic/LangOptions.def +++ cfe/trunk/include/clang/Basic/LangOptions.def @@ -157,6 +157,7 @@ LANGOPT(HalfArgsAndReturns, 1, 0, "half args and returns") LANGOPT(CUDA , 1, 0, "CUDA") LANGOPT(OpenMP , 1, 0, "OpenMP support") +LANGOPT(CUDAIsDevice , 1, 0, "Compiling for CUDA device") LANGOPT(AssumeSaneOperatorNew , 1, 1, "implicit __attribute__((malloc)) for C++'s new operators") LANGOPT(SizedDeallocation , 1, 0, "enable sized deallocation functions") Index: cfe/trunk/lib/Basic/Targets.cpp =================================================================== --- cfe/trunk/lib/Basic/Targets.cpp +++ cfe/trunk/lib/Basic/Targets.cpp @@ -1377,6 +1377,16 @@ class NVPTXTargetInfo : public TargetInfo { static const char * const GCCRegNames[]; static const Builtin::Info BuiltinInfo[]; + + // The GPU profiles supported by the NVPTX backend + enum GPUKind { + GK_NONE, + GK_SM20, + GK_SM21, + GK_SM30, + GK_SM35, + } GPU; + public: NVPTXTargetInfo(const llvm::Triple &Triple) : TargetInfo(Triple) { BigEndian = false; @@ -1387,11 +1397,34 @@ // Define available target features // These must be defined in sorted order! NoAsmVariants = true; + // Set the default GPU to sm20 + GPU = GK_SM20; } void getTargetDefines(const LangOptions &Opts, MacroBuilder &Builder) const override { Builder.defineMacro("__PTX__"); Builder.defineMacro("__NVPTX__"); + if (Opts.CUDAIsDevice) { + // Set __CUDA_ARCH__ for the GPU specified. + std::string CUDAArchCode; + switch (GPU) { + case GK_SM20: + CUDAArchCode = "200"; + break; + case GK_SM21: + CUDAArchCode = "210"; + break; + case GK_SM30: + CUDAArchCode = "300"; + break; + case GK_SM35: + CUDAArchCode = "350"; + break; + default: + llvm_unreachable("Unhandled target CPU"); + } + Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode); + } } void getTargetBuiltins(const Builtin::Info *&Records, unsigned &NumRecords) const override { @@ -1434,14 +1467,14 @@ return TargetInfo::CharPtrBuiltinVaList; } bool setCPU(const std::string &Name) override { - bool Valid = llvm::StringSwitch(Name) - .Case("sm_20", true) - .Case("sm_21", true) - .Case("sm_30", true) - .Case("sm_35", true) - .Default(false); + GPU = llvm::StringSwitch(Name) + .Case("sm_20", GK_SM20) + .Case("sm_21", GK_SM21) + .Case("sm_30", GK_SM30) + .Case("sm_35", GK_SM35) + .Default(GK_NONE); - return Valid; + return GPU != GK_NONE; } }; Index: cfe/trunk/lib/Frontend/CompilerInvocation.cpp =================================================================== --- cfe/trunk/lib/Frontend/CompilerInvocation.cpp +++ cfe/trunk/lib/Frontend/CompilerInvocation.cpp @@ -1349,6 +1349,9 @@ if (Args.hasArg(OPT_fno_operator_names)) Opts.CXXOperatorNames = 0; + if (Args.hasArg(OPT_fcuda_is_device)) + Opts.CUDAIsDevice = 1; + if (Opts.ObjC1) { if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) { StringRef value = arg->getValue(); Index: cfe/trunk/lib/Frontend/InitPreprocessor.cpp =================================================================== --- cfe/trunk/lib/Frontend/InitPreprocessor.cpp +++ cfe/trunk/lib/Frontend/InitPreprocessor.cpp @@ -870,6 +870,13 @@ Builder.defineMacro("_OPENMP", "201307"); } + // CUDA device path compilaton + if (LangOpts.CUDAIsDevice) { + // The CUDA_ARCH value is set for the GPU target specified in the NVPTX + // backend's target defines. + Builder.defineMacro("__CUDA_ARCH__"); + } + // Get other target #defines. TI.getTargetDefines(LangOpts, Builder); } Index: cfe/trunk/lib/Sema/SemaCUDA.cpp =================================================================== --- cfe/trunk/lib/Sema/SemaCUDA.cpp +++ cfe/trunk/lib/Sema/SemaCUDA.cpp @@ -14,6 +14,7 @@ #include "clang/Sema/Sema.h" #include "clang/AST/ASTContext.h" #include "clang/AST/Decl.h" +#include "clang/Lex/Preprocessor.h" #include "clang/Sema/SemaDiagnostic.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallVector.h" @@ -72,21 +73,29 @@ if (CallerTarget == CFT_InvalidTarget || CalleeTarget == CFT_InvalidTarget) return true; - // CUDA B.1.1 "The __device__ qualifier declares a function that is... + // CUDA B.1.1 "The __device__ qualifier declares a function that is [...] // Callable from the device only." if (CallerTarget == CFT_Host && CalleeTarget == CFT_Device) return true; - // CUDA B.1.2 "The __global__ qualifier declares a function that is... + // CUDA B.1.2 "The __global__ qualifier declares a function that is [...] // Callable from the host only." - // CUDA B.1.3 "The __host__ qualifier declares a function that is... + // CUDA B.1.3 "The __host__ qualifier declares a function that is [...] // Callable from the host only." if ((CallerTarget == CFT_Device || CallerTarget == CFT_Global) && (CalleeTarget == CFT_Host || CalleeTarget == CFT_Global)) return true; - if (CallerTarget == CFT_HostDevice && CalleeTarget != CFT_HostDevice) - return true; + // CUDA B.1.3 "The __device__ and __host__ qualifiers can be used together + // however, in which case the function is compiled for both the host and the + // device. The __CUDA_ARCH__ macro [...] can be used to differentiate code + // paths between host and device." + bool InDeviceMode = getLangOpts().CUDAIsDevice; + if (CallerTarget == CFT_HostDevice && CalleeTarget != CFT_HostDevice) { + if ((InDeviceMode && CalleeTarget != CFT_Device) || + (!InDeviceMode && CalleeTarget != CFT_Host)) + return true; + } return false; } Index: cfe/trunk/test/SemaCUDA/function-target.cu =================================================================== --- cfe/trunk/test/SemaCUDA/function-target.cu +++ cfe/trunk/test/SemaCUDA/function-target.cu @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -fsyntax-only -verify %s +// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -verify %s #include "Inputs/cuda.h" @@ -31,14 +32,40 @@ d1g<<<1, 1>>>(); // expected-error {{reference to __global__ function 'd1g' in __device__ function}} } -__host__ void hd1h(void); // expected-note {{candidate function not viable: call to __host__ function from __host__ __device__ function}} -__device__ void hd1d(void); // expected-note {{candidate function not viable: call to __device__ function from __host__ __device__ function}} +// Expected 0-1 as in one of host/device side compilation it is an error, while +// not in the other +__host__ void hd1h(void); // expected-note 0-1 {{candidate function not viable: call to __host__ function from __host__ __device__ function}} +__device__ void hd1d(void); // expected-note 0-1 {{candidate function not viable: call to __device__ function from __host__ __device__ function}} +__host__ void hd1hg(void); +__device__ void hd1dg(void); +#ifdef __CUDA_ARCH__ +__host__ void hd1hig(void); // expected-note {{candidate function not viable: call to __host__ function from __host__ __device__ function}} +#else +__device__ void hd1dig(void); // expected-note {{candidate function not viable: call to __device__ function from __host__ __device__ function}} +#endif __host__ __device__ void hd1hd(void); __global__ void hd1g(void); // expected-note {{'hd1g' declared here}} __host__ __device__ void hd1(void) { - hd1h(); // expected-error {{no matching function}} - hd1d(); // expected-error {{no matching function}} + // Expected 0-1 as in one of host/device side compilation it is an error, + // while not in the other + hd1d(); // expected-error 0-1 {{no matching function}} + hd1h(); // expected-error 0-1 {{no matching function}} + + // No errors as guarded +#ifdef __CUDA_ARCH__ + hd1d(); +#else + hd1h(); +#endif + + // Errors as incorrectly guarded +#ifndef __CUDA_ARCH__ + hd1dig(); // expected-error {{no matching function}} +#else + hd1hig(); // expected-error {{no matching function}} +#endif + hd1hd(); hd1g<<<1, 1>>>(); // expected-error {{reference to __global__ function 'hd1g' in __host__ __device__ function}} }