Index: docs/ClangCommandLineReference.rst =================================================================== --- docs/ClangCommandLineReference.rst +++ docs/ClangCommandLineReference.rst @@ -144,6 +144,10 @@ CUDA GPU architecture (e.g. sm\_35). May be specified more than once. +.. option:: --cuda-include-ptx=, --no-cuda-include-ptx= + +Include (or not) PTX along with CUDA GPU binary for the given architecture (e.g. sm\_35). Argument may be 'all'. The option may be specified more than once. Default: --cuda-include-ptx=all + .. option:: --cuda-host-only Compile CUDA code for host only. Has no effect on non-CUDA compilations. Index: include/clang/Driver/Options.td =================================================================== --- include/clang/Driver/Options.td +++ include/clang/Driver/Options.td @@ -546,6 +546,10 @@ def cuda_compile_host_device : Flag<["--"], "cuda-compile-host-device">, HelpText<"Compile CUDA code for both host and device (default). Has no " "effect on non-CUDA compilations.">; +def cuda_include_ptx_EQ : Joined<["--"], "cuda-include-ptx=">, Flags<[DriverOption]>, + HelpText<"Include PTX for the follwing GPU architecture (e.g. sm_35) or 'all'. May be specified more than once.">; +def no_cuda_include_ptx_EQ : Joined<["--"], "no-cuda-include-ptx=">, Flags<[DriverOption]>, + HelpText<"Do not include PTX for the follwing GPU architecture (e.g. sm_35) or 'all'. May be specified more than once.">; def cuda_gpu_arch_EQ : Joined<["--"], "cuda-gpu-arch=">, Flags<[DriverOption]>, HelpText<"CUDA GPU architecture (e.g. sm_35). May be specified more than once.">; def no_cuda_gpu_arch_EQ : Joined<["--"], "no-cuda-gpu-arch=">, Flags<[DriverOption]>, Index: lib/Driver/ToolChains/Cuda.cpp =================================================================== --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -377,6 +377,22 @@ C.addCommand(llvm::make_unique(JA, *this, Exec, CmdArgs, Inputs)); } +static bool shouldIncludePTX(const ArgList &Args, const char *gpu_arch) { + bool includePTX = true; + for (Arg *A : Args) { + if (!(A->getOption().matches(options::OPT_cuda_include_ptx_EQ) || + A->getOption().matches(options::OPT_no_cuda_include_ptx_EQ))) + continue; + A->claim(); + const StringRef ArchStr = A->getValue(); + if (ArchStr == "all" || ArchStr == gpu_arch) { + includePTX = A->getOption().matches(options::OPT_cuda_include_ptx_EQ); + continue; + } + } + return includePTX; +} + // All inputs to this linker must be from CudaDeviceActions, as we need to look // at the Inputs' Actions in order to figure out which GPU architecture they // correspond to. @@ -404,6 +420,9 @@ "Device action expected to have associated a GPU architecture!"); CudaArch gpu_arch = StringToCudaArch(gpu_arch_str); + if (II.getType() == types::TY_PP_Asm && + !shouldIncludePTX(Args, gpu_arch_str)) + continue; // We need to pass an Arch of the form "sm_XX" for cubin files and // "compute_XX" for ptx. const char *Arch = Index: test/Driver/cuda-options.cu =================================================================== --- test/Driver/cuda-options.cu +++ test/Driver/cuda-options.cu @@ -142,6 +142,48 @@ // RUN: -c %s 2>&1 \ // RUN: | FileCheck -check-prefix ARCHALLERROR %s + +// Verify that --[no-]cuda-include-ptx arguments are handled correctly. +// a) by default we're including PTX for all GPUs. +// RUN: %clang -### -target x86_64-linux-gnu \ +// RUN: --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 \ +// RUN: -c %s 2>&1 \ +// RUN: | FileCheck -check-prefixes FATBIN-COMMON,PTX-SM35,PTX-SM30 %s + +// b) --no-cuda-include-ptx=all disables PTX inclusion for all GPUs +// RUN: %clang -### -target x86_64-linux-gnu \ +// RUN: --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 \ +// RUN: --no-cuda-include-ptx=all \ +// RUN: -c %s 2>&1 \ +// RUN: | FileCheck -check-prefixes FATBIN-COMMON,NOPTX-SM35,NOPTX-SM30 %s + +// c) --no-cuda-include-ptx=sm_XX disables PTX inclusion for that GPU only. +// RUN: %clang -### -target x86_64-linux-gnu \ +// RUN: --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 \ +// RUN: --no-cuda-include-ptx=sm_35 \ +// RUN: -c %s 2>&1 \ +// RUN: | FileCheck -check-prefixes FATBIN-COMMON,NOPTX-SM35,PTX-SM30 %s +// RUN: %clang -### -target x86_64-linux-gnu \ +// RUN: --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 \ +// RUN: --no-cuda-include-ptx=sm_30 \ +// RUN: -c %s 2>&1 \ +// RUN: | FileCheck -check-prefixes FATBIN-COMMON,PTX-SM35,NOPTX-SM30 %s + +// d) --cuda-include-ptx=all overrides preceding --no-cuda-include-ptx=all +// RUN: %clang -### -target x86_64-linux-gnu \ +// RUN: --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 \ +// RUN: --no-cuda-include-ptx=all --cuda-include-ptx=all \ +// RUN: -c %s 2>&1 \ +// RUN: | FileCheck -check-prefixes FATBIN-COMMON,PTX-SM35,PTX-SM30 %s + +// e) --cuda-include-ptx=all overrides preceding --no-cuda-include-ptx=sm_XX +// RUN: %clang -### -target x86_64-linux-gnu \ +// RUN: --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 \ +// RUN: --no-cuda-include-ptx=sm_30 --cuda-include-ptx=all \ +// RUN: -c %s 2>&1 \ +// RUN: | FileCheck -check-prefixes FATBIN-COMMON,PTX-SM35,PTX-SM30 %s + + // ARCH-SM20: "-cc1"{{.*}}"-target-cpu" "sm_20" // NOARCH-SM20-NOT: "-cc1"{{.*}}"-target-cpu" "sm_20" // ARCH-SM30: "-cc1"{{.*}}"-target-cpu" "sm_30" @@ -236,3 +278,12 @@ // Match no linker. // NOLINK-NOT: "{{.*}}{{ld|link}}{{(.exe)?}}" + +// FATBIN-COMMON:fatbinary +// FATBIN-COMMON: "--create" "[[FATBINARY:[^"]*]]" +// FATBIN-COMMON: "--image=profile=sm_30,file= +// PTX-SM30: "--image=profile=compute_30,file= +// NOPTX-SM30-NOT: "--image=profile=compute_30,file= +// FATBIN-COMMON: "--image=profile=sm_35,file= +// PTX-SM35: "--image=profile=compute_35,file= +// NOPTX-SM35-NOT: "--image=profile=compute_35,file=