Index: cfe/trunk/lib/Driver/Driver.cpp =================================================================== --- cfe/trunk/lib/Driver/Driver.cpp +++ cfe/trunk/lib/Driver/Driver.cpp @@ -1233,11 +1233,13 @@ } } -// For each unique --cuda-gpu-arch= argument creates a TY_CUDA_DEVICE input -// action and then wraps each in CudaDeviceAction paired with appropriate GPU -// arch name. If we're only building device-side code, each action remains -// independent. Otherwise we pass device-side actions as inputs to a new -// CudaHostAction which combines both host and device side actions. +// For each unique --cuda-gpu-arch= argument creates a TY_CUDA_DEVICE +// input action and then wraps each in CudaDeviceAction paired with +// appropriate GPU arch name. In case of partial (i.e preprocessing +// only) or device-only compilation, each device action is added to /p +// Actions and /p Current is released. Otherwise the function creates +// and returns a new CudaHostAction which wraps /p Current and device +// side actions. static std::unique_ptr buildCudaActions(const Driver &D, const ToolChain &TC, DerivedArgList &Args, const Arg *InputArg, std::unique_ptr HostAction, @@ -1421,22 +1423,14 @@ } phases::ID CudaInjectionPhase; - if (isSaveTempsEnabled()) { - // All phases are done independently, inject GPU blobs during compilation - // phase as that's where we generate glue code to init them. - CudaInjectionPhase = phases::Compile; - } else { - // Assumes that clang does everything up until linking phase, so we inject - // cuda device actions at the last step before linking. Otherwise CUDA - // host action forces preprocessor into a separate invocation. - CudaInjectionPhase = FinalPhase; - if (FinalPhase == phases::Link) - for (auto PI = PL.begin(), PE = PL.end(); PI != PE; ++PI) { - auto next = PI + 1; - if (next != PE && *next == phases::Link) - CudaInjectionPhase = *PI; - } - } + bool InjectCuda = (InputType == types::TY_CUDA && + !Args.hasArg(options::OPT_cuda_host_only)); + CudaInjectionPhase = FinalPhase; + for (auto &Phase : PL) + if (Phase <= FinalPhase && Phase == phases::Compile) { + CudaInjectionPhase = Phase; + break; + } // Build the pipeline for this file. std::unique_ptr Current(new InputAction(*InputArg, InputType)); @@ -1464,8 +1458,7 @@ // Otherwise construct the appropriate action. Current = ConstructPhaseAction(TC, Args, Phase, std::move(Current)); - if (InputType == types::TY_CUDA && Phase == CudaInjectionPhase && - !Args.hasArg(options::OPT_cuda_host_only)) { + if (InjectCuda && Phase == CudaInjectionPhase) { Current = buildCudaActions(*this, TC, Args, InputArg, std::move(Current), Actions); if (!Current) @@ -1679,10 +1672,17 @@ } } -static const Tool *SelectToolForJob(Compilation &C, bool SaveTemps, +// Returns a Tool for a given JobAction. In case the action and its +// predecessors can be combined, updates Inputs with the inputs of the +// first combined action. If one of the collapsed actions is a +// CudaHostAction, updates CollapsedCHA with the pointer to it so the +// caller can deal with extra handling such action requires. +static const Tool *selectToolForJob(Compilation &C, bool SaveTemps, const ToolChain *TC, const JobAction *JA, - const ActionList *&Inputs) { + const ActionList *&Inputs, + const CudaHostAction *&CollapsedCHA) { const Tool *ToolForJob = nullptr; + CollapsedCHA = nullptr; // See if we should look for a compiler with an integrated assembler. We match // bottom up, so what we are actually looking for is an assembler job with a @@ -1699,13 +1699,19 @@ // checking the backend tool, check if the tool for the CompileJob // has an integrated assembler. const ActionList *BackendInputs = &(*Inputs)[0]->getInputs(); - JobAction *CompileJA = cast(*BackendInputs->begin()); + // Compile job may be wrapped in CudaHostAction, extract it if + // that's the case and update CollapsedCHA if we combine phases. + CudaHostAction *CHA = dyn_cast(*BackendInputs->begin()); + JobAction *CompileJA = + cast(CHA ? *CHA->begin() : *BackendInputs->begin()); + assert(CompileJA && "Backend job is not preceeded by compile job."); const Tool *Compiler = TC->SelectTool(*CompileJA); if (!Compiler) return nullptr; if (Compiler->hasIntegratedAssembler()) { - Inputs = &(*BackendInputs)[0]->getInputs(); + Inputs = &CompileJA->getInputs(); ToolForJob = Compiler; + CollapsedCHA = CHA; } } @@ -1715,19 +1721,19 @@ if (isa(JA)) { // Check if the compiler supports emitting LLVM IR. assert(Inputs->size() == 1); - JobAction *CompileJA; - // Extract real host action, if it's a CudaHostAction. - if (CudaHostAction *CudaHA = dyn_cast(*Inputs->begin())) - CompileJA = cast(*CudaHA->begin()); - else - CompileJA = cast(*Inputs->begin()); - + // Compile job may be wrapped in CudaHostAction, extract it if + // that's the case and update CollapsedCHA if we combine phases. + CudaHostAction *CHA = dyn_cast(*Inputs->begin()); + JobAction *CompileJA = + cast(CHA ? *CHA->begin() : *Inputs->begin()); + assert(CompileJA && "Backend job is not preceeded by compile job."); const Tool *Compiler = TC->SelectTool(*CompileJA); if (!Compiler) return nullptr; if (!Compiler->canEmitIR() || !SaveTemps) { - Inputs = &(*Inputs)[0]->getInputs(); + Inputs = &CompileJA->getInputs(); ToolForJob = Compiler; + CollapsedCHA = CHA; } } @@ -1811,10 +1817,23 @@ const ActionList *Inputs = &A->getInputs(); const JobAction *JA = cast(A); - const Tool *T = SelectToolForJob(C, isSaveTempsEnabled(), TC, JA, Inputs); + const CudaHostAction *CollapsedCHA = nullptr; + const Tool *T = + selectToolForJob(C, isSaveTempsEnabled(), TC, JA, Inputs, CollapsedCHA); if (!T) return; + // If we've collapsed action list that contained CudaHostAction we + // need to build jobs for device-side inputs it may have held. + if (CollapsedCHA) { + InputInfo II; + for (const Action *DA : CollapsedCHA->getDeviceActions()) { + BuildJobsForAction(C, DA, TC, "", AtTopLevel, + /*MultipleArchs*/ false, LinkingOutput, II); + CudaDeviceInputInfos.push_back(II); + } + } + // Only use pipes when there is exactly one input. InputInfoList InputInfos; for (const Action *Input : *Inputs) { Index: cfe/trunk/test/Driver/cuda-options.cu =================================================================== --- cfe/trunk/test/Driver/cuda-options.cu +++ cfe/trunk/test/Driver/cuda-options.cu @@ -6,7 +6,7 @@ // Simple compilation case: // RUN: %clang -### -target x86_64-linux-gnu -c %s 2>&1 \ // Compile device-side to PTX assembly and make sure we use it on the host side. -// RUN: | FileCheck -check-prefix CUDA-D1 \ +// RUN: | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS\ // Then compile host side and incorporate device code. // RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 \ // Make sure we don't link anything. @@ -15,7 +15,7 @@ // Typical compilation + link case: // RUN: %clang -### -target x86_64-linux-gnu %s 2>&1 \ // Compile device-side to PTX assembly and make sure we use it on the host side -// RUN: | FileCheck -check-prefix CUDA-D1 \ +// RUN: | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS\ // Then compile host side and incorporate device code. // RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 \ // Then link things. @@ -33,7 +33,7 @@ // Verify that -cuda-no-host disables host-side compilation and linking // RUN: %clang -### -target x86_64-linux-gnu --cuda-device-only %s 2>&1 \ // Compile device-side to PTX assembly -// RUN: | FileCheck -check-prefix CUDA-D1 \ +// RUN: | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS\ // Make sure there are no host cmpilation or linking. // RUN: -check-prefix CUDA-NH -check-prefix CUDA-NL %s @@ -41,7 +41,7 @@ // and incorporate device code on the host side. // RUN: %clang -### -target x86_64-linux-gnu -S -c %s 2>&1 \ // Compile device-side to PTX assembly -// RUN: | FileCheck -check-prefix CUDA-D1 \ +// RUN: | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS\ // Then compile host side and incorporate GPU code. // RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 \ // Make sure we don't link anything. @@ -51,7 +51,8 @@ // archtecture info to device compilation. // RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \ // Compile device-side to PTX assembly. -// RUN: | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1-SM35 \ +// RUN: | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS \ +// RUN: -check-prefix CUDA-D1-SM35 \ // Then compile host side and incorporate GPU code. // RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 \ // Make sure we don't link anything. @@ -59,16 +60,47 @@ // Verify that there is device-side compilation per --cuda-gpu-arch args // and that all results are included on the host side. -// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 -c %s 2>&1 \ +// RUN: %clang -### -target x86_64-linux-gnu \ +// RUN: --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 -c %s 2>&1 \ // Compile both device-sides to PTX assembly // RUN: | FileCheck \ -// RUN: -check-prefix CUDA-D1 -check-prefix CUDA-D1-SM35 \ +// RUN: -check-prefix CUDA-D1 -check-prefix CUDA-D1NS -check-prefix CUDA-D1-SM35 \ // RUN: -check-prefix CUDA-D2 -check-prefix CUDA-D2-SM30 \ // Then compile host side and incorporate both device-side outputs -// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 -check-prefix CUDA-H-I2 \ +// RUN: -check-prefix CUDA-H -check-prefix CUDA-HNS \ +// RUN: -check-prefix CUDA-H-I1 -check-prefix CUDA-H-I2 \ // Make sure we don't link anything. // RUN: -check-prefix CUDA-NL %s +// Verify that device-side results are passed to correct tool when +// -save-temps is used +// RUN: %clang -### -target x86_64-linux-gnu -save-temps -c %s 2>&1 \ +// Compile device-side to PTX assembly and make sure we use it on the host side. +// RUN: | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1S \ +// Then compile host side and incorporate device code. +// RUN: -check-prefix CUDA-H -check-prefix CUDA-HS -check-prefix CUDA-HS-I1 \ +// Make sure we don't link anything. +// RUN: -check-prefix CUDA-NL %s + +// Verify that device-side results are passed to correct tool when +// -fno-integrated-as is used +// RUN: %clang -### -target x86_64-linux-gnu -fno-integrated-as -c %s 2>&1 \ +// Compile device-side to PTX assembly and make sure we use it on the host side. +// RUN: | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS \ +// Then compile host side and incorporate device code. +// RUN: -check-prefix CUDA-H -check-prefix CUDA-HNS -check-prefix CUDA-HS-I1 \ +// RUN: -check-prefix CUDA-H-AS \ +// Make sure we don't link anything. +// RUN: -check-prefix CUDA-NL %s + +// Match device-side preprocessor, and compiler phases with -save-temps +// CUDA-D1S: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda" +// CUDA-D1S-SAME: "-fcuda-is-device" +// CUDA-D1S-SAME: "-x" "cuda" +// CUDA-D1S: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda" +// CUDA-D1S-SAME: "-fcuda-is-device" +// CUDA-D1S-SAME: "-x" "cuda-cpp-output" + // --cuda-host-only should never trigger unused arg warning. // RUN: %clang -### -target x86_64-linux-gnu --cuda-host-only -c %s 2>&1 | \ // RUN: FileCheck -check-prefix CUDA-NO-UNUSED-CHO %s @@ -83,12 +115,13 @@ // RUN: %clang -### -target x86_64-linux-gnu --cuda-device-only -x c -c %s 2>&1 | \ // RUN: FileCheck -check-prefix CUDA-UNUSED-CDO %s -// Match device-side compilation +// Match the job that produces PTX assembly // CUDA-D1: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda" // CUDA-D1-SAME: "-fcuda-is-device" // CUDA-D1-SM35-SAME: "-target-cpu" "sm_35" // CUDA-D1-SAME: "-o" "[[GPUBINARY1:[^"]*]]" -// CUDA-D1-SAME: "-x" "cuda" +// CUDA-D1NS-SAME: "-x" "cuda" +// CUDA-D1S-SAME: "-x" "ir" // Match anothe device-side compilation // CUDA-D2: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda" @@ -98,18 +131,28 @@ // CUDA-D2-SAME: "-x" "cuda" // Match no device-side compilation -// CUDA-ND-NOT: "-cc1" "-triple" "nvptx{{64?}}-nvidia-cuda" +// CUDA-ND-NOT: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda" // CUDA-ND-SAME-NOT: "-fcuda-is-device" +// Match host-side preprocessor job with -save-temps +// CUDA-HS: "-cc1" "-triple" +// CUDA-HS-SAME-NOT: "nvptx{{(64)?}}-nvidia-cuda" +// CUDA-HS-SAME-NOT: "-fcuda-is-device" +// CUDA-HS-SAME: "-x" "cuda" + // Match host-side compilation // CUDA-H: "-cc1" "-triple" -// CUDA-H-SAME-NOT: "nvptx{{64?}}-nvidia-cuda" +// CUDA-H-SAME-NOT: "nvptx{{(64)?}}-nvidia-cuda" // CUDA-H-SAME-NOT: "-fcuda-is-device" -// CUDA-H-SAME: "-o" "[[HOSTOBJ:[^"]*]]" -// CUDA-H-SAME: "-x" "cuda" +// CUDA-H-SAME: "-o" "[[HOSTOUTPUT:[^"]*]]" +// CUDA-HNS-SAME: "-x" "cuda" +// CUDA-HS-SAME: "-x" "cuda-cpp-output" // CUDA-H-I1-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY1]]" // CUDA-H-I2-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY2]]" +// Match external assembler that uses compilation output +// CUDA-H-AS: "-o" "{{.*}}.o" "[[HOSTOUTPUT]]" + // Match no GPU code inclusion. // CUDA-H-NI-NOT: "-fcuda-include-gpubinary" @@ -119,7 +162,7 @@ // Match linker // CUDA-L: "{{.*}}{{ld|link}}{{(.exe)?}}" -// CUDA-L-SAME: "[[HOSTOBJ]]" +// CUDA-L-SAME: "[[HOSTOUTPUT]]" // Match no linker // CUDA-NL-NOT: "{{.*}}{{ld|link}}{{(.exe)?}}"