Index: cfe/trunk/lib/Driver/Driver.cpp
===================================================================
--- cfe/trunk/lib/Driver/Driver.cpp
+++ cfe/trunk/lib/Driver/Driver.cpp
@@ -1233,11 +1233,13 @@
   }
 }
 
-// For each unique --cuda-gpu-arch= argument creates a TY_CUDA_DEVICE input
-// action and then wraps each in CudaDeviceAction paired with appropriate GPU
-// arch name. If we're only building device-side code, each action remains
-// independent. Otherwise we pass device-side actions as inputs to a new
-// CudaHostAction which combines both host and device side actions.
+// For each unique --cuda-gpu-arch= argument creates a TY_CUDA_DEVICE
+// input action and then wraps each in CudaDeviceAction paired with
+// appropriate GPU arch name. In case of partial (i.e preprocessing
+// only) or device-only compilation, each device action is added to /p
+// Actions and /p Current is released. Otherwise the function creates
+// and returns a new CudaHostAction which wraps /p Current and device
+// side actions.
 static std::unique_ptr<Action>
 buildCudaActions(const Driver &D, const ToolChain &TC, DerivedArgList &Args,
                  const Arg *InputArg, std::unique_ptr<Action> HostAction,
@@ -1421,22 +1423,14 @@
     }
 
     phases::ID CudaInjectionPhase;
-    if (isSaveTempsEnabled()) {
-      // All phases are done independently, inject GPU blobs during compilation
-      // phase as that's where we generate glue code to init them.
-      CudaInjectionPhase = phases::Compile;
-    } else {
-      // Assumes that clang does everything up until linking phase, so we inject
-      // cuda device actions at the last step before linking. Otherwise CUDA
-      // host action forces preprocessor into a separate invocation.
-      CudaInjectionPhase = FinalPhase;
-      if (FinalPhase == phases::Link)
-        for (auto PI = PL.begin(), PE = PL.end(); PI != PE; ++PI) {
-          auto next = PI + 1;
-          if (next != PE && *next == phases::Link)
-            CudaInjectionPhase = *PI;
-        }
-    }
+    bool InjectCuda = (InputType == types::TY_CUDA &&
+                       !Args.hasArg(options::OPT_cuda_host_only));
+    CudaInjectionPhase = FinalPhase;
+    for (auto &Phase : PL)
+      if (Phase <= FinalPhase && Phase == phases::Compile) {
+        CudaInjectionPhase = Phase;
+        break;
+      }
 
     // Build the pipeline for this file.
     std::unique_ptr<Action> Current(new InputAction(*InputArg, InputType));
@@ -1464,8 +1458,7 @@
       // Otherwise construct the appropriate action.
       Current = ConstructPhaseAction(TC, Args, Phase, std::move(Current));
 
-      if (InputType == types::TY_CUDA && Phase == CudaInjectionPhase &&
-          !Args.hasArg(options::OPT_cuda_host_only)) {
+      if (InjectCuda && Phase == CudaInjectionPhase) {
         Current = buildCudaActions(*this, TC, Args, InputArg,
                                    std::move(Current), Actions);
         if (!Current)
@@ -1679,10 +1672,17 @@
   }
 }
 
-static const Tool *SelectToolForJob(Compilation &C, bool SaveTemps,
+// Returns a Tool for a given JobAction.  In case the action and its
+// predecessors can be combined, updates Inputs with the inputs of the
+// first combined action. If one of the collapsed actions is a
+// CudaHostAction, updates CollapsedCHA with the pointer to it so the
+// caller can deal with extra handling such action requires.
+static const Tool *selectToolForJob(Compilation &C, bool SaveTemps,
                                     const ToolChain *TC, const JobAction *JA,
-                                    const ActionList *&Inputs) {
+                                    const ActionList *&Inputs,
+                                    const CudaHostAction *&CollapsedCHA) {
   const Tool *ToolForJob = nullptr;
+  CollapsedCHA = nullptr;
 
   // See if we should look for a compiler with an integrated assembler. We match
   // bottom up, so what we are actually looking for is an assembler job with a
@@ -1699,13 +1699,19 @@
     // checking the backend tool, check if the tool for the CompileJob
     // has an integrated assembler.
     const ActionList *BackendInputs = &(*Inputs)[0]->getInputs();
-    JobAction *CompileJA = cast<CompileJobAction>(*BackendInputs->begin());
+    // Compile job may be wrapped in CudaHostAction, extract it if
+    // that's the case and update CollapsedCHA if we combine phases.
+    CudaHostAction *CHA = dyn_cast<CudaHostAction>(*BackendInputs->begin());
+    JobAction *CompileJA =
+        cast<CompileJobAction>(CHA ? *CHA->begin() : *BackendInputs->begin());
+    assert(CompileJA && "Backend job is not preceeded by compile job.");
     const Tool *Compiler = TC->SelectTool(*CompileJA);
     if (!Compiler)
       return nullptr;
     if (Compiler->hasIntegratedAssembler()) {
-      Inputs = &(*BackendInputs)[0]->getInputs();
+      Inputs = &CompileJA->getInputs();
       ToolForJob = Compiler;
+      CollapsedCHA = CHA;
     }
   }
 
@@ -1715,19 +1721,19 @@
   if (isa<BackendJobAction>(JA)) {
     // Check if the compiler supports emitting LLVM IR.
     assert(Inputs->size() == 1);
-    JobAction *CompileJA;
-    // Extract real host action, if it's a CudaHostAction.
-    if (CudaHostAction *CudaHA = dyn_cast<CudaHostAction>(*Inputs->begin()))
-      CompileJA = cast<CompileJobAction>(*CudaHA->begin());
-    else
-      CompileJA = cast<CompileJobAction>(*Inputs->begin());
-
+    // Compile job may be wrapped in CudaHostAction, extract it if
+    // that's the case and update CollapsedCHA if we combine phases.
+    CudaHostAction *CHA = dyn_cast<CudaHostAction>(*Inputs->begin());
+    JobAction *CompileJA =
+        cast<CompileJobAction>(CHA ? *CHA->begin() : *Inputs->begin());
+    assert(CompileJA && "Backend job is not preceeded by compile job.");
     const Tool *Compiler = TC->SelectTool(*CompileJA);
     if (!Compiler)
       return nullptr;
     if (!Compiler->canEmitIR() || !SaveTemps) {
-      Inputs = &(*Inputs)[0]->getInputs();
+      Inputs = &CompileJA->getInputs();
       ToolForJob = Compiler;
+      CollapsedCHA = CHA;
     }
   }
 
@@ -1811,10 +1817,23 @@
   const ActionList *Inputs = &A->getInputs();
 
   const JobAction *JA = cast<JobAction>(A);
-  const Tool *T = SelectToolForJob(C, isSaveTempsEnabled(), TC, JA, Inputs);
+  const CudaHostAction *CollapsedCHA = nullptr;
+  const Tool *T =
+      selectToolForJob(C, isSaveTempsEnabled(), TC, JA, Inputs, CollapsedCHA);
   if (!T)
     return;
 
+  // If we've collapsed action list that contained CudaHostAction we
+  // need to build jobs for device-side inputs it may have held.
+  if (CollapsedCHA) {
+    InputInfo II;
+    for (const Action *DA : CollapsedCHA->getDeviceActions()) {
+      BuildJobsForAction(C, DA, TC, "", AtTopLevel,
+                         /*MultipleArchs*/ false, LinkingOutput, II);
+      CudaDeviceInputInfos.push_back(II);
+    }
+  }
+
   // Only use pipes when there is exactly one input.
   InputInfoList InputInfos;
   for (const Action *Input : *Inputs) {
Index: cfe/trunk/test/Driver/cuda-options.cu
===================================================================
--- cfe/trunk/test/Driver/cuda-options.cu
+++ cfe/trunk/test/Driver/cuda-options.cu
@@ -6,7 +6,7 @@
 // Simple compilation case:
 // RUN: %clang -### -target x86_64-linux-gnu -c %s 2>&1 \
 // Compile device-side to PTX assembly and make sure we use it on the host side.
-// RUN:   | FileCheck -check-prefix CUDA-D1 \
+// RUN:   | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS\
 // Then compile host side and incorporate device code.
 // RUN:   -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
 // Make sure we don't link anything.
@@ -15,7 +15,7 @@
 // Typical compilation + link case:
 // RUN: %clang -### -target x86_64-linux-gnu %s 2>&1 \
 // Compile device-side to PTX assembly and make sure we use it on the host side
-// RUN:   | FileCheck -check-prefix CUDA-D1 \
+// RUN:   | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS\
 // Then compile host side and incorporate device code.
 // RUN:   -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
 // Then link things.
@@ -33,7 +33,7 @@
 // Verify that -cuda-no-host disables host-side compilation and linking
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-device-only %s 2>&1 \
 // Compile device-side to PTX assembly
-// RUN:   | FileCheck -check-prefix CUDA-D1 \
+// RUN:   | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS\
 // Make sure there are no host cmpilation or linking.
 // RUN:   -check-prefix CUDA-NH -check-prefix CUDA-NL %s
 
@@ -41,7 +41,7 @@
 // and incorporate device code on the host side.
 // RUN: %clang -### -target x86_64-linux-gnu -S -c %s 2>&1 \
 // Compile device-side to PTX assembly
-// RUN:   | FileCheck -check-prefix CUDA-D1 \
+// RUN:   | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS\
 // Then compile host side and incorporate GPU code.
 // RUN:  -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
 // Make sure we don't link anything.
@@ -51,7 +51,8 @@
 // archtecture info to device compilation.
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \
 // Compile device-side to PTX assembly.
-// RUN:   | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1-SM35 \
+// RUN:   | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS \
+// RUN:   -check-prefix CUDA-D1-SM35 \
 // Then compile host side and incorporate GPU code.
 // RUN:   -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
 // Make sure we don't link anything.
@@ -59,16 +60,47 @@
 
 // Verify that there is device-side compilation per --cuda-gpu-arch args
 // and that all results are included on the host side.
-// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 -c %s 2>&1 \
+// RUN: %clang -### -target x86_64-linux-gnu \
+// RUN:        --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 -c %s 2>&1 \
 // Compile both device-sides to PTX assembly
 // RUN:   | FileCheck \
-// RUN: -check-prefix CUDA-D1 -check-prefix CUDA-D1-SM35 \
+// RUN: -check-prefix CUDA-D1 -check-prefix CUDA-D1NS -check-prefix CUDA-D1-SM35 \
 // RUN: -check-prefix CUDA-D2 -check-prefix CUDA-D2-SM30 \
 // Then compile host side and incorporate both device-side outputs
-// RUN:   -check-prefix CUDA-H -check-prefix CUDA-H-I1 -check-prefix CUDA-H-I2 \
+// RUN:   -check-prefix CUDA-H -check-prefix CUDA-HNS \
+// RUN:   -check-prefix CUDA-H-I1 -check-prefix CUDA-H-I2 \
 // Make sure we don't link anything.
 // RUN:   -check-prefix CUDA-NL %s
 
+// Verify that device-side results are passed to correct tool when
+// -save-temps is used
+// RUN: %clang -### -target x86_64-linux-gnu -save-temps -c %s 2>&1 \
+// Compile device-side to PTX assembly and make sure we use it on the host side.
+// RUN:   | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1S \
+// Then compile host side and incorporate device code.
+// RUN:   -check-prefix CUDA-H -check-prefix CUDA-HS -check-prefix CUDA-HS-I1 \
+// Make sure we don't link anything.
+// RUN:   -check-prefix CUDA-NL %s
+
+// Verify that device-side results are passed to correct tool when
+// -fno-integrated-as is used
+// RUN: %clang -### -target x86_64-linux-gnu -fno-integrated-as -c %s 2>&1 \
+// Compile device-side to PTX assembly and make sure we use it on the host side.
+// RUN:   | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1NS \
+// Then compile host side and incorporate device code.
+// RUN:   -check-prefix CUDA-H -check-prefix CUDA-HNS -check-prefix CUDA-HS-I1 \
+// RUN:   -check-prefix CUDA-H-AS \
+// Make sure we don't link anything.
+// RUN:   -check-prefix CUDA-NL %s
+
+// Match device-side preprocessor, and compiler phases with -save-temps
+// CUDA-D1S: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
+// CUDA-D1S-SAME: "-fcuda-is-device"
+// CUDA-D1S-SAME: "-x" "cuda"
+// CUDA-D1S: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
+// CUDA-D1S-SAME: "-fcuda-is-device"
+// CUDA-D1S-SAME: "-x" "cuda-cpp-output"
+
 // --cuda-host-only should never trigger unused arg warning.
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-host-only -c %s 2>&1 | \
 // RUN:    FileCheck -check-prefix CUDA-NO-UNUSED-CHO %s
@@ -83,12 +115,13 @@
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-device-only -x c -c %s 2>&1 | \
 // RUN:    FileCheck -check-prefix CUDA-UNUSED-CDO %s
 
-// Match device-side compilation
+// Match the job that produces PTX assembly
 // CUDA-D1: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
 // CUDA-D1-SAME: "-fcuda-is-device"
 // CUDA-D1-SM35-SAME: "-target-cpu" "sm_35"
 // CUDA-D1-SAME: "-o" "[[GPUBINARY1:[^"]*]]"
-// CUDA-D1-SAME: "-x" "cuda"
+// CUDA-D1NS-SAME: "-x" "cuda"
+// CUDA-D1S-SAME: "-x" "ir"
 
 // Match anothe device-side compilation
 // CUDA-D2: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
@@ -98,18 +131,28 @@
 // CUDA-D2-SAME: "-x" "cuda"
 
 // Match no device-side compilation
-// CUDA-ND-NOT: "-cc1" "-triple" "nvptx{{64?}}-nvidia-cuda"
+// CUDA-ND-NOT: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
 // CUDA-ND-SAME-NOT: "-fcuda-is-device"
 
+// Match host-side preprocessor job with -save-temps
+// CUDA-HS: "-cc1" "-triple"
+// CUDA-HS-SAME-NOT: "nvptx{{(64)?}}-nvidia-cuda"
+// CUDA-HS-SAME-NOT: "-fcuda-is-device"
+// CUDA-HS-SAME: "-x" "cuda"
+
 // Match host-side compilation
 // CUDA-H: "-cc1" "-triple"
-// CUDA-H-SAME-NOT: "nvptx{{64?}}-nvidia-cuda"
+// CUDA-H-SAME-NOT: "nvptx{{(64)?}}-nvidia-cuda"
 // CUDA-H-SAME-NOT: "-fcuda-is-device"
-// CUDA-H-SAME: "-o" "[[HOSTOBJ:[^"]*]]"
-// CUDA-H-SAME: "-x" "cuda"
+// CUDA-H-SAME: "-o" "[[HOSTOUTPUT:[^"]*]]"
+// CUDA-HNS-SAME: "-x" "cuda"
+// CUDA-HS-SAME: "-x" "cuda-cpp-output"
 // CUDA-H-I1-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY1]]"
 // CUDA-H-I2-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY2]]"
 
+// Match external assembler that uses compilation output
+// CUDA-H-AS: "-o" "{{.*}}.o" "[[HOSTOUTPUT]]"
+
 // Match no GPU code inclusion.
 // CUDA-H-NI-NOT: "-fcuda-include-gpubinary"
 
@@ -119,7 +162,7 @@
 
 // Match linker
 // CUDA-L: "{{.*}}{{ld|link}}{{(.exe)?}}"
-// CUDA-L-SAME: "[[HOSTOBJ]]"
+// CUDA-L-SAME: "[[HOSTOUTPUT]]"
 
 // Match no linker
 // CUDA-NL-NOT: "{{.*}}{{ld|link}}{{(.exe)?}}"