Index: docs/ClangCommandLineReference.rst
===================================================================
--- docs/ClangCommandLineReference.rst
+++ docs/ClangCommandLineReference.rst
@@ -144,6 +144,10 @@
 
 CUDA GPU architecture (e.g. sm\_35).  May be specified more than once.
 
+.. option:: --cuda-include-ptx=<arg>, --no-cuda-include-ptx=<arg>
+
+Include (or not) PTX along with CUDA GPU binary for the given architecture (e.g. sm\_35). Argument may be 'all'. The option may be specified more than once. Default: --cuda-include-ptx=all
+
 .. option:: --cuda-host-only
 
 Compile CUDA code for host only.  Has no effect on non-CUDA compilations.
Index: include/clang/Driver/Options.td
===================================================================
--- include/clang/Driver/Options.td
+++ include/clang/Driver/Options.td
@@ -546,6 +546,10 @@
 def cuda_compile_host_device : Flag<["--"], "cuda-compile-host-device">,
   HelpText<"Compile CUDA code for both host and device (default).  Has no "
            "effect on non-CUDA compilations.">;
+def cuda_include_ptx_EQ : Joined<["--"], "cuda-include-ptx=">, Flags<[DriverOption]>,
+  HelpText<"Include PTX for the follwing GPU architecture (e.g. sm_35) or 'all'. May be specified more than once.">;
+def no_cuda_include_ptx_EQ : Joined<["--"], "no-cuda-include-ptx=">, Flags<[DriverOption]>,
+  HelpText<"Do not include PTX for the follwing GPU architecture (e.g. sm_35) or 'all'. May be specified more than once.">;
 def cuda_gpu_arch_EQ : Joined<["--"], "cuda-gpu-arch=">, Flags<[DriverOption]>,
   HelpText<"CUDA GPU architecture (e.g. sm_35).  May be specified more than once.">;
 def no_cuda_gpu_arch_EQ : Joined<["--"], "no-cuda-gpu-arch=">, Flags<[DriverOption]>,
Index: lib/Driver/ToolChains/Cuda.cpp
===================================================================
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -377,6 +377,22 @@
   C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
 }
 
+static bool shouldIncludePTX(const ArgList &Args, const char *gpu_arch) {
+  bool includePTX = true;
+  for (Arg *A : Args) {
+    if (!(A->getOption().matches(options::OPT_cuda_include_ptx_EQ) ||
+          A->getOption().matches(options::OPT_no_cuda_include_ptx_EQ)))
+      continue;
+    A->claim();
+    const StringRef ArchStr = A->getValue();
+    if (ArchStr == "all" || ArchStr == gpu_arch) {
+      includePTX = A->getOption().matches(options::OPT_cuda_include_ptx_EQ);
+      continue;
+    }
+  }
+  return includePTX;
+}
+
 // All inputs to this linker must be from CudaDeviceActions, as we need to look
 // at the Inputs' Actions in order to figure out which GPU architecture they
 // correspond to.
@@ -404,6 +420,9 @@
            "Device action expected to have associated a GPU architecture!");
     CudaArch gpu_arch = StringToCudaArch(gpu_arch_str);
 
+    if (II.getType() == types::TY_PP_Asm &&
+        !shouldIncludePTX(Args, gpu_arch_str))
+      continue;
     // We need to pass an Arch of the form "sm_XX" for cubin files and
     // "compute_XX" for ptx.
     const char *Arch =
Index: test/Driver/cuda-options.cu
===================================================================
--- test/Driver/cuda-options.cu
+++ test/Driver/cuda-options.cu
@@ -142,6 +142,48 @@
 // RUN:   -c %s 2>&1 \
 // RUN: | FileCheck -check-prefix ARCHALLERROR %s
 
+
+// Verify that --[no-]cuda-include-ptx arguments are handled correctly.
+// a) by default we're including PTX for all GPUs.
+// RUN: %clang -### -target x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 \
+// RUN:   -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes FATBIN-COMMON,PTX-SM35,PTX-SM30 %s
+
+// b) --no-cuda-include-ptx=all disables PTX inclusion for all GPUs
+// RUN: %clang -### -target x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 \
+// RUN:   --no-cuda-include-ptx=all \
+// RUN:   -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes FATBIN-COMMON,NOPTX-SM35,NOPTX-SM30 %s
+
+// c) --no-cuda-include-ptx=sm_XX disables PTX inclusion for that GPU only.
+// RUN: %clang -### -target x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 \
+// RUN:   --no-cuda-include-ptx=sm_35 \
+// RUN:   -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes FATBIN-COMMON,NOPTX-SM35,PTX-SM30 %s
+// RUN: %clang -### -target x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 \
+// RUN:   --no-cuda-include-ptx=sm_30 \
+// RUN:   -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes FATBIN-COMMON,PTX-SM35,NOPTX-SM30 %s
+
+// d) --cuda-include-ptx=all overrides preceding --no-cuda-include-ptx=all
+// RUN: %clang -### -target x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 \
+// RUN:   --no-cuda-include-ptx=all --cuda-include-ptx=all \
+// RUN:   -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes FATBIN-COMMON,PTX-SM35,PTX-SM30 %s
+
+// e) --cuda-include-ptx=all overrides preceding --no-cuda-include-ptx=sm_XX
+// RUN: %clang -### -target x86_64-linux-gnu \
+// RUN:   --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 \
+// RUN:   --no-cuda-include-ptx=sm_30 --cuda-include-ptx=all \
+// RUN:   -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes FATBIN-COMMON,PTX-SM35,PTX-SM30 %s
+
+
 // ARCH-SM20: "-cc1"{{.*}}"-target-cpu" "sm_20"
 // NOARCH-SM20-NOT: "-cc1"{{.*}}"-target-cpu" "sm_20"
 // ARCH-SM30: "-cc1"{{.*}}"-target-cpu" "sm_30"
@@ -236,3 +278,12 @@
 
 // Match no linker.
 // NOLINK-NOT: "{{.*}}{{ld|link}}{{(.exe)?}}"
+
+// FATBIN-COMMON:fatbinary
+// FATBIN-COMMON: "--create" "[[FATBINARY:[^"]*]]"
+// FATBIN-COMMON: "--image=profile=sm_30,file=
+// PTX-SM30: "--image=profile=compute_30,file=
+// NOPTX-SM30-NOT: "--image=profile=compute_30,file=
+// FATBIN-COMMON: "--image=profile=sm_35,file=
+// PTX-SM35: "--image=profile=compute_35,file=
+// NOPTX-SM35-NOT: "--image=profile=compute_35,file=