diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -409,9 +409,6 @@
     CmdArgs.push_back("--return-at-end");
   } else if (Arg *A = Args.getLastArg(options::OPT_O_Group)) {
     // Map the -O we received to -O{0,1,2,3}.
-    //
-    // TODO: Perhaps we should map host -O2 to ptxas -O3. -O3 is ptxas's
-    // default, so it may correspond more closely to the spirit of clang -O2.
 
     // -O3 seems like the least-bad option when -Osomething is specified to
     // clang but it isn't handled below.
@@ -433,9 +430,9 @@
     }
     CmdArgs.push_back(Args.MakeArgString(llvm::Twine("-O") + OOpt));
   } else {
-    // If no -O was passed, pass -O0 to ptxas -- no opt flag should correspond
-    // to no optimizations, but ptxas's default is -O3.
-    CmdArgs.push_back("-O0");
+    // If no -O was passed, pass -O3 to ptxas -- this makes ptxas's
+    // optimization level the same as the ptxjitcompiler.
+    CmdArgs.push_back("-O3");
   }
   if (DIKind == DebugDirectivesOnly)
     CmdArgs.push_back("-lineinfo");
diff --git a/clang/test/Driver/cuda-external-tools.cu b/clang/test/Driver/cuda-external-tools.cu
--- a/clang/test/Driver/cuda-external-tools.cu
+++ b/clang/test/Driver/cuda-external-tools.cu
@@ -40,10 +40,10 @@
 // RUN:   --no-cuda-noopt-device-debug -O2 -c %s 2>&1 \
 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT2 %s
 
-// Regular compile without -O.  This should result in us passing -O0 to ptxas.
+// Regular compile without -O.  This should result in us passing -O3 to ptxas.
 // RUN: %clang -### -target x86_64-linux-gnu -c %s 2>&1 \
 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
-// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT0 %s
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT3 %s
 
 // Regular compiles with -Os and -Oz.  For lack of a better option, we map
 // these to ptxas -O3.
@@ -75,7 +75,7 @@
 // Compile with -fintegrated-as.  This should still cause us to invoke ptxas.
 // RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -c %s 2>&1 \
 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
-// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT0 %s
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT3 %s
 // Check that we still pass -c when generating relocatable device code.
 // RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -fgpu-rdc -c %s 2>&1 \
 // RUN:   --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \