Index: lib/Driver/Tools.cpp =================================================================== --- lib/Driver/Tools.cpp +++ lib/Driver/Tools.cpp @@ -10645,10 +10645,35 @@ ArgStringList CmdArgs; CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-m64" : "-m32"); - // Clang's default optimization level is -O0, but ptxas's default is -O3. - CmdArgs.push_back(Args.MakeArgString( - llvm::Twine("-O") + - Args.getLastArgValue(options::OPT_O_Group, "0").data())); + // Map the -O we received to -O{0,1,2,3}. + // + // TODO: Perhaps we should map host -O2 to ptxas -O3. -O3 is ptxas's default, + // so it may correspond more closely to the spirit of clang -O2. + if (Arg *A = Args.getLastArg(options::OPT_O_Group)) { + // -O3 seems like the least-bad option when -Osomething is specified to + // clang but it isn't handled above. + StringRef OOpt = "3"; + if (A->getOption().matches(options::OPT_O4) || + A->getOption().matches(options::OPT_Ofast)) + OOpt = "3"; + else if (A->getOption().matches(options::OPT_O0)) + OOpt = "0"; + else if (A->getOption().matches(options::OPT_O)) { + // -Os, -Oz, and -O(anything else) map to -O2, for lack of better options. + OOpt = llvm::StringSwitch(A->getValue()) + .Case("1", "1") + .Case("2", "2") + .Case("3", "3") + .Case("s", "2") + .Case("z", "2") + .Default("2"); + } + CmdArgs.push_back(Args.MakeArgString(llvm::Twine("-O") + OOpt)); + } else { + // If no -O was passed, pass -O0 to ptxas -- no opt flag should correspond + // to no optimizations, but ptxas's default is -O3. + CmdArgs.push_back("-O0"); + } // Don't bother passing -g to ptxas: It's enabled by default at -O0, and // not supported at other optimization levels. Index: test/Driver/cuda-external-tools.cu =================================================================== --- test/Driver/cuda-external-tools.cu +++ test/Driver/cuda-external-tools.cu @@ -4,14 +4,31 @@ // REQUIRES: x86-registered-target // REQUIRES: nvptx-registered-target -// Regular compile with -O2. +// Regular compiles with -O{0,1,2,3,4,fast}. -O4 and -Ofast map to ptxas O3. +// RUN: %clang -### -target x86_64-linux-gnu -O0 -c %s 2>&1 \ +// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT0 %s +// RUN: %clang -### -target x86_64-linux-gnu -O1 -c %s 2>&1 \ +// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT1 %s // RUN: %clang -### -target x86_64-linux-gnu -O2 -c %s 2>&1 \ // RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT2 %s +// RUN: %clang -### -target x86_64-linux-gnu -O3 -c %s 2>&1 \ +// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT3 %s +// RUN: %clang -### -target x86_64-linux-gnu -O4 -c %s 2>&1 \ +// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT3 %s +// RUN: %clang -### -target x86_64-linux-gnu -Ofast -c %s 2>&1 \ +// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT3 %s // Regular compile without -O. This should result in us passing -O0 to ptxas. // RUN: %clang -### -target x86_64-linux-gnu -c %s 2>&1 \ // RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT0 %s +// Regular compiles with -Os and -Oz. For lack of a better option, we map +// these to ptxas -O3. +// RUN: %clang -### -target x86_64-linux-gnu -Os -c %s 2>&1 \ +// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT2 %s +// RUN: %clang -### -target x86_64-linux-gnu -Oz -c %s 2>&1 \ +// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT2 %s + // Regular compile targeting sm_35. // RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \ // RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM35 %s @@ -42,7 +59,9 @@ // ARCH64: "-m64" // ARCH32: "-m32" // OPT0: "-O0" +// OPT1: "-O1" // OPT2: "-O2" +// OPT3: "-O3" // SM20: "--gpu-name" "sm_20" // SM35: "--gpu-name" "sm_35" // SM20: "--output-file" "[[CUBINFILE:[^"]*]]"