diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h --- a/clang/include/clang/Driver/Driver.h +++ b/clang/include/clang/Driver/Driver.h @@ -84,6 +84,9 @@ /// LTO mode selected via -f(no-)?lto(=.*)? options. LTOKind LTOMode; + /// LTO mode selected via -f(no-offload-)?lto(=.*)? options. + LTOKind OffloadLTOMode; + public: enum OpenMPRuntimeKind { /// An unknown OpenMP runtime. We can't generate effective OpenMP code @@ -559,10 +562,14 @@ bool ShouldEmitStaticLibrary(const llvm::opt::ArgList &Args) const; /// Returns true if we are performing any kind of LTO. - bool isUsingLTO() const { return LTOMode != LTOK_None; } + bool isUsingLTO(bool IsOffload = false) const { + return getLTOMode(IsOffload) != LTOK_None; + } /// Get the specific kind of LTO being performed. - LTOKind getLTOMode() const { return LTOMode; } + LTOKind getLTOMode(bool IsOffload = false) const { + return IsOffload ? OffloadLTOMode : LTOMode; + } private: diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1964,6 +1964,12 @@ HelpText<"Enable LTO in 'full' mode">; def fno_lto : Flag<["-"], "fno-lto">, Flags<[CoreOption, CC1Option]>, Group, HelpText<"Disable LTO mode (default)">; +def foffload_lto_EQ : Joined<["-"], "foffload-lto=">, Flags<[CoreOption]>, Group, + HelpText<"Set LTO mode to either 'full' or 'thin' for offload compilation">, Values<"thin,full">; +def foffload_lto : Flag<["-"], "foffload-lto">, Flags<[CoreOption]>, Group, + HelpText<"Enable LTO in 'full' mode for offload compilation">; +def fno_offload_lto : Flag<["-"], "fno-offload-lto">, Flags<[CoreOption]>, Group, + HelpText<"Disable LTO mode (default) for offload compilation">; def flto_jobs_EQ : Joined<["-"], "flto-jobs=">, Flags<[CC1Option]>, Group, HelpText<"Controls the backend parallelism of -flto=thin (default " diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -594,19 +594,24 @@ } // Parse the LTO options and record the type of LTO compilation -// based on which -f(no-)?lto(=.*)? option occurs last. -void Driver::setLTOMode(const llvm::opt::ArgList &Args) { - LTOMode = LTOK_None; - if (!Args.hasFlag(options::OPT_flto, options::OPT_flto_EQ, - options::OPT_fno_lto, false) && - !Args.hasFlag(options::OPT_flto_EQ_auto, options::OPT_fno_lto, false) && - !Args.hasFlag(options::OPT_flto_EQ_jobserver, options::OPT_fno_lto, - false)) - return; +// based on which -f(no-)?lto(=.*)? or -f(no-)?offload-lto(=.*)? +// option occurs last. +static llvm::Optional +parseLTOMode(Driver &D, const llvm::opt::ArgList &Args, OptSpecifier OptPos, + OptSpecifier OptNeg, OptSpecifier OptEq, bool IsOffload) { + driver::LTOKind LTOMode = LTOK_None; + // Non-offload LTO allows -flto=auto and -flto=jobserver. Offload LTO does + // not support those options. + if (!Args.hasFlag(OptPos, OptEq, OptNeg, false) && + (IsOffload || + (!Args.hasFlag(options::OPT_flto_EQ_auto, options::OPT_fno_lto, false) && + !Args.hasFlag(options::OPT_flto_EQ_jobserver, options::OPT_fno_lto, + false)))) + return None; StringRef LTOName("full"); - const Arg *A = Args.getLastArg(options::OPT_flto_EQ); + const Arg *A = Args.getLastArg(OptEq); if (A) LTOName = A->getValue(); @@ -617,9 +622,27 @@ if (LTOMode == LTOK_Unknown) { assert(A); - Diag(diag::err_drv_unsupported_option_argument) << A->getOption().getName() - << A->getValue(); + D.Diag(diag::err_drv_unsupported_option_argument) + << A->getOption().getName() << A->getValue(); + return None; } + return LTOMode; +} + +// Parse the LTO options. +void Driver::setLTOMode(const llvm::opt::ArgList &Args) { + LTOMode = LTOK_None; + if (auto M = parseLTOMode(*this, Args, options::OPT_flto, + options::OPT_fno_lto, options::OPT_flto_EQ, + /*IsOffload=*/false)) + LTOMode = M.getValue(); + + OffloadLTOMode = LTOK_None; + if (auto M = parseLTOMode(*this, Args, options::OPT_foffload_lto, + options::OPT_fno_offload_lto, + options::OPT_foffload_lto_EQ, + /*IsOffload=*/true)) + OffloadLTOMode = M.getValue(); } /// Compute the desired OpenMP runtime from the flags provided. @@ -2933,11 +2956,12 @@ // a fat binary containing all the code objects for different GPU's. // The fat binary is then an input to the host action. for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { - if (GPUSanitize) { + if (GPUSanitize || C.getDriver().isUsingLTO(/*IsOffload=*/true)) { // When GPU sanitizer is enabled, since we need to link in the // the sanitizer runtime library after the sanitize pass, we have // to skip the backend and assemble phases and use lld to link - // the bitcode. + // the bitcode. The same happens if users request to use LTO + // explicitly. ActionList AL; AL.push_back(CudaDeviceActions[I]); // Create a link action to link device IR with device library diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -4200,6 +4200,10 @@ bool IsHIPDevice = JA.isDeviceOffloading(Action::OFK_HIP); bool IsOpenMPDevice = JA.isDeviceOffloading(Action::OFK_OpenMP); bool IsHeaderModulePrecompile = isa(JA); + bool IsDeviceOffloadAction = !(JA.isDeviceOffloading(Action::OFK_None) || + JA.isDeviceOffloading(Action::OFK_Host)); + bool IsUsingLTO = D.isUsingLTO(IsDeviceOffloadAction); + auto LTOMode = D.getLTOMode(IsDeviceOffloadAction); // A header module compilation doesn't have a main input file, so invent a // fake one as a placeholder. @@ -4457,20 +4461,30 @@ if (JA.getType() == types::TY_LLVM_BC) CmdArgs.push_back("-emit-llvm-uselists"); - // Device-side jobs do not support LTO. - bool isDeviceOffloadAction = !(JA.isDeviceOffloading(Action::OFK_None) || - JA.isDeviceOffloading(Action::OFK_Host)); - - if (D.isUsingLTO() && !isDeviceOffloadAction) { - if (Args.hasArg(options::OPT_flto)) - CmdArgs.push_back("-flto"); - else { - if (D.getLTOMode() == LTOK_Thin) - CmdArgs.push_back("-flto=thin"); - else - CmdArgs.push_back("-flto=full"); + if (IsUsingLTO) { + if (!IsDeviceOffloadAction) { + if (Args.hasArg(options::OPT_flto)) + CmdArgs.push_back("-flto"); + else { + if (D.getLTOMode() == LTOK_Thin) + CmdArgs.push_back("-flto=thin"); + else + CmdArgs.push_back("-flto=full"); + } + CmdArgs.push_back("-flto-unit"); + } else if (Triple.isAMDGPU()) { + // Only AMDGPU supports device-side LTO + assert(LTOMode == LTOK_Full || LTOMode == LTOK_Thin); + CmdArgs.push_back(Args.MakeArgString( + Twine("-flto=") + (LTOMode == LTOK_Thin ? "thin" : "full"))); + CmdArgs.push_back("-flto-unit"); + } else { + D.Diag(diag::err_drv_unsupported_opt_for_target) + << Args.getLastArg(options::OPT_foffload_lto, + options::OPT_foffload_lto_EQ) + ->getAsString(Args) + << Triple.getTriple(); } - CmdArgs.push_back("-flto-unit"); } } @@ -4495,7 +4509,7 @@ // Embed-bitcode option. // Only white-listed flags below are allowed to be embedded. - if (C.getDriver().embedBitcodeInObject() && !C.getDriver().isUsingLTO() && + if (C.getDriver().embedBitcodeInObject() && !IsUsingLTO && (isa(JA) || isa(JA))) { // Add flags implied by -fembed-bitcode. Args.AddLastArg(CmdArgs, options::OPT_fembed_bitcode_EQ); @@ -4612,7 +4626,7 @@ return; } - if (C.getDriver().embedBitcodeMarkerOnly() && !C.getDriver().isUsingLTO()) + if (C.getDriver().embedBitcodeMarkerOnly() && !IsUsingLTO) CmdArgs.push_back("-fembed-bitcode=marker"); // We normally speed up the clang process a bit by skipping destructors at @@ -6475,7 +6489,7 @@ // be added so both IR can be captured. if ((C.getDriver().isSaveTempsEnabled() || JA.isHostOffloading(Action::OFK_OpenMP)) && - !(C.getDriver().embedBitcodeInObject() && !C.getDriver().isUsingLTO()) && + !(C.getDriver().embedBitcodeInObject() && !IsUsingLTO) && isa(JA)) CmdArgs.push_back("-disable-llvm-passes"); @@ -6608,7 +6622,7 @@ if (VirtualFunctionElimination) { // VFE requires full LTO (currently, this might be relaxed to allow ThinLTO // in the future). - if (D.getLTOMode() != LTOK_Full) + if (LTOMode != LTOK_Full) D.Diag(diag::err_drv_argument_only_allowed_with) << "-fvirtual-function-elimination" << "-flto=full"; @@ -6627,7 +6641,7 @@ } if (WholeProgramVTables) { - if (!D.isUsingLTO()) + if (!IsUsingLTO) D.Diag(diag::err_drv_argument_only_allowed_with) << "-fwhole-program-vtables" << "-flto"; @@ -6636,7 +6650,7 @@ bool DefaultsSplitLTOUnit = (WholeProgramVTables || Sanitize.needsLTO()) && - (D.getLTOMode() == LTOK_Full || TC.canSplitThinLTOUnit()); + (LTOMode == LTOK_Full || TC.canSplitThinLTOUnit()); bool SplitLTOUnit = Args.hasFlag(options::OPT_fsplit_lto_unit, options::OPT_fno_split_lto_unit, DefaultsSplitLTOUnit); @@ -6682,7 +6696,7 @@ // Enable order file instrumentation when ThinLTO is not on. When ThinLTO is // on, we need to pass these flags as linker flags and that will be handled // outside of the compiler. - if (!D.isUsingLTO()) { + if (!IsUsingLTO) { CmdArgs.push_back("-mllvm"); CmdArgs.push_back("-enable-order-file-instrumentation"); } diff --git a/clang/lib/Driver/ToolChains/HIP.cpp b/clang/lib/Driver/ToolChains/HIP.cpp --- a/clang/lib/Driver/ToolChains/HIP.cpp +++ b/clang/lib/Driver/ToolChains/HIP.cpp @@ -49,8 +49,8 @@ auto &TC = getToolChain(); auto &D = TC.getDriver(); assert(!Inputs.empty() && "Must have at least one input."); - addLTOOptions(TC, Args, LldArgs, Output, Inputs[0], - D.getLTOMode() == LTOK_Thin); + bool IsThinLTO = D.getLTOMode(/*IsOffload=*/true) == LTOK_Thin; + addLTOOptions(TC, Args, LldArgs, Output, Inputs[0], IsThinLTO); // Extract all the -m options std::vector Features; @@ -66,6 +66,12 @@ if (!Features.empty()) LldArgs.push_back(Args.MakeArgString(MAttrString)); + // ToDo: Remove this option after AMDGPU backend supports ISA-level linking. + // Since AMDGPU backend currently does not support ISA-level linking, all + // called functions need to be imported. + if (IsThinLTO) + LldArgs.push_back(Args.MakeArgString("-plugin-opt=-force-import-all")); + for (const Arg *A : Args.filtered(options::OPT_mllvm)) { LldArgs.push_back( Args.MakeArgString(Twine("-plugin-opt=") + A->getValue(0))); diff --git a/clang/test/Driver/hip-options.hip b/clang/test/Driver/hip-options.hip --- a/clang/test/Driver/hip-options.hip +++ b/clang/test/Driver/hip-options.hip @@ -56,3 +56,17 @@ // RUN: --offload-arch=gfx906 -fgpu-inline-threshold=1000 %s 2>&1 | FileCheck -check-prefix=THRESH %s // THRESH: clang{{.*}} "-triple" "amdgcn-amd-amdhsa" {{.*}} "-mllvm" "-inline-threshold=1000" // THRESH-NOT: clang{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-inline-threshold=1000" + +// Check -foffload-lto=thin translated correctly. + +// RUN: %clang -### -target x86_64-unknown-linux-gnu -nogpuinc -nogpulib \ +// RUN: --cuda-gpu-arch=gfx906 -foffload-lto=thin %s 2>&1 \ +// RUN: | FileCheck -check-prefix=THINLTO %s + +// RUN: %clang -### -target x86_64-unknown-linux-gnu -nogpuinc -nogpulib \ +// RUN: --cuda-gpu-arch=gfx906 -fgpu-rdc -foffload-lto=thin %s 2>&1 \ +// RUN: | FileCheck -check-prefix=THINLTO %s + +// THINLTO-NOT: clang{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-flto-unit" +// THINLTO: clang{{.*}} "-triple" "amdgcn-amd-amdhsa" {{.*}} "-flto=thin" "-flto-unit" +// THINLTO: lld{{.*}}"-plugin-opt=mcpu=gfx906" "-plugin-opt=thinlto" "-plugin-opt=-force-import-all" diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp --- a/llvm/lib/Transforms/IPO/FunctionImport.cpp +++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -84,6 +84,10 @@ "import-cutoff", cl::init(-1), cl::Hidden, cl::value_desc("N"), cl::desc("Only import first N functions if N>=0 (default -1)")); +static cl::opt + ForceImportAll("force-import-all", cl::init(false), cl::Hidden, + cl::desc("Import functions with noinline attribute")); + static cl::opt ImportInstrFactor("import-instr-evolution-factor", cl::init(0.7), cl::Hidden, cl::value_desc("x"), @@ -227,7 +231,7 @@ } if ((Summary->instCount() > Threshold) && - !Summary->fflags().AlwaysInline) { + !Summary->fflags().AlwaysInline && !ForceImportAll) { Reason = FunctionImporter::ImportFailureReason::TooLarge; return false; } @@ -240,7 +244,7 @@ } // Don't bother importing if we can't inline it anyway. - if (Summary->fflags().NoInline) { + if (Summary->fflags().NoInline && !ForceImportAll) { Reason = FunctionImporter::ImportFailureReason::NoInline; return false; } @@ -487,17 +491,28 @@ FailureInfo = std::make_unique( VI, Edge.second.getHotness(), Reason, 1); } - LLVM_DEBUG( - dbgs() << "ignored! No qualifying callee with summary found.\n"); - continue; + if (ForceImportAll) { + std::string Msg = std::string("Failed to import function ") + + VI.name().str() + " due to " + + getFailureName(Reason); + auto Error = make_error( + Msg, std::make_error_code(std::errc::operation_not_supported)); + logAllUnhandledErrors(std::move(Error), errs(), + "Error importing module: "); + break; + } else { + LLVM_DEBUG(dbgs() + << "ignored! No qualifying callee with summary found.\n"); + continue; + } } // "Resolve" the summary CalleeSummary = CalleeSummary->getBaseObject(); ResolvedCalleeSummary = cast(CalleeSummary); - assert((ResolvedCalleeSummary->fflags().AlwaysInline || - (ResolvedCalleeSummary->instCount() <= NewThreshold)) && + assert((ResolvedCalleeSummary->fflags().AlwaysInline || ForceImportAll || + (ResolvedCalleeSummary->instCount() <= NewThreshold)) && "selectCallee() didn't honor the threshold"); auto ExportModulePath = ResolvedCalleeSummary->modulePath(); diff --git a/llvm/test/Transforms/FunctionImport/Inputs/funcimport.ll b/llvm/test/Transforms/FunctionImport/Inputs/funcimport.ll --- a/llvm/test/Transforms/FunctionImport/Inputs/funcimport.ll +++ b/llvm/test/Transforms/FunctionImport/Inputs/funcimport.ll @@ -1,3 +1,6 @@ +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.11.0" + @globalvar = global i32 1, align 4 @staticvar = internal global i32 1, align 4 @staticconstvar = internal unnamed_addr constant [2 x i32] [i32 10, i32 20], align 4 diff --git a/llvm/test/Transforms/FunctionImport/Inputs/noinline.ll b/llvm/test/Transforms/FunctionImport/Inputs/noinline.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/FunctionImport/Inputs/noinline.ll @@ -0,0 +1,8 @@ +define void @foo(i64* %v) #0 { +entry: + %v.addr = alloca i64*, align 8 + store i64* %v, i64** %v.addr, align 8 + ret void +} + +attributes #0 = { noinline } \ No newline at end of file diff --git a/llvm/test/Transforms/FunctionImport/adjustable_threshold.ll b/llvm/test/Transforms/FunctionImport/adjustable_threshold.ll --- a/llvm/test/Transforms/FunctionImport/adjustable_threshold.ll +++ b/llvm/test/Transforms/FunctionImport/adjustable_threshold.ll @@ -11,7 +11,15 @@ ; RUN: opt -function-import -summary-file %t3.thinlto.bc %t.bc -import-instr-limit=10 -import-instr-evolution-factor=0.5 -S | FileCheck %s --check-prefix=INSTLIM-PROGRESSIVE ; INSTLIM-PROGRESSIVE-NOT: call void @staticfunc - +; Test force import all +; RUN: opt -function-import -summary-file %t3.thinlto.bc %t.bc \ +; RUN: -import-instr-limit=1 -force-import-all -S \ +; RUN: | FileCheck %s --check-prefix=IMPORTALL +; IMPORTALL-DAG: define available_externally void @globalfunc1() +; IMPORTALL-DAG: define available_externally void @trampoline() +; IMPORTALL-DAG: define available_externally void @largefunction() +; IMPORTALL-DAG: define available_externally hidden void @staticfunc2.llvm.0() +; IMPORTALL-DAG: define available_externally void @globalfunc2() declare void @globalfunc1() declare void @globalfunc2() diff --git a/llvm/test/Transforms/FunctionImport/funcimport.ll b/llvm/test/Transforms/FunctionImport/funcimport.ll --- a/llvm/test/Transforms/FunctionImport/funcimport.ll +++ b/llvm/test/Transforms/FunctionImport/funcimport.ll @@ -1,3 +1,5 @@ +; REQUIRES: x86-registered-target + ; Do setup work for all below tests: generate bitcode and combined index ; RUN: opt -module-summary %s -o %t.bc ; RUN: opt -module-summary %p/Inputs/funcimport.ll -o %t2.bc @@ -15,6 +17,13 @@ ; RUN: opt -function-import -enable-import-metadata -summary-file %t3.thinlto.bc %t.bc -import-instr-limit=5 -S | FileCheck %s --check-prefix=CHECK --check-prefix=INSTLIM5 ; INSTLIM5-NOT: @staticfunc.llvm. +; Test force import all +; RUN: llvm-lto -thinlto-action=run -force-import-all %t.bc %t2.bc 2>&1 \ +; RUN: | FileCheck %s --check-prefix=IMPORTALL +; IMPORTALL-DAG: Error importing module: Failed to import function weakalias due to InterposableLinkage + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.11.0" define i32 @main() #0 { entry: diff --git a/llvm/test/Transforms/FunctionImport/noinline.ll b/llvm/test/Transforms/FunctionImport/noinline.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/FunctionImport/noinline.ll @@ -0,0 +1,23 @@ +; Do setup work for all below tests: generate bitcode and combined index +; RUN: opt -module-summary %s -o %t.main.bc +; RUN: opt -module-summary %p/Inputs/noinline.ll -o %t.inputs.noinline.bc +; RUN: llvm-lto -thinlto -o %t.summary %t.main.bc %t.inputs.noinline.bc + +; Attempt the import now, ensure below that file containing noinline +; is not imported by default but imported with -force-import-all. + +; RUN: opt -function-import -summary-file %t.summary.thinlto.bc %t.main.bc -S 2>&1 \ +; RUN: | FileCheck -check-prefix=NOIMPORT %s +; RUN: opt -function-import -force-import-all -summary-file %t.summary.thinlto.bc \ +; RUN: %t.main.bc -S 2>&1 | FileCheck -check-prefix=IMPORT %s + +define i32 @main() #0 { +entry: + %f = alloca i64, align 8 + call void @foo(i64* %f) + ret i32 0 +} + +; NOIMPORT: declare void @foo(i64*) +; IMPORT: define available_externally void @foo +declare void @foo(i64*) #1