Index: clang/include/clang/Driver/Driver.h =================================================================== --- clang/include/clang/Driver/Driver.h +++ clang/include/clang/Driver/Driver.h @@ -84,6 +84,9 @@ /// LTO mode selected via -f(no-)?lto(=.*)? options. LTOKind LTOMode; + /// LTO mode selected via -f(no-offload-)?lto(=.*)? options. + LTOKind OffloadLTOMode; + public: enum OpenMPRuntimeKind { /// An unknown OpenMP runtime. We can't generate effective OpenMP code @@ -559,10 +562,14 @@ bool ShouldEmitStaticLibrary(const llvm::opt::ArgList &Args) const; /// Returns true if we are performing any kind of LTO. - bool isUsingLTO() const { return LTOMode != LTOK_None; } + bool isUsingLTO(bool IsOffload = false) const { + return getLTOMode(IsOffload) != LTOK_None; + } /// Get the specific kind of LTO being performed. - LTOKind getLTOMode() const { return LTOMode; } + LTOKind getLTOMode(bool IsOffload = false) const { + return IsOffload ? OffloadLTOMode : LTOMode; + } private: Index: clang/include/clang/Driver/Options.td =================================================================== --- clang/include/clang/Driver/Options.td +++ clang/include/clang/Driver/Options.td @@ -1892,6 +1892,13 @@ HelpText<"Enable LTO in 'full' mode">; def fno_lto : Flag<["-"], "fno-lto">, Flags<[CoreOption, CC1Option]>, Group, HelpText<"Disable LTO mode (default)">; +def foffload_lto_EQ : Joined<["-"], "foffload-lto=">, Flags<[CoreOption]>, Group, + HelpText<"Set LTO mode to either 'full' or 'thin' for offload compilation">, Values<"thin,full">; +defm offload_lto : BoolFOption<"offload-lto", EmptyKPM, DefaultFalse, + PosFlag, + NegFlag, + BothFlags<[], " for offload compilation">>; + def flto_jobs_EQ : Joined<["-"], "flto-jobs=">, Flags<[CC1Option]>, Group, HelpText<"Controls the backend parallelism of -flto=thin (default " Index: clang/lib/Driver/Driver.cpp =================================================================== --- clang/lib/Driver/Driver.cpp +++ clang/lib/Driver/Driver.cpp @@ -594,16 +594,18 @@ } // Parse the LTO options and record the type of LTO compilation -// based on which -f(no-)?lto(=.*)? option occurs last. -void Driver::setLTOMode(const llvm::opt::ArgList &Args) { - LTOMode = LTOK_None; - if (!Args.hasFlag(options::OPT_flto, options::OPT_flto_EQ, - options::OPT_fno_lto, false)) - return; +// based on which -f(no-)?lto(=.*)? or -f(no-)?offload-lto(=.*)? +// option occurs last. +static llvm::Optional +parseLTOMode(Driver &D, const llvm::opt::ArgList &Args, OptSpecifier OptPos, + OptSpecifier OptNeg, OptSpecifier OptEq) { + driver::LTOKind LTOMode = LTOK_None; + if (!Args.hasFlag(OptPos, OptEq, OptNeg, false)) + return None; StringRef LTOName("full"); - const Arg *A = Args.getLastArg(options::OPT_flto_EQ); + const Arg *A = Args.getLastArg(OptEq); if (A) LTOName = A->getValue(); @@ -614,9 +616,25 @@ if (LTOMode == LTOK_Unknown) { assert(A); - Diag(diag::err_drv_unsupported_option_argument) << A->getOption().getName() - << A->getValue(); + D.Diag(diag::err_drv_unsupported_option_argument) + << A->getOption().getName() << A->getValue(); + return None; } + return LTOMode; +} + +// Parse the LTO options. +void Driver::setLTOMode(const llvm::opt::ArgList &Args) { + LTOMode = LTOK_None; + if (auto M = parseLTOMode(*this, Args, options::OPT_flto, + options::OPT_fno_lto, options::OPT_flto_EQ)) + LTOMode = M.getValue(); + + OffloadLTOMode = LTOK_None; + if (auto M = parseLTOMode(*this, Args, options::OPT_foffload_lto, + options::OPT_fno_offload_lto, + options::OPT_foffload_lto_EQ)) + OffloadLTOMode = M.getValue(); } /// Compute the desired OpenMP runtime from the flags provided. Index: clang/lib/Driver/ToolChains/Clang.cpp =================================================================== --- clang/lib/Driver/ToolChains/Clang.cpp +++ clang/lib/Driver/ToolChains/Clang.cpp @@ -4159,6 +4159,10 @@ bool IsHIP = JA.isOffloading(Action::OFK_HIP); bool IsOpenMPDevice = JA.isDeviceOffloading(Action::OFK_OpenMP); bool IsHeaderModulePrecompile = isa(JA); + bool IsDeviceOffloadAction = !(JA.isDeviceOffloading(Action::OFK_None) || + JA.isDeviceOffloading(Action::OFK_Host)); + bool IsUsingLTO = D.isUsingLTO(IsDeviceOffloadAction); + auto LTOMode = D.getLTOMode(IsDeviceOffloadAction); // A header module compilation doesn't have a main input file, so invent a // fake one as a placeholder. @@ -4410,11 +4414,8 @@ if (JA.getType() == types::TY_LLVM_BC) CmdArgs.push_back("-emit-llvm-uselists"); - // Device-side jobs do not support LTO. - bool isDeviceOffloadAction = !(JA.isDeviceOffloading(Action::OFK_None) || - JA.isDeviceOffloading(Action::OFK_Host)); - - if (D.isUsingLTO() && !isDeviceOffloadAction) { + // Only AMDGPU supports device-side LTO + if (IsUsingLTO && (!IsDeviceOffloadAction || Triple.isAMDGPU())) { Args.AddLastArg(CmdArgs, options::OPT_flto, options::OPT_flto_EQ); CmdArgs.push_back("-flto-unit"); } @@ -4441,7 +4442,7 @@ // Embed-bitcode option. // Only white-listed flags below are allowed to be embedded. - if (C.getDriver().embedBitcodeInObject() && !C.getDriver().isUsingLTO() && + if (C.getDriver().embedBitcodeInObject() && !IsUsingLTO && (isa(JA) || isa(JA))) { // Add flags implied by -fembed-bitcode. Args.AddLastArg(CmdArgs, options::OPT_fembed_bitcode_EQ); @@ -4558,7 +4559,7 @@ return; } - if (C.getDriver().embedBitcodeMarkerOnly() && !C.getDriver().isUsingLTO()) + if (C.getDriver().embedBitcodeMarkerOnly() && !IsUsingLTO) CmdArgs.push_back("-fembed-bitcode=marker"); // We normally speed up the clang process a bit by skipping destructors at @@ -6380,7 +6381,7 @@ // be added so both IR can be captured. if ((C.getDriver().isSaveTempsEnabled() || JA.isHostOffloading(Action::OFK_OpenMP)) && - !(C.getDriver().embedBitcodeInObject() && !C.getDriver().isUsingLTO()) && + !(C.getDriver().embedBitcodeInObject() && !IsUsingLTO) && isa(JA)) CmdArgs.push_back("-disable-llvm-passes"); @@ -6503,7 +6504,7 @@ if (VirtualFunctionElimination) { // VFE requires full LTO (currently, this might be relaxed to allow ThinLTO // in the future). - if (D.getLTOMode() != LTOK_Full) + if (LTOMode != LTOK_Full) D.Diag(diag::err_drv_argument_only_allowed_with) << "-fvirtual-function-elimination" << "-flto=full"; @@ -6522,7 +6523,7 @@ } if (WholeProgramVTables) { - if (!D.isUsingLTO()) + if (!IsUsingLTO) D.Diag(diag::err_drv_argument_only_allowed_with) << "-fwhole-program-vtables" << "-flto"; @@ -6531,7 +6532,7 @@ bool DefaultsSplitLTOUnit = (WholeProgramVTables || Sanitize.needsLTO()) && - (D.getLTOMode() == LTOK_Full || TC.canSplitThinLTOUnit()); + (LTOMode == LTOK_Full || TC.canSplitThinLTOUnit()); bool SplitLTOUnit = Args.hasFlag(options::OPT_fsplit_lto_unit, options::OPT_fno_split_lto_unit, DefaultsSplitLTOUnit); @@ -6577,7 +6578,7 @@ // Enable order file instrumentation when ThinLTO is not on. When ThinLTO is // on, we need to pass these flags as linker flags and that will be handled // outside of the compiler. - if (!D.isUsingLTO()) { + if (!IsUsingLTO) { CmdArgs.push_back("-mllvm"); CmdArgs.push_back("-enable-order-file-instrumentation"); } Index: clang/lib/Driver/ToolChains/HIP.cpp =================================================================== --- clang/lib/Driver/ToolChains/HIP.cpp +++ clang/lib/Driver/ToolChains/HIP.cpp @@ -49,8 +49,8 @@ auto &TC = getToolChain(); auto &D = TC.getDriver(); assert(!Inputs.empty() && "Must have at least one input."); - addLTOOptions(TC, Args, LldArgs, Output, Inputs[0], - D.getLTOMode() == LTOK_Thin); + bool IsThinLTO = D.getLTOMode(/*IsOffload=*/true) == LTOK_Thin; + addLTOOptions(TC, Args, LldArgs, Output, Inputs[0], IsThinLTO); // Extract all the -m options std::vector Features; @@ -66,6 +66,14 @@ if (!Features.empty()) LldArgs.push_back(Args.MakeArgString(MAttrString)); + // ToDo: Remove these options after AMDGPU backend supports ISA-level linking. + // Since AMDGPU backend currently does not support ISA-level linking, all + // called functions need to be imported. + if (IsThinLTO) + LldArgs.append( + {Args.MakeArgString("-plugin-opt=-import-instr-limit=100000"), + Args.MakeArgString("-plugin-opt=-import-noinline")}); + for (const Arg *A : Args.filtered(options::OPT_mllvm)) { LldArgs.push_back( Args.MakeArgString(Twine("-plugin-opt=") + A->getValue(0))); Index: clang/test/Driver/hip-options.hip =================================================================== --- clang/test/Driver/hip-options.hip +++ clang/test/Driver/hip-options.hip @@ -51,3 +51,12 @@ // RUN: --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=CTA %s // CTA: clang{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-mconstructor-aliases" // CTA-NOT: clang{{.*}} "-triple" "amdgcn-amd-amdhsa" {{.*}} "-mconstructor-aliases" + +// Check -foffload-lto=thin translated correctly. + +// RUN: %clang -### -target x86_64-unknown-linux-gnu -nogpuinc -nogpulib \ +// RUN: --cuda-gpu-arch=gfx906 -fgpu-rdc -foffload-lto=thin %s 2>&1 \ +// RUN: | FileCheck -check-prefix=THINLTO %s +// THINLTO-NOT: clang{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-flto-unit" +// THINLTO: clang{{.*}} "-triple" "amdgcn-amd-amdhsa" {{.*}} "-flto-unit" +// THINLTO: lld{{.*}}"-plugin-opt=mcpu=gfx906" "-plugin-opt=thinlto" "-plugin-opt=-import-instr-limit=100000" "-plugin-opt=-import-noinline" Index: llvm/lib/Transforms/IPO/FunctionImport.cpp =================================================================== --- llvm/lib/Transforms/IPO/FunctionImport.cpp +++ llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -84,6 +84,10 @@ "import-cutoff", cl::init(-1), cl::Hidden, cl::value_desc("N"), cl::desc("Only import first N functions if N>=0 (default -1)")); +static cl::opt + ImportNoInline("import-noinline", cl::init(false), cl::Hidden, + cl::desc("Import functions with noinline attribute")); + static cl::opt ImportInstrFactor("import-instr-evolution-factor", cl::init(0.7), cl::Hidden, cl::value_desc("x"), @@ -240,7 +244,7 @@ } // Don't bother importing if we can't inline it anyway. - if (Summary->fflags().NoInline) { + if (Summary->fflags().NoInline && !ImportNoInline) { Reason = FunctionImporter::ImportFailureReason::NoInline; return false; } Index: llvm/test/Transforms/FunctionImport/Inputs/noinline.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/FunctionImport/Inputs/noinline.ll @@ -0,0 +1,8 @@ +define void @foo(i64* %v) #0 { +entry: + %v.addr = alloca i64*, align 8 + store i64* %v, i64** %v.addr, align 8 + ret void +} + +attributes #0 = { noinline } \ No newline at end of file Index: llvm/test/Transforms/FunctionImport/noinline.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/FunctionImport/noinline.ll @@ -0,0 +1,23 @@ +; Do setup work for all below tests: generate bitcode and combined index +; RUN: opt -module-summary %s -o %t.main.bc +; RUN: opt -module-summary %p/Inputs/noinline.ll -o %t.inputs.noinline.bc +; RUN: llvm-lto -thinlto -o %t.summary %t.main.bc %t.inputs.noinline.bc + +; Attempt the import now, ensure below that file containing noinline +; is not imported by default but imported with -import-noinline. + +; RUN: opt -function-import -summary-file %t.summary.thinlto.bc %t.main.bc -S 2>&1 \ +; RUN: | FileCheck -check-prefix=NOIMPORT %s +; RUN: opt -function-import -import-noinline -summary-file %t.summary.thinlto.bc \ +; RUN: %t.main.bc -S 2>&1 | FileCheck -check-prefix=IMPORT %s + +define i32 @main() #0 { +entry: + %f = alloca i64, align 8 + call void @foo(i64* %f) + ret i32 0 +} + +; NOIMPORT: declare void @foo(i64*) +; IMPORT: define available_externally void @foo +declare void @foo(i64*) #1