diff --git a/clang/include/clang/Driver/Compilation.h b/clang/include/clang/Driver/Compilation.h --- a/clang/include/clang/Driver/Compilation.h +++ b/clang/include/clang/Driver/Compilation.h @@ -143,6 +143,8 @@ return ActiveOffloadMask & Kind; } + unsigned getActiveOffloadKinds() const { return ActiveOffloadMask; } + /// Iterator that visits device toolchains of a given kind. using const_offload_toolchains_iterator = const std::multimap bool hasOffloadToolChain() const { return OrderedOffloadingToolchains.find(Kind) != diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -3830,11 +3830,6 @@ // Builder to be used to build offloading actions. OffloadingActionBuilder OffloadBuilder(C, Args, Inputs); - // Offload kinds active for this compilation. - unsigned OffloadKinds = Action::OFK_None; - if (C.hasOffloadToolChain()) - OffloadKinds |= Action::OFK_OpenMP; - // Construct the actions to perform. HeaderModulePrecompileJobAction *HeaderModuleAction = nullptr; ActionList LinkerInputs; @@ -3935,7 +3930,7 @@ if (!Args.hasArg(options::OPT_fopenmp_new_driver)) OffloadBuilder.appendTopLevelActions(Actions, Current, InputArg); else if (Current) - Current->propagateHostOffloadInfo(OffloadKinds, + Current->propagateHostOffloadInfo(C.getActiveOffloadKinds(), /*BoundArch=*/nullptr); } @@ -3956,9 +3951,9 @@ if (ShouldEmitStaticLibrary(Args)) { LA = C.MakeAction(LinkerInputs, types::TY_Image); } else if (Args.hasArg(options::OPT_fopenmp_new_driver) && - OffloadKinds != Action::OFK_None) { + C.getActiveOffloadKinds() != Action::OFK_None) { LA = C.MakeAction(LinkerInputs, types::TY_Image); - LA->propagateHostOffloadInfo(OffloadKinds, + LA->propagateHostOffloadInfo(C.getActiveOffloadKinds(), /*BoundArch=*/nullptr); } else { LA = C.MakeAction(LinkerInputs, types::TY_Image); @@ -4050,6 +4045,67 @@ Args.ClaimAllArgs(options::OPT_cuda_compile_host_device); } +static StringRef getCanonicalArchString(Compilation &C, + llvm::opt::DerivedArgList &Args, + StringRef ArchStr, + Action::OffloadKind Kind) { + if (Kind == Action::OFK_Cuda) { + CudaArch Arch = StringToCudaArch(ArchStr); + if (Arch == CudaArch::UNKNOWN || !IsNVIDIAGpuArch(Arch)) { + C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << ArchStr; + return StringRef(); + } + return Args.MakeArgStringRef(CudaArchToString(Arch)); + } else { + llvm::StringMap Features; + // getHIPOffloadTargetTriple() is known to return valid value as it has + // been called successfully in the CreateOffloadingDeviceToolChains(). + auto Arch = parseTargetID( + *getHIPOffloadTargetTriple(C.getDriver(), C.getInputArgs()), ArchStr, + &Features); + if (!Arch) { + C.getDriver().Diag(clang::diag::err_drv_bad_target_id) << ArchStr; + C.setContainsError(); + return StringRef(); + } + return Args.MakeArgStringRef( + getCanonicalTargetID(Arch.getValue(), Features)); + } + return StringRef(); +} + +static llvm::DenseSet +getOffloadArchs(Compilation &C, llvm::opt::DerivedArgList &Args, + Action::OffloadKind Kind) { + + // If this is OpenMP offloading we don't use a bound architecture. + if (Kind == Action::OFK_OpenMP) + return llvm::DenseSet{StringRef()}; + + // --offload and --offload-arch options are mutually exclusive. + if (Args.hasArgNoClaim(options::OPT_offload_EQ) && + Args.hasArgNoClaim(options::OPT_offload_arch_EQ, + options::OPT_no_offload_arch_EQ)) { + C.getDriver().Diag(diag::err_opt_not_valid_with_opt) << "--offload-arch" + << "--offload"; + } + + llvm::DenseSet Archs; + for (auto &Arg : Args.getAllArgValues(options::OPT_offload_arch_EQ)) + Archs.insert(getCanonicalArchString(C, Args, Arg, Kind)); + for (auto &Arg : Args.getAllArgValues(options::OPT_no_offload_arch_EQ)) + Archs.erase(getCanonicalArchString(C, Args, Arg, Kind)); + + if (Archs.empty()) { + if (Kind == Action::OFK_Cuda) + Archs.insert(CudaArchToString(CudaArch::SM_35)); + else if (Kind == Action::OFK_Cuda) + Archs.insert(CudaArchToString(CudaArch::GFX803)); + } + + return Archs; +} + Action *Driver::BuildOffloadingActions(Compilation &C, llvm::opt::DerivedArgList &Args, const InputTy &Input, @@ -4057,53 +4113,73 @@ if (!isa(HostAction)) return HostAction; - SmallVector ToolChains; - ActionList DeviceActions; + OffloadAction::DeviceDependences DDeps; types::ID InputType = Input.first; const Arg *InputArg = Input.second; - auto OpenMPTCRange = C.getOffloadToolChains(); - for (auto TI = OpenMPTCRange.first, TE = OpenMPTCRange.second; TI != TE; ++TI) - ToolChains.push_back(TI->second); + const Action::OffloadKind OffloadKinds[] = { + Action::OFK_OpenMP, Action::OFK_Cuda, Action::OFK_HIP}; - for (unsigned I = 0; I < ToolChains.size(); ++I) - DeviceActions.push_back(C.MakeAction(*InputArg, InputType)); + for (Action::OffloadKind Kind : OffloadKinds) { + SmallVector ToolChains; + ActionList DeviceActions; - if (DeviceActions.empty()) - return HostAction; + auto TCRange = C.getOffloadToolChains(Kind); + for (auto TI = TCRange.first, TE = TCRange.second; TI != TE; ++TI) + ToolChains.push_back(TI->second); - auto PL = types::getCompilationPhases(*this, Args, InputType); + if (ToolChains.empty()) + continue; - for (phases::ID Phase : PL) { - if (Phase == phases::Link) { - assert(Phase == PL.back() && "linking must be final compilation step."); - break; - } + // TODO: Handle CUID. - auto TC = ToolChains.begin(); - for (Action *&A : DeviceActions) { - A = ConstructPhaseAction(C, Args, Phase, A, Action::OFK_OpenMP); + // Get the product of all bound architectures and toolchains. + SmallVector> TCAndArchs; + for (const ToolChain *TC : ToolChains) + for (StringRef Arch : getOffloadArchs(C, Args, Kind)) + TCAndArchs.push_back(std::make_pair(TC, Arch)); - if (isa(A)) { - HostAction->setCannotBeCollapsedWithNextDependentAction(); - OffloadAction::HostDependence HDep( - *HostAction, *C.getSingleOffloadToolChain(), - /*BourdArch=*/nullptr, Action::OFK_OpenMP); - OffloadAction::DeviceDependences DDep; - DDep.add(*A, **TC, /*BoundArch=*/nullptr, Action::OFK_OpenMP); - A = C.MakeAction(HDep, DDep); + for (unsigned I = 0, E = TCAndArchs.size(); I != E; ++I) + DeviceActions.push_back(C.MakeAction(*InputArg, InputType)); + + if (DeviceActions.empty()) + return HostAction; + + auto PL = types::getCompilationPhases(*this, Args, InputType); + + for (phases::ID Phase : PL) { + if (Phase == phases::Link) { + assert(Phase == PL.back() && "linking must be final compilation step."); + break; } - ++TC; - } - } - OffloadAction::DeviceDependences DDeps; + auto TCAndArch = TCAndArchs.begin(); + for (Action *&A : DeviceActions) { + A = ConstructPhaseAction(C, Args, Phase, A, Kind); - auto TC = ToolChains.begin(); - for (Action *A : DeviceActions) { - DDeps.add(*A, **TC, /*BoundArch=*/nullptr, Action::OFK_OpenMP); - TC++; + if (isa(A) && Kind == Action::OFK_OpenMP) { + HostAction->setCannotBeCollapsedWithNextDependentAction(); + OffloadAction::HostDependence HDep( + *HostAction, *C.getSingleOffloadToolChain(), + /*BoundArch=*/nullptr, Kind); + OffloadAction::DeviceDependences DDep; + DDep.add(*A, *TCAndArch->first, /*BoundArch=*/nullptr, Kind); + A = C.MakeAction(HDep, DDep); + ++TCAndArch; + } + } + } + + auto TCAndArch = TCAndArchs.begin(); + for (Action *A : DeviceActions) { + if (isa(A) && + (Kind == Action::OFK_Cuda || Kind == Action::OFK_HIP)) + DDeps.add(*A->getInputs()[0], *TCAndArch->first, + TCAndArch->second.data(), Kind); + DDeps.add(*A, *TCAndArch->first, TCAndArch->second.data(), Kind); + ++TCAndArch; + } } OffloadAction::HostDependence HDep( @@ -4204,7 +4280,7 @@ return C.MakeAction(Input, Output); } if (isUsingLTO(/* IsOffload */ true) && - TargetDeviceOffloadKind == Action::OFK_OpenMP) { + TargetDeviceOffloadKind != Action::OFK_None) { types::ID Output = Args.hasArg(options::OPT_S) ? types::TY_LTO_IR : types::TY_LTO_BC; return C.MakeAction(Input, Output); diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -4374,6 +4374,7 @@ // one input. bool IsCuda = JA.isOffloading(Action::OFK_Cuda); bool IsCudaDevice = JA.isDeviceOffloading(Action::OFK_Cuda); + bool IsCudaHost = JA.isHostOffloading(Action::OFK_Cuda); bool IsHIP = JA.isOffloading(Action::OFK_HIP); bool IsHIPDevice = JA.isDeviceOffloading(Action::OFK_HIP); bool IsOpenMPDevice = JA.isDeviceOffloading(Action::OFK_OpenMP); @@ -4397,6 +4398,7 @@ InputInfoList ModuleHeaderInputs; InputInfoList OpenMPHostInputs; + InputInfoList CudaHostInputs; const InputInfo *CudaDeviceInput = nullptr; const InputInfo *OpenMPDeviceInput = nullptr; for (const InputInfo &I : Inputs) { @@ -4411,6 +4413,8 @@ << types::getTypeName(Expected); } ModuleHeaderInputs.push_back(I); + } else if (IsCudaHost && Args.hasArg(options::OPT_fopenmp_new_driver)) { + CudaHostInputs.push_back(I); } else if ((IsCuda || IsHIP) && !CudaDeviceInput) { CudaDeviceInput = &I; } else if (IsOpenMPDevice && !OpenMPDeviceInput) { @@ -6929,6 +6933,7 @@ auto OpenMPTCs = C.getOffloadToolChains(); for (auto TI = OpenMPTCs.first, TE = OpenMPTCs.second; TI != TE; ++TI, ++InputFile) { + assert(InputFile->isFilename() && "Offloading requires a filename"); const ToolChain *TC = TI->second; const ArgList &TCArgs = C.getArgsForToolChain(TC, "", Action::OFK_OpenMP); StringRef File = @@ -6941,6 +6946,25 @@ TC->getTripleString() + "." + TCArgs.getLastArgValue(options::OPT_march_EQ) + "." + InputName)); } + } else if (IsCudaHost && !CudaHostInputs.empty()) { + const ToolChain *CudaTC = C.getSingleOffloadToolChain(); + for (const auto &InputFile : CudaHostInputs) { + assert(InputFile.isFilename() && "Offloading requires a filename"); + StringRef File = + C.getArgs().MakeArgString(CudaTC->getInputFilename(InputFile)); + StringRef InputName = Clang::getBaseInputStem(Args, Inputs); + // The CUDA toolchain should have a bound arch appended to the filename. + StringRef Arch = File.split(".").first.rsplit('-').second; + // CUDA offloads both the PTX and Cubin so we need a uniqe section name. + if (File.endswith(".s")) + CmdArgs.push_back(Args.MakeArgString( + "-fembed-offload-object=" + File + "," + "cuda." + + CudaTC->getTripleString() + "." + Arch + ".ptx." + InputName)); + else + CmdArgs.push_back(Args.MakeArgString( + "-fembed-offload-object=" + File + "," + "cuda." + + CudaTC->getTripleString() + "." + Arch + "." + InputName)); + } } if (Triple.isAMDGPU()) { @@ -8189,6 +8213,7 @@ const Driver &D = getToolChain().getDriver(); const llvm::Triple TheTriple = getToolChain().getTriple(); auto OpenMPTCRange = C.getOffloadToolChains(); + auto CudaTCRange = C.getOffloadToolChains(); ArgStringList CmdArgs; // Pass the CUDA path to the linker wrapper tool. @@ -8202,6 +8227,16 @@ break; } } + for (auto &I : llvm::make_range(CudaTCRange.first, CudaTCRange.second)) { + const ToolChain *TC = I.second; + if (TC->getTriple().isNVPTX()) { + CudaInstallationDetector CudaInstallation(D, TheTriple, Args); + if (CudaInstallation.isValid()) + CmdArgs.push_back(Args.MakeArgString( + "--cuda-path=" + CudaInstallation.getInstallPath())); + break; + } + } // Get the AMDGPU math libraries. // FIXME: This method is bad, remove once AMDGPU has a proper math library