diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td --- a/clang/include/clang/Basic/DiagnosticDriverKinds.td +++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td @@ -121,6 +121,9 @@ "invalid argument in '%0', only integer or 'auto' is supported">; def err_drv_missing_argument : Error< "argument to '%0' is missing (expected %1 value%s1)">; +def err_drv_missing_Xopenmptarget_or_march: Error< + "The option -fopenmp-targets= requires additional options -Xopenmp-target= and -march= .">, + DefaultFatal; def err_drv_invalid_Xarch_argument_with_args : Error< "invalid Xarch argument: '%0', options requiring arguments are unsupported">; def err_drv_Xopenmp_target_missing_triple : Error< diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h --- a/clang/include/clang/Driver/ToolChain.h +++ b/clang/include/clang/Driver/ToolChain.h @@ -170,6 +170,9 @@ mutable llvm::Optional runtimeLibType; mutable llvm::Optional unwindLibType; + // OpenMP creates a toolchain for each target arch. eg - gfx908 + std::string OffloadArch; + protected: MultilibSet Multilibs; Multilib SelectedMultilib; @@ -246,6 +249,12 @@ return EffectiveTriple; } + const std::string getOffloadArch() const { return OffloadArch; } + + void setOffloadArch(std::string OffloadArch) { + this->OffloadArch = std::move(OffloadArch); + } + path_list &getLibraryPaths() { return LibraryPaths; } const path_list &getLibraryPaths() const { return LibraryPaths; } diff --git a/clang/lib/Driver/Action.cpp b/clang/lib/Driver/Action.cpp --- a/clang/lib/Driver/Action.cpp +++ b/clang/lib/Driver/Action.cpp @@ -206,11 +206,23 @@ const DeviceDependences &DDeps) : Action(OffloadClass, HDep.getAction()), HostTC(HDep.getToolChain()), DevToolChains(DDeps.getToolChains()) { - // We use the kinds of the host dependence for this action. - OffloadingArch = HDep.getBoundArch(); + auto &OKinds = DDeps.getOffloadKinds(); + auto &BArchs = DDeps.getBoundArchs(); + + // If all inputs agree on the same kind, use it also for this action. + if (llvm::all_of(OKinds, [&](OffloadKind K) { return K == OKinds.front(); })) + OffloadingDeviceKind = OKinds.front(); + + // If we have a single dependency, inherit the architecture from it. + if (OKinds.size() == 1) + OffloadingArch = BArchs.front(); + else + // We use the kinds of the host dependence for this action. + OffloadingArch = HDep.getBoundArch(); + ActiveOffloadKindMask = HDep.getOffloadKinds(); HDep.getAction()->propagateHostOffloadInfo(HDep.getOffloadKinds(), - HDep.getBoundArch()); + OffloadingArch); // Add device inputs and propagate info to the device actions. Do work only if // we have dependencies. diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -678,6 +678,38 @@ return RT; } +bool GetTargetInfoFromMArch(Compilation &C, + std::set &OffloadArchs) { + StringRef OpenMPTargetArch; + for (Arg *A : C.getInputArgs()) { + if (A->getOption().matches(options::OPT_Xopenmp_target_EQ)) { + for (auto *V : A->getValues()) { + StringRef VStr = StringRef(V); + if (VStr.startswith("-march=") || VStr.startswith("--march=")) { + OpenMPTargetArch = VStr.split('=').second; + CudaArch Arch = StringToCudaArch(StringRef(OpenMPTargetArch)); + if (Arch == CudaArch::UNKNOWN) { + C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) + << OpenMPTargetArch; + C.setContainsError(); + return false; + } + StringRef OpenMPTargetTriple = StringRef(A->getValue(0)); + llvm::Triple TargetTriple(OpenMPTargetTriple); + + // Append Triple and Arch to form a unique key for each instance of + // the ToolChain + if (!OpenMPTargetTriple.empty() && !OpenMPTargetArch.empty()) + OffloadArchs.insert(TargetTriple.normalize().append("^").append( + OpenMPTargetArch.str())); + } + A->claim(); + } + } + } + return true; +} + void Driver::CreateOffloadingDeviceToolChains(Compilation &C, InputList &Inputs) { @@ -729,17 +761,58 @@ *this, HIPTriple, *HostTC, C.getInputArgs()); } C.addOffloadDeviceToolChain(HIPTC.get(), OFK); - } + } else { + // + // OpenMP + // - // - // OpenMP - // - // We need to generate an OpenMP toolchain if the user specified targets with - // the -fopenmp-targets option. - if (Arg *OpenMPTargets = - C.getInputArgs().getLastArg(options::OPT_fopenmp_targets_EQ)) { - if (OpenMPTargets->getNumValues()) { - // We expect that -fopenmp-targets is always used in conjunction with the + std::set OffloadArchs; + + if (Arg *OpenMPTargets = + C.getInputArgs().getLastArg(options::OPT_fopenmp_targets_EQ)) { + + if (!OpenMPTargets->getNumValues()) { + Diag(clang::diag::warn_drv_empty_joined_argument) + << OpenMPTargets->getAsString(C.getInputArgs()); + return; + } + + // First, handle errors in command line for OpenMP target offload + bool is_host_offloading = + (OpenMPTargets->getNumValues() == 1) && + StringRef(OpenMPTargets->getValue()) + .startswith_insensitive( + C.getSingleOffloadToolChain() + ->getTriple() + .getArchName()); + if (!is_host_offloading) { + // Ensure at least one -Xopenm-target exists with a gpu -march + if (Arg *XOpenMPTargets = + C.getInputArgs().getLastArg(options::OPT_Xopenmp_target_EQ)) { + bool has_valid_march = false; + for (auto *V : XOpenMPTargets->getValues()) + if (StringRef(V).startswith("-march=") || + StringRef(V).startswith("--march=")) + has_valid_march = true; + if (!has_valid_march) { + Diag(diag::err_drv_missing_Xopenmptarget_or_march); + return; + } + } else { + Diag(diag::err_drv_missing_Xopenmptarget_or_march); + return; + } + } + + // process legacy option -fopenmp-targets -Xopenmp-target and -march + auto status = GetTargetInfoFromMArch(C, OffloadArchs); + if (!status) + return; + } + + if (!OffloadArchs.empty()) { + + // We expect that an offload target is always used in conjunction with // option -fopenmp specifying a valid runtime with offloading support, // i.e. libomp or libiomp. bool HasValidOpenMPRuntime = C.getInputArgs().hasFlag( @@ -750,61 +823,65 @@ HasValidOpenMPRuntime = OpenMPKind == OMPRT_OMP || OpenMPKind == OMPRT_IOMP5; } + if (!HasValidOpenMPRuntime) { + Diag(clang::diag::err_drv_expecting_fopenmp_with_fopenmp_targets); + return; + } - if (HasValidOpenMPRuntime) { - llvm::StringMap FoundNormalizedTriples; - for (const char *Val : OpenMPTargets->getValues()) { - llvm::Triple TT(Val); - std::string NormalizedName = TT.normalize(); - - // Make sure we don't have a duplicate triple. - auto Duplicate = FoundNormalizedTriples.find(NormalizedName); - if (Duplicate != FoundNormalizedTriples.end()) { - Diag(clang::diag::warn_drv_omp_offload_target_duplicate) - << Val << Duplicate->second; - continue; - } + llvm::StringMap FoundNormalizedTriples; + for (auto &Target : OffloadArchs) { + size_t Loc = Target.find('^'); + std::string TripleStr = Target.substr(0, Loc); + std::string OpenMPTargetArch = Target.substr(Loc + 1); + llvm::Triple TT(TripleStr); + std::string NormalizedName = Target; + + // Make sure we don't have a duplicate triple. + auto Duplicate = FoundNormalizedTriples.find(NormalizedName); + if (Duplicate != FoundNormalizedTriples.end()) { + Diag(clang::diag::warn_drv_omp_offload_target_duplicate) + << NormalizedName << Duplicate->second; + continue; + } + + // Store the current triple so that we can check for duplicates in the + // following iterations. + FoundNormalizedTriples[NormalizedName] = NormalizedName.c_str(); - // Store the current triple so that we can check for duplicates in the - // following iterations. - FoundNormalizedTriples[NormalizedName] = Val; - - // If the specified target is invalid, emit a diagnostic. - if (TT.getArch() == llvm::Triple::UnknownArch) - Diag(clang::diag::err_drv_invalid_omp_target) << Val; - else { - const ToolChain *TC; - // Device toolchains have to be selected differently. They pair host - // and device in their implementation. - if (TT.isNVPTX() || TT.isAMDGCN()) { - const ToolChain *HostTC = - C.getSingleOffloadToolChain(); - assert(HostTC && "Host toolchain should be always defined."); - auto &DeviceTC = - ToolChains[TT.str() + "/" + HostTC->getTriple().normalize()]; - if (!DeviceTC) { - if (TT.isNVPTX()) - DeviceTC = std::make_unique( - *this, TT, *HostTC, C.getInputArgs(), Action::OFK_OpenMP); - else if (TT.isAMDGCN()) - DeviceTC = - std::make_unique( - *this, TT, *HostTC, C.getInputArgs()); - else - assert(DeviceTC && "Device toolchain not defined."); - } - - TC = DeviceTC.get(); - } else - TC = &getToolChain(C.getInputArgs(), TT); - C.addOffloadDeviceToolChain(TC, Action::OFK_OpenMP); + // If the specified target is invalid, emit a diagnostic. + if (TT.getArch() == llvm::Triple::UnknownArch) { + Diag(clang::diag::err_drv_invalid_omp_target) << NormalizedName; + return; + } + + const ToolChain *TC; + // Device toolchains have to be selected differently. They pair host + // and device in their implementation. + if (TT.isNVPTX() || TT.isAMDGCN()) { + const ToolChain *HostTC = + C.getSingleOffloadToolChain(); + assert(HostTC && "Host toolchain should be always defined."); + auto &DeviceTC = ToolChains[NormalizedName + "/" + + HostTC->getTriple().normalize()]; + if (!DeviceTC) { + if (TT.isNVPTX()) + DeviceTC = std::make_unique( + *this, TT, *HostTC, C.getInputArgs(), Action::OFK_OpenMP, + OpenMPTargetArch); + else if (TT.isAMDGCN()) + DeviceTC = std::make_unique( + *this, TT, *HostTC, C.getInputArgs(), OpenMPTargetArch); + else + assert(DeviceTC && "Device toolchain not defined."); } + TC = DeviceTC.get(); + } else { + TC = &getToolChain(C.getInputArgs(), TT); } - } else - Diag(clang::diag::err_drv_expecting_fopenmp_with_fopenmp_targets); - } else - Diag(clang::diag::warn_drv_empty_joined_argument) - << OpenMPTargets->getAsString(C.getInputArgs()); + // Each value of -fopenmp-targets gets instance of offload toolchain + C.addOffloadDeviceToolChain(TC, Action::OFK_OpenMP); + } // end foreach openmp target + } // end has openmp offload targets } // @@ -2406,6 +2483,19 @@ ABRT_Ignore_Host, }; + /// ID to identify each device compilation. For CUDA it is simply the + /// GPU arch string. For HIP it is either the GPU arch string or GPU + /// arch string plus feature strings delimited by a plus sign, e.g. + /// gfx906+xnack. + struct TargetID { + /// Target ID string which is persistent throughout the compilation. + const char *ID; + TargetID(CudaArch Arch) { ID = CudaArchToString(Arch); } + TargetID(const char *ID) : ID(ID) {} + operator const char *() { return ID; } + operator StringRef() { return StringRef(ID); } + }; + protected: /// Compilation associated with this builder. Compilation &C; @@ -2487,18 +2577,6 @@ bool EmitLLVM = false; bool EmitAsm = false; - /// ID to identify each device compilation. For CUDA it is simply the - /// GPU arch string. For HIP it is either the GPU arch string or GPU - /// arch string plus feature strings delimited by a plus sign, e.g. - /// gfx906+xnack. - struct TargetID { - /// Target ID string which is persistent throughout the compilation. - const char *ID; - TargetID(CudaArch Arch) { ID = CudaArchToString(Arch); } - TargetID(const char *ID) : ID(ID) {} - operator const char *() { return ID; } - operator StringRef() { return StringRef(ID); } - }; /// List of GPU architectures to use in this compilation. SmallVector GpuArchList; @@ -3121,6 +3199,12 @@ /// The OpenMP actions for the current input. ActionList OpenMPDeviceActions; + bool CompileHostOnly = false; + bool CompileDeviceOnly = false; + + /// List of GPU architectures to use in this compilation. + SmallVector GpuArchList; + /// The linker inputs obtained for each toolchain. SmallVector DeviceLinkerInputs; @@ -3154,14 +3238,26 @@ // We passed the device action as a host dependence, so we don't need to // do anything else with them. OpenMPDeviceActions.clear(); - return ABRT_Success; + return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success; + ; } + bool LastActionIsCompile = false; // By default, we produce an action for each device arch. - for (Action *&A : OpenMPDeviceActions) - A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A); - - return ABRT_Success; + for (unsigned I = 0; I < ToolChains.size(); ++I) { + Action *&A = OpenMPDeviceActions[I]; + // AMDGPU does not support linking of object files, so we skip + // assemble and backend actions to produce LLVM IR. + if (ToolChains[I]->getTriple().isAMDGCN() && + (CurPhase == phases::Assemble || CurPhase == phases::Backend)) + continue; + A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A, + Action::OFK_OpenMP); + LastActionIsCompile = + (A->getKind() == Action::ActionClass::CompileJobClass); + } + return (CompileDeviceOnly && LastActionIsCompile) ? ABRT_Ignore_Host + : ABRT_Success; } ActionBuilderReturnCode addDeviceDepences(Action *HostAction) override { @@ -3169,9 +3265,15 @@ // If this is an input action replicate it for each OpenMP toolchain. if (auto *IA = dyn_cast(HostAction)) { OpenMPDeviceActions.clear(); - for (unsigned I = 0; I < ToolChains.size(); ++I) - OpenMPDeviceActions.push_back( - C.MakeAction(IA->getInputArg(), IA->getType())); + // Only process input actions for files that have extensions + std::string FileName = IA->getInputArg().getAsString(Args); + if (!llvm::sys::path::has_extension(FileName)) { + return ABRT_Inactive; + } + for (unsigned I = 0; I < ToolChains.size(); ++I) { + OpenMPDeviceActions.push_back(C.MakeAction( + IA->getInputArg(), IA->getType(), GpuArchList[I].ID)); + } return ABRT_Success; } @@ -3191,8 +3293,9 @@ return ABRT_Inactive; for (unsigned I = 0; I < ToolChains.size(); ++I) { OpenMPDeviceActions.push_back(UA); - UA->registerDependentActionInfo( - ToolChains[I], /*BoundArch=*/StringRef(), Action::OFK_OpenMP); + UA->registerDependentActionInfo(ToolChains[I], + /*BoundArch=*/GpuArchList[I].ID, + Action::OFK_OpenMP); } return ABRT_Success; } @@ -3209,10 +3312,11 @@ *HostAction, *C.getSingleOffloadToolChain(), /*BoundArch=*/nullptr, Action::OFK_OpenMP); auto TC = ToolChains.begin(); + unsigned arch_count = 0; for (Action *&A : OpenMPDeviceActions) { assert(isa(A)); OffloadAction::DeviceDependences DDep; - DDep.add(*A, **TC, /*BoundArch=*/nullptr, Action::OFK_OpenMP); + DDep.add(*A, **TC, GpuArchList[arch_count++].ID, Action::OFK_OpenMP); A = C.MakeAction(HDep, DDep); ++TC; } @@ -3228,11 +3332,13 @@ assert(OpenMPDeviceActions.size() == ToolChains.size() && "Number of OpenMP actions and toolchains do not match."); + unsigned arch_count = 0; // Append all device actions followed by the proper offload action. auto TI = ToolChains.begin(); for (auto *A : OpenMPDeviceActions) { OffloadAction::DeviceDependences Dep; - Dep.add(*A, **TI, /*BoundArch=*/nullptr, Action::OFK_OpenMP); + Dep.add(*A, **TI, /*BoundArch=*/GpuArchList[arch_count++].ID, + Action::OFK_OpenMP); AL.push_back(C.MakeAction(Dep, A->getType())); ++TI; } @@ -3243,17 +3349,17 @@ void appendLinkDeviceActions(ActionList &AL) override { assert(ToolChains.size() == DeviceLinkerInputs.size() && "Toolchains and linker inputs sizes do not match."); - // Append a new link action for each device. auto TC = ToolChains.begin(); + unsigned arch_count = 0; for (auto &LI : DeviceLinkerInputs) { auto *DeviceLinkAction = C.MakeAction(LI, types::TY_Image); OffloadAction::DeviceDependences DeviceLinkDeps; - DeviceLinkDeps.add(*DeviceLinkAction, **TC, /*BoundArch=*/nullptr, - Action::OFK_OpenMP); + DeviceLinkDeps.add(*DeviceLinkAction, **TC, + GpuArchList[arch_count++].ID, Action::OFK_OpenMP); AL.push_back(C.MakeAction(DeviceLinkDeps, - DeviceLinkAction->getType())); + DeviceLinkAction->getType())); ++TC; } DeviceLinkerInputs.clear(); @@ -3270,12 +3376,21 @@ void appendLinkDependences(OffloadAction::DeviceDependences &DA) override {} bool initialize() override { + if (Arg *cu_dev_only = + C.getInputArgs().getLastArg(options::OPT_cuda_device_only)) { + cu_dev_only->claim(); + CompileDeviceOnly = true; + // TODO: Check emitting IR for OpenMP when cuda-device-only is set + } // Get the OpenMP toolchains. If we don't get any, the action builder will // know there is nothing to do related to OpenMP offloading. auto OpenMPTCRange = C.getOffloadToolChains(); for (auto TI = OpenMPTCRange.first, TE = OpenMPTCRange.second; TI != TE; - ++TI) + ++TI) { + GpuArchList.push_back( + TI->second->getTriple().getEnvironmentName().data()); ToolChains.push_back(TI->second); + } DeviceLinkerInputs.resize(ToolChains.size()); return false; @@ -4593,6 +4708,7 @@ OA->doOnEachDependence( /*IsHostDependence=*/BuildingForOffloadDevice, [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) { + OffloadDependencesInputInfo.push_back(BuildJobsForAction( C, DepA, DepTC, DepBoundArch, /*AtTopLevel=*/false, /*MultipleArchs*/ !!DepBoundArch, LinkingOutput, CachedResults, @@ -4645,25 +4761,6 @@ if (!T) return InputInfo(); - if (BuildingForOffloadDevice && - A->getOffloadingDeviceKind() == Action::OFK_OpenMP) { - if (TC->getTriple().isAMDGCN()) { - // AMDGCN treats backend and assemble actions as no-op because - // linker does not support object files. - if (const BackendJobAction *BA = dyn_cast(A)) { - return BuildJobsForAction(C, *BA->input_begin(), TC, BoundArch, - AtTopLevel, MultipleArchs, LinkingOutput, - CachedResults, TargetDeviceOffloadKind); - } - - if (const AssembleJobAction *AA = dyn_cast(A)) { - return BuildJobsForAction(C, *AA->input_begin(), TC, BoundArch, - AtTopLevel, MultipleArchs, LinkingOutput, - CachedResults, TargetDeviceOffloadKind); - } - } - } - // If we've collapsed action list that contained OffloadAction we // need to build jobs for host/device-side inputs it may have held. for (const auto *OA : CollapsedOffloadActions) @@ -4747,17 +4844,23 @@ UI.DependentOffloadKind == Action::OFK_HIP, OffloadingPrefix), BaseInput); + if (UI.DependentOffloadKind == Action::OFK_Host && + llvm::sys::path::extension(InputInfos[0].getFilename()) == ".a") + CurI = InputInfos[0]; // Save the unbundling result. UnbundlingResults.push_back(CurI); // Get the unique string identifier for this dependence and cache the // result. StringRef Arch; - if (TargetDeviceOffloadKind == Action::OFK_HIP) { + if (TargetDeviceOffloadKind == Action::OFK_HIP || + TargetDeviceOffloadKind == Action::OFK_OpenMP) { if (UI.DependentOffloadKind == Action::OFK_Host) Arch = StringRef(); - else + else if (TargetDeviceOffloadKind == Action::OFK_HIP) Arch = UI.DependentBoundArch; + else if (TargetDeviceOffloadKind == Action::OFK_OpenMP) + Arch = UI.DependentToolChain->getOffloadArch(); } else Arch = BoundArch; @@ -4787,8 +4890,9 @@ BaseInput = FinalOutput->getValue(); else BaseInput = getDefaultImageName(); - BaseInput = - C.getArgs().MakeArgString(std::string(BaseInput) + "-wrapper"); + std::string BaseNm = std::string(BaseInput); + std::replace(BaseNm.begin(), BaseNm.end(), '.', '_'); + BaseInput = C.getArgs().MakeArgString(BaseNm + "-wrapper"); } Result = InputInfo(A, GetNamedOutputPath(C, *JA, BaseInput, BoundArch, AtTopLevel, MultipleArchs, diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.h b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.h --- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.h +++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.h @@ -68,6 +68,10 @@ const ToolChain &HostTC, const llvm::opt::ArgList &Args); + AMDGPUOpenMPToolChain(const Driver &D, const llvm::Triple &Triple, + const ToolChain &HostTC, const llvm::opt::ArgList &Args, + const std::string OffloadArch); + const llvm::Triple *getAuxTriple() const override { return &HostTC.getTriple(); } diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp --- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp @@ -166,7 +166,7 @@ const toolchains::AMDGPUOpenMPToolChain &AMDGPUOpenMPTC = static_cast(TC); - std::string GPUArch = Args.getLastArgValue(options::OPT_march_EQ).str(); + std::string GPUArch = AMDGPUOpenMPTC.getOffloadArch(); if (GPUArch.empty()) { if (!checkSystemForAMDGPU(Args, AMDGPUOpenMPTC, GPUArch)) return; @@ -202,12 +202,21 @@ getProgramPaths().push_back(getDriver().Dir); } +AMDGPUOpenMPToolChain::AMDGPUOpenMPToolChain(const Driver &D, + const llvm::Triple &Triple, + const ToolChain &HostTC, + const ArgList &Args, + const std::string OffloadArch) + : ROCMToolChain(D, Triple, Args), HostTC(HostTC) { + getProgramPaths().push_back(getDriver().Dir); + setOffloadArch(OffloadArch); +} + void AMDGPUOpenMPToolChain::addClangTargetOptions( const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args, Action::OffloadKind DeviceOffloadingKind) const { HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind); - - std::string GPUArch = DriverArgs.getLastArgValue(options::OPT_march_EQ).str(); + std::string GPUArch = getOffloadArch(); if (GPUArch.empty()) { if (!checkSystemForAMDGPU(DriverArgs, *this, GPUArch)) return; diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6659,20 +6659,32 @@ } // For all the host OpenMP offloading compile jobs we need to pass the targets - // information using -fopenmp-targets= option. + // information using `-fopenmp-targets=` option. if (JA.isHostOffloading(Action::OFK_OpenMP)) { SmallString<128> TargetInfo("-fopenmp-targets="); Arg *Tgts = Args.getLastArg(options::OPT_fopenmp_targets_EQ); - assert(Tgts && Tgts->getNumValues() && - "OpenMP offloading has to have targets specified."); - for (unsigned i = 0; i < Tgts->getNumValues(); ++i) { - if (i) - TargetInfo += ','; - // We need to get the string from the triple because it may be not exactly - // the same as the one we get directly from the arguments. - llvm::Triple T(Tgts->getValue(i)); - TargetInfo += T.getTriple(); + // Get list of device Toolchains + auto OpenMPTCRange = C.getOffloadToolChains(); + + if (Tgts && Tgts->getNumValues()) { + for (unsigned i = 0; i < Tgts->getNumValues(); ++i) { + if (i) + TargetInfo += ','; + // We need to get the string from the triple because it may be not + // exactly the same as the one we get directly from the arguments. + llvm::Triple T(Tgts->getValue(i)); + TargetInfo += T.getTriple(); + } + } else if (OpenMPTCRange.first != OpenMPTCRange.second) { + for (auto TI = OpenMPTCRange.first, TE = OpenMPTCRange.second; TI != TE; + ++TI) { + auto *deviceTC = TI->second; + TargetInfo += deviceTC->getTriple().str(); + } + } else { + assert("OpenMP offloading requires target devices use \ + `-fopenmp-targets=`"); } CmdArgs.push_back(Args.MakeArgString(TargetInfo.str())); } @@ -7668,18 +7680,17 @@ }); } Triples += Action::GetOffloadKindName(CurKind); - Triples += "-"; - std::string NormalizedTriple = CurTC->getTriple().normalize(); - Triples += NormalizedTriple; - - if (CurDep->getOffloadingArch() != nullptr) { - // If OffloadArch is present it can only appear as the 6th hypen - // sepearated field of Bundle Entry ID. So, pad required number of - // hyphens in Triple. - for (int i = 4 - StringRef(NormalizedTriple).count("-"); i > 0; i--) - Triples += "-"; + Triples += '-'; + Triples += CurTC->getTriple().normalize(); + if ((CurKind == Action::OFK_HIP || CurKind == Action::OFK_Cuda) && + CurDep->getOffloadingArch()) { + Triples += '-'; Triples += CurDep->getOffloadingArch(); } + if (CurKind == Action::OFK_OpenMP && !CurTC->getOffloadArch().empty()) { + Triples += '-'; + Triples += CurTC->getOffloadArch(); + } } CmdArgs.push_back(TCArgs.MakeArgString(Triples)); @@ -7711,7 +7722,7 @@ C.addCommand(std::make_unique( JA, *this, ResponseFileSupport::None(), TCArgs.MakeArgString(getToolChain().GetProgramPath(getShortName())), - CmdArgs, None, Output)); + CmdArgs, Inputs, Output)); } void OffloadBundler::ConstructJobMultipleOutputs( @@ -7746,20 +7757,21 @@ Triples += ','; auto &Dep = DepInfo[I]; - Triples += Action::GetOffloadKindName(Dep.DependentOffloadKind); - Triples += "-"; - std::string NormalizedTriple = - Dep.DependentToolChain->getTriple().normalize(); - Triples += NormalizedTriple; - - if (!Dep.DependentBoundArch.empty()) { - // If OffloadArch is present it can only appear as the 6th hypen - // sepearated field of Bundle Entry ID. So, pad required number of - // hyphens in Triple. - for (int i = 4 - StringRef(NormalizedTriple).count("-"); i > 0; i--) - Triples += "-"; + auto OffloadKind = Dep.DependentOffloadKind; + Triples += Action::GetOffloadKindName(OffloadKind); + Triples += '-'; + Triples += Dep.DependentToolChain->getTriple().normalize(); + if ((Dep.DependentOffloadKind == Action::OFK_HIP || + Dep.DependentOffloadKind == Action::OFK_Cuda) && + !Dep.DependentBoundArch.empty()) { + Triples += '-'; Triples += Dep.DependentBoundArch; } + if (OffloadKind == Action::OFK_OpenMP && + !Dep.DependentToolChain->getOffloadArch().empty()) { + Triples += '-'; + Triples += Dep.DependentToolChain->getOffloadArch(); + } } CmdArgs.push_back(TCArgs.MakeArgString(Triples)); @@ -7805,9 +7817,30 @@ CmdArgs.push_back("-o"); CmdArgs.push_back(Output.getFilename()); - // Add inputs. + auto TCs = C.getOffloadToolChains(); + + // Add runtime requirements on each image which includes the offload-arch + auto II = TCs.first; for (const InputInfo &I : Inputs) { assert(I.isFilename() && "Invalid input."); + if (I.getAction()) { + auto TC = II->second; + II++; + std::string requirements("--requirements="); + requirements.append(TC->getOffloadArch()); + // targetid could have user specified features such as :xnack-:sramecc+ + // so replace ":" with "__" in requirements used for + // clang-offload-wrapper. + size_t start_pos = 0; + while ((start_pos = requirements.find(":", start_pos)) != + std::string::npos) { + requirements.replace(start_pos, 1, "__"); + start_pos += 2; + } + + // FIXME: Add other architecture requirements here + CmdArgs.push_back(Args.MakeArgString(requirements.c_str())); + } CmdArgs.push_back(I.getFilename()); } diff --git a/clang/lib/Driver/ToolChains/Cuda.h b/clang/lib/Driver/ToolChains/Cuda.h --- a/clang/lib/Driver/ToolChains/Cuda.h +++ b/clang/lib/Driver/ToolChains/Cuda.h @@ -134,6 +134,10 @@ const ToolChain &HostTC, const llvm::opt::ArgList &Args, const Action::OffloadKind OK); + CudaToolChain(const Driver &D, const llvm::Triple &Triple, + const ToolChain &HostTC, const llvm::opt::ArgList &Args, + const Action::OffloadKind OK, const std::string OffloadArch); + const llvm::Triple *getAuxTriple() const override { return &HostTC.getTriple(); } diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -404,6 +404,8 @@ // flag or the default value. if (JA.isDeviceOffloading(Action::OFK_OpenMP)) { GPUArchName = Args.getLastArgValue(options::OPT_march_EQ); + if (GPUArchName.empty()) + GPUArchName = TC.getOffloadArch(); assert(!GPUArchName.empty() && "Must have an architecture passed in."); } else GPUArchName = JA.getOffloadingArch(); @@ -597,6 +599,9 @@ StringRef GPUArch = Args.getLastArgValue(options::OPT_march_EQ); + if (GPUArch.empty()) + GPUArch = getToolChain().getOffloadArch(); + assert(!GPUArch.empty() && "At least one GPU Arch required for ptxas."); CmdArgs.push_back("-arch"); @@ -659,6 +664,22 @@ getProgramPaths().push_back(getDriver().Dir); } +CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple, + const ToolChain &HostTC, const ArgList &Args, + const Action::OffloadKind OK, + const std::string OffloadArch) + : ToolChain(D, Triple, Args), HostTC(HostTC), + CudaInstallation(D, HostTC.getTriple(), Args), OK(OK) { + if (CudaInstallation.isValid()) { + CudaInstallation.WarnIfUnsupportedVersion(); + getProgramPaths().push_back(std::string(CudaInstallation.getBinPath())); + } + // Lookup binaries into the driver directory, this is used to + // discover the clang-offload-bundler executable. + getProgramPaths().push_back(getDriver().Dir); + setOffloadArch(OffloadArch); +} + std::string CudaToolChain::getInputFilename(const InputInfo &Input) const { // Only object files are changed, for example assembly files keep their .s // extensions. CUDA also continues to use .o as they don't use nvlink but @@ -680,6 +701,8 @@ HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind); StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ); + if (GpuArch.empty()) + GpuArch = getOffloadArch(); assert(!GpuArch.empty() && "Must have an explicit GPU arch."); assert((DeviceOffloadingKind == Action::OFK_OpenMP || DeviceOffloadingKind == Action::OFK_Cuda) && @@ -844,6 +867,8 @@ } StringRef Arch = DAL->getLastArgValue(options::OPT_march_EQ); + if (Arch.empty()) + Arch = getOffloadArch(); if (Arch.empty()) DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), CLANG_OPENMP_NVPTX_DEFAULT_ARCH); diff --git a/clang/test/Driver/amdgpu-openmp-system-arch-fail.c b/clang/test/Driver/amdgpu-openmp-system-arch-fail.c --- a/clang/test/Driver/amdgpu-openmp-system-arch-fail.c +++ b/clang/test/Driver/amdgpu-openmp-system-arch-fail.c @@ -15,14 +15,9 @@ // case when amdgpu_arch returns nothing or fails // RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib --amdgpu-arch-tool=%t/amdgpu_arch_fail %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=NO-OUTPUT-ERROR -// NO-OUTPUT-ERROR: error: Cannot determine AMDGPU architecture{{.*}}Exited with error code 1. Consider passing it via --march - -// case when amdgpu_arch returns multiple gpus but all are different -// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib --amdgpu-arch-tool=%t/amdgpu_arch_different %s 2>&1 \ -// RUN: | FileCheck %s --check-prefix=MULTIPLE-OUTPUT-ERROR -// MULTIPLE-OUTPUT-ERROR: error: Cannot determine AMDGPU architecture: Multiple AMD GPUs found with different archs. Consider passing it via --march +// NO-OUTPUT-ERROR: fatal error: The option -fopenmp-targets= requires additional options -Xopenmp-target= and -march= // case when amdgpu_arch does not return anything with successful execution // RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib --amdgpu-arch-tool=%t/amdgpu_arch_empty %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=EMPTY-OUTPUT -// EMPTY-OUTPUT: error: Cannot determine AMDGPU architecture: No AMD GPU detected in the system. Consider passing it via --march +// EMPTY-OUTPUT: fatal error: The option -fopenmp-targets= requires additional options -Xopenmp-target= and -march= diff --git a/clang/test/Driver/amdgpu-openmp-toolchain.c b/clang/test/Driver/amdgpu-openmp-toolchain.c --- a/clang/test/Driver/amdgpu-openmp-toolchain.c +++ b/clang/test/Driver/amdgpu-openmp-toolchain.c @@ -10,9 +10,9 @@ // CHECK: llvm-link{{.*}}"-o" "{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906-linked-{{.*}}.bc" // CHECK: llc{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906-linked-{{.*}}.bc" "-mtriple=amdgcn-amd-amdhsa" "-mcpu=gfx906" "-filetype=obj" "-o"{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906-{{.*}}.o" // CHECK: lld{{.*}}"-flavor" "gnu" "--no-undefined" "-shared" "-o"{{.*}}amdgpu-openmp-toolchain-{{.*}}.out" "{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906-{{.*}}.o" -// CHECK: clang-offload-wrapper{{.*}}"-target" "x86_64-unknown-linux-gnu" "-o" "{{.*}}a-{{.*}}.bc" {{.*}}amdgpu-openmp-toolchain-{{.*}}.out" -// CHECK: clang{{.*}}"-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-o" "{{.*}}a-{{.*}}.o" "-x" "ir" "{{.*}}a-{{.*}}.bc" -// CHECK: ld{{.*}}"-o" "a.out"{{.*}}"{{.*}}amdgpu-openmp-toolchain-{{.*}}.o" "{{.*}}a-{{.*}}.o" "-lomp" "-lomptarget" +// CHECK: clang-offload-wrapper{{.*}}" "-target" "x86_64-unknown-linux-gnu" "-o" "{{.*}}a_{{.*}}.bc" "--requirements=gfx906" "{{.*}}amdgpu-openmp-toolchain-{{.*}}.out" +// CHECK: clang{{.*}}"-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-o" "{{.*}}a_{{.*}}.o" "-x" "ir" "{{.*}}a_{{.*}}.bc" +// CHECK: ld{{.*}}"-o" "a.out"{{.*}}"{{.*}}amdgpu-openmp-toolchain-{{.*}}.o" "{{.*}}a_{{.*}}.o" "-lomp" "-lomptarget" // RUN: %clang -ccc-print-phases --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 %s 2>&1 \ // RUN: | FileCheck --check-prefix=CHECK-PHASES %s @@ -26,14 +26,12 @@ // CHECK-PHASES: 6: preprocessor, {5}, cpp-output, (device-openmp) // CHECK-PHASES: 7: compiler, {6}, ir, (device-openmp) // CHECK-PHASES: 8: offload, "host-openmp (x86_64-unknown-linux-gnu)" {2}, "device-openmp (amdgcn-amd-amdhsa)" {7}, ir -// CHECK-PHASES: 9: backend, {8}, assembler, (device-openmp) -// CHECK-PHASES: 10: assembler, {9}, object, (device-openmp) -// CHECK-PHASES: 11: linker, {10}, image, (device-openmp) -// CHECK-PHASES: 12: offload, "device-openmp (amdgcn-amd-amdhsa)" {11}, image -// CHECK-PHASES: 13: clang-offload-wrapper, {12}, ir, (host-openmp) -// CHECK-PHASES: 14: backend, {13}, assembler, (host-openmp) -// CHECK-PHASES: 15: assembler, {14}, object, (host-openmp) -// CHECK-PHASES: 16: linker, {4, 15}, image, (host-openmp) +// CHECK-PHASES: 9: linker, {8}, image, (device-openmp) +// CHECK-PHASES: 10: offload, "device-openmp (amdgcn-amd-amdhsa)" {9}, image +// CHECK-PHASES: 11: clang-offload-wrapper, {10}, ir, (host-openmp) +// CHECK-PHASES: 12: backend, {11}, assembler, (host-openmp) +// CHECK-PHASES: 13: assembler, {12}, object, (host-openmp) +// CHECK-PHASES: 14: linker, {4, 13}, image, (host-openmp) // handling of --libomptarget-amdgcn-bc-path // RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib/libomptarget-amdgcn-gfx803.bc %s 2>&1 | FileCheck %s --check-prefix=CHECK-LIBOMPTARGET @@ -73,4 +71,4 @@ // CHECK-C: "x86_64-unknown-linux-gnu" - "offload bundler" // RUN: %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-EMIT-LLVM-IR -// CHECK-EMIT-LLVM-IR: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-emit-llvm" +// CHECK-EMIT-LLVM-IR: clang{{.*}}" "-cc1" "-triple" "amdgcn-amd-amdhsa"{{.*}}"-emit-llvm" diff --git a/clang/test/Driver/hip-rdc-device-only.hip b/clang/test/Driver/hip-rdc-device-only.hip --- a/clang/test/Driver/hip-rdc-device-only.hip +++ b/clang/test/Driver/hip-rdc-device-only.hip @@ -82,7 +82,7 @@ // COMMON-SAME: {{.*}} {{".*a.cu"}} // COMMON: "{{.*}}clang-offload-bundler" "-type={{(bc|ll)}}" -// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900" +// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" // COMMON-SAME: "-outputs=a-hip-amdgcn-amd-amdhsa.{{(bc|ll)}}" // COMMON: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" @@ -112,7 +112,7 @@ // COMMON-SAME: {{.*}} {{".*b.hip"}} // COMMON: "{{.*}}clang-offload-bundler" "-type={{(bc|ll)}}" -// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900" +// COMMON-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" // COMMON-SAME: "-outputs=b-hip-amdgcn-amd-amdhsa.{{(bc|ll)}}" // SAVETEMP: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu" @@ -142,7 +142,7 @@ // SAVETEMP-SAME: {{.*}} "-o" {{"a.*.ll"}} "-x" "ir" [[A_GFX900_TMP_BC]] // SAVETEMP: "{{.*}}clang-offload-bundler" "-type=ll" -// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900" +// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" // SAVETEMP-SAME: "-outputs=a-hip-amdgcn-amd-amdhsa.ll" // SAVETEMP: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" "-aux-triple" "x86_64-unknown-linux-gnu" @@ -172,7 +172,7 @@ // SAVETEMP-SAME: {{.*}} "-o" {{"b.*.ll"}} "-x" "ir" [[B_GFX900_TMP_BC]] // SAVETEMP: "{{.*}}clang-offload-bundler" "-type=ll" -// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900" +// SAVETEMP-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" // SAVETEMP-SAME: "-outputs=b-hip-amdgcn-amd-amdhsa.ll" // FAIL: error: cannot specify -o when generating multiple output files diff --git a/clang/test/Driver/hip-toolchain-rdc-separate.hip b/clang/test/Driver/hip-toolchain-rdc-separate.hip --- a/clang/test/Driver/hip-toolchain-rdc-separate.hip +++ b/clang/test/Driver/hip-toolchain-rdc-separate.hip @@ -44,7 +44,7 @@ // CHECK-SAME: {{.*}} [[A_SRC]] // CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" -// CHECK-SAME: "-targets=hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900,host-x86_64-unknown-linux-gnu" +// CHECK-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900,host-x86_64-unknown-linux-gnu" // CHECK-SAME: "-outputs=[[A_O:.*a.o]]" "-inputs=[[A_BC1]],[[A_BC2]],[[A_OBJ_HOST]]" // CHECK: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" @@ -79,7 +79,7 @@ // CHECK-SAME: {{.*}} [[B_SRC]] // CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" -// CHECK-SAME: "-targets=hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900,host-x86_64-unknown-linux-gnu" +// CHECK-SAME: "-targets=hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900,host-x86_64-unknown-linux-gnu" // CHECK-SAME: "-outputs=[[B_O:.*b.o]]" "-inputs=[[B_BC1]],[[B_BC2]],[[B_OBJ_HOST]]" // RUN: touch %T/a.o @@ -91,22 +91,22 @@ // RUN: 2>&1 | FileCheck -check-prefix=LINK %s // LINK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" -// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900" +// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" // LINK-SAME: "-inputs=[[A_O:.*a.o]]" "-outputs=[[A_OBJ_HOST:.*o]],{{.*o}},{{.*o}}" // LINK: "-unbundle" "-allow-missing-bundles" // LINK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" -// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900" +// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" // LINK-SAME: "-inputs=[[B_O:.*b.o]]" "-outputs=[[B_OBJ_HOST:.*o]],{{.*o}},{{.*o}}" // LINK: "-unbundle" "-allow-missing-bundles" // LINK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" -// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900" +// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" // LINK-SAME: "-inputs=[[A_O]]" "-outputs={{.*o}},[[A_BC1:.*o]],[[A_BC2:.*o]]" // LINK: "-unbundle" "-allow-missing-bundles" // LINK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" -// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa--gfx803,hip-amdgcn-amd-amdhsa--gfx900" +// LINK-SAME: "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" // LINK-SAME: "-inputs=[[B_O]]" "-outputs={{.*o}},[[B_BC1:.*o]],[[B_BC2:.*o]]" // LINK: "-unbundle" "-allow-missing-bundles" diff --git a/clang/test/Driver/openmp-offload-multi.c b/clang/test/Driver/openmp-offload-multi.c new file mode 100644 --- /dev/null +++ b/clang/test/Driver/openmp-offload-multi.c @@ -0,0 +1,34 @@ +// REQUIRES: clang-driver +// REQUIRES: x86-registered-target +// REQUIRES: amdgpu-registered-target + +// +// Legacy mode (-fopenmp-targets,-Xopenmp-target,-march) tests for +// multi arch compilation +// +// RUN: %clang -### -target x86_64-linux-gnu -fopenmp\ +// RUN: -fopenmp-targets=amdgcn-amd-amdhsa,amdgcn-amd-amdhsa \ +// RUN: -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 \ +// RUN: -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908 \ +// RUN: %s 2>&1 | FileCheck %s + +// CHECK: clang{{.*}}"-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-x" "c"{{.*}} +// CHECK: clang{{.*}}"-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-o" "[[HOSTOBJ:.*.o]]" "-x" "ir"{{.*}} + +// compilation for offload target 1 : gfx906 +// CHECK: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-emit-llvm-bc"{{.*}}"-target-cpu" "gfx906" "-fcuda-is-device"{{.*}}"-o" "{{.*}}.bc" "-x" "c"{{.*}}.c +// CHECK: llvm-link"{{.*}}openmp-offload-multi-{{.*}}.bc"{{.*}}"-o" "{{.*}}openmp-offload-multi-{{.*}}-gfx906-linked-{{.*}}.bc" +// CHECK: llc{{.*}}openmp-offload-multi-{{.*}}-gfx906-linked-{{.*}}.bc" "-mtriple=amdgcn-amd-amdhsa" "-mcpu=gfx906" "-filetype=obj"{{.*}}"-o"{{.*}}openmp-offload-multi-{{.*}}-gfx906-{{.*}}.o" +// CHECK: lld{{.*}}"-flavor" "gnu" "--no-undefined" "-shared" "-o" "[[GFX906OUT:.*.out]]" "{{.*}}openmp-offload-multi-{{.*}}-gfx906-{{.*}}.o" + +// compilation for offload target 1 : gfx908 +// CHECK: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-emit-llvm-bc"{{.*}}"-target-cpu" "gfx908" "-fcuda-is-device"{{.*}}"-o" "{{.*}}.bc" "-x" "c"{{.*}}.c +// CHECK: llvm-link"{{.*}}openmp-offload-multi-{{.*}}.bc"{{.*}}"-o" "{{.*}}openmp-offload-multi-{{.*}}-gfx908-linked-{{.*}}.bc" +// CHECK: llc{{.*}}openmp-offload-multi-{{.*}}-gfx908-linked-{{.*}}.bc" "-mtriple=amdgcn-amd-amdhsa" "-mcpu=gfx908" "-filetype=obj"{{.*}}"-o"{{.*}}openmp-offload-multi-{{.*}}-gfx908-{{.*}}.o" +// CHECK: lld{{.*}}"-flavor" "gnu" "--no-undefined" "-shared" "-o" "[[GFX908OUT:.*.out]]" "{{.*}}openmp-offload-multi-{{.*}}-gfx908-{{.*}}.o" + +// Combining device images for offload targets +// CHECK: clang-offload-wrapper"{{.*}}" "-o" "[[COMBINEDIR:.*.bc]]" "--requirements=gfx906" "[[GFX906OUT]]" "--requirements=gfx908" "[[GFX908OUT]]" + +// CHECK: clang{{.*}}"-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}} "-fopenmp-targets=amdgcn-amd-amdhsa,amdgcn-amd-amdhsa"{{.*}}"-o" "[[COMBINEDOBJ:.*.o]]" "-x" "ir" "[[COMBINEDIR]]" +// CHECK: ld.lld"{{.*}}" "-o" "a.out{{.*}}[[HOSTOBJ]]" "[[COMBINEDOBJ]]{{.*}}" "-lomp{{.*}}-lomptarget" diff --git a/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp b/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp --- a/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp +++ b/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp @@ -60,6 +60,11 @@ cl::desc("Target triple for the output module"), cl::value_desc("triple"), cl::cat(ClangOffloadWrapperCategory)); +static cl::list + OffloadArchs("requirements", cl::desc("requirements contains offload-arch"), + cl::value_desc("requirements"), + cl::cat(ClangOffloadWrapperCategory)); + namespace { class BinaryWrapper { @@ -69,6 +74,7 @@ StructType *EntryTy = nullptr; StructType *ImageTy = nullptr; StructType *DescTy = nullptr; + StructType *ImageInfoTy = nullptr; private: IntegerType *getSizeTTy() { @@ -134,6 +140,27 @@ return PointerType::getUnqual(getBinDescTy()); } + // This matches the runtime struct definition of __tgt_image_info + // declared in openmp/libomptarget/include/omptarget.h / + // struct __tgt_image_info { + // int32_t version; + // int32_t image_number; + // int32_t number_images; + // char* requirements; + // char* target_compile_opts; + // }; + StructType *getImageInfoTy() { + if (!ImageInfoTy) + ImageInfoTy = StructType::create( + "__tgt_image_info", Type::getInt32Ty(C), Type::getInt32Ty(C), + Type::getInt32Ty(C), Type::getInt8PtrTy(C), Type::getInt8PtrTy(C)); + return ImageInfoTy; + } + + PointerType *getImageInfoPtrTy() { + return PointerType::getUnqual(getImageInfoTy()); + } + /// Creates binary descriptor for the given device images. Binary descriptor /// is an object that is passed to the offloading runtime at program startup /// and it describes all device images available in the executable or shared @@ -245,7 +272,9 @@ ".omp_offloading.descriptor"); } - void createRegisterFunction(GlobalVariable *BinDesc) { + void createRegisterFunction(GlobalVariable *BinDesc, + ArrayRef> Requirements) { + auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage, ".omp_offloading.descriptor_reg", &M); @@ -259,6 +288,47 @@ // Construct function body IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func)); + + // Create calls to __tgt_register_image_info for each image + auto *NullPtr = llvm::ConstantPointerNull::get(Builder.getInt8PtrTy()); + auto *Zero = ConstantInt::get(getSizeTTy(), 0u); + auto *RegInfoFuncTy = + FunctionType::get(Type::getVoidTy(C), getImageInfoPtrTy(), false); + FunctionCallee RegInfoFuncC = + M.getOrInsertFunction("__tgt_register_image_info", RegInfoFuncTy); + unsigned int img_count = 0; + for (ArrayRef Requirement : Requirements) { + Constant *RequirementV = ConstantDataArray::get(C, Requirement); + auto *GV = + new GlobalVariable(M, RequirementV->getType(), /*isConstant*/ true, + GlobalValue::InternalLinkage, RequirementV, + Twine("__offload_arch_" + Twine(img_count))); + GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + + // store value of these variables (i.e. offload archs) into a custom + // section which will be used by "offload-arch -f". It won't be + // removed during binary stripping. + GV->setSection(".offload_arch_list"); + + auto *RequirementVPtr = + ConstantExpr::getGetElementPtr(GV->getValueType(), GV, Zero); + RequirementVPtr = + ConstantExpr::getBitCast(RequirementVPtr, Type::getInt8PtrTy(C)); + auto *InfoInit = ConstantStruct::get( + getImageInfoTy(), ConstantInt::get(Type::getInt32Ty(C), 1), + ConstantInt::get(Type::getInt32Ty(C), img_count), + ConstantInt::get(Type::getInt32Ty(C), (uint32_t)Requirements.size()), + RequirementVPtr, + NullPtr // TODO: capture target-compile-opts from clang driver + ); + auto *ImageInfoGV = new GlobalVariable( + M, InfoInit->getType(), + /*isConstant*/ true, GlobalValue::InternalLinkage, InfoInit, + Twine(".offload_image_info_" + Twine(img_count++))); + ImageInfoGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + Builder.CreateCall(RegInfoFuncC, ImageInfoGV); + } + Builder.CreateCall(RegFuncC, BinDesc); Builder.CreateRetVoid(); @@ -298,10 +368,11 @@ M.setTargetTriple(Target); } - const Module &wrapBinaries(ArrayRef> Binaries) { + const Module &wrapBinaries(ArrayRef> Binaries, + ArrayRef> Requirements) { GlobalVariable *Desc = createBinDesc(Binaries); assert(Desc && "no binary descriptor"); - createRegisterFunction(Desc); + createRegisterFunction(Desc, Requirements); createUnregisterFunction(Desc); return M; } @@ -363,10 +434,20 @@ return 1; } + SmallVector, 4u> Requirements; + Requirements.reserve(OffloadArchs.size()); + for (unsigned i = 0; i != OffloadArchs.size(); ++i) { + OffloadArchs[i].append("\0"); + Requirements.emplace_back(OffloadArchs[i].data(), + OffloadArchs[i].size() + 1); + } + // Create a wrapper for device binaries and write its bitcode to the file. - WriteBitcodeToFile(BinaryWrapper(Target).wrapBinaries( - makeArrayRef(Images.data(), Images.size())), - Out.os()); + WriteBitcodeToFile( + BinaryWrapper(Target).wrapBinaries( + makeArrayRef(Images.data(), Images.size()), + makeArrayRef(Requirements.data(), Requirements.size())), + Out.os()); if (Out.os().has_error()) { reportError(createFileError(Output, Out.os().error())); return 1; diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -120,6 +120,44 @@ __tgt_offload_entry *HostEntriesEnd; // End of table (non inclusive) }; +/// __tgt_image_info: +/// +/// The information in this struct is provided in clang-offload-wrapper +/// as a call to __tgt_register_image_info for each image in the library +/// of images also created created by clang-offload-wrapper. +/// __tgt_register_image_info is called for each image BEFORE the single +/// call to __tgt_register_lib so that image information is available +/// before they are loaded. clang-offload-wrapper gets this image information +/// from command line arguments provided by the clang driver when it creates +/// the call to the __clang-offload-wrapper command. +/// This architecture allows the binary image (pointed to by ImageStart and +/// ImageEnd in __tgt_device_image) to remain architecture indenendent. +/// That is, the architecture independent part of the libomptarget runtime +/// does not need to peer inside the image to determine if it is loadable +/// even though in most cases the image is an elf object. +/// There is one __tgt_image_info for each __tgt_device_image. For backward +/// compabibility, no changes are allowed to either __tgt_device_image or +/// __tgt_bin_desc. The absense of __tgt_image_info is the indication that +/// the runtime is being used on a binary created by an old version of +/// the compiler. +/// +struct __tgt_image_info { + int32_t version; // The version of this struct + int32_t image_number; // Image number in image library starting from 0 + int32_t number_images; // Number of images, used for initial allocation + char *requirements; // e.g. sm_30, sm_70, gfx906, includes features + char *compile_opts; // reserved for future use +}; + +/// __tgt_active_offload_env +/// +/// This structure is created by __tgt_get_active_offload_env and is used +/// to determine compatibility of the images with the current environment +/// that is "in play". +struct __tgt_active_offload_env { + char *capabilities; // string returned by offload-arch -r +}; + /// This struct contains the offload entries identified by the target runtime struct __tgt_target_table { __tgt_offload_entry *EntriesBegin; // Begin of the table with all the entries @@ -210,6 +248,13 @@ /// adds a target shared library to the target execution image void __tgt_register_lib(__tgt_bin_desc *desc); +/// adds an image information struct, called for each image +void __tgt_register_image_info(__tgt_image_info *imageInfo); + +/// gets pointer to image information for specified image number +/// Returns nullptr for apps built with old version of compiler +__tgt_image_info *__tgt_get_image_info(uint32_t image_num); + /// removes a target shared library from the target execution image void __tgt_unregister_lib(__tgt_bin_desc *desc); diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports --- a/openmp/libomptarget/src/exports +++ b/openmp/libomptarget/src/exports @@ -2,6 +2,7 @@ global: __tgt_register_requires; __tgt_register_lib; + __tgt_register_image_info; __tgt_unregister_lib; __tgt_target_data_begin; __tgt_target_data_end; diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -43,6 +43,30 @@ PM->RTLs.RegisterLib(desc); } +static __tgt_image_info **__tgt_AllImageInfos; +static int __tgt_num_registered_images = 0; +EXTERN void __tgt_register_image_info(__tgt_image_info *imageInfo) { + + DP(" register_image_info image %d of %d requirements:%s VERSION:%d\n", + imageInfo->image_number, imageInfo->number_images, imageInfo->requirements, + imageInfo->version); + + if (!__tgt_AllImageInfos) + __tgt_AllImageInfos = (__tgt_image_info **)malloc( + sizeof(__tgt_image_info *) * imageInfo->number_images); + __tgt_AllImageInfos[imageInfo->image_number] = imageInfo; + __tgt_num_registered_images = imageInfo->number_images; +} + +//////////////////////////////////////////////////////////////////////////////// +/// Return pointer to image information if it was registered +EXTERN __tgt_image_info *__tgt_get_image_info(unsigned image_number) { + if (__tgt_num_registered_images) + return __tgt_AllImageInfos[image_number]; + else + return nullptr; +} + //////////////////////////////////////////////////////////////////////////////// /// unloads a target shared library EXTERN void __tgt_unregister_lib(__tgt_bin_desc *desc) { @@ -55,6 +79,10 @@ } } } + if (__tgt_num_registered_images) { + free(__tgt_AllImageInfos); + __tgt_num_registered_images = 0; + } } /// creates host-to-target data mapping, stores it in the diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -20,6 +20,7 @@ #include #include #include +#include // List of all plugins that can support offloading. static const char *RTLNames[] = { @@ -288,18 +289,131 @@ flags, RequiresFlags); } +/// Query runtime capabilities of this system by calling offload-arch -c +/// offload_arch_output_buffer is persistant storage returned by this +/// __tgt_get_active_offload_env. +static void +__tgt_get_active_offload_env(__tgt_active_offload_env *active_env, + char *offload_arch_output_buffer, + size_t offload_arch_output_buffer_size) { + void *handle = dlopen("libomptarget.so", RTLD_NOW); + if (!handle) + DP("dlopen() failed: %s\n", dlerror()); + char *libomptarget_dir_name = new char[PATH_MAX]; + if (dlinfo(handle, RTLD_DI_ORIGIN, libomptarget_dir_name) == -1) + DP("RTLD_DI_ORIGIN failed: %s\n", dlerror()); + std::string cmd_bin; + cmd_bin.assign(libomptarget_dir_name).append("/../bin/amdgpu-arch"); + struct stat stat_buffer; + if (stat(cmd_bin.c_str(), &stat_buffer)) { + DP("Missing offload-arch command at %s \n", cmd_bin.c_str()); + } else { + // Add option to print capabilities of current system + // cmd_bin.append(" -c"); + FILE *stream = popen(cmd_bin.c_str(), "r"); + while (fgets(offload_arch_output_buffer, offload_arch_output_buffer_size, + stream) != NULL) + ; + pclose(stream); + active_env->capabilities = offload_arch_output_buffer; + size_t slen = strlen(active_env->capabilities); + offload_arch_output_buffer[slen - 1] = + '\0'; // terminate string before line feed + offload_arch_output_buffer += + slen; // To store next value in offload_arch_output_buffer, not likely + } + delete[] libomptarget_dir_name; +} + +std::vector _splitstrings(char *input, const char *sep) { + std::vector split_strings; + std::string s(input); + std::string delimiter(sep); + size_t pos = 0; + while ((pos = s.find(delimiter)) != std::string::npos) { + if (pos != 0) + split_strings.push_back(s.substr(0, pos)); + s.erase(0, pos + delimiter.length()); + } + if (s.length() > 1) + split_strings.push_back(s.substr(0, s.length())); + return split_strings; +} + +static bool _ImageIsCompatibleWithEnv(__tgt_image_info *img_info, + __tgt_active_offload_env *active_env) { + // get_image_info will return null if no image information was registered. + // If no image information, assume application built with old compiler and + // check each image. + if (!img_info) + return true; + + // Each runtime requirement for the compiled image is stored in + // the img_info->requirements string and is separated by __ . + // Each runtime capability obtained from "offload-arch -c" is stored in + // actvie_env->capabilities and is separated by spaces. + // If every requirement has a matching capability, then the image + // is compatible with active environment + + std::vector reqs = _splitstrings(img_info->requirements, "__"); + std::vector caps = _splitstrings(active_env->capabilities, " "); + + bool is_compatible = true; + for (auto req : reqs) { + bool missing_capability = true; + for (auto capability : caps) + if (capability == req) + missing_capability = false; + if (missing_capability) { + DP("Image requires %s but runtime capability %s is missing.\n", + img_info->requirements, req.c_str()); + is_compatible = false; + } + } + return is_compatible; +} + +#define MAX_CAPS_STR_SIZE 1024 void RTLsTy::RegisterLib(__tgt_bin_desc *desc) { + + // Get the current active offload environment + __tgt_active_offload_env offload_env; + // Need a buffer to hold results of offload-arch -c command + size_t offload_arch_output_buffer_size = MAX_CAPS_STR_SIZE; + char *offload_arch_output_buffer = + (char *)malloc(offload_arch_output_buffer_size); + __tgt_get_active_offload_env(&offload_env, offload_arch_output_buffer, + offload_arch_output_buffer_size); + + bool requires_usm = (bool)(RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY); + bool has_xnack = (std::string(offload_env.capabilities).find("xnack+") != + std::string::npos); + bool is_amd = (std::string(offload_env.capabilities).find("gfx") == 0); + if (is_amd && requires_usm && !has_xnack) { + fprintf(stderr, "WARNING: USM SET WITHOUT XNACK ENABLED.\n"); + fprintf(stderr, " THIS WILL BECOME FATAL ERROR IN FUTURE.\n"); + } +#if 0 + FATAL_MESSAGE0(1, "'#pragma omp requires unified_shared_memory' requires " + "environment with xnack+ capability!"); +#endif + + RTLInfoTy *FoundRTL = NULL; PM->RTLsMtx.lock(); // Register the images with the RTLs that understand them, if any. for (int32_t i = 0; i < desc->NumDeviceImages; ++i) { // Obtain the image. __tgt_device_image *img = &desc->DeviceImages[i]; - RTLInfoTy *FoundRTL = NULL; - + // Get corresponding image info requirements and check with runtime + __tgt_image_info *img_info = __tgt_get_image_info(i); + if (!_ImageIsCompatibleWithEnv(img_info, &offload_env)) + continue; + FoundRTL = NULL; // Scan the RTLs that have associated images until we find one that supports // the current image. for (auto &R : AllRTLs) { + if (!R.is_valid_binary(img)) { DP("Image " DPxMOD " is NOT compatible with RTL %s!\n", DPxPTR(img->ImageStart), R.RTLName.c_str()); @@ -368,7 +482,41 @@ } PM->RTLsMtx.unlock(); + if (!FoundRTL) { + if (PM->TargetOffloadPolicy == tgt_mandatory) + fprintf(stderr, "ERROR:\ + Runtime capabilities do NOT meet any offload image requirements\n\ + and the OMP_TARGET_OFFLOAD policy is mandatory. Terminating!\n\ + Runtime capabilities : %s\n", + offload_env.capabilities); + else if (PM->TargetOffloadPolicy == tgt_disabled) + fprintf(stderr, "WARNING: Offloading is disabled.\n"); + else + fprintf( + stderr, + "WARNING: Runtime capabilities do NOT meet any image requirements.\n\ + So device offloading is now disabled.\n\ + Runtime capabilities : %s\n", + offload_env.capabilities); + if (PM->TargetOffloadPolicy != tgt_disabled) { + for (int32_t i = 0; i < desc->NumDeviceImages; ++i) { + __tgt_image_info *img_info = __tgt_get_image_info(i); + if (img_info) + fprintf(stderr, "\ + Image %d requirements : %s\n", + i, img_info->requirements); + else + fprintf(stderr, "\ + Image %d has no requirements. Could be from older compiler\n", + i); + } + } + if (PM->TargetOffloadPolicy == tgt_mandatory) + exit(1); + } + DP("Done registering entries!\n"); + free(offload_arch_output_buffer); } void RTLsTy::UnregisterLib(__tgt_bin_desc *desc) {