Index: include/clang/Driver/Action.h =================================================================== --- include/clang/Driver/Action.h +++ include/clang/Driver/Action.h @@ -12,6 +12,7 @@ #include "clang/Driver/Types.h" #include "clang/Driver/Util.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" namespace llvm { @@ -26,6 +27,8 @@ namespace clang { namespace driver { +class ToolChain; + /// Action - Represent an abstract compilation step to perform. /// /// An action represents an edge in the compilation graph; typically @@ -49,8 +52,7 @@ enum ActionClass { InputClass = 0, BindArchClass, - CudaDeviceClass, - CudaHostClass, + OffloadClass, PreprocessJobClass, PrecompileJobClass, AnalyzeJobClass, @@ -70,10 +72,6 @@ // The offloading kind determines if this action is binded to a particular // programming model. Each entry reserves one bit. - // - // FIXME: This is currently used to indicate that toolchains are used in a - // given programming as well, but will be used here as well once a generic - // offloading action is implemented. enum OffloadKind { OFFLOAD_None = 0x00, OFFLOAD_CUDA = 0x01, @@ -90,13 +88,24 @@ ActionList Inputs; protected: + /// Offload information. It has to be mutable as it needs to be adjusted if + /// actions are integrated. + /// \brief Multiple programming models may be supported simultaneously by the + /// same host. Therefore, the host offloading kind is a combination of kinds. + mutable unsigned OffloadingHostKind; + /// \brief Offloading kind of the device. + mutable OffloadKind OffloadingDeviceKind; + /// \brief The Offloading architecture associated with this action. + mutable const char *OffloadingArch; + Action(ActionClass Kind, types::ID Type) : Action(Kind, ActionList(), Type) {} Action(ActionClass Kind, Action *Input, types::ID Type) : Action(Kind, ActionList({Input}), Type) {} Action(ActionClass Kind, Action *Input) : Action(Kind, ActionList({Input}), Input->getType()) {} Action(ActionClass Kind, const ActionList &Inputs, types::ID Type) - : Kind(Kind), Type(Type), Inputs(Inputs) {} + : Kind(Kind), Type(Type), Inputs(Inputs), OffloadingHostKind(0u), + OffloadingDeviceKind(OFFLOAD_None), OffloadingArch(nullptr) {} public: virtual ~Action(); @@ -119,6 +128,36 @@ input_const_range inputs() const { return input_const_range(input_begin(), input_end()); } + + std::string getOffloadingKindPrefix() const; + std::string getOffloadingFileNamePrefix(const ToolChain *TC) const; + + /// \brief Set the device offload info of this action and propagate it to its + /// dependences. + void propagateDeviceOffloadInfo(OffloadKind OKind, const char *OArch) const; + /// \brief Append the host offload info of this action and propagate it to its + /// dependences. + void propagateHostOffloadInfo(unsigned OKinds, const char *OArch) const; + /// \brief Set the offload info of this action to be the same as the provided + /// action, and propagate it to its dependences. + void propagateOffloadInfo(const Action *A) const; + + unsigned getOffloadingHostKinds() const { return OffloadingHostKind; } + OffloadKind getOffloadingDeviceKind() const { return OffloadingDeviceKind; } + const char *getOffloadingArch() const { return OffloadingArch; } + + /// \brief Check if this action have any offload kinds. Note that host offload + /// kinds are only set if the action is a dependence to an host offload + /// action. + bool isHostOffloading(OffloadKind OKind) const { + return OffloadingHostKind & OKind; + } + bool isDeviceOffloading(OffloadKind OKind) const { + return OffloadingDeviceKind == OKind; + } + bool isOffloading(OffloadKind OKind) const { + return isHostOffloading(OKind) || isDeviceOffloading(OKind); + } }; class InputAction : public Action { @@ -151,43 +190,102 @@ } }; -class CudaDeviceAction : public Action { +/// \brief An offload action combines host or/and device actions according to +/// the programming model implementation needs and propagates the offloading +/// kind to its dependences. +class OffloadAction : public Action { virtual void anchor(); - /// GPU architecture to bind. Always of the form /sm_\d+/ or null (when the - /// action applies to multiple architectures). - const char *GpuArchName; - /// True when action results are not consumed by the host action (e.g when - /// -fsyntax-only or --cuda-device-only options are used). - bool AtTopLevel; - public: - CudaDeviceAction(Action *Input, const char *ArchName, bool AtTopLevel); + /// \brief Type used to communicate device actions. It associates bound + /// architecture, toolchain, and offload kind to each action. + class DeviceDependences { + public: + typedef SmallVector ToolChainList; + typedef SmallVector BoundArchList; + typedef SmallVector OffloadKindList; + + private: + /// \brief The dependence action. + ActionList AL; + /// \brief The offloading toolchains that should be used with the action. + SmallVector TCL; + /// \brief The architectures that should be used with this action. + SmallVector BAL; + /// \brief The offload kind of each dependence. + SmallVector KL; + + public: + /// \brief Add a action along with the associated toolchain, bound arch, and + /// offload kind. + void add(Action *A, const ToolChain *TC, const char *BoundArch, + OffloadKind OKind); + + /// \brief Get each of the individual arrays. + const ActionList &getActions() const { return AL; }; + const ToolChainList &getToolChains() const { return TCL; }; + const BoundArchList &getBoundArchs() const { return BAL; }; + const OffloadKindList &getOffloadKinds() const { return KL; }; + }; + + /// \brief Type used to communicate host actions. It associates bound + /// architecture, toolchain, and offload kinds to each action. + class HostDependence { + /// \brief The dependence action. + Action *A; + /// \brief The offloading toolchain that should be used with the action. + const ToolChain *TC; + /// \brief The architectures that should be used with this action. + const char *BoundArch; + /// \brief The offload kind of each dependence. + unsigned OffloadKinds; + + public: + HostDependence(Action *A, const ToolChain *TC, const char *BoundArch, + const unsigned OffloadKinds) + : A(A), TC(TC), BoundArch(BoundArch), OffloadKinds(OffloadKinds){}; + /// \brief Constructor version that obtains the offload kinds from the + /// device dependencies. + HostDependence(Action *A, const ToolChain *TC, const char *BoundArch, + const DeviceDependences &DDeps); + Action *getAction() const { return A; }; + const ToolChain *getToolChain() const { return TC; }; + const char *getBoundArch() const { return BoundArch; }; + unsigned getOffloadKinds() const { return OffloadKinds; }; + }; - const char *getGpuArchName() const { return GpuArchName; } + typedef llvm::function_ref + OffloadActionWorkTy; - /// Gets the compute_XX that corresponds to getGpuArchName(). Returns null - /// when getGpuArchName() is null. - const char *getComputeArchName() const; +private: + /// \brief The offloading toolchain that should be used with the action. + const ToolChain *HostTC; - bool isAtTopLevel() const { return AtTopLevel; } + /// \brief The tool chains associated with the list of actions. + DeviceDependences::ToolChainList DevToolChains; - static bool IsValidGpuArchName(llvm::StringRef ArchName); +public: + OffloadAction(const HostDependence &HDep); + OffloadAction(const DeviceDependences &DDeps, types::ID Ty); + OffloadAction(const HostDependence &HDep, const DeviceDependences &DDeps); - static bool classof(const Action *A) { - return A->getKind() == CudaDeviceClass; - } -}; + /// \brief Execute the work specified in \a Work on the host dependence. + void doOnHostDependence(const OffloadActionWorkTy &Work) const; -class CudaHostAction : public Action { - virtual void anchor(); - ActionList DeviceActions; + /// \brief Execute the work specified in \a Work on each device dependence. + void doOnEachDeviceDependence(const OffloadActionWorkTy &Work) const; -public: - CudaHostAction(Action *Input, const ActionList &DeviceActions); + /// \brief Execute the work specified in \a Work on each dependence. + void doOnEachDependence(const OffloadActionWorkTy &Work) const; + + /// \brief Return the host dependence of this action, or null if we don't have + /// any. + Action *getHostDependence() const; - const ActionList &getDeviceActions() const { return DeviceActions; } + /// \brief Return the single device dependence of this action, or null if we + /// don't have one or we have more than one. + Action *getSingleDeviceDependence() const; - static bool classof(const Action *A) { return A->getKind() == CudaHostClass; } + static bool classof(const Action *A) { return A->getKind() == OffloadClass; } }; class JobAction : public Action { Index: include/clang/Driver/Driver.h =================================================================== --- include/clang/Driver/Driver.h +++ include/clang/Driver/Driver.h @@ -415,12 +415,11 @@ /// \param BoundArch - The bound architecture. /// \param AtTopLevel - Whether this is a "top-level" action. /// \param MultipleArchs - Whether multiple -arch options were supplied. - const char *GetNamedOutputPath(Compilation &C, - const JobAction &JA, - const char *BaseInput, - const char *BoundArch, - bool AtTopLevel, - bool MultipleArchs) const; + /// \param TC - Toolchain associated with the output. + const char *GetNamedOutputPath(Compilation &C, const JobAction &JA, + const char *BaseInput, const char *BoundArch, + bool AtTopLevel, bool MultipleArchs, + const ToolChain *TC) const; /// GetTemporaryPath - Return the pathname of a temporary file to use /// as part of compilation; the file will have the given prefix and suffix. Index: lib/Driver/Action.cpp =================================================================== --- lib/Driver/Action.cpp +++ lib/Driver/Action.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "clang/Driver/Action.h" +#include "clang/Driver/ToolChain.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Regex.h" @@ -21,8 +22,7 @@ switch (AC) { case InputClass: return "input"; case BindArchClass: return "bind-arch"; - case CudaDeviceClass: return "cuda-device"; - case CudaHostClass: return "cuda-host"; + case OffloadClass: return "offload"; case PreprocessJobClass: return "preprocessor"; case PrecompileJobClass: return "precompiler"; case AnalyzeJobClass: return "analyzer"; @@ -40,6 +40,79 @@ llvm_unreachable("invalid class"); } +void Action::propagateDeviceOffloadInfo(OffloadKind OKind, + const char *OArch) const { + // Offload action set its own kinds on their dependences. + if (Kind == OffloadClass) + return; + + assert( + (OffloadingDeviceKind == OKind || OffloadingDeviceKind == OFFLOAD_None) && + "Setting device kind to a different device??"); + assert(!OffloadingHostKind && "Setting a device kind in a host action??"); + OffloadingDeviceKind = OKind; + OffloadingArch = OArch; + + for (auto *A : Inputs) + A->propagateDeviceOffloadInfo(OffloadingDeviceKind, OArch); +} + +void Action::propagateHostOffloadInfo(unsigned OKinds, + const char *OArch) const { + // Offload action set its own kinds on their dependences. + if (Kind == OffloadClass) + return; + + assert(OffloadingDeviceKind == OFFLOAD_None && + "Setting a host kind in a device action."); + OffloadingHostKind |= OKinds; + OffloadingArch = OArch; + + for (auto *A : Inputs) + A->propagateHostOffloadInfo(OffloadingHostKind, OArch); +} + +void Action::propagateOffloadInfo(const Action *A) const { + if (unsigned HK = A->getOffloadingHostKinds()) + propagateHostOffloadInfo(HK, A->getOffloadingArch()); + else + propagateDeviceOffloadInfo(A->getOffloadingDeviceKind(), + A->getOffloadingArch()); +} + +std::string Action::getOffloadingKindPrefix() const { + switch (OffloadingDeviceKind) { + case OFFLOAD_None: + break; + case OFFLOAD_CUDA: + return "device-cuda"; + // Add other programming models here. + } + + if (!OffloadingHostKind) + return ""; + + std::string Res("host"); + if (OffloadingHostKind & OFFLOAD_CUDA) + Res += "-cuda"; + // Add other programming models here. + + return Res; +} + +std::string Action::getOffloadingFileNamePrefix(const ToolChain *TC) const { + // A file prefix is only generated for device actions and consists of the + // offload kind and triple. + if (!OffloadingDeviceKind) + return ""; + + std::string Res("-"); + Res += getOffloadingKindPrefix(); + Res += "-"; + Res += TC->getTriple().normalize(); + return Res; +} + void InputAction::anchor() {} InputAction::InputAction(const Arg &_Input, types::ID _Type) @@ -51,45 +124,106 @@ BindArchAction::BindArchAction(Action *Input, const char *_ArchName) : Action(BindArchClass, Input), ArchName(_ArchName) {} -// Converts CUDA GPU architecture, e.g. "sm_21", to its corresponding virtual -// compute arch, e.g. "compute_20". Returns null if the input arch is null or -// doesn't match an existing arch. -static const char* GpuArchToComputeName(const char *ArchName) { - if (!ArchName) - return nullptr; - return llvm::StringSwitch(ArchName) - .Cases("sm_20", "sm_21", "compute_20") - .Case("sm_30", "compute_30") - .Case("sm_32", "compute_32") - .Case("sm_35", "compute_35") - .Case("sm_37", "compute_37") - .Case("sm_50", "compute_50") - .Case("sm_52", "compute_52") - .Case("sm_53", "compute_53") - .Default(nullptr); +void OffloadAction::anchor() {} + +OffloadAction::OffloadAction(const HostDependence &HDep) + : Action(OffloadClass, HDep.getAction()), HostTC(HDep.getToolChain()) { + OffloadingArch = HDep.getBoundArch(); + OffloadingHostKind = HDep.getOffloadKinds(); + HDep.getAction()->propagateHostOffloadInfo(HDep.getOffloadKinds(), + HDep.getBoundArch()); +}; + +OffloadAction::OffloadAction(const DeviceDependences &DDeps, types::ID Ty) + : Action(OffloadClass, DDeps.getActions(), Ty), HostTC(nullptr), + DevToolChains(DDeps.getToolChains()) { + auto &OKinds = DDeps.getOffloadKinds(); + auto &BArchs = DDeps.getBoundArchs(); + + // If we have a single dependency, inherit the offloading info from it. + if (OKinds.size() == 1) { + OffloadingDeviceKind = OKinds.front(); + OffloadingArch = BArchs.front(); + } + // Propagate info to the dependencies. + for (unsigned i = 0; i < getInputs().size(); ++i) + getInputs()[i]->propagateDeviceOffloadInfo(OKinds[i], BArchs[i]); } -void CudaDeviceAction::anchor() {} +OffloadAction::OffloadAction(const HostDependence &HDep, + const DeviceDependences &DDeps) + : Action(OffloadClass, HDep.getAction()), HostTC(HDep.getToolChain()), + DevToolChains(DDeps.getToolChains()) { + // We use the kinds of the host dependence for this action. + OffloadingArch = HDep.getBoundArch(); + OffloadingHostKind = HDep.getOffloadKinds(); + HDep.getAction()->propagateHostOffloadInfo(HDep.getOffloadKinds(), + HDep.getBoundArch()); + + // Add device inputs and propagate info to the device actions. + for (unsigned i = 0; i < DDeps.getActions().size(); ++i) { + auto *A = DDeps.getActions()[i]; + // Skip actions of empty dependences. + if (!A) + continue; + getInputs().push_back(A); + A->propagateDeviceOffloadInfo(DDeps.getOffloadKinds()[i], + DDeps.getBoundArchs()[i]); + } +} -CudaDeviceAction::CudaDeviceAction(Action *Input, const char *ArchName, - bool AtTopLevel) - : Action(CudaDeviceClass, Input), GpuArchName(ArchName), - AtTopLevel(AtTopLevel) { - assert(!GpuArchName || IsValidGpuArchName(GpuArchName)); +void OffloadAction::doOnHostDependence(const OffloadActionWorkTy &Work) const { + if (!HostTC) + return; + auto *A = getInputs().front(); + Work(A, HostTC, A->getOffloadingArch()); } -const char *CudaDeviceAction::getComputeArchName() const { - return GpuArchToComputeName(GpuArchName); +void OffloadAction::doOnEachDeviceDependence( + const OffloadActionWorkTy &Work) const { + auto I = getInputs().begin(); + auto E = getInputs().end(); + if (I == E) + return; + + // Skip host action + if (HostTC) + ++I; + + auto TI = DevToolChains.begin(); + for (; I != E; ++I) + Work(*I, *TI, (*I)->getOffloadingArch()); } -bool CudaDeviceAction::IsValidGpuArchName(llvm::StringRef ArchName) { - return GpuArchToComputeName(ArchName.data()) != nullptr; +void OffloadAction::doOnEachDependence(const OffloadActionWorkTy &Work) const { + doOnHostDependence(Work); + doOnEachDeviceDependence(Work); } -void CudaHostAction::anchor() {} +Action *OffloadAction::getHostDependence() const { + return HostTC ? getInputs().front() : nullptr; +} + +Action *OffloadAction::getSingleDeviceDependence() const { + return (!HostTC && getInputs().size() == 1) ? getInputs().front() : nullptr; +} -CudaHostAction::CudaHostAction(Action *Input, const ActionList &DeviceActions) - : Action(CudaHostClass, Input), DeviceActions(DeviceActions) {} +void OffloadAction::DeviceDependences::add(Action *A, const ToolChain *TC, + const char *BoundArch, + OffloadKind OKind) { + AL.push_back(A); + TCL.push_back(TC); + BAL.push_back(BoundArch); + KL.push_back(OKind); +} + +OffloadAction::HostDependence::HostDependence(Action *A, const ToolChain *TC, + const char *BoundArch, + const DeviceDependences &DDeps) + : A(A), TC(TC), BoundArch(BoundArch), OffloadKinds(0u) { + for (auto K : DDeps.getOffloadKinds()) + OffloadKinds |= K; +} void JobAction::anchor() {} Index: lib/Driver/Driver.cpp =================================================================== --- lib/Driver/Driver.cpp +++ lib/Driver/Driver.cpp @@ -987,18 +987,33 @@ } else if (BindArchAction *BIA = dyn_cast(A)) { os << '"' << BIA->getArchName() << '"' << ", {" << PrintActions1(C, *BIA->input_begin(), Ids) << "}"; - } else if (CudaDeviceAction *CDA = dyn_cast(A)) { - os << '"' - << (CDA->getGpuArchName() ? CDA->getGpuArchName() : "(multiple archs)") - << '"' << ", {" << PrintActions1(C, *CDA->input_begin(), Ids) << "}"; + } else if (OffloadAction *OA = dyn_cast(A)) { + bool IsFirst = true; + OA->doOnEachDependence( + [&](Action *A, const ToolChain *TC, const char *BoundArch) { + // E.g. for two CUDA device dependences whose bound arch is sm_20 and + // sm_35 this will generate: + // "cuda-device" (nvptx64-nvidia-cuda:sm_20) {#ID}, "cuda-device" + // (nvptx64-nvidia-cuda:sm_35) {#ID} + if (!IsFirst) + os << ", "; + os << '"'; + if (TC) + os << A->getOffloadingKindPrefix(); + else + os << "host"; + os << " ("; + os << TC->getTriple().normalize(); + + if (BoundArch) + os << ":" << BoundArch; + os << ")"; + os << '"'; + os << " {" << PrintActions1(C, A, Ids) << "}"; + IsFirst = false; + }); } else { - const ActionList *AL; - if (CudaHostAction *CHA = dyn_cast(A)) { - os << "{" << PrintActions1(C, *CHA->input_begin(), Ids) << "}" - << ", gpu binaries "; - AL = &CHA->getDeviceActions(); - } else - AL = &A->getInputs(); + const ActionList *AL = &A->getInputs(); if (AL->size()) { const char *Prefix = "{"; @@ -1011,10 +1026,24 @@ os << "{}"; } + // Append offload info for all options other than the offloading action + // itself (e.g. (cuda-device, sm_20) or (cuda-host)). + std::string offload_str; + llvm::raw_string_ostream offload_os(offload_str); + if (!isa(A)) { + auto S = A->getOffloadingKindPrefix(); + if (!S.empty()) { + offload_os << ", (" << S; + if (A->getOffloadingArch()) + offload_os << ", " << A->getOffloadingArch(); + offload_os << ")"; + } + } + unsigned Id = Ids.size(); Ids[A] = Id; llvm::errs() << Id << ": " << os.str() << ", " - << types::getTypeName(A->getType()) << "\n"; + << types::getTypeName(A->getType()) << offload_os.str() << "\n"; return Id; } @@ -1327,8 +1356,12 @@ options::OPT_cuda_device_only); // Host-only compilation case. if (PartialCompilationArg && - PartialCompilationArg->getOption().matches(options::OPT_cuda_host_only)) - return C.MakeAction(HostAction, ActionList()); + PartialCompilationArg->getOption().matches(options::OPT_cuda_host_only)) { + OffloadAction::HostDependence HDep( + HostAction, C.getOffloadingHostToolChain(), /*BoundArch=*/nullptr, + Action::OFFLOAD_CUDA); + return C.MakeAction(HDep); + } // Collect all cuda_gpu_arch parameters, removing duplicates. SmallVector GpuArchList; @@ -1339,7 +1372,7 @@ A->claim(); const auto& Arch = A->getValue(); - if (!CudaDeviceAction::IsValidGpuArchName(Arch)) + if (!toolchains::CudaToolChain::GpuArchToComputeName(Arch)) C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << Arch; else if (GpuArchNames.insert(Arch).second) GpuArchList.push_back(Arch); @@ -1355,9 +1388,11 @@ for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) CudaDeviceInputs.push_back(std::make_pair(types::TY_CUDA_DEVICE, InputArg)); + const ToolChain *CudaTC = + C.getSingleOffloadDeviceToolChain(); + // Build actions for all device inputs. - assert(C.getSingleOffloadDeviceToolChain() && - "Missing toolchain for device-side compilation."); + assert(CudaTC && "Missing toolchain for device-side compilation."); ActionList CudaDeviceActions; C.getDriver().BuildActions(C, Args, CudaDeviceInputs, CudaDeviceActions); assert(GpuArchList.size() == CudaDeviceActions.size() && @@ -1385,10 +1420,13 @@ return nullptr; } - for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) - Actions.push_back(C.MakeAction(CudaDeviceActions[I], - GpuArchList[I], - /* AtTopLevel */ true)); + for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { + OffloadAction::DeviceDependences DDep; + DDep.add(CudaDeviceActions[I], CudaTC, GpuArchList[I], + Action::OFFLOAD_CUDA); + Actions.push_back( + C.MakeAction(DDep, CudaDeviceActions[I]->getType())); + } // Kill host action in case of device-only compilation. if (DeviceOnlyCompilation) return nullptr; @@ -1408,19 +1446,23 @@ Action* BackendAction = AssembleAction->getInputs()[0]; assert(BackendAction->getType() == types::TY_PP_Asm); - for (const auto& A : {AssembleAction, BackendAction}) { - DeviceActions.push_back(C.MakeAction( - A, GpuArchList[I], /* AtTopLevel */ false)); + for (auto &A : {AssembleAction, BackendAction}) { + OffloadAction::DeviceDependences DDep; + DDep.add(A, CudaTC, GpuArchList[I], Action::OFFLOAD_CUDA); + DeviceActions.push_back(C.MakeAction(DDep, A->getType())); } } - auto FatbinAction = C.MakeAction( - C.MakeAction(DeviceActions, types::TY_CUDA_FATBIN), - /* GpuArchName = */ nullptr, - /* AtTopLevel = */ false); + auto FatbinAction = + C.MakeAction(DeviceActions, types::TY_CUDA_FATBIN); + // Return a new host action that incorporates original host action and all // device actions. - return C.MakeAction(std::move(HostAction), - ActionList({FatbinAction})); + OffloadAction::HostDependence HDep(HostAction, C.getOffloadingHostToolChain(), + /*BoundArch=*/nullptr, + Action::OFFLOAD_CUDA); + OffloadAction::DeviceDependences DDep; + DDep.add(FatbinAction, CudaTC, /*BoundArch=*/nullptr, Action::OFFLOAD_CUDA); + return C.MakeAction(HDep, DDep); } void Driver::BuildActions(Compilation &C, DerivedArgList &Args, @@ -1825,7 +1867,28 @@ } } } - +// Collapse an offloading action looking for a job of the given type. The input +// action is changed to the input of the collapsed sequence. If we effectively +// had a collapse return the corresponding offloading action, otherwise return +// null. +template +static OffloadAction *collapseOffloadingAction(Action *&CurAction) { + if (!CurAction) + return nullptr; + if (auto *OA = dyn_cast(CurAction)) { + if (auto *HDep = OA->getHostDependence()) + if (isa(HDep)) { + CurAction = HDep; + return OA; + } + if (auto *DDep = OA->getSingleDeviceDependence()) + if (isa(DDep)) { + CurAction = DDep; + return OA; + } + } + return nullptr; +} // Returns a Tool for a given JobAction. In case the action and its // predecessors can be combined, updates Inputs with the inputs of the // first combined action. If one of the collapsed actions is a @@ -1835,34 +1898,39 @@ bool EmbedBitcode, const ToolChain *TC, const JobAction *JA, const ActionList *&Inputs, - const CudaHostAction *&CollapsedCHA) { + ActionList &CollapsedOffloadAction) { const Tool *ToolForJob = nullptr; - CollapsedCHA = nullptr; + CollapsedOffloadAction.clear(); // See if we should look for a compiler with an integrated assembler. We match // bottom up, so what we are actually looking for is an assembler job with a // compiler input. + // Look through offload actions between assembler and backend actions. + Action *BackendJA = (isa(JA) && Inputs->size() == 1) + ? *Inputs->begin() + : nullptr; + auto *BackendOA = collapseOffloadingAction(BackendJA); + if (TC->useIntegratedAs() && !SaveTemps && !C.getArgs().hasArg(options::OPT_via_file_asm) && !C.getArgs().hasArg(options::OPT__SLASH_FA) && - !C.getArgs().hasArg(options::OPT__SLASH_Fa) && - isa(JA) && Inputs->size() == 1 && - isa(*Inputs->begin())) { + !C.getArgs().hasArg(options::OPT__SLASH_Fa) && BackendJA && + isa(BackendJA)) { // A BackendJob is always preceded by a CompileJob, and without -save-temps // or -fembed-bitcode, they will always get combined together, so instead of // checking the backend tool, check if the tool for the CompileJob has an // integrated assembler. For -fembed-bitcode, CompileJob is still used to // look up tools for BackendJob, but they need to match before we can split // them. - const ActionList *BackendInputs = &(*Inputs)[0]->getInputs(); - // Compile job may be wrapped in CudaHostAction, extract it if - // that's the case and update CollapsedCHA if we combine phases. - CudaHostAction *CHA = dyn_cast(*BackendInputs->begin()); - JobAction *CompileJA = cast( - CHA ? *CHA->input_begin() : *BackendInputs->begin()); - assert(CompileJA && "Backend job is not preceeded by compile job."); - const Tool *Compiler = TC->SelectTool(*CompileJA); + + // Look through offload actions between backend and compile actions. + Action *CompileJA = *BackendJA->getInputs().begin(); + auto *CompileOA = collapseOffloadingAction(CompileJA); + + assert(CompileJA && isa(CompileJA) && + "Backend job is not preceeded by compile job."); + const Tool *Compiler = TC->SelectTool(*cast(CompileJA)); if (!Compiler) return nullptr; // When using -fembed-bitcode, it is required to have the same tool (clang) @@ -1876,7 +1944,15 @@ if (Compiler->hasIntegratedAssembler()) { Inputs = &CompileJA->getInputs(); ToolForJob = Compiler; - CollapsedCHA = CHA; + // Save the collapsed offload actions because they may still contain + // device action. Also propagate the offloading info of the inputs to the + // other action that are being integrated. + if (CompileOA) + CollapsedOffloadAction.push_back(CompileOA); + if (BackendOA) + CollapsedOffloadAction.push_back(BackendOA); + if (CompileOA || BackendOA) + JA->propagateOffloadInfo(CompileJA); } } @@ -1886,20 +1962,25 @@ if (isa(JA)) { // Check if the compiler supports emitting LLVM IR. assert(Inputs->size() == 1); - // Compile job may be wrapped in CudaHostAction, extract it if - // that's the case and update CollapsedCHA if we combine phases. - CudaHostAction *CHA = dyn_cast(*Inputs->begin()); - JobAction *CompileJA = - cast(CHA ? *CHA->input_begin() : *Inputs->begin()); - assert(CompileJA && "Backend job is not preceeded by compile job."); - const Tool *Compiler = TC->SelectTool(*CompileJA); + + // Look through offload actions between backend and compile actions. + Action *CompileJA = *JA->getInputs().begin(); + auto *CompileOA = collapseOffloadingAction(CompileJA); + + assert(CompileJA && isa(CompileJA) && + "Backend job is not preceeded by compile job."); + const Tool *Compiler = TC->SelectTool(*cast(CompileJA)); if (!Compiler) return nullptr; if (!Compiler->canEmitIR() || (!SaveTemps && !EmbedBitcode)) { Inputs = &CompileJA->getInputs(); ToolForJob = Compiler; - CollapsedCHA = CHA; + + if (CompileOA) { + CollapsedOffloadAction.push_back(CompileOA); + JA->propagateOffloadInfo(CompileJA); + } } } @@ -1910,12 +1991,23 @@ // See if we should use an integrated preprocessor. We do so when we have // exactly one input, since this is the only use case we care about // (irrelevant since we don't support combine yet). - if (Inputs->size() == 1 && isa(*Inputs->begin()) && + + // Look through offload actions after preprocessing. + Action *PreprocessJA = (Inputs->size() == 1) ? *Inputs->begin() : nullptr; + auto *PreprocessOA = + collapseOffloadingAction(PreprocessJA); + + if (PreprocessJA && isa(PreprocessJA) && !C.getArgs().hasArg(options::OPT_no_integrated_cpp) && !C.getArgs().hasArg(options::OPT_traditional_cpp) && !SaveTemps && !C.getArgs().hasArg(options::OPT_rewrite_objc) && - ToolForJob->hasIntegratedCPP()) - Inputs = &(*Inputs)[0]->getInputs(); + ToolForJob->hasIntegratedCPP()) { + Inputs = &PreprocessJA->getInputs(); + if (PreprocessOA) { + CollapsedOffloadAction.push_back(PreprocessOA); + JA->propagateOffloadInfo(PreprocessJA); + } + } return ToolForJob; } @@ -1952,17 +2044,31 @@ const { llvm::PrettyStackTraceString CrashInfo("Building compilation jobs"); - InputInfoList CudaDeviceInputInfos; - if (const CudaHostAction *CHA = dyn_cast(A)) { - // Append outputs of device jobs to the input list. - for (const Action *DA : CHA->getDeviceActions()) { - CudaDeviceInputInfos.push_back(BuildJobsForAction( - C, DA, TC, nullptr, AtTopLevel, - /*MultipleArchs*/ false, LinkingOutput, CachedResults)); + InputInfoList OffloadDeviceInputInfos; + if (const OffloadAction *OA = dyn_cast(A)) { + Action *HostAction = nullptr; + OA->doOnEachDeviceDependence( + [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) { + OffloadDeviceInputInfos.push_back(BuildJobsForAction( + C, DepA, DepTC, DepBoundArch, AtTopLevel, + /*MultipleArchs*/ !!DepBoundArch, LinkingOutput, CachedResults)); + }); + OA->doOnHostDependence( + [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) { + HostAction = DepA; + }); + + // If we have a single device action, just return its info. + if (!HostAction && OffloadDeviceInputInfos.size() == 1) { + return OffloadDeviceInputInfos.back(); } + + assert(HostAction && "Device actions are only expected to be used by the " + "host, not by each other."); + // Override current action with a real host compile action and continue // processing it. - A = *CHA->input_begin(); + A = HostAction; } if (const InputAction *IA = dyn_cast(A)) { @@ -1992,38 +2098,27 @@ MultipleArchs, LinkingOutput, CachedResults); } - if (const CudaDeviceAction *CDA = dyn_cast(A)) { - // Initial processing of CudaDeviceAction carries host params. - // Call BuildJobsForAction() again, now with correct device parameters. - InputInfo II = BuildJobsForAction( - C, *CDA->input_begin(), - C.getSingleOffloadDeviceToolChain(), - CDA->getGpuArchName(), CDA->isAtTopLevel(), /*MultipleArchs=*/true, - LinkingOutput, CachedResults); - // Currently II's Action is *CDA->input_begin(). Set it to CDA instead, so - // that one can retrieve II's GPU arch. - II.setAction(A); - return II; - } const ActionList *Inputs = &A->getInputs(); const JobAction *JA = cast(A); - const CudaHostAction *CollapsedCHA = nullptr; + ActionList CollapsedOffloadActions; + const Tool *T = selectToolForJob(C, isSaveTempsEnabled(), embedBitcodeEnabled(), TC, JA, - Inputs, CollapsedCHA); + Inputs, CollapsedOffloadActions); if (!T) return InputInfo(); - // If we've collapsed action list that contained CudaHostAction we + // If we've collapsed action list that contained OffloadAction we // need to build jobs for device-side inputs it may have held. - if (CollapsedCHA) { - for (const Action *DA : CollapsedCHA->getDeviceActions()) { - CudaDeviceInputInfos.push_back(BuildJobsForAction( - C, DA, TC, "", AtTopLevel, - /*MultipleArchs*/ false, LinkingOutput, CachedResults)); - } + for (const auto *OA : CollapsedOffloadActions) { + cast(OA)->doOnEachDeviceDependence( + [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) { + OffloadDeviceInputInfos.push_back(BuildJobsForAction( + C, DepA, DepTC, DepBoundArch, AtTopLevel, + /*MultipleArchs=*/!!DepBoundArch, LinkingOutput, CachedResults)); + }); } // Only use pipes when there is exactly one input. @@ -2047,9 +2142,10 @@ if (JA->getType() == types::TY_dSYM) BaseInput = InputInfos[0].getFilename(); - // Append outputs of cuda device jobs to the input list - if (CudaDeviceInputInfos.size()) - InputInfos.append(CudaDeviceInputInfos.begin(), CudaDeviceInputInfos.end()); + // Append outputs of offload device jobs to the input list + if (!OffloadDeviceInputInfos.empty()) + InputInfos.append(OffloadDeviceInputInfos.begin(), + OffloadDeviceInputInfos.end()); // Determine the place to write output to, if any. InputInfo Result; @@ -2057,7 +2153,7 @@ Result = InputInfo(A, BaseInput); else Result = InputInfo(A, GetNamedOutputPath(C, *JA, BaseInput, BoundArch, - AtTopLevel, MultipleArchs), + AtTopLevel, MultipleArchs, TC), BaseInput); if (CCCPrintBindings && !CCGenDiagnostics) { @@ -2117,7 +2213,8 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA, const char *BaseInput, const char *BoundArch, bool AtTopLevel, - bool MultipleArchs) const { + bool MultipleArchs, + const ToolChain *TC) const { llvm::PrettyStackTraceString CrashInfo("Computing output path"); // Output to a user requested destination? if (AtTopLevel && !isa(JA) && !isa(JA)) { @@ -2203,6 +2300,7 @@ MakeCLOutputFilename(C.getArgs(), "", BaseName, types::TY_Image); } else if (MultipleArchs && BoundArch) { SmallString<128> Output(getDefaultImageName()); + Output += JA.getOffloadingFileNamePrefix(TC); Output += "-"; Output.append(BoundArch); NamedOutput = C.getArgs().MakeArgString(Output.c_str()); @@ -2219,6 +2317,7 @@ if (!types::appendSuffixForType(JA.getType())) End = BaseName.rfind('.'); SmallString<128> Suffixed(BaseName.substr(0, End)); + Suffixed += JA.getOffloadingFileNamePrefix(TC); if (MultipleArchs && BoundArch) { Suffixed += "-"; Suffixed.append(BoundArch); Index: lib/Driver/ToolChain.cpp =================================================================== --- lib/Driver/ToolChain.cpp +++ lib/Driver/ToolChain.cpp @@ -248,8 +248,7 @@ case Action::InputClass: case Action::BindArchClass: - case Action::CudaDeviceClass: - case Action::CudaHostClass: + case Action::OffloadClass: case Action::LipoJobClass: case Action::DsymutilJobClass: case Action::VerifyDebugInfoJobClass: Index: lib/Driver/ToolChains.h =================================================================== --- lib/Driver/ToolChains.h +++ lib/Driver/ToolChains.h @@ -833,6 +833,11 @@ // ptxas. bool useIntegratedAs() const override { return false; } + // Converts CUDA GPU architecture, e.g. "sm_21", to its corresponding virtual + // compute arch, e.g. "compute_20". Returns null if the input arch is null or + // doesn't match an existing arch. + static const char *GpuArchToComputeName(const char *ArchName); + protected: Tool *buildAssembler() const override; // ptxas Tool *buildLinker() const override; // fatbinary (ok, not really a linker) Index: lib/Driver/ToolChains.cpp =================================================================== --- lib/Driver/ToolChains.cpp +++ lib/Driver/ToolChains.cpp @@ -4291,6 +4291,21 @@ return DAL; } +const char *CudaToolChain::GpuArchToComputeName(const char *ArchName) { + if (!ArchName) + return nullptr; + return llvm::StringSwitch(ArchName) + .Cases("sm_20", "sm_21", "compute_20") + .Case("sm_30", "compute_30") + .Case("sm_32", "compute_32") + .Case("sm_35", "compute_35") + .Case("sm_37", "compute_37") + .Case("sm_50", "compute_50") + .Case("sm_52", "compute_52") + .Case("sm_53", "compute_53") + .Default(nullptr); +} + Tool *CudaToolChain::buildAssembler() const { return new tools::NVPTX::Assembler(*this); } Index: lib/Driver/Tools.cpp =================================================================== --- lib/Driver/Tools.cpp +++ lib/Driver/Tools.cpp @@ -3565,7 +3565,7 @@ // CUDA compilation may have multiple inputs (source file + results of // device-side compilations). All other jobs are expected to have exactly one // input. - bool IsCuda = types::isCuda(Input.getType()); + bool IsCuda = JA.isOffloading(Action::OFFLOAD_CUDA); assert((IsCuda || Inputs.size() == 1) && "Unable to handle multiple inputs."); // Invoke ourselves in -cc1 mode. @@ -3583,13 +3583,13 @@ // particular compilation pass we're constructing here. For now we // can check which toolchain we're using and pick the other one to // extract the triple. - if (&getToolChain() == - C.getSingleOffloadDeviceToolChain()) + if (JA.isDeviceOffloading(Action::OFFLOAD_CUDA)) AuxToolChain = C.getOffloadingHostToolChain(); - else if (&getToolChain() == C.getOffloadingHostToolChain()) + else { + assert(C.isOffloadingHostKind(Action::OFFLOAD_CUDA) && + "Expecting CUDA host toolchain."); AuxToolChain = C.getSingleOffloadDeviceToolChain(); - else - llvm_unreachable("Can't figure out CUDA compilation mode."); + } assert(AuxToolChain != nullptr && "No aux toolchain."); CmdArgs.push_back("-aux-triple"); CmdArgs.push_back(Args.MakeArgString(AuxToolChain->getTriple().str())); @@ -10883,10 +10883,9 @@ static_cast(getToolChain()); assert(TC.getTriple().isNVPTX() && "Wrong platform"); - std::vector gpu_archs = - Args.getAllArgValues(options::OPT_march_EQ); - assert(gpu_archs.size() == 1 && "Exactly one GPU Arch required for ptxas."); - const std::string& gpu_arch = gpu_archs[0]; + // Obtain architecture from the action. + const char *gpu_arch = JA.getOffloadingArch(); + assert(gpu_arch && "Device action expected to have an architecture."); ArgStringList CmdArgs; CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-m64" : "-m32"); @@ -10960,12 +10959,19 @@ CmdArgs.push_back(Args.MakeArgString(Output.getFilename())); for (const auto& II : Inputs) { - auto* A = cast(II.getAction()); + auto *A = II.getAction(); + assert(A->getInputs().size() == 1 && + "Device offload action is expected to have a single input"); + const char *gpu_arch = A->getOffloadingArch(); + assert(gpu_arch && + "Device action expected to have associated a GPU architecture!"); + // We need to pass an Arch of the form "sm_XX" for cubin files and // "compute_XX" for ptx. - const char *Arch = (II.getType() == types::TY_PP_Asm) - ? A->getComputeArchName() - : A->getGpuArchName(); + const char *Arch = + (II.getType() == types::TY_PP_Asm) + ? toolchains::CudaToolChain::GpuArchToComputeName(gpu_arch) + : gpu_arch; CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") + Arch + ",file=" + II.getFilename())); } Index: lib/Frontend/CreateInvocationFromCommandLine.cpp =================================================================== --- lib/Frontend/CreateInvocationFromCommandLine.cpp +++ lib/Frontend/CreateInvocationFromCommandLine.cpp @@ -60,25 +60,25 @@ } // We expect to get back exactly one command job, if we didn't something - // failed. CUDA compilation is an exception as it creates multiple jobs. If - // that's the case, we proceed with the first job. If caller needs particular - // CUDA job, it should be controlled via --cuda-{host|device}-only option - // passed to the driver. + // failed. Offload compilation is an exception as it creates multiple jobs. If + // that's the case, we proceed with the first job. If caller needs a + // particular job, it should be controlled via options (e.g. + // --cuda-{host|device}-only for CUDA) passed to the driver. const driver::JobList &Jobs = C->getJobs(); - bool CudaCompilation = false; + bool OffloadCompilation = false; if (Jobs.size() > 1) { for (auto &A : C->getActions()){ // On MacOSX real actions may end up being wrapped in BindArchAction if (isa(A)) A = *A->input_begin(); - if (isa(A)) { - CudaCompilation = true; + if (isa(A)) { + OffloadCompilation = true; break; } } } if (Jobs.size() == 0 || !isa(*Jobs.begin()) || - (Jobs.size() > 1 && !CudaCompilation)) { + (Jobs.size() > 1 && !OffloadCompilation)) { SmallString<256> Msg; llvm::raw_svector_ostream OS(Msg); Jobs.Print(OS, "; ", true);