Index: include/clang/Driver/Action.h =================================================================== --- include/clang/Driver/Action.h +++ include/clang/Driver/Action.h @@ -12,6 +12,7 @@ #include "clang/Driver/Types.h" #include "clang/Driver/Util.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" namespace llvm { @@ -26,6 +27,8 @@ namespace clang { namespace driver { +class ToolChain; + /// Action - Represent an abstract compilation step to perform. /// /// An action represents an edge in the compilation graph; typically @@ -49,8 +52,7 @@ enum ActionClass { InputClass = 0, BindArchClass, - CudaDeviceClass, - CudaHostClass, + OffloadClass, PreprocessJobClass, PrecompileJobClass, AnalyzeJobClass, @@ -71,10 +73,6 @@ // The offloading kind determines if this action is binded to a particular // programming model. Each entry reserves one bit. We also have a special kind // to designate the host offloading tool chain. - // - // FIXME: This is currently used to indicate that tool chains are used in a - // given programming, but will be used here as well once a generic offloading - // action is implemented. enum OffloadKind { OFK_None = 0x00, // The host offloading tool chain. @@ -94,6 +92,19 @@ ActionList Inputs; protected: + /// + /// Offload information. + /// + + /// The host offloading kind - a combination of kinds encoded in a mask. + /// Multiple programming models may be supported simultaneously by the same + /// host. + unsigned ActiveOffloadKindMask = 0u; + /// Offloading kind of the device. + OffloadKind OffloadingDeviceKind = OFK_None; + /// The Offloading architecture associated with this action. + const char *OffloadingArch = nullptr; + Action(ActionClass Kind, types::ID Type) : Action(Kind, ActionList(), Type) {} Action(ActionClass Kind, Action *Input, types::ID Type) : Action(Kind, ActionList({Input}), Type) {} @@ -123,6 +134,40 @@ input_const_range inputs() const { return input_const_range(input_begin(), input_end()); } + + /// Return a string containing the offload kind of the action. + std::string getOffloadingKindPrefix() const; + /// Return a string that can be used as prefix in order to generate unique + /// files for each offloading kind. + std::string getOffloadingFileNamePrefix(StringRef NormalizedTriple) const; + + /// Set the device offload info of this action and propagate it to its + /// dependences. + void propagateDeviceOffloadInfo(OffloadKind OKind, const char *OArch); + /// Append the host offload info of this action and propagate it to its + /// dependences. + void propagateHostOffloadInfo(unsigned OKinds, const char *OArch); + /// Set the offload info of this action to be the same as the provided action, + /// and propagate it to its dependences. + void propagateOffloadInfo(const Action *A); + + unsigned getOffloadingHostActiveKinds() const { + return ActiveOffloadKindMask; + } + OffloadKind getOffloadingDeviceKind() const { return OffloadingDeviceKind; } + const char *getOffloadingArch() const { return OffloadingArch; } + + /// Check if this action have any offload kinds. Note that host offload kinds + /// are only set if the action is a dependence to a host offload action. + bool isHostOffloading(OffloadKind OKind) const { + return ActiveOffloadKindMask & OKind; + } + bool isDeviceOffloading(OffloadKind OKind) const { + return OffloadingDeviceKind == OKind; + } + bool isOffloading(OffloadKind OKind) const { + return isHostOffloading(OKind) || isDeviceOffloading(OKind); + } }; class InputAction : public Action { @@ -155,43 +200,125 @@ } }; -class CudaDeviceAction : public Action { +/// An offload action combines host or/and device actions according to the +/// programming model implementation needs and propagates the offloading kind to +/// its dependences. +class OffloadAction final : public Action { virtual void anchor(); - /// GPU architecture to bind. Always of the form /sm_\d+/ or null (when the - /// action applies to multiple architectures). - const char *GpuArchName; - /// True when action results are not consumed by the host action (e.g when - /// -fsyntax-only or --cuda-device-only options are used). - bool AtTopLevel; +public: + /// Type used to communicate device actions. It associates bound architecture, + /// toolchain, and offload kind to each action. + class DeviceDependences final { + public: + typedef SmallVector ToolChainList; + typedef SmallVector BoundArchList; + typedef SmallVector OffloadKindList; + + private: + // Lists that keep the information for each dependency. All the lists are + // meant to be updated in sync. We are adopting separate lists instead of a + // list of structs, because that simplifies forwarding the actions list to + // initialize the inputs of the base Action class. + + /// The dependence actions. + ActionList DeviceActions; + /// The offloading toolchains that should be used with the action. + ToolChainList DeviceToolChains; + /// The architectures that should be used with this action. + BoundArchList DeviceBoundArchs; + /// The offload kind of each dependence. + OffloadKindList DeviceOffloadKinds; + + public: + /// Add a action along with the associated toolchain, bound arch, and + /// offload kind. + void add(Action &A, const ToolChain &TC, const char *BoundArch, + OffloadKind OKind); + + /// Get each of the individual arrays. + const ActionList &getActions() const { return DeviceActions; }; + const ToolChainList &getToolChains() const { return DeviceToolChains; }; + const BoundArchList &getBoundArchs() const { return DeviceBoundArchs; }; + const OffloadKindList &getOffloadKinds() const { + return DeviceOffloadKinds; + }; + }; + + /// Type used to communicate host actions. It associates bound architecture, + /// toolchain, and offload kinds to the host action. + class HostDependence final { + /// The dependence action. + Action &HostAction; + /// The offloading toolchain that should be used with the action. + const ToolChain &HostToolChain; + /// The architectures that should be used with this action. + const char *HostBoundArch = nullptr; + /// The offload kind of each dependence. + unsigned HostOffloadKinds = 0u; + + public: + HostDependence(Action &A, const ToolChain &TC, const char *BoundArch, + const unsigned OffloadKinds) + : HostAction(A), HostToolChain(TC), HostBoundArch(BoundArch), + HostOffloadKinds(OffloadKinds){}; + /// Constructor version that obtains the offload kinds from the device + /// dependencies. + HostDependence(Action &A, const ToolChain &TC, const char *BoundArch, + const DeviceDependences &DDeps); + Action *getAction() const { return &HostAction; }; + const ToolChain *getToolChain() const { return &HostToolChain; }; + const char *getBoundArch() const { return HostBoundArch; }; + unsigned getOffloadKinds() const { return HostOffloadKinds; }; + }; + + typedef llvm::function_ref + OffloadActionWorkTy; + +private: + /// The host offloading toolchain that should be used with the action. + const ToolChain *HostTC = nullptr; + + /// The tool chains associated with the list of actions. + DeviceDependences::ToolChainList DevToolChains; public: - CudaDeviceAction(Action *Input, const char *ArchName, bool AtTopLevel); + OffloadAction(const HostDependence &HDep); + OffloadAction(const DeviceDependences &DDeps, types::ID Ty); + OffloadAction(const HostDependence &HDep, const DeviceDependences &DDeps); - const char *getGpuArchName() const { return GpuArchName; } + /// Execute the work specified in \a Work on the host dependence. + void doOnHostDependence(const OffloadActionWorkTy &Work) const; - /// Gets the compute_XX that corresponds to getGpuArchName(). Returns null - /// when getGpuArchName() is null. - const char *getComputeArchName() const; + /// Execute the work specified in \a Work on each device dependence. + void doOnEachDeviceDependence(const OffloadActionWorkTy &Work) const; - bool isAtTopLevel() const { return AtTopLevel; } + /// Execute the work specified in \a Work on each dependence. + void doOnEachDependence(const OffloadActionWorkTy &Work) const; - static bool IsValidGpuArchName(llvm::StringRef ArchName); + /// Execute the work specified in \a Work on each host or device dependence if + /// \a IsHostDependenceto is true or false, respectively. + void doOnEachDependence(bool IsHostDependence, + const OffloadActionWorkTy &Work) const; - static bool classof(const Action *A) { - return A->getKind() == CudaDeviceClass; - } -}; + /// Return true if the action has a host dependence. + bool hasHostDependence() const; -class CudaHostAction : public Action { - virtual void anchor(); - ActionList DeviceActions; + /// Return the host dependence of this action. This function is only expected + /// to be called if the host dependence exists. + Action *getHostDependence() const; -public: - CudaHostAction(Action *Input, const ActionList &DeviceActions); + /// Return true if the action has a single device dependence. If \a + /// DoNotConsiderHostActions is set, ignore the host dependence, if any, while + /// accounting for the number of dependences. + bool hasSingleDeviceDependence(bool DoNotConsiderHostActions = false) const; - const ActionList &getDeviceActions() const { return DeviceActions; } + /// Return the single device dependence of this action. This function is only + /// expected to be called if a single device dependence exists. If \a + /// DoNotConsiderHostActions is set, a host dependence is allowed. + Action * + getSingleDeviceDependence(bool DoNotConsiderHostActions = false) const; - static bool classof(const Action *A) { return A->getKind() == CudaHostClass; } + static bool classof(const Action *A) { return A->getKind() == OffloadClass; } }; class JobAction : public Action { Index: include/clang/Driver/Compilation.h =================================================================== --- include/clang/Driver/Compilation.h +++ include/clang/Driver/Compilation.h @@ -98,12 +98,7 @@ const Driver &getDriver() const { return TheDriver; } const ToolChain &getDefaultToolChain() const { return DefaultToolChain; } - const ToolChain *getOffloadingHostToolChain() const { - auto It = OrderedOffloadingToolchains.find(Action::OFK_Host); - if (It != OrderedOffloadingToolchains.end()) - return It->second; - return nullptr; - } + unsigned isOffloadingHostKind(Action::OffloadKind Kind) const { return ActiveOffloadMask & Kind; } @@ -121,8 +116,8 @@ return OrderedOffloadingToolchains.equal_range(Kind); } - // Return an offload toolchain of the provided kind. Only one is expected to - // exist. + /// Return an offload toolchain of the provided kind. Only one is expected to + /// exist. template const ToolChain *getSingleOffloadToolChain() const { auto TCs = getOffloadToolChains(); Index: include/clang/Driver/Driver.h =================================================================== --- include/clang/Driver/Driver.h +++ include/clang/Driver/Driver.h @@ -394,12 +394,13 @@ /// BuildJobsForAction - Construct the jobs to perform for the action \p A and /// return an InputInfo for the result of running \p A. Will only construct /// jobs for a given (Action, ToolChain, BoundArch) tuple once. - InputInfo BuildJobsForAction(Compilation &C, const Action *A, - const ToolChain *TC, const char *BoundArch, - bool AtTopLevel, bool MultipleArchs, - const char *LinkingOutput, - std::map, - InputInfo> &CachedResults) const; + InputInfo + BuildJobsForAction(Compilation &C, const Action *A, const ToolChain *TC, + const char *BoundArch, bool AtTopLevel, bool MultipleArchs, + const char *LinkingOutput, + std::map, InputInfo> + &CachedResults, + bool BuildForOffloadDevice) const; /// Returns the default name for linked images (e.g., "a.out"). const char *getDefaultImageName() const; @@ -415,12 +416,11 @@ /// \param BoundArch - The bound architecture. /// \param AtTopLevel - Whether this is a "top-level" action. /// \param MultipleArchs - Whether multiple -arch options were supplied. - const char *GetNamedOutputPath(Compilation &C, - const JobAction &JA, - const char *BaseInput, - const char *BoundArch, - bool AtTopLevel, - bool MultipleArchs) const; + /// \param NormalizedTriple - The normalized triple of the relevant target. + const char *GetNamedOutputPath(Compilation &C, const JobAction &JA, + const char *BaseInput, const char *BoundArch, + bool AtTopLevel, bool MultipleArchs, + StringRef NormalizedTriple) const; /// GetTemporaryPath - Return the pathname of a temporary file to use /// as part of compilation; the file will have the given prefix and suffix. @@ -467,7 +467,8 @@ const char *BoundArch, bool AtTopLevel, bool MultipleArchs, const char *LinkingOutput, std::map, InputInfo> - &CachedResults) const; + &CachedResults, + bool BuildForOffloadDevice) const; public: /// GetReleaseVersion - Parse (([0-9]+)(.([0-9]+)(.([0-9]+)?))?)? and Index: lib/Driver/Action.cpp =================================================================== --- lib/Driver/Action.cpp +++ lib/Driver/Action.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "clang/Driver/Action.h" +#include "clang/Driver/ToolChain.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Regex.h" @@ -21,8 +22,8 @@ switch (AC) { case InputClass: return "input"; case BindArchClass: return "bind-arch"; - case CudaDeviceClass: return "cuda-device"; - case CudaHostClass: return "cuda-host"; + case OffloadClass: + return "offload"; case PreprocessJobClass: return "preprocessor"; case PrecompileJobClass: return "precompiler"; case AnalyzeJobClass: return "analyzer"; @@ -40,6 +41,82 @@ llvm_unreachable("invalid class"); } +void Action::propagateDeviceOffloadInfo(OffloadKind OKind, const char *OArch) { + // Offload action set its own kinds on their dependences. + if (Kind == OffloadClass) + return; + + assert((OffloadingDeviceKind == OKind || OffloadingDeviceKind == OFK_None) && + "Setting device kind to a different device??"); + assert(!ActiveOffloadKindMask && "Setting a device kind in a host action??"); + OffloadingDeviceKind = OKind; + OffloadingArch = OArch; + + for (auto *A : Inputs) + A->propagateDeviceOffloadInfo(OffloadingDeviceKind, OArch); +} + +void Action::propagateHostOffloadInfo(unsigned OKinds, const char *OArch) { + // Offload action set its own kinds on their dependences. + if (Kind == OffloadClass) + return; + + assert(OffloadingDeviceKind == OFK_None && + "Setting a host kind in a device action."); + ActiveOffloadKindMask |= OKinds; + OffloadingArch = OArch; + + for (auto *A : Inputs) + A->propagateHostOffloadInfo(ActiveOffloadKindMask, OArch); +} + +void Action::propagateOffloadInfo(const Action *A) { + if (unsigned HK = A->getOffloadingHostActiveKinds()) + propagateHostOffloadInfo(HK, A->getOffloadingArch()); + else + propagateDeviceOffloadInfo(A->getOffloadingDeviceKind(), + A->getOffloadingArch()); +} + +std::string Action::getOffloadingKindPrefix() const { + switch (OffloadingDeviceKind) { + case OFK_None: + break; + case OFK_Host: + llvm_unreachable("Host kind is not an offloading device kind."); + break; + case OFK_Cuda: + return "device-cuda"; + + // TODO: Add other programming models here. + } + + if (!ActiveOffloadKindMask) + return ""; + + std::string Res("host"); + if (ActiveOffloadKindMask & OFK_Cuda) + Res += "-cuda"; + + // TODO: Add other programming models here. + + return Res; +} + +std::string +Action::getOffloadingFileNamePrefix(StringRef NormalizedTriple) const { + // A file prefix is only generated for device actions and consists of the + // offload kind and triple. + if (!OffloadingDeviceKind) + return ""; + + std::string Res("-"); + Res += getOffloadingKindPrefix(); + Res += "-"; + Res += NormalizedTriple; + return Res; +} + void InputAction::anchor() {} InputAction::InputAction(const Arg &_Input, types::ID _Type) @@ -51,45 +128,130 @@ BindArchAction::BindArchAction(Action *Input, const char *_ArchName) : Action(BindArchClass, Input), ArchName(_ArchName) {} -// Converts CUDA GPU architecture, e.g. "sm_21", to its corresponding virtual -// compute arch, e.g. "compute_20". Returns null if the input arch is null or -// doesn't match an existing arch. -static const char* GpuArchToComputeName(const char *ArchName) { - if (!ArchName) - return nullptr; - return llvm::StringSwitch(ArchName) - .Cases("sm_20", "sm_21", "compute_20") - .Case("sm_30", "compute_30") - .Case("sm_32", "compute_32") - .Case("sm_35", "compute_35") - .Case("sm_37", "compute_37") - .Case("sm_50", "compute_50") - .Case("sm_52", "compute_52") - .Case("sm_53", "compute_53") - .Default(nullptr); +void OffloadAction::anchor() {} + +OffloadAction::OffloadAction(const HostDependence &HDep) + : Action(OffloadClass, HDep.getAction()), HostTC(HDep.getToolChain()) { + OffloadingArch = HDep.getBoundArch(); + ActiveOffloadKindMask = HDep.getOffloadKinds(); + HDep.getAction()->propagateHostOffloadInfo(HDep.getOffloadKinds(), + HDep.getBoundArch()); +}; + +OffloadAction::OffloadAction(const DeviceDependences &DDeps, types::ID Ty) + : Action(OffloadClass, DDeps.getActions(), Ty), + DevToolChains(DDeps.getToolChains()) { + auto &OKinds = DDeps.getOffloadKinds(); + auto &BArchs = DDeps.getBoundArchs(); + + // If all inputs agree on the same kind, use it also for this action. + if (llvm::all_of(OKinds, [&](OffloadKind K) { return K == OKinds.front(); })) + OffloadingDeviceKind = OKinds.front(); + + // If we have a single dependency, inherit the architecture from it. + if (OKinds.size() == 1) + OffloadingArch = BArchs.front(); + + // Propagate info to the dependencies. + for (unsigned i = 0; i < getInputs().size(); ++i) + getInputs()[i]->propagateDeviceOffloadInfo(OKinds[i], BArchs[i]); } -void CudaDeviceAction::anchor() {} +OffloadAction::OffloadAction(const HostDependence &HDep, + const DeviceDependences &DDeps) + : Action(OffloadClass, HDep.getAction()), HostTC(HDep.getToolChain()), + DevToolChains(DDeps.getToolChains()) { + // We use the kinds of the host dependence for this action. + OffloadingArch = HDep.getBoundArch(); + ActiveOffloadKindMask = HDep.getOffloadKinds(); + HDep.getAction()->propagateHostOffloadInfo(HDep.getOffloadKinds(), + HDep.getBoundArch()); + + // Add device inputs and propagate info to the device actions. + for (unsigned i = 0; i < DDeps.getActions().size(); ++i) { + auto *A = DDeps.getActions()[i]; + // Skip actions of empty dependences. + if (!A) + continue; + getInputs().push_back(A); + A->propagateDeviceOffloadInfo(DDeps.getOffloadKinds()[i], + DDeps.getBoundArchs()[i]); + } +} -CudaDeviceAction::CudaDeviceAction(Action *Input, const char *ArchName, - bool AtTopLevel) - : Action(CudaDeviceClass, Input), GpuArchName(ArchName), - AtTopLevel(AtTopLevel) { - assert(!GpuArchName || IsValidGpuArchName(GpuArchName)); +void OffloadAction::doOnHostDependence(const OffloadActionWorkTy &Work) const { + if (!HostTC) + return; + auto *A = getInputs().front(); + Work(A, HostTC, A->getOffloadingArch()); } -const char *CudaDeviceAction::getComputeArchName() const { - return GpuArchToComputeName(GpuArchName); +void OffloadAction::doOnEachDeviceDependence( + const OffloadActionWorkTy &Work) const { + auto I = getInputs().begin(); + auto E = getInputs().end(); + if (I == E) + return; + + // Skip host action + if (HostTC) + ++I; + + auto TI = DevToolChains.begin(); + for (; I != E; ++I, ++TI) + Work(*I, *TI, (*I)->getOffloadingArch()); +} + +void OffloadAction::doOnEachDependence(const OffloadActionWorkTy &Work) const { + doOnHostDependence(Work); + doOnEachDeviceDependence(Work); } -bool CudaDeviceAction::IsValidGpuArchName(llvm::StringRef ArchName) { - return GpuArchToComputeName(ArchName.data()) != nullptr; +void OffloadAction::doOnEachDependence(bool IsHostDependence, + const OffloadActionWorkTy &Work) const { + if (IsHostDependence) + doOnHostDependence(Work); + else + doOnEachDeviceDependence(Work); } -void CudaHostAction::anchor() {} +bool OffloadAction::hasHostDependence() const { return HostTC != nullptr; } -CudaHostAction::CudaHostAction(Action *Input, const ActionList &DeviceActions) - : Action(CudaHostClass, Input), DeviceActions(DeviceActions) {} +Action *OffloadAction::getHostDependence() const { + assert(hasHostDependence() && "Host dependence does not exist!"); + return HostTC ? getInputs().front() : nullptr; +} + +bool OffloadAction::hasSingleDeviceDependence( + bool DoNotConsiderHostActions) const { + if (DoNotConsiderHostActions) + return getInputs().size() == (HostTC ? 2 : 1); + return !HostTC && getInputs().size() == 1; +} + +Action * +OffloadAction::getSingleDeviceDependence(bool DoNotConsiderHostActions) const { + assert(hasSingleDeviceDependence(DoNotConsiderHostActions) && + "Single device dependence does not exist!"); + return HostTC ? getInputs()[1] : getInputs().front(); +} + +void OffloadAction::DeviceDependences::add(Action &A, const ToolChain &TC, + const char *BoundArch, + OffloadKind OKind) { + DeviceActions.push_back(&A); + DeviceToolChains.push_back(&TC); + DeviceBoundArchs.push_back(BoundArch); + DeviceOffloadKinds.push_back(OKind); +} + +OffloadAction::HostDependence::HostDependence(Action &A, const ToolChain &TC, + const char *BoundArch, + const DeviceDependences &DDeps) + : HostAction(A), HostToolChain(TC), HostBoundArch(BoundArch) { + for (auto K : DDeps.getOffloadKinds()) + HostOffloadKinds |= K; +} void JobAction::anchor() {} Index: lib/Driver/Driver.cpp =================================================================== --- lib/Driver/Driver.cpp +++ lib/Driver/Driver.cpp @@ -434,7 +434,9 @@ })) { const ToolChain &TC = getToolChain( C.getInputArgs(), - llvm::Triple(C.getOffloadingHostToolChain()->getTriple().isArch64Bit() + llvm::Triple(C.getSingleOffloadToolChain() + ->getTriple() + .isArch64Bit() ? "nvptx64-nvidia-cuda" : "nvptx-nvidia-cuda")); C.addOffloadDeviceToolChain(&TC, Action::OFK_Cuda); @@ -1021,18 +1023,33 @@ } else if (BindArchAction *BIA = dyn_cast(A)) { os << '"' << BIA->getArchName() << '"' << ", {" << PrintActions1(C, *BIA->input_begin(), Ids) << "}"; - } else if (CudaDeviceAction *CDA = dyn_cast(A)) { - os << '"' - << (CDA->getGpuArchName() ? CDA->getGpuArchName() : "(multiple archs)") - << '"' << ", {" << PrintActions1(C, *CDA->input_begin(), Ids) << "}"; + } else if (OffloadAction *OA = dyn_cast(A)) { + bool IsFirst = true; + OA->doOnEachDependence( + [&](Action *A, const ToolChain *TC, const char *BoundArch) { + // E.g. for two CUDA device dependences whose bound arch is sm_20 and + // sm_35 this will generate: + // "cuda-device" (nvptx64-nvidia-cuda:sm_20) {#ID}, "cuda-device" + // (nvptx64-nvidia-cuda:sm_35) {#ID} + if (!IsFirst) + os << ", "; + os << '"'; + if (TC) + os << A->getOffloadingKindPrefix(); + else + os << "host"; + os << " ("; + os << TC->getTriple().normalize(); + + if (BoundArch) + os << ":" << BoundArch; + os << ")"; + os << '"'; + os << " {" << PrintActions1(C, A, Ids) << "}"; + IsFirst = false; + }); } else { - const ActionList *AL; - if (CudaHostAction *CHA = dyn_cast(A)) { - os << "{" << PrintActions1(C, *CHA->input_begin(), Ids) << "}" - << ", gpu binaries "; - AL = &CHA->getDeviceActions(); - } else - AL = &A->getInputs(); + const ActionList *AL = &A->getInputs(); if (AL->size()) { const char *Prefix = "{"; @@ -1045,10 +1062,24 @@ os << "{}"; } + // Append offload info for all options other than the offloading action + // itself (e.g. (cuda-device, sm_20) or (cuda-host)). + std::string offload_str; + llvm::raw_string_ostream offload_os(offload_str); + if (!isa(A)) { + auto S = A->getOffloadingKindPrefix(); + if (!S.empty()) { + offload_os << ", (" << S; + if (A->getOffloadingArch()) + offload_os << ", " << A->getOffloadingArch(); + offload_os << ")"; + } + } + unsigned Id = Ids.size(); Ids[A] = Id; llvm::errs() << Id << ": " << os.str() << ", " - << types::getTypeName(A->getType()) << "\n"; + << types::getTypeName(A->getType()) << offload_os.str() << "\n"; return Id; } @@ -1376,8 +1407,12 @@ PartialCompilationArg && PartialCompilationArg->getOption().matches(options::OPT_cuda_device_only); - if (CompileHostOnly) - return C.MakeAction(HostAction, ActionList()); + if (CompileHostOnly) { + OffloadAction::HostDependence HDep( + *HostAction, *C.getSingleOffloadToolChain(), + /*BoundArch=*/nullptr, Action::OFK_Cuda); + return C.MakeAction(HDep); + } // Collect all cuda_gpu_arch parameters, removing duplicates. SmallVector GpuArchList; @@ -1388,7 +1423,7 @@ A->claim(); const auto& Arch = A->getValue(); - if (!CudaDeviceAction::IsValidGpuArchName(Arch)) + if (!toolchains::CudaToolChain::GpuArchToComputeName(Arch)) C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << Arch; else if (GpuArchNames.insert(Arch).second) GpuArchList.push_back(Arch); @@ -1405,8 +1440,6 @@ CudaDeviceInputs.push_back(std::make_pair(types::TY_CUDA_DEVICE, InputArg)); // Build actions for all device inputs. - assert(C.getSingleOffloadToolChain() && - "Missing toolchain for device-side compilation."); ActionList CudaDeviceActions; C.getDriver().BuildActions(C, Args, CudaDeviceInputs, CudaDeviceActions); assert(GpuArchList.size() == CudaDeviceActions.size() && @@ -1418,6 +1451,8 @@ return a->getKind() != Action::AssembleJobClass; }); + const ToolChain *CudaTC = C.getSingleOffloadToolChain(); + // Figure out what to do with device actions -- pass them as inputs to the // host action or run each of them independently. if (PartialCompilation || CompileDeviceOnly) { @@ -1433,10 +1468,13 @@ return nullptr; } - for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) - Actions.push_back(C.MakeAction(CudaDeviceActions[I], - GpuArchList[I], - /* AtTopLevel */ true)); + for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { + OffloadAction::DeviceDependences DDep; + DDep.add(*CudaDeviceActions[I], *CudaTC, GpuArchList[I], + Action::OFK_Cuda); + Actions.push_back( + C.MakeAction(DDep, CudaDeviceActions[I]->getType())); + } // Kill host action in case of device-only compilation. if (CompileDeviceOnly) return nullptr; @@ -1456,19 +1494,23 @@ Action* BackendAction = AssembleAction->getInputs()[0]; assert(BackendAction->getType() == types::TY_PP_Asm); - for (const auto& A : {AssembleAction, BackendAction}) { - DeviceActions.push_back(C.MakeAction( - A, GpuArchList[I], /* AtTopLevel */ false)); + for (auto &A : {AssembleAction, BackendAction}) { + OffloadAction::DeviceDependences DDep; + DDep.add(*A, *CudaTC, GpuArchList[I], Action::OFK_Cuda); + DeviceActions.push_back(C.MakeAction(DDep, A->getType())); } } - auto FatbinAction = C.MakeAction( - C.MakeAction(DeviceActions, types::TY_CUDA_FATBIN), - /* GpuArchName = */ nullptr, - /* AtTopLevel = */ false); + auto FatbinAction = + C.MakeAction(DeviceActions, types::TY_CUDA_FATBIN); + // Return a new host action that incorporates original host action and all // device actions. - return C.MakeAction(std::move(HostAction), - ActionList({FatbinAction})); + OffloadAction::HostDependence HDep( + *HostAction, *C.getSingleOffloadToolChain(), + /*BoundArch=*/nullptr, Action::OFK_Cuda); + OffloadAction::DeviceDependences DDep; + DDep.add(*FatbinAction, *CudaTC, /*BoundArch=*/nullptr, Action::OFK_Cuda); + return C.MakeAction(HDep, DDep); } void Driver::BuildActions(Compilation &C, DerivedArgList &Args, @@ -1577,6 +1619,9 @@ YcArg = YuArg = nullptr; } + // Track the host offload kinds used on this compilation. + unsigned CompilationActiveOffloadHostKinds = 0u; + // Construct the actions to perform. ActionList LinkerInputs; @@ -1645,6 +1690,9 @@ ? phases::Compile : FinalPhase; + // Track the host offload kinds used on this input. + unsigned InputActiveOffloadHostKinds = 0u; + // Build the pipeline for this file. Action *Current = C.MakeAction(*InputArg, InputType); for (SmallVectorImpl::iterator i = PL.begin(), e = PL.end(); @@ -1676,21 +1724,36 @@ Current = buildCudaActions(C, Args, InputArg, Current, Actions); if (!Current) break; + + // We produced a CUDA action for this input, so the host has to support + // CUDA. + InputActiveOffloadHostKinds |= Action::OFK_Cuda; + CompilationActiveOffloadHostKinds |= Action::OFK_Cuda; } if (Current->getType() == types::TY_Nothing) break; } - // If we ended with something, add to the output list. - if (Current) + // If we ended with something, add to the output list. Also, propagate the + // offload information to the top-level host action related with the current + // input. + if (Current) { + if (InputActiveOffloadHostKinds) + Current->propagateHostOffloadInfo(InputActiveOffloadHostKinds, + /*BoundArch=*/nullptr); Actions.push_back(Current); + } } - // Add a link action if necessary. - if (!LinkerInputs.empty()) + // Add a link action if necessary and propagate the offload information for + // the current compilation. + if (!LinkerInputs.empty()) { Actions.push_back( C.MakeAction(LinkerInputs, types::TY_Image)); + Actions.back()->propagateHostOffloadInfo(CompilationActiveOffloadHostKinds, + /*BoundArch=*/nullptr); + } // If we are linking, claim any options which are obviously only used for // compilation. @@ -1826,7 +1889,8 @@ /*BoundArch*/ nullptr, /*AtTopLevel*/ true, /*MultipleArchs*/ ArchNames.size() > 1, - /*LinkingOutput*/ LinkingOutput, CachedResults); + /*LinkingOutput*/ LinkingOutput, CachedResults, + /*BuildForOffloadDevice*/ false); } // If the user passed -Qunused-arguments or there were errors, don't warn @@ -1875,7 +1939,28 @@ } } } - +/// Collapse an offloading action looking for a job of the given type. The input +/// action is changed to the input of the collapsed sequence. If we effectively +/// had a collapse return the corresponding offloading action, otherwise return +/// null. +template +static OffloadAction *collapseOffloadingAction(Action *&CurAction) { + if (!CurAction) + return nullptr; + if (auto *OA = dyn_cast(CurAction)) { + if (OA->hasHostDependence()) + if (auto *HDep = dyn_cast(OA->getHostDependence())) { + CurAction = HDep; + return OA; + } + if (OA->hasSingleDeviceDependence()) + if (auto *DDep = dyn_cast(OA->getSingleDeviceDependence())) { + CurAction = DDep; + return OA; + } + } + return nullptr; +} // Returns a Tool for a given JobAction. In case the action and its // predecessors can be combined, updates Inputs with the inputs of the // first combined action. If one of the collapsed actions is a @@ -1885,34 +1970,39 @@ bool EmbedBitcode, const ToolChain *TC, const JobAction *JA, const ActionList *&Inputs, - const CudaHostAction *&CollapsedCHA) { + ActionList &CollapsedOffloadAction) { const Tool *ToolForJob = nullptr; - CollapsedCHA = nullptr; + CollapsedOffloadAction.clear(); // See if we should look for a compiler with an integrated assembler. We match // bottom up, so what we are actually looking for is an assembler job with a // compiler input. + // Look through offload actions between assembler and backend actions. + Action *BackendJA = (isa(JA) && Inputs->size() == 1) + ? *Inputs->begin() + : nullptr; + auto *BackendOA = collapseOffloadingAction(BackendJA); + if (TC->useIntegratedAs() && !SaveTemps && !C.getArgs().hasArg(options::OPT_via_file_asm) && !C.getArgs().hasArg(options::OPT__SLASH_FA) && - !C.getArgs().hasArg(options::OPT__SLASH_Fa) && - isa(JA) && Inputs->size() == 1 && - isa(*Inputs->begin())) { + !C.getArgs().hasArg(options::OPT__SLASH_Fa) && BackendJA && + isa(BackendJA)) { // A BackendJob is always preceded by a CompileJob, and without -save-temps // or -fembed-bitcode, they will always get combined together, so instead of // checking the backend tool, check if the tool for the CompileJob has an // integrated assembler. For -fembed-bitcode, CompileJob is still used to // look up tools for BackendJob, but they need to match before we can split // them. - const ActionList *BackendInputs = &(*Inputs)[0]->getInputs(); - // Compile job may be wrapped in CudaHostAction, extract it if - // that's the case and update CollapsedCHA if we combine phases. - CudaHostAction *CHA = dyn_cast(*BackendInputs->begin()); - JobAction *CompileJA = cast( - CHA ? *CHA->input_begin() : *BackendInputs->begin()); - assert(CompileJA && "Backend job is not preceeded by compile job."); - const Tool *Compiler = TC->SelectTool(*CompileJA); + + // Look through offload actions between backend and compile actions. + Action *CompileJA = *BackendJA->getInputs().begin(); + auto *CompileOA = collapseOffloadingAction(CompileJA); + + assert(CompileJA && isa(CompileJA) && + "Backend job is not preceeded by compile job."); + const Tool *Compiler = TC->SelectTool(*cast(CompileJA)); if (!Compiler) return nullptr; // When using -fembed-bitcode, it is required to have the same tool (clang) @@ -1926,7 +2016,12 @@ if (Compiler->hasIntegratedAssembler()) { Inputs = &CompileJA->getInputs(); ToolForJob = Compiler; - CollapsedCHA = CHA; + // Save the collapsed offload actions because they may still contain + // device actions. + if (CompileOA) + CollapsedOffloadAction.push_back(CompileOA); + if (BackendOA) + CollapsedOffloadAction.push_back(BackendOA); } } @@ -1936,20 +2031,23 @@ if (isa(JA)) { // Check if the compiler supports emitting LLVM IR. assert(Inputs->size() == 1); - // Compile job may be wrapped in CudaHostAction, extract it if - // that's the case and update CollapsedCHA if we combine phases. - CudaHostAction *CHA = dyn_cast(*Inputs->begin()); - JobAction *CompileJA = - cast(CHA ? *CHA->input_begin() : *Inputs->begin()); - assert(CompileJA && "Backend job is not preceeded by compile job."); - const Tool *Compiler = TC->SelectTool(*CompileJA); + + // Look through offload actions between backend and compile actions. + Action *CompileJA = *JA->getInputs().begin(); + auto *CompileOA = collapseOffloadingAction(CompileJA); + + assert(CompileJA && isa(CompileJA) && + "Backend job is not preceeded by compile job."); + const Tool *Compiler = TC->SelectTool(*cast(CompileJA)); if (!Compiler) return nullptr; if (!Compiler->canEmitIR() || (!SaveTemps && !EmbedBitcode)) { Inputs = &CompileJA->getInputs(); ToolForJob = Compiler; - CollapsedCHA = CHA; + + if (CompileOA) + CollapsedOffloadAction.push_back(CompileOA); } } @@ -1960,12 +2058,21 @@ // See if we should use an integrated preprocessor. We do so when we have // exactly one input, since this is the only use case we care about // (irrelevant since we don't support combine yet). - if (Inputs->size() == 1 && isa(*Inputs->begin()) && + + // Look through offload actions after preprocessing. + Action *PreprocessJA = (Inputs->size() == 1) ? *Inputs->begin() : nullptr; + auto *PreprocessOA = + collapseOffloadingAction(PreprocessJA); + + if (PreprocessJA && isa(PreprocessJA) && !C.getArgs().hasArg(options::OPT_no_integrated_cpp) && !C.getArgs().hasArg(options::OPT_traditional_cpp) && !SaveTemps && !C.getArgs().hasArg(options::OPT_rewrite_objc) && - ToolForJob->hasIntegratedCPP()) - Inputs = &(*Inputs)[0]->getInputs(); + ToolForJob->hasIntegratedCPP()) { + Inputs = &PreprocessJA->getInputs(); + if (PreprocessOA) + CollapsedOffloadAction.push_back(PreprocessOA); + } return ToolForJob; } @@ -1973,8 +2080,8 @@ InputInfo Driver::BuildJobsForAction( Compilation &C, const Action *A, const ToolChain *TC, const char *BoundArch, bool AtTopLevel, bool MultipleArchs, const char *LinkingOutput, - std::map, InputInfo> &CachedResults) - const { + std::map, InputInfo> &CachedResults, + bool BuildForOffloadDevice) const { // The bound arch is not necessarily represented in the toolchain's triple -- // for example, armv7 and armv7s both map to the same triple -- so we need // both in our map. @@ -1988,9 +2095,9 @@ if (CachedResult != CachedResults.end()) { return CachedResult->second; } - InputInfo Result = - BuildJobsForActionNoCache(C, A, TC, BoundArch, AtTopLevel, MultipleArchs, - LinkingOutput, CachedResults); + InputInfo Result = BuildJobsForActionNoCache( + C, A, TC, BoundArch, AtTopLevel, MultipleArchs, LinkingOutput, + CachedResults, BuildForOffloadDevice); CachedResults[ActionTC] = Result; return Result; } @@ -1998,21 +2105,65 @@ InputInfo Driver::BuildJobsForActionNoCache( Compilation &C, const Action *A, const ToolChain *TC, const char *BoundArch, bool AtTopLevel, bool MultipleArchs, const char *LinkingOutput, - std::map, InputInfo> &CachedResults) - const { + std::map, InputInfo> &CachedResults, + bool BuildForOffloadDevice) const { llvm::PrettyStackTraceString CrashInfo("Building compilation jobs"); - InputInfoList CudaDeviceInputInfos; - if (const CudaHostAction *CHA = dyn_cast(A)) { - // Append outputs of device jobs to the input list. - for (const Action *DA : CHA->getDeviceActions()) { - CudaDeviceInputInfos.push_back(BuildJobsForAction( - C, DA, TC, nullptr, AtTopLevel, - /*MultipleArchs*/ false, LinkingOutput, CachedResults)); + InputInfoList OffloadDependencesInputInfo; + if (const OffloadAction *OA = dyn_cast(A)) { + // The offload action is expected to be used in four different situations. + // + // a) Set a toolchain/architecture/kind for a host action: + // Host Action 1 -> OffloadAction -> Host Action 2 + // + // b) Set a toolchain/architecture/kind for a device action; + // Device Action 1 -> OffloadAction -> Device Action 2 + // + // c) Specify a device dependences to a host action; + // Device Action 1 _ + // \ + // Host Action 1 ---> OffloadAction -> Host Action 2 + // + // d) Specify a host dependence to a device action. + // Host Action 1 _ + // \ + // Device Action 1 ---> OffloadAction -> Device Action 2 + // + // For a) and b), we just return the job generated for the dependence. For + // c) and d) we override the current action with the host/device dependence + // if the current toolchain is host/device and set the offload dependences + // info with the jobs obtained from the device/host dependence(s). + + // If there is a single device option, just generate the job for it. + if (OA->hasSingleDeviceDependence()) { + InputInfo DevA; + OA->doOnEachDeviceDependence([&](Action *DepA, const ToolChain *DepTC, + const char *DepBoundArch) { + DevA = + BuildJobsForAction(C, DepA, DepTC, DepBoundArch, AtTopLevel, + /*MultipleArchs*/ !!DepBoundArch, LinkingOutput, + CachedResults, /*BuildForOffloadDevice=*/true); + }); + return DevA; } - // Override current action with a real host compile action and continue - // processing it. - A = *CHA->input_begin(); + + // If 'Action 2' is host, we generate jobs for the device dependences and + // override the current action with the host dependence. Otherwise, we + // generate the host dependences and override the action with the device + // dependence. The dependences can't therefore be a top-level action. + OA->doOnEachDependence( + /*IsHostDependence=*/BuildForOffloadDevice, + [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) { + OffloadDependencesInputInfo.push_back(BuildJobsForAction( + C, DepA, DepTC, DepBoundArch, /*AtTopLevel=*/false, + /*MultipleArchs*/ !!DepBoundArch, LinkingOutput, CachedResults, + /*BuildForOffloadDevice=*/DepA->getOffloadingDeviceKind() != + Action::OFK_None)); + }); + + A = BuildForOffloadDevice + ? OA->getSingleDeviceDependence(/*DoNotConsiderHostActions=*/true) + : OA->getHostDependence(); } if (const InputAction *IA = dyn_cast(A)) { @@ -2039,41 +2190,34 @@ TC = &C.getDefaultToolChain(); return BuildJobsForAction(C, *BAA->input_begin(), TC, ArchName, AtTopLevel, - MultipleArchs, LinkingOutput, CachedResults); + MultipleArchs, LinkingOutput, CachedResults, + BuildForOffloadDevice); } - if (const CudaDeviceAction *CDA = dyn_cast(A)) { - // Initial processing of CudaDeviceAction carries host params. - // Call BuildJobsForAction() again, now with correct device parameters. - InputInfo II = BuildJobsForAction( - C, *CDA->input_begin(), C.getSingleOffloadToolChain(), - CDA->getGpuArchName(), CDA->isAtTopLevel(), /*MultipleArchs=*/true, - LinkingOutput, CachedResults); - // Currently II's Action is *CDA->input_begin(). Set it to CDA instead, so - // that one can retrieve II's GPU arch. - II.setAction(A); - return II; - } const ActionList *Inputs = &A->getInputs(); const JobAction *JA = cast(A); - const CudaHostAction *CollapsedCHA = nullptr; + ActionList CollapsedOffloadActions; + const Tool *T = selectToolForJob(C, isSaveTempsEnabled(), embedBitcodeEnabled(), TC, JA, - Inputs, CollapsedCHA); + Inputs, CollapsedOffloadActions); if (!T) return InputInfo(); - // If we've collapsed action list that contained CudaHostAction we - // need to build jobs for device-side inputs it may have held. - if (CollapsedCHA) { - for (const Action *DA : CollapsedCHA->getDeviceActions()) { - CudaDeviceInputInfos.push_back(BuildJobsForAction( - C, DA, TC, "", AtTopLevel, - /*MultipleArchs*/ false, LinkingOutput, CachedResults)); - } - } + // If we've collapsed action list that contained OffloadAction we + // need to build jobs for host/device-side inputs it may have held. + for (const auto *OA : CollapsedOffloadActions) + cast(OA)->doOnEachDependence( + /*IsHostDependence=*/BuildForOffloadDevice, + [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) { + OffloadDependencesInputInfo.push_back(BuildJobsForAction( + C, DepA, DepTC, DepBoundArch, AtTopLevel, + /*MultipleArchs=*/!!DepBoundArch, LinkingOutput, CachedResults, + /*BuildForOffloadDevice=*/DepA->getOffloadingDeviceKind() != + Action::OFK_None)); + }); // Only use pipes when there is exactly one input. InputInfoList InputInfos; @@ -2083,9 +2227,9 @@ // FIXME: Clean this up. bool SubJobAtTopLevel = AtTopLevel && (isa(A) || isa(A)); - InputInfos.push_back(BuildJobsForAction(C, Input, TC, BoundArch, - SubJobAtTopLevel, MultipleArchs, - LinkingOutput, CachedResults)); + InputInfos.push_back(BuildJobsForAction( + C, Input, TC, BoundArch, SubJobAtTopLevel, MultipleArchs, LinkingOutput, + CachedResults, BuildForOffloadDevice)); } // Always use the first input as the base input. @@ -2096,9 +2240,10 @@ if (JA->getType() == types::TY_dSYM) BaseInput = InputInfos[0].getFilename(); - // Append outputs of cuda device jobs to the input list - if (CudaDeviceInputInfos.size()) - InputInfos.append(CudaDeviceInputInfos.begin(), CudaDeviceInputInfos.end()); + // Append outputs of offload device jobs to the input list + if (!OffloadDependencesInputInfo.empty()) + InputInfos.append(OffloadDependencesInputInfo.begin(), + OffloadDependencesInputInfo.end()); // Determine the place to write output to, if any. InputInfo Result; @@ -2106,7 +2251,8 @@ Result = InputInfo(A, BaseInput); else Result = InputInfo(A, GetNamedOutputPath(C, *JA, BaseInput, BoundArch, - AtTopLevel, MultipleArchs), + AtTopLevel, MultipleArchs, + TC->getTriple().normalize()), BaseInput); if (CCCPrintBindings && !CCGenDiagnostics) { @@ -2166,7 +2312,8 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA, const char *BaseInput, const char *BoundArch, bool AtTopLevel, - bool MultipleArchs) const { + bool MultipleArchs, + StringRef NormalizedTriple) const { llvm::PrettyStackTraceString CrashInfo("Computing output path"); // Output to a user requested destination? if (AtTopLevel && !isa(JA) && !isa(JA)) { @@ -2252,6 +2399,7 @@ MakeCLOutputFilename(C.getArgs(), "", BaseName, types::TY_Image); } else if (MultipleArchs && BoundArch) { SmallString<128> Output(getDefaultImageName()); + Output += JA.getOffloadingFileNamePrefix(NormalizedTriple); Output += "-"; Output.append(BoundArch); NamedOutput = C.getArgs().MakeArgString(Output.c_str()); @@ -2268,6 +2416,7 @@ if (!types::appendSuffixForType(JA.getType())) End = BaseName.rfind('.'); SmallString<128> Suffixed(BaseName.substr(0, End)); + Suffixed += JA.getOffloadingFileNamePrefix(NormalizedTriple); if (MultipleArchs && BoundArch) { Suffixed += "-"; Suffixed.append(BoundArch); Index: lib/Driver/ToolChain.cpp =================================================================== --- lib/Driver/ToolChain.cpp +++ lib/Driver/ToolChain.cpp @@ -248,8 +248,7 @@ case Action::InputClass: case Action::BindArchClass: - case Action::CudaDeviceClass: - case Action::CudaHostClass: + case Action::OffloadClass: case Action::LipoJobClass: case Action::DsymutilJobClass: case Action::VerifyDebugInfoJobClass: Index: lib/Driver/ToolChains.h =================================================================== --- lib/Driver/ToolChains.h +++ lib/Driver/ToolChains.h @@ -852,6 +852,11 @@ // ptxas. bool useIntegratedAs() const override { return false; } + // Converts CUDA GPU architecture, e.g. "sm_21", to its corresponding virtual + // compute arch, e.g. "compute_20". Returns null if the input arch is null or + // doesn't match an existing arch. + static const char *GpuArchToComputeName(const char *ArchName); + protected: Tool *buildAssembler() const override; // ptxas Tool *buildLinker() const override; // fatbinary (ok, not really a linker) Index: lib/Driver/ToolChains.cpp =================================================================== --- lib/Driver/ToolChains.cpp +++ lib/Driver/ToolChains.cpp @@ -4713,6 +4713,21 @@ return DAL; } +const char *CudaToolChain::GpuArchToComputeName(const char *ArchName) { + if (!ArchName) + return nullptr; + return llvm::StringSwitch(ArchName) + .Cases("sm_20", "sm_21", "compute_20") + .Case("sm_30", "compute_30") + .Case("sm_32", "compute_32") + .Case("sm_35", "compute_35") + .Case("sm_37", "compute_37") + .Case("sm_50", "compute_50") + .Case("sm_52", "compute_52") + .Case("sm_53", "compute_53") + .Default(nullptr); +} + Tool *CudaToolChain::buildAssembler() const { return new tools::NVPTX::Assembler(*this); } Index: lib/Driver/Tools.h =================================================================== --- lib/Driver/Tools.h +++ lib/Driver/Tools.h @@ -57,8 +57,7 @@ const Driver &D, const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs, const InputInfo &Output, - const InputInfoList &Inputs, - const ToolChain *AuxToolChain) const; + const InputInfoList &Inputs) const; void AddAArch64TargetArgs(const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const; Index: lib/Driver/Tools.cpp =================================================================== --- lib/Driver/Tools.cpp +++ lib/Driver/Tools.cpp @@ -288,12 +288,45 @@ !O.hasFlag(options::DriverOption) && !O.hasFlag(options::LinkerInput); } +/// Add the C++ include args of other offloading toolchains. If this is a host +/// job, the device toolchains are added. If this is a device job, the host +/// toolchains will be added. +static void addExtraOffloadCXXStdlibIncludeArgs(Compilation &C, + const JobAction &JA, + const ArgList &Args, + ArgStringList &CmdArgs) { + + if (JA.isHostOffloading(Action::OFK_Cuda)) + C.getSingleOffloadToolChain() + ->AddClangCXXStdlibIncludeArgs(Args, CmdArgs); + else if (JA.isDeviceOffloading(Action::OFK_Cuda)) + C.getSingleOffloadToolChain() + ->AddClangCXXStdlibIncludeArgs(Args, CmdArgs); + + // TODO: Add support for other programming models here. +} + +/// Add the include args that are specific of each offloading programming model. +static void addExtraOffloadSpecificIncludeArgs(Compilation &C, + const JobAction &JA, + const ArgList &Args, + ArgStringList &CmdArgs) { + + if (JA.isHostOffloading(Action::OFK_Cuda)) + C.getSingleOffloadToolChain()->AddCudaIncludeArgs( + Args, CmdArgs); + else if (JA.isDeviceOffloading(Action::OFK_Cuda)) + C.getSingleOffloadToolChain()->AddCudaIncludeArgs( + Args, CmdArgs); + + // TODO: Add support for other programming models here. +} + void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA, const Driver &D, const ArgList &Args, ArgStringList &CmdArgs, const InputInfo &Output, - const InputInfoList &Inputs, - const ToolChain *AuxToolChain) const { + const InputInfoList &Inputs) const { Arg *A; const bool IsIAMCU = getToolChain().getTriple().isOSIAMCU(); @@ -558,31 +591,27 @@ // OBJCPLUS_INCLUDE_PATH - system includes enabled when compiling ObjC++. addDirectoryList(Args, CmdArgs, "-objcxx-isystem", "OBJCPLUS_INCLUDE_PATH"); - // Optional AuxToolChain indicates that we need to include headers - // for more than one target. If that's the case, add include paths - // from AuxToolChain right after include paths of the same kind for - // the current target. + // While adding the include arguments, we also attempt to retrieve the + // arguments of related offloading toolchains or arguments that are specific + // of an offloading programming model. // Add C++ include arguments, if needed. if (types::isCXX(Inputs[0].getType())) { getToolChain().AddClangCXXStdlibIncludeArgs(Args, CmdArgs); - if (AuxToolChain) - AuxToolChain->AddClangCXXStdlibIncludeArgs(Args, CmdArgs); + addExtraOffloadCXXStdlibIncludeArgs(C, JA, Args, CmdArgs); } // Add system include arguments for all targets but IAMCU. if (!IsIAMCU) { getToolChain().AddClangSystemIncludeArgs(Args, CmdArgs); - if (AuxToolChain) - AuxToolChain->AddClangCXXStdlibIncludeArgs(Args, CmdArgs); + addExtraOffloadCXXStdlibIncludeArgs(C, JA, Args, CmdArgs); } else { // For IAMCU add special include arguments. getToolChain().AddIAMCUIncludeArgs(Args, CmdArgs); } - // Add CUDA include arguments, if needed. - if (types::isCuda(Inputs[0].getType())) - getToolChain().AddCudaIncludeArgs(Args, CmdArgs); + // Add offload include arguments, if needed. + addExtraOffloadSpecificIncludeArgs(C, JA, Args, CmdArgs); } // FIXME: Move to target hook. @@ -3767,7 +3796,7 @@ // CUDA compilation may have multiple inputs (source file + results of // device-side compilations). All other jobs are expected to have exactly one // input. - bool IsCuda = types::isCuda(Input.getType()); + bool IsCuda = JA.isOffloading(Action::OFK_Cuda); assert((IsCuda || Inputs.size() == 1) && "Unable to handle multiple inputs."); // C++ is not supported for IAMCU. @@ -3783,21 +3812,21 @@ CmdArgs.push_back("-triple"); CmdArgs.push_back(Args.MakeArgString(TripleStr)); - const ToolChain *AuxToolChain = nullptr; if (IsCuda) { - // FIXME: We need a (better) way to pass information about - // particular compilation pass we're constructing here. For now we - // can check which toolchain we're using and pick the other one to - // extract the triple. - if (&getToolChain() == C.getSingleOffloadToolChain()) - AuxToolChain = C.getOffloadingHostToolChain(); - else if (&getToolChain() == C.getOffloadingHostToolChain()) - AuxToolChain = C.getSingleOffloadToolChain(); + // We have to pass the triple of the host if compiling for a CUDA device and + // vice-versa. + StringRef NormalizedTriple; + if (JA.isDeviceOffloading(Action::OFK_Cuda)) + NormalizedTriple = C.getSingleOffloadToolChain() + ->getTriple() + .normalize(); else - llvm_unreachable("Can't figure out CUDA compilation mode."); - assert(AuxToolChain != nullptr && "No aux toolchain."); + NormalizedTriple = C.getSingleOffloadToolChain() + ->getTriple() + .normalize(); + CmdArgs.push_back("-aux-triple"); - CmdArgs.push_back(Args.MakeArgString(AuxToolChain->getTriple().str())); + CmdArgs.push_back(Args.MakeArgString(NormalizedTriple)); } if (Triple.isOSWindows() && (Triple.getArch() == llvm::Triple::arm || @@ -4675,8 +4704,7 @@ // // FIXME: Support -fpreprocessed if (types::getPreprocessedType(InputType) != types::TY_INVALID) - AddPreprocessingOptions(C, JA, D, Args, CmdArgs, Output, Inputs, - AuxToolChain); + AddPreprocessingOptions(C, JA, D, Args, CmdArgs, Output, Inputs); // Don't warn about "clang -c -DPIC -fPIC test.i" because libtool.m4 assumes // that "The compiler can only warn and ignore the option if not recognized". @@ -11132,10 +11160,9 @@ static_cast(getToolChain()); assert(TC.getTriple().isNVPTX() && "Wrong platform"); - std::vector gpu_archs = - Args.getAllArgValues(options::OPT_march_EQ); - assert(gpu_archs.size() == 1 && "Exactly one GPU Arch required for ptxas."); - const std::string& gpu_arch = gpu_archs[0]; + // Obtain architecture from the action. + const char *gpu_arch = JA.getOffloadingArch(); + assert(gpu_arch && "Device action expected to have an architecture."); ArgStringList CmdArgs; CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-m64" : "-m32"); @@ -11210,12 +11237,19 @@ CmdArgs.push_back(Args.MakeArgString(Output.getFilename())); for (const auto& II : Inputs) { - auto* A = cast(II.getAction()); + auto *A = II.getAction(); + assert(A->getInputs().size() == 1 && + "Device offload action is expected to have a single input"); + const char *gpu_arch = A->getOffloadingArch(); + assert(gpu_arch && + "Device action expected to have associated a GPU architecture!"); + // We need to pass an Arch of the form "sm_XX" for cubin files and // "compute_XX" for ptx. - const char *Arch = (II.getType() == types::TY_PP_Asm) - ? A->getComputeArchName() - : A->getGpuArchName(); + const char *Arch = + (II.getType() == types::TY_PP_Asm) + ? toolchains::CudaToolChain::GpuArchToComputeName(gpu_arch) + : gpu_arch; CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") + Arch + ",file=" + II.getFilename())); } Index: lib/Frontend/CreateInvocationFromCommandLine.cpp =================================================================== --- lib/Frontend/CreateInvocationFromCommandLine.cpp +++ lib/Frontend/CreateInvocationFromCommandLine.cpp @@ -60,25 +60,25 @@ } // We expect to get back exactly one command job, if we didn't something - // failed. CUDA compilation is an exception as it creates multiple jobs. If - // that's the case, we proceed with the first job. If caller needs particular - // CUDA job, it should be controlled via --cuda-{host|device}-only option - // passed to the driver. + // failed. Offload compilation is an exception as it creates multiple jobs. If + // that's the case, we proceed with the first job. If caller needs a + // particular job, it should be controlled via options (e.g. + // --cuda-{host|device}-only for CUDA) passed to the driver. const driver::JobList &Jobs = C->getJobs(); - bool CudaCompilation = false; + bool OffloadCompilation = false; if (Jobs.size() > 1) { for (auto &A : C->getActions()){ // On MacOSX real actions may end up being wrapped in BindArchAction if (isa(A)) A = *A->input_begin(); - if (isa(A)) { - CudaCompilation = true; + if (isa(A)) { + OffloadCompilation = true; break; } } } if (Jobs.size() == 0 || !isa(*Jobs.begin()) || - (Jobs.size() > 1 && !CudaCompilation)) { + (Jobs.size() > 1 && !OffloadCompilation)) { SmallString<256> Msg; llvm::raw_svector_ostream OS(Msg); Jobs.Print(OS, "; ", true);