Index: include/clang/Driver/Action.h =================================================================== --- include/clang/Driver/Action.h +++ include/clang/Driver/Action.h @@ -13,6 +13,7 @@ #include "clang/Basic/Cuda.h" #include "clang/Driver/Types.h" #include "clang/Driver/Util.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" namespace llvm { @@ -27,6 +28,8 @@ namespace clang { namespace driver { +class ToolChain; + /// Action - Represent an abstract compilation step to perform. /// /// An action represents an edge in the compilation graph; typically @@ -50,8 +53,7 @@ enum ActionClass { InputClass = 0, BindArchClass, - CudaDeviceClass, - CudaHostClass, + OffloadClass, PreprocessJobClass, PrecompileJobClass, AnalyzeJobClass, @@ -65,17 +67,13 @@ VerifyDebugInfoJobClass, VerifyPCHJobClass, - JobClassFirst=PreprocessJobClass, - JobClassLast=VerifyPCHJobClass + JobClassFirst = PreprocessJobClass, + JobClassLast = VerifyPCHJobClass }; // The offloading kind determines if this action is binded to a particular // programming model. Each entry reserves one bit. We also have a special kind // to designate the host offloading tool chain. - // - // FIXME: This is currently used to indicate that tool chains are used in a - // given programming, but will be used here as well once a generic offloading - // action is implemented. enum OffloadKind { OFK_None = 0x00, // The host offloading tool chain. @@ -95,6 +93,19 @@ ActionList Inputs; protected: + /// + /// Offload information. + /// + + /// The host offloading kind - a combination of kinds encoded in a mask. + /// Multiple programming models may be supported simultaneously by the same + /// host. + unsigned ActiveOffloadKindMask = 0u; + /// Offloading kind of the device. + OffloadKind OffloadingDeviceKind = OFK_None; + /// The Offloading architecture associated with this action. + const char *OffloadingArch = nullptr; + Action(ActionClass Kind, types::ID Type) : Action(Kind, ActionList(), Type) {} Action(ActionClass Kind, Action *Input, types::ID Type) : Action(Kind, ActionList({Input}), Type) {} @@ -124,6 +135,40 @@ input_const_range inputs() const { return input_const_range(input_begin(), input_end()); } + + /// Return a string containing the offload kind of the action. + std::string getOffloadingKindPrefix() const; + /// Return a string that can be used as prefix in order to generate unique + /// files for each offloading kind. + std::string getOffloadingFileNamePrefix(StringRef NormalizedTriple) const; + + /// Set the device offload info of this action and propagate it to its + /// dependences. + void propagateDeviceOffloadInfo(OffloadKind OKind, const char *OArch); + /// Append the host offload info of this action and propagate it to its + /// dependences. + void propagateHostOffloadInfo(unsigned OKinds, const char *OArch); + /// Set the offload info of this action to be the same as the provided action, + /// and propagate it to its dependences. + void propagateOffloadInfo(const Action *A); + + unsigned getOffloadingHostActiveKinds() const { + return ActiveOffloadKindMask; + } + OffloadKind getOffloadingDeviceKind() const { return OffloadingDeviceKind; } + const char *getOffloadingArch() const { return OffloadingArch; } + + /// Check if this action have any offload kinds. Note that host offload kinds + /// are only set if the action is a dependence to a host offload action. + bool isHostOffloading(OffloadKind OKind) const { + return ActiveOffloadKindMask & OKind; + } + bool isDeviceOffloading(OffloadKind OKind) const { + return OffloadingDeviceKind == OKind; + } + bool isOffloading(OffloadKind OKind) const { + return isHostOffloading(OKind) || isDeviceOffloading(OKind); + } }; class InputAction : public Action { @@ -156,39 +201,126 @@ } }; -class CudaDeviceAction : public Action { +/// An offload action combines host or/and device actions according to the +/// programming model implementation needs and propagates the offloading kind to +/// its dependences. +class OffloadAction final : public Action { virtual void anchor(); - const CudaArch GpuArch; +public: + /// Type used to communicate device actions. It associates bound architecture, + /// toolchain, and offload kind to each action. + class DeviceDependences final { + public: + typedef SmallVector ToolChainList; + typedef SmallVector BoundArchList; + typedef SmallVector OffloadKindList; + + private: + // Lists that keep the information for each dependency. All the lists are + // meant to be updated in sync. We are adopting separate lists instead of a + // list of structs, because that simplifies forwarding the actions list to + // initialize the inputs of the base Action class. + + /// The dependence actions. + ActionList DeviceActions; + /// The offloading toolchains that should be used with the action. + ToolChainList DeviceToolChains; + /// The architectures that should be used with this action. + BoundArchList DeviceBoundArchs; + /// The offload kind of each dependence. + OffloadKindList DeviceOffloadKinds; + + public: + /// Add a action along with the associated toolchain, bound arch, and + /// offload kind. + void add(Action &A, const ToolChain &TC, const char *BoundArch, + OffloadKind OKind); + + /// Get each of the individual arrays. + const ActionList &getActions() const { return DeviceActions; }; + const ToolChainList &getToolChains() const { return DeviceToolChains; }; + const BoundArchList &getBoundArchs() const { return DeviceBoundArchs; }; + const OffloadKindList &getOffloadKinds() const { + return DeviceOffloadKinds; + }; + }; + + /// Type used to communicate host actions. It associates bound architecture, + /// toolchain, and offload kinds to the host action. + class HostDependence final { + /// The dependence action. + Action &HostAction; + /// The offloading toolchain that should be used with the action. + const ToolChain &HostToolChain; + /// The architectures that should be used with this action. + const char *HostBoundArch = nullptr; + /// The offload kind of each dependence. + unsigned HostOffloadKinds = 0u; + + public: + HostDependence(Action &A, const ToolChain &TC, const char *BoundArch, + const unsigned OffloadKinds) + : HostAction(A), HostToolChain(TC), HostBoundArch(BoundArch), + HostOffloadKinds(OffloadKinds){}; + /// Constructor version that obtains the offload kinds from the device + /// dependencies. + HostDependence(Action &A, const ToolChain &TC, const char *BoundArch, + const DeviceDependences &DDeps); + Action *getAction() const { return &HostAction; }; + const ToolChain *getToolChain() const { return &HostToolChain; }; + const char *getBoundArch() const { return HostBoundArch; }; + unsigned getOffloadKinds() const { return HostOffloadKinds; }; + }; + + typedef llvm::function_ref + OffloadActionWorkTy; + +private: + /// The host offloading toolchain that should be used with the action. + const ToolChain *HostTC = nullptr; - /// True when action results are not consumed by the host action (e.g when - /// -fsyntax-only or --cuda-device-only options are used). - bool AtTopLevel; + /// The tool chains associated with the list of actions. + DeviceDependences::ToolChainList DevToolChains; public: - CudaDeviceAction(Action *Input, CudaArch Arch, bool AtTopLevel); + OffloadAction(const HostDependence &HDep); + OffloadAction(const DeviceDependences &DDeps, types::ID Ty); + OffloadAction(const HostDependence &HDep, const DeviceDependences &DDeps); - /// Get the CUDA GPU architecture to which this Action corresponds. Returns - /// UNKNOWN if this Action corresponds to multiple architectures. - CudaArch getGpuArch() const { return GpuArch; } + /// Execute the work specified in \a Work on the host dependence. + void doOnHostDependence(const OffloadActionWorkTy &Work) const; - bool isAtTopLevel() const { return AtTopLevel; } + /// Execute the work specified in \a Work on each device dependence. + void doOnEachDeviceDependence(const OffloadActionWorkTy &Work) const; - static bool classof(const Action *A) { - return A->getKind() == CudaDeviceClass; - } -}; + /// Execute the work specified in \a Work on each dependence. + void doOnEachDependence(const OffloadActionWorkTy &Work) const; -class CudaHostAction : public Action { - virtual void anchor(); - ActionList DeviceActions; + /// Execute the work specified in \a Work on each host or device dependence if + /// \a IsHostDependenceto is true or false, respectively. + void doOnEachDependence(bool IsHostDependence, + const OffloadActionWorkTy &Work) const; -public: - CudaHostAction(Action *Input, const ActionList &DeviceActions); + /// Return true if the action has a host dependence. + bool hasHostDependence() const; + + /// Return the host dependence of this action. This function is only expected + /// to be called if the host dependence exists. + Action *getHostDependence() const; + + /// Return true if the action has a single device dependence. If \a + /// DoNotConsiderHostActions is set, ignore the host dependence, if any, while + /// accounting for the number of dependences. + bool hasSingleDeviceDependence(bool DoNotConsiderHostActions = false) const; - const ActionList &getDeviceActions() const { return DeviceActions; } + /// Return the single device dependence of this action. This function is only + /// expected to be called if a single device dependence exists. If \a + /// DoNotConsiderHostActions is set, a host dependence is allowed. + Action * + getSingleDeviceDependence(bool DoNotConsiderHostActions = false) const; - static bool classof(const Action *A) { return A->getKind() == CudaHostClass; } + static bool classof(const Action *A) { return A->getKind() == OffloadClass; } }; class JobAction : public Action { Index: include/clang/Driver/Compilation.h =================================================================== --- include/clang/Driver/Compilation.h +++ include/clang/Driver/Compilation.h @@ -98,12 +98,7 @@ const Driver &getDriver() const { return TheDriver; } const ToolChain &getDefaultToolChain() const { return DefaultToolChain; } - const ToolChain *getOffloadingHostToolChain() const { - auto It = OrderedOffloadingToolchains.find(Action::OFK_Host); - if (It != OrderedOffloadingToolchains.end()) - return It->second; - return nullptr; - } + unsigned isOffloadingHostKind(Action::OffloadKind Kind) const { return ActiveOffloadMask & Kind; } @@ -121,8 +116,8 @@ return OrderedOffloadingToolchains.equal_range(Kind); } - // Return an offload toolchain of the provided kind. Only one is expected to - // exist. + /// Return an offload toolchain of the provided kind. Only one is expected to + /// exist. template const ToolChain *getSingleOffloadToolChain() const { auto TCs = getOffloadToolChains(); Index: include/clang/Driver/Driver.h =================================================================== --- include/clang/Driver/Driver.h +++ include/clang/Driver/Driver.h @@ -394,12 +394,13 @@ /// BuildJobsForAction - Construct the jobs to perform for the action \p A and /// return an InputInfo for the result of running \p A. Will only construct /// jobs for a given (Action, ToolChain, BoundArch) tuple once. - InputInfo BuildJobsForAction(Compilation &C, const Action *A, - const ToolChain *TC, const char *BoundArch, - bool AtTopLevel, bool MultipleArchs, - const char *LinkingOutput, - std::map, - InputInfo> &CachedResults) const; + InputInfo + BuildJobsForAction(Compilation &C, const Action *A, const ToolChain *TC, + const char *BoundArch, bool AtTopLevel, bool MultipleArchs, + const char *LinkingOutput, + std::map, InputInfo> + &CachedResults, + bool BuildForOffloadDevice) const; /// Returns the default name for linked images (e.g., "a.out"). const char *getDefaultImageName() const; @@ -415,12 +416,11 @@ /// \param BoundArch - The bound architecture. /// \param AtTopLevel - Whether this is a "top-level" action. /// \param MultipleArchs - Whether multiple -arch options were supplied. - const char *GetNamedOutputPath(Compilation &C, - const JobAction &JA, - const char *BaseInput, - const char *BoundArch, - bool AtTopLevel, - bool MultipleArchs) const; + /// \param NormalizedTriple - The normalized triple of the relevant target. + const char *GetNamedOutputPath(Compilation &C, const JobAction &JA, + const char *BaseInput, const char *BoundArch, + bool AtTopLevel, bool MultipleArchs, + StringRef NormalizedTriple) const; /// GetTemporaryPath - Return the pathname of a temporary file to use /// as part of compilation; the file will have the given prefix and suffix. @@ -467,7 +467,8 @@ const char *BoundArch, bool AtTopLevel, bool MultipleArchs, const char *LinkingOutput, std::map, InputInfo> - &CachedResults) const; + &CachedResults, + bool BuildForOffloadDevice) const; public: /// GetReleaseVersion - Parse (([0-9]+)(.([0-9]+)(.([0-9]+)?))?)? and Index: lib/Driver/Action.cpp =================================================================== --- lib/Driver/Action.cpp +++ lib/Driver/Action.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "clang/Driver/Action.h" +#include "clang/Driver/ToolChain.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Regex.h" @@ -21,8 +22,8 @@ switch (AC) { case InputClass: return "input"; case BindArchClass: return "bind-arch"; - case CudaDeviceClass: return "cuda-device"; - case CudaHostClass: return "cuda-host"; + case OffloadClass: + return "offload"; case PreprocessJobClass: return "preprocessor"; case PrecompileJobClass: return "precompiler"; case AnalyzeJobClass: return "analyzer"; @@ -40,6 +41,82 @@ llvm_unreachable("invalid class"); } +void Action::propagateDeviceOffloadInfo(OffloadKind OKind, const char *OArch) { + // Offload action set its own kinds on their dependences. + if (Kind == OffloadClass) + return; + + assert((OffloadingDeviceKind == OKind || OffloadingDeviceKind == OFK_None) && + "Setting device kind to a different device??"); + assert(!ActiveOffloadKindMask && "Setting a device kind in a host action??"); + OffloadingDeviceKind = OKind; + OffloadingArch = OArch; + + for (auto *A : Inputs) + A->propagateDeviceOffloadInfo(OffloadingDeviceKind, OArch); +} + +void Action::propagateHostOffloadInfo(unsigned OKinds, const char *OArch) { + // Offload action set its own kinds on their dependences. + if (Kind == OffloadClass) + return; + + assert(OffloadingDeviceKind == OFK_None && + "Setting a host kind in a device action."); + ActiveOffloadKindMask |= OKinds; + OffloadingArch = OArch; + + for (auto *A : Inputs) + A->propagateHostOffloadInfo(ActiveOffloadKindMask, OArch); +} + +void Action::propagateOffloadInfo(const Action *A) { + if (unsigned HK = A->getOffloadingHostActiveKinds()) + propagateHostOffloadInfo(HK, A->getOffloadingArch()); + else + propagateDeviceOffloadInfo(A->getOffloadingDeviceKind(), + A->getOffloadingArch()); +} + +std::string Action::getOffloadingKindPrefix() const { + switch (OffloadingDeviceKind) { + case OFK_None: + break; + case OFK_Host: + llvm_unreachable("Host kind is not an offloading device kind."); + break; + case OFK_Cuda: + return "device-cuda"; + + // TODO: Add other programming models here. + } + + if (!ActiveOffloadKindMask) + return ""; + + std::string Res("host"); + if (ActiveOffloadKindMask & OFK_Cuda) + Res += "-cuda"; + + // TODO: Add other programming models here. + + return Res; +} + +std::string +Action::getOffloadingFileNamePrefix(StringRef NormalizedTriple) const { + // A file prefix is only generated for device actions and consists of the + // offload kind and triple. + if (!OffloadingDeviceKind) + return ""; + + std::string Res("-"); + Res += getOffloadingKindPrefix(); + Res += "-"; + Res += NormalizedTriple; + return Res; +} + void InputAction::anchor() {} InputAction::InputAction(const Arg &_Input, types::ID _Type) @@ -51,16 +128,138 @@ BindArchAction::BindArchAction(Action *Input, const char *_ArchName) : Action(BindArchClass, Input), ArchName(_ArchName) {} -void CudaDeviceAction::anchor() {} +void OffloadAction::anchor() {} + +OffloadAction::OffloadAction(const HostDependence &HDep) + : Action(OffloadClass, HDep.getAction()), HostTC(HDep.getToolChain()) { + OffloadingArch = HDep.getBoundArch(); + ActiveOffloadKindMask = HDep.getOffloadKinds(); + HDep.getAction()->propagateHostOffloadInfo(HDep.getOffloadKinds(), + HDep.getBoundArch()); +}; + +OffloadAction::OffloadAction(const DeviceDependences &DDeps, types::ID Ty) + : Action(OffloadClass, DDeps.getActions(), Ty), + DevToolChains(DDeps.getToolChains()) { + auto &OKinds = DDeps.getOffloadKinds(); + auto &BArchs = DDeps.getBoundArchs(); + + // If all inputs agree on the same kind, use it also for this action. + if (llvm::all_of(OKinds, [&](OffloadKind K) { return K == OKinds.front(); })) + OffloadingDeviceKind = OKinds.front(); + + // If we have a single dependency, inherit the architecture from it. + if (OKinds.size() == 1) + OffloadingArch = BArchs.front(); + + // Propagate info to the dependencies. + for (unsigned i = 0, e = getInputs().size(); i != e; ++i) + getInputs()[i]->propagateDeviceOffloadInfo(OKinds[i], BArchs[i]); +} + +OffloadAction::OffloadAction(const HostDependence &HDep, + const DeviceDependences &DDeps) + : Action(OffloadClass, HDep.getAction()), HostTC(HDep.getToolChain()), + DevToolChains(DDeps.getToolChains()) { + // We use the kinds of the host dependence for this action. + OffloadingArch = HDep.getBoundArch(); + ActiveOffloadKindMask = HDep.getOffloadKinds(); + HDep.getAction()->propagateHostOffloadInfo(HDep.getOffloadKinds(), + HDep.getBoundArch()); + + // Add device inputs and propagate info to the device actions. Do work only if + // we have dependencies. + for (unsigned i = 0, e = DDeps.getActions().size(); i != e; ++i) + if (auto *A = DDeps.getActions()[i]) { + getInputs().push_back(A); + A->propagateDeviceOffloadInfo(DDeps.getOffloadKinds()[i], + DDeps.getBoundArchs()[i]); + } +} + +void OffloadAction::doOnHostDependence(const OffloadActionWorkTy &Work) const { + if (!HostTC) + return; + assert(!getInputs().empty() && "No dependencies for offload action??"); + auto *A = getInputs().front(); + Work(A, HostTC, A->getOffloadingArch()); +} -CudaDeviceAction::CudaDeviceAction(Action *Input, clang::CudaArch Arch, - bool AtTopLevel) - : Action(CudaDeviceClass, Input), GpuArch(Arch), AtTopLevel(AtTopLevel) {} +void OffloadAction::doOnEachDeviceDependence( + const OffloadActionWorkTy &Work) const { + auto I = getInputs().begin(); + auto E = getInputs().end(); + if (I == E) + return; + + // We expect to have the same number of input dependences and device tool + // chains, except if we also have a host dependence. In that case we have one + // more dependence than we have device tool chains. + assert(getInputs().size() == DevToolChains.size() + (HostTC ? 1 : 0) && + "Sizes of action dependences and toolchains are not consistent!"); + + // Skip host action + if (HostTC) + ++I; + + auto TI = DevToolChains.begin(); + for (; I != E; ++I, ++TI) + Work(*I, *TI, (*I)->getOffloadingArch()); +} + +void OffloadAction::doOnEachDependence(const OffloadActionWorkTy &Work) const { + doOnHostDependence(Work); + doOnEachDeviceDependence(Work); +} + +void OffloadAction::doOnEachDependence(bool IsHostDependence, + const OffloadActionWorkTy &Work) const { + if (IsHostDependence) + doOnHostDependence(Work); + else + doOnEachDeviceDependence(Work); +} -void CudaHostAction::anchor() {} +bool OffloadAction::hasHostDependence() const { return HostTC != nullptr; } -CudaHostAction::CudaHostAction(Action *Input, const ActionList &DeviceActions) - : Action(CudaHostClass, Input), DeviceActions(DeviceActions) {} +Action *OffloadAction::getHostDependence() const { + assert(hasHostDependence() && "Host dependence does not exist!"); + assert(!getInputs().empty() && "No dependencies for offload action??"); + return HostTC ? getInputs().front() : nullptr; +} + +bool OffloadAction::hasSingleDeviceDependence( + bool DoNotConsiderHostActions) const { + if (DoNotConsiderHostActions) + return getInputs().size() == (HostTC ? 2 : 1); + return !HostTC && getInputs().size() == 1; +} + +Action * +OffloadAction::getSingleDeviceDependence(bool DoNotConsiderHostActions) const { + assert(hasSingleDeviceDependence(DoNotConsiderHostActions) && + "Single device dependence does not exist!"); + // The previous assert ensures the number of entries in getInputs() is + // consistent with what we are doing here. + return HostTC ? getInputs()[1] : getInputs().front(); +} + +void OffloadAction::DeviceDependences::add(Action &A, const ToolChain &TC, + const char *BoundArch, + OffloadKind OKind) { + DeviceActions.push_back(&A); + DeviceToolChains.push_back(&TC); + DeviceBoundArchs.push_back(BoundArch); + DeviceOffloadKinds.push_back(OKind); +} + +OffloadAction::HostDependence::HostDependence(Action &A, const ToolChain &TC, + const char *BoundArch, + const DeviceDependences &DDeps) + : HostAction(A), HostToolChain(TC), HostBoundArch(BoundArch) { + for (auto K : DDeps.getOffloadKinds()) + HostOffloadKinds |= K; +} void JobAction::anchor() {} Index: lib/Driver/Driver.cpp =================================================================== --- lib/Driver/Driver.cpp +++ lib/Driver/Driver.cpp @@ -435,7 +435,9 @@ })) { const ToolChain &TC = getToolChain( C.getInputArgs(), - llvm::Triple(C.getOffloadingHostToolChain()->getTriple().isArch64Bit() + llvm::Triple(C.getSingleOffloadToolChain() + ->getTriple() + .isArch64Bit() ? "nvptx64-nvidia-cuda" : "nvptx-nvidia-cuda")); C.addOffloadDeviceToolChain(&TC, Action::OFK_Cuda); @@ -1022,19 +1024,33 @@ } else if (BindArchAction *BIA = dyn_cast(A)) { os << '"' << BIA->getArchName() << '"' << ", {" << PrintActions1(C, *BIA->input_begin(), Ids) << "}"; - } else if (CudaDeviceAction *CDA = dyn_cast(A)) { - CudaArch Arch = CDA->getGpuArch(); - if (Arch != CudaArch::UNKNOWN) - os << "'" << CudaArchToString(Arch) << "', "; - os << "{" << PrintActions1(C, *CDA->input_begin(), Ids) << "}"; + } else if (OffloadAction *OA = dyn_cast(A)) { + bool IsFirst = true; + OA->doOnEachDependence( + [&](Action *A, const ToolChain *TC, const char *BoundArch) { + // E.g. for two CUDA device dependences whose bound arch is sm_20 and + // sm_35 this will generate: + // "cuda-device" (nvptx64-nvidia-cuda:sm_20) {#ID}, "cuda-device" + // (nvptx64-nvidia-cuda:sm_35) {#ID} + if (!IsFirst) + os << ", "; + os << '"'; + if (TC) + os << A->getOffloadingKindPrefix(); + else + os << "host"; + os << " ("; + os << TC->getTriple().normalize(); + + if (BoundArch) + os << ":" << BoundArch; + os << ")"; + os << '"'; + os << " {" << PrintActions1(C, A, Ids) << "}"; + IsFirst = false; + }); } else { - const ActionList *AL; - if (CudaHostAction *CHA = dyn_cast(A)) { - os << "{" << PrintActions1(C, *CHA->input_begin(), Ids) << "}" - << ", gpu binaries "; - AL = &CHA->getDeviceActions(); - } else - AL = &A->getInputs(); + const ActionList *AL = &A->getInputs(); if (AL->size()) { const char *Prefix = "{"; @@ -1047,10 +1063,24 @@ os << "{}"; } + // Append offload info for all options other than the offloading action + // itself (e.g. (cuda-device, sm_20) or (cuda-host)). + std::string offload_str; + llvm::raw_string_ostream offload_os(offload_str); + if (!isa(A)) { + auto S = A->getOffloadingKindPrefix(); + if (!S.empty()) { + offload_os << ", (" << S; + if (A->getOffloadingArch()) + offload_os << ", " << A->getOffloadingArch(); + offload_os << ")"; + } + } + unsigned Id = Ids.size(); Ids[A] = Id; llvm::errs() << Id << ": " << os.str() << ", " - << types::getTypeName(A->getType()) << "\n"; + << types::getTypeName(A->getType()) << offload_os.str() << "\n"; return Id; } @@ -1378,8 +1408,12 @@ PartialCompilationArg && PartialCompilationArg->getOption().matches(options::OPT_cuda_device_only); - if (CompileHostOnly) - return C.MakeAction(HostAction, ActionList()); + if (CompileHostOnly) { + OffloadAction::HostDependence HDep( + *HostAction, *C.getSingleOffloadToolChain(), + /*BoundArch=*/nullptr, Action::OFK_Cuda); + return C.MakeAction(HDep); + } // Collect all cuda_gpu_arch parameters, removing duplicates. SmallVector GpuArchList; @@ -1408,8 +1442,6 @@ CudaDeviceInputs.push_back(std::make_pair(types::TY_CUDA_DEVICE, InputArg)); // Build actions for all device inputs. - assert(C.getSingleOffloadToolChain() && - "Missing toolchain for device-side compilation."); ActionList CudaDeviceActions; C.getDriver().BuildActions(C, Args, CudaDeviceInputs, CudaDeviceActions); assert(GpuArchList.size() == CudaDeviceActions.size() && @@ -1421,6 +1453,8 @@ return a->getKind() != Action::AssembleJobClass; }); + const ToolChain *CudaTC = C.getSingleOffloadToolChain(); + // Figure out what to do with device actions -- pass them as inputs to the // host action or run each of them independently. if (PartialCompilation || CompileDeviceOnly) { @@ -1436,10 +1470,13 @@ return nullptr; } - for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) - Actions.push_back(C.MakeAction(CudaDeviceActions[I], - GpuArchList[I], - /* AtTopLevel */ true)); + for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { + OffloadAction::DeviceDependences DDep; + DDep.add(*CudaDeviceActions[I], *CudaTC, CudaArchToString(GpuArchList[I]), + Action::OFK_Cuda); + Actions.push_back( + C.MakeAction(DDep, CudaDeviceActions[I]->getType())); + } // Kill host action in case of device-only compilation. if (CompileDeviceOnly) return nullptr; @@ -1459,19 +1496,23 @@ Action* BackendAction = AssembleAction->getInputs()[0]; assert(BackendAction->getType() == types::TY_PP_Asm); - for (const auto& A : {AssembleAction, BackendAction}) { - DeviceActions.push_back(C.MakeAction( - A, GpuArchList[I], /* AtTopLevel */ false)); + for (auto &A : {AssembleAction, BackendAction}) { + OffloadAction::DeviceDependences DDep; + DDep.add(*A, *CudaTC, CudaArchToString(GpuArchList[I]), Action::OFK_Cuda); + DeviceActions.push_back(C.MakeAction(DDep, A->getType())); } } - auto FatbinAction = C.MakeAction( - C.MakeAction(DeviceActions, types::TY_CUDA_FATBIN), - CudaArch::UNKNOWN, - /* AtTopLevel = */ false); + auto FatbinAction = + C.MakeAction(DeviceActions, types::TY_CUDA_FATBIN); + // Return a new host action that incorporates original host action and all // device actions. - return C.MakeAction(std::move(HostAction), - ActionList({FatbinAction})); + OffloadAction::HostDependence HDep( + *HostAction, *C.getSingleOffloadToolChain(), + /*BoundArch=*/nullptr, Action::OFK_Cuda); + OffloadAction::DeviceDependences DDep; + DDep.add(*FatbinAction, *CudaTC, /*BoundArch=*/nullptr, Action::OFK_Cuda); + return C.MakeAction(HDep, DDep); } void Driver::BuildActions(Compilation &C, DerivedArgList &Args, @@ -1580,6 +1621,9 @@ YcArg = YuArg = nullptr; } + // Track the host offload kinds used on this compilation. + unsigned CompilationActiveOffloadHostKinds = 0u; + // Construct the actions to perform. ActionList LinkerInputs; @@ -1648,6 +1692,9 @@ ? phases::Compile : FinalPhase; + // Track the host offload kinds used on this input. + unsigned InputActiveOffloadHostKinds = 0u; + // Build the pipeline for this file. Action *Current = C.MakeAction(*InputArg, InputType); for (SmallVectorImpl::iterator i = PL.begin(), e = PL.end(); @@ -1679,21 +1726,36 @@ Current = buildCudaActions(C, Args, InputArg, Current, Actions); if (!Current) break; + + // We produced a CUDA action for this input, so the host has to support + // CUDA. + InputActiveOffloadHostKinds |= Action::OFK_Cuda; + CompilationActiveOffloadHostKinds |= Action::OFK_Cuda; } if (Current->getType() == types::TY_Nothing) break; } - // If we ended with something, add to the output list. - if (Current) + // If we ended with something, add to the output list. Also, propagate the + // offload information to the top-level host action related with the current + // input. + if (Current) { + if (InputActiveOffloadHostKinds) + Current->propagateHostOffloadInfo(InputActiveOffloadHostKinds, + /*BoundArch=*/nullptr); Actions.push_back(Current); + } } - // Add a link action if necessary. - if (!LinkerInputs.empty()) + // Add a link action if necessary and propagate the offload information for + // the current compilation. + if (!LinkerInputs.empty()) { Actions.push_back( C.MakeAction(LinkerInputs, types::TY_Image)); + Actions.back()->propagateHostOffloadInfo(CompilationActiveOffloadHostKinds, + /*BoundArch=*/nullptr); + } // If we are linking, claim any options which are obviously only used for // compilation. @@ -1829,7 +1891,8 @@ /*BoundArch*/ nullptr, /*AtTopLevel*/ true, /*MultipleArchs*/ ArchNames.size() > 1, - /*LinkingOutput*/ LinkingOutput, CachedResults); + /*LinkingOutput*/ LinkingOutput, CachedResults, + /*BuildForOffloadDevice*/ false); } // If the user passed -Qunused-arguments or there were errors, don't warn @@ -1878,7 +1941,28 @@ } } } - +/// Collapse an offloading action looking for a job of the given type. The input +/// action is changed to the input of the collapsed sequence. If we effectively +/// had a collapse return the corresponding offloading action, otherwise return +/// null. +template +static OffloadAction *collapseOffloadingAction(Action *&CurAction) { + if (!CurAction) + return nullptr; + if (auto *OA = dyn_cast(CurAction)) { + if (OA->hasHostDependence()) + if (auto *HDep = dyn_cast(OA->getHostDependence())) { + CurAction = HDep; + return OA; + } + if (OA->hasSingleDeviceDependence()) + if (auto *DDep = dyn_cast(OA->getSingleDeviceDependence())) { + CurAction = DDep; + return OA; + } + } + return nullptr; +} // Returns a Tool for a given JobAction. In case the action and its // predecessors can be combined, updates Inputs with the inputs of the // first combined action. If one of the collapsed actions is a @@ -1888,34 +1972,39 @@ bool EmbedBitcode, const ToolChain *TC, const JobAction *JA, const ActionList *&Inputs, - const CudaHostAction *&CollapsedCHA) { + ActionList &CollapsedOffloadAction) { const Tool *ToolForJob = nullptr; - CollapsedCHA = nullptr; + CollapsedOffloadAction.clear(); // See if we should look for a compiler with an integrated assembler. We match // bottom up, so what we are actually looking for is an assembler job with a // compiler input. + // Look through offload actions between assembler and backend actions. + Action *BackendJA = (isa(JA) && Inputs->size() == 1) + ? *Inputs->begin() + : nullptr; + auto *BackendOA = collapseOffloadingAction(BackendJA); + if (TC->useIntegratedAs() && !SaveTemps && !C.getArgs().hasArg(options::OPT_via_file_asm) && !C.getArgs().hasArg(options::OPT__SLASH_FA) && - !C.getArgs().hasArg(options::OPT__SLASH_Fa) && - isa(JA) && Inputs->size() == 1 && - isa(*Inputs->begin())) { + !C.getArgs().hasArg(options::OPT__SLASH_Fa) && BackendJA && + isa(BackendJA)) { // A BackendJob is always preceded by a CompileJob, and without -save-temps // or -fembed-bitcode, they will always get combined together, so instead of // checking the backend tool, check if the tool for the CompileJob has an // integrated assembler. For -fembed-bitcode, CompileJob is still used to // look up tools for BackendJob, but they need to match before we can split // them. - const ActionList *BackendInputs = &(*Inputs)[0]->getInputs(); - // Compile job may be wrapped in CudaHostAction, extract it if - // that's the case and update CollapsedCHA if we combine phases. - CudaHostAction *CHA = dyn_cast(*BackendInputs->begin()); - JobAction *CompileJA = cast( - CHA ? *CHA->input_begin() : *BackendInputs->begin()); - assert(CompileJA && "Backend job is not preceeded by compile job."); - const Tool *Compiler = TC->SelectTool(*CompileJA); + + // Look through offload actions between backend and compile actions. + Action *CompileJA = *BackendJA->getInputs().begin(); + auto *CompileOA = collapseOffloadingAction(CompileJA); + + assert(CompileJA && isa(CompileJA) && + "Backend job is not preceeded by compile job."); + const Tool *Compiler = TC->SelectTool(*cast(CompileJA)); if (!Compiler) return nullptr; // When using -fembed-bitcode, it is required to have the same tool (clang) @@ -1929,7 +2018,12 @@ if (Compiler->hasIntegratedAssembler()) { Inputs = &CompileJA->getInputs(); ToolForJob = Compiler; - CollapsedCHA = CHA; + // Save the collapsed offload actions because they may still contain + // device actions. + if (CompileOA) + CollapsedOffloadAction.push_back(CompileOA); + if (BackendOA) + CollapsedOffloadAction.push_back(BackendOA); } } @@ -1939,20 +2033,23 @@ if (isa(JA)) { // Check if the compiler supports emitting LLVM IR. assert(Inputs->size() == 1); - // Compile job may be wrapped in CudaHostAction, extract it if - // that's the case and update CollapsedCHA if we combine phases. - CudaHostAction *CHA = dyn_cast(*Inputs->begin()); - JobAction *CompileJA = - cast(CHA ? *CHA->input_begin() : *Inputs->begin()); - assert(CompileJA && "Backend job is not preceeded by compile job."); - const Tool *Compiler = TC->SelectTool(*CompileJA); + + // Look through offload actions between backend and compile actions. + Action *CompileJA = *JA->getInputs().begin(); + auto *CompileOA = collapseOffloadingAction(CompileJA); + + assert(CompileJA && isa(CompileJA) && + "Backend job is not preceeded by compile job."); + const Tool *Compiler = TC->SelectTool(*cast(CompileJA)); if (!Compiler) return nullptr; if (!Compiler->canEmitIR() || (!SaveTemps && !EmbedBitcode)) { Inputs = &CompileJA->getInputs(); ToolForJob = Compiler; - CollapsedCHA = CHA; + + if (CompileOA) + CollapsedOffloadAction.push_back(CompileOA); } } @@ -1963,12 +2060,21 @@ // See if we should use an integrated preprocessor. We do so when we have // exactly one input, since this is the only use case we care about // (irrelevant since we don't support combine yet). - if (Inputs->size() == 1 && isa(*Inputs->begin()) && + + // Look through offload actions after preprocessing. + Action *PreprocessJA = (Inputs->size() == 1) ? *Inputs->begin() : nullptr; + auto *PreprocessOA = + collapseOffloadingAction(PreprocessJA); + + if (PreprocessJA && isa(PreprocessJA) && !C.getArgs().hasArg(options::OPT_no_integrated_cpp) && !C.getArgs().hasArg(options::OPT_traditional_cpp) && !SaveTemps && !C.getArgs().hasArg(options::OPT_rewrite_objc) && - ToolForJob->hasIntegratedCPP()) - Inputs = &(*Inputs)[0]->getInputs(); + ToolForJob->hasIntegratedCPP()) { + Inputs = &PreprocessJA->getInputs(); + if (PreprocessOA) + CollapsedOffloadAction.push_back(PreprocessOA); + } return ToolForJob; } @@ -1976,8 +2082,8 @@ InputInfo Driver::BuildJobsForAction( Compilation &C, const Action *A, const ToolChain *TC, const char *BoundArch, bool AtTopLevel, bool MultipleArchs, const char *LinkingOutput, - std::map, InputInfo> &CachedResults) - const { + std::map, InputInfo> &CachedResults, + bool BuildForOffloadDevice) const { // The bound arch is not necessarily represented in the toolchain's triple -- // for example, armv7 and armv7s both map to the same triple -- so we need // both in our map. @@ -1991,9 +2097,9 @@ if (CachedResult != CachedResults.end()) { return CachedResult->second; } - InputInfo Result = - BuildJobsForActionNoCache(C, A, TC, BoundArch, AtTopLevel, MultipleArchs, - LinkingOutput, CachedResults); + InputInfo Result = BuildJobsForActionNoCache( + C, A, TC, BoundArch, AtTopLevel, MultipleArchs, LinkingOutput, + CachedResults, BuildForOffloadDevice); CachedResults[ActionTC] = Result; return Result; } @@ -2001,21 +2107,65 @@ InputInfo Driver::BuildJobsForActionNoCache( Compilation &C, const Action *A, const ToolChain *TC, const char *BoundArch, bool AtTopLevel, bool MultipleArchs, const char *LinkingOutput, - std::map, InputInfo> &CachedResults) - const { + std::map, InputInfo> &CachedResults, + bool BuildForOffloadDevice) const { llvm::PrettyStackTraceString CrashInfo("Building compilation jobs"); - InputInfoList CudaDeviceInputInfos; - if (const CudaHostAction *CHA = dyn_cast(A)) { - // Append outputs of device jobs to the input list. - for (const Action *DA : CHA->getDeviceActions()) { - CudaDeviceInputInfos.push_back(BuildJobsForAction( - C, DA, TC, nullptr, AtTopLevel, - /*MultipleArchs*/ false, LinkingOutput, CachedResults)); + InputInfoList OffloadDependencesInputInfo; + if (const OffloadAction *OA = dyn_cast(A)) { + // The offload action is expected to be used in four different situations. + // + // a) Set a toolchain/architecture/kind for a host action: + // Host Action 1 -> OffloadAction -> Host Action 2 + // + // b) Set a toolchain/architecture/kind for a device action; + // Device Action 1 -> OffloadAction -> Device Action 2 + // + // c) Specify a device dependences to a host action; + // Device Action 1 _ + // \ + // Host Action 1 ---> OffloadAction -> Host Action 2 + // + // d) Specify a host dependence to a device action. + // Host Action 1 _ + // \ + // Device Action 1 ---> OffloadAction -> Device Action 2 + // + // For a) and b), we just return the job generated for the dependence. For + // c) and d) we override the current action with the host/device dependence + // if the current toolchain is host/device and set the offload dependences + // info with the jobs obtained from the device/host dependence(s). + + // If there is a single device option, just generate the job for it. + if (OA->hasSingleDeviceDependence()) { + InputInfo DevA; + OA->doOnEachDeviceDependence([&](Action *DepA, const ToolChain *DepTC, + const char *DepBoundArch) { + DevA = + BuildJobsForAction(C, DepA, DepTC, DepBoundArch, AtTopLevel, + /*MultipleArchs*/ !!DepBoundArch, LinkingOutput, + CachedResults, /*BuildForOffloadDevice=*/true); + }); + return DevA; } - // Override current action with a real host compile action and continue - // processing it. - A = *CHA->input_begin(); + + // If 'Action 2' is host, we generate jobs for the device dependences and + // override the current action with the host dependence. Otherwise, we + // generate the host dependences and override the action with the device + // dependence. The dependences can't therefore be a top-level action. + OA->doOnEachDependence( + /*IsHostDependence=*/BuildForOffloadDevice, + [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) { + OffloadDependencesInputInfo.push_back(BuildJobsForAction( + C, DepA, DepTC, DepBoundArch, /*AtTopLevel=*/false, + /*MultipleArchs*/ !!DepBoundArch, LinkingOutput, CachedResults, + /*BuildForOffloadDevice=*/DepA->getOffloadingDeviceKind() != + Action::OFK_None)); + }); + + A = BuildForOffloadDevice + ? OA->getSingleDeviceDependence(/*DoNotConsiderHostActions=*/true) + : OA->getHostDependence(); } if (const InputAction *IA = dyn_cast(A)) { @@ -2042,41 +2192,34 @@ TC = &C.getDefaultToolChain(); return BuildJobsForAction(C, *BAA->input_begin(), TC, ArchName, AtTopLevel, - MultipleArchs, LinkingOutput, CachedResults); + MultipleArchs, LinkingOutput, CachedResults, + BuildForOffloadDevice); } - if (const CudaDeviceAction *CDA = dyn_cast(A)) { - // Initial processing of CudaDeviceAction carries host params. - // Call BuildJobsForAction() again, now with correct device parameters. - InputInfo II = BuildJobsForAction( - C, *CDA->input_begin(), C.getSingleOffloadToolChain(), - CudaArchToString(CDA->getGpuArch()), CDA->isAtTopLevel(), - /*MultipleArchs=*/true, LinkingOutput, CachedResults); - // Currently II's Action is *CDA->input_begin(). Set it to CDA instead, so - // that one can retrieve II's GPU arch. - II.setAction(A); - return II; - } const ActionList *Inputs = &A->getInputs(); const JobAction *JA = cast(A); - const CudaHostAction *CollapsedCHA = nullptr; + ActionList CollapsedOffloadActions; + const Tool *T = selectToolForJob(C, isSaveTempsEnabled(), embedBitcodeEnabled(), TC, JA, - Inputs, CollapsedCHA); + Inputs, CollapsedOffloadActions); if (!T) return InputInfo(); - // If we've collapsed action list that contained CudaHostAction we - // need to build jobs for device-side inputs it may have held. - if (CollapsedCHA) { - for (const Action *DA : CollapsedCHA->getDeviceActions()) { - CudaDeviceInputInfos.push_back(BuildJobsForAction( - C, DA, TC, "", AtTopLevel, - /*MultipleArchs*/ false, LinkingOutput, CachedResults)); - } - } + // If we've collapsed action list that contained OffloadAction we + // need to build jobs for host/device-side inputs it may have held. + for (const auto *OA : CollapsedOffloadActions) + cast(OA)->doOnEachDependence( + /*IsHostDependence=*/BuildForOffloadDevice, + [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) { + OffloadDependencesInputInfo.push_back(BuildJobsForAction( + C, DepA, DepTC, DepBoundArch, AtTopLevel, + /*MultipleArchs=*/!!DepBoundArch, LinkingOutput, CachedResults, + /*BuildForOffloadDevice=*/DepA->getOffloadingDeviceKind() != + Action::OFK_None)); + }); // Only use pipes when there is exactly one input. InputInfoList InputInfos; @@ -2086,9 +2229,9 @@ // FIXME: Clean this up. bool SubJobAtTopLevel = AtTopLevel && (isa(A) || isa(A)); - InputInfos.push_back(BuildJobsForAction(C, Input, TC, BoundArch, - SubJobAtTopLevel, MultipleArchs, - LinkingOutput, CachedResults)); + InputInfos.push_back(BuildJobsForAction( + C, Input, TC, BoundArch, SubJobAtTopLevel, MultipleArchs, LinkingOutput, + CachedResults, BuildForOffloadDevice)); } // Always use the first input as the base input. @@ -2099,9 +2242,10 @@ if (JA->getType() == types::TY_dSYM) BaseInput = InputInfos[0].getFilename(); - // Append outputs of cuda device jobs to the input list - if (CudaDeviceInputInfos.size()) - InputInfos.append(CudaDeviceInputInfos.begin(), CudaDeviceInputInfos.end()); + // Append outputs of offload device jobs to the input list + if (!OffloadDependencesInputInfo.empty()) + InputInfos.append(OffloadDependencesInputInfo.begin(), + OffloadDependencesInputInfo.end()); // Determine the place to write output to, if any. InputInfo Result; @@ -2109,7 +2253,8 @@ Result = InputInfo(A, BaseInput); else Result = InputInfo(A, GetNamedOutputPath(C, *JA, BaseInput, BoundArch, - AtTopLevel, MultipleArchs), + AtTopLevel, MultipleArchs, + TC->getTriple().normalize()), BaseInput); if (CCCPrintBindings && !CCGenDiagnostics) { @@ -2169,7 +2314,8 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA, const char *BaseInput, const char *BoundArch, bool AtTopLevel, - bool MultipleArchs) const { + bool MultipleArchs, + StringRef NormalizedTriple) const { llvm::PrettyStackTraceString CrashInfo("Computing output path"); // Output to a user requested destination? if (AtTopLevel && !isa(JA) && !isa(JA)) { @@ -2255,6 +2401,7 @@ MakeCLOutputFilename(C.getArgs(), "", BaseName, types::TY_Image); } else if (MultipleArchs && BoundArch) { SmallString<128> Output(getDefaultImageName()); + Output += JA.getOffloadingFileNamePrefix(NormalizedTriple); Output += "-"; Output.append(BoundArch); NamedOutput = C.getArgs().MakeArgString(Output.c_str()); @@ -2271,6 +2418,7 @@ if (!types::appendSuffixForType(JA.getType())) End = BaseName.rfind('.'); SmallString<128> Suffixed(BaseName.substr(0, End)); + Suffixed += JA.getOffloadingFileNamePrefix(NormalizedTriple); if (MultipleArchs && BoundArch) { Suffixed += "-"; Suffixed.append(BoundArch); Index: lib/Driver/ToolChain.cpp =================================================================== --- lib/Driver/ToolChain.cpp +++ lib/Driver/ToolChain.cpp @@ -248,8 +248,7 @@ case Action::InputClass: case Action::BindArchClass: - case Action::CudaDeviceClass: - case Action::CudaHostClass: + case Action::OffloadClass: case Action::LipoJobClass: case Action::DsymutilJobClass: case Action::VerifyDebugInfoJobClass: Index: lib/Driver/Tools.h =================================================================== --- lib/Driver/Tools.h +++ lib/Driver/Tools.h @@ -57,8 +57,7 @@ const Driver &D, const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs, const InputInfo &Output, - const InputInfoList &Inputs, - const ToolChain *AuxToolChain) const; + const InputInfoList &Inputs) const; void AddAArch64TargetArgs(const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const; Index: lib/Driver/Tools.cpp =================================================================== --- lib/Driver/Tools.cpp +++ lib/Driver/Tools.cpp @@ -296,12 +296,45 @@ !O.hasFlag(options::DriverOption) && !O.hasFlag(options::LinkerInput); } +/// Add the C++ include args of other offloading toolchains. If this is a host +/// job, the device toolchains are added. If this is a device job, the host +/// toolchains will be added. +static void addExtraOffloadCXXStdlibIncludeArgs(Compilation &C, + const JobAction &JA, + const ArgList &Args, + ArgStringList &CmdArgs) { + + if (JA.isHostOffloading(Action::OFK_Cuda)) + C.getSingleOffloadToolChain() + ->AddClangCXXStdlibIncludeArgs(Args, CmdArgs); + else if (JA.isDeviceOffloading(Action::OFK_Cuda)) + C.getSingleOffloadToolChain() + ->AddClangCXXStdlibIncludeArgs(Args, CmdArgs); + + // TODO: Add support for other programming models here. +} + +/// Add the include args that are specific of each offloading programming model. +static void addExtraOffloadSpecificIncludeArgs(Compilation &C, + const JobAction &JA, + const ArgList &Args, + ArgStringList &CmdArgs) { + + if (JA.isHostOffloading(Action::OFK_Cuda)) + C.getSingleOffloadToolChain()->AddCudaIncludeArgs( + Args, CmdArgs); + else if (JA.isDeviceOffloading(Action::OFK_Cuda)) + C.getSingleOffloadToolChain()->AddCudaIncludeArgs( + Args, CmdArgs); + + // TODO: Add support for other programming models here. +} + void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA, const Driver &D, const ArgList &Args, ArgStringList &CmdArgs, const InputInfo &Output, - const InputInfoList &Inputs, - const ToolChain *AuxToolChain) const { + const InputInfoList &Inputs) const { Arg *A; const bool IsIAMCU = getToolChain().getTriple().isOSIAMCU(); @@ -566,31 +599,27 @@ // OBJCPLUS_INCLUDE_PATH - system includes enabled when compiling ObjC++. addDirectoryList(Args, CmdArgs, "-objcxx-isystem", "OBJCPLUS_INCLUDE_PATH"); - // Optional AuxToolChain indicates that we need to include headers - // for more than one target. If that's the case, add include paths - // from AuxToolChain right after include paths of the same kind for - // the current target. + // While adding the include arguments, we also attempt to retrieve the + // arguments of related offloading toolchains or arguments that are specific + // of an offloading programming model. // Add C++ include arguments, if needed. if (types::isCXX(Inputs[0].getType())) { getToolChain().AddClangCXXStdlibIncludeArgs(Args, CmdArgs); - if (AuxToolChain) - AuxToolChain->AddClangCXXStdlibIncludeArgs(Args, CmdArgs); + addExtraOffloadCXXStdlibIncludeArgs(C, JA, Args, CmdArgs); } // Add system include arguments for all targets but IAMCU. if (!IsIAMCU) { getToolChain().AddClangSystemIncludeArgs(Args, CmdArgs); - if (AuxToolChain) - AuxToolChain->AddClangCXXStdlibIncludeArgs(Args, CmdArgs); + addExtraOffloadCXXStdlibIncludeArgs(C, JA, Args, CmdArgs); } else { // For IAMCU add special include arguments. getToolChain().AddIAMCUIncludeArgs(Args, CmdArgs); } - // Add CUDA include arguments, if needed. - if (types::isCuda(Inputs[0].getType())) - getToolChain().AddCudaIncludeArgs(Args, CmdArgs); + // Add offload include arguments, if needed. + addExtraOffloadSpecificIncludeArgs(C, JA, Args, CmdArgs); } // FIXME: Move to target hook. @@ -3772,7 +3801,7 @@ // CUDA compilation may have multiple inputs (source file + results of // device-side compilations). All other jobs are expected to have exactly one // input. - bool IsCuda = types::isCuda(Input.getType()); + bool IsCuda = JA.isOffloading(Action::OFK_Cuda); assert((IsCuda || Inputs.size() == 1) && "Unable to handle multiple inputs."); // C++ is not supported for IAMCU. @@ -3788,21 +3817,21 @@ CmdArgs.push_back("-triple"); CmdArgs.push_back(Args.MakeArgString(TripleStr)); - const ToolChain *AuxToolChain = nullptr; if (IsCuda) { - // FIXME: We need a (better) way to pass information about - // particular compilation pass we're constructing here. For now we - // can check which toolchain we're using and pick the other one to - // extract the triple. - if (&getToolChain() == C.getSingleOffloadToolChain()) - AuxToolChain = C.getOffloadingHostToolChain(); - else if (&getToolChain() == C.getOffloadingHostToolChain()) - AuxToolChain = C.getSingleOffloadToolChain(); + // We have to pass the triple of the host if compiling for a CUDA device and + // vice-versa. + StringRef NormalizedTriple; + if (JA.isDeviceOffloading(Action::OFK_Cuda)) + NormalizedTriple = C.getSingleOffloadToolChain() + ->getTriple() + .normalize(); else - llvm_unreachable("Can't figure out CUDA compilation mode."); - assert(AuxToolChain != nullptr && "No aux toolchain."); + NormalizedTriple = C.getSingleOffloadToolChain() + ->getTriple() + .normalize(); + CmdArgs.push_back("-aux-triple"); - CmdArgs.push_back(Args.MakeArgString(AuxToolChain->getTriple().str())); + CmdArgs.push_back(Args.MakeArgString(NormalizedTriple)); } if (Triple.isOSWindows() && (Triple.getArch() == llvm::Triple::arm || @@ -4680,8 +4709,7 @@ // // FIXME: Support -fpreprocessed if (types::getPreprocessedType(InputType) != types::TY_INVALID) - AddPreprocessingOptions(C, JA, D, Args, CmdArgs, Output, Inputs, - AuxToolChain); + AddPreprocessingOptions(C, JA, D, Args, CmdArgs, Output, Inputs); // Don't warn about "clang -c -DPIC -fPIC test.i" because libtool.m4 assumes // that "The compiler can only warn and ignore the option if not recognized". @@ -11151,15 +11179,14 @@ static_cast(getToolChain()); assert(TC.getTriple().isNVPTX() && "Wrong platform"); - std::vector gpu_archs = - Args.getAllArgValues(options::OPT_march_EQ); - assert(gpu_archs.size() == 1 && "Exactly one GPU Arch required for ptxas."); - const std::string& gpu_arch = gpu_archs[0]; + // Obtain architecture from the action. + CudaArch gpu_arch = StringToCudaArch(JA.getOffloadingArch()); + assert(gpu_arch != CudaArch::UNKNOWN && + "Device action expected to have an architecture."); // Check that our installation's ptxas supports gpu_arch. if (!Args.hasArg(options::OPT_no_cuda_version_check)) { - TC.cudaInstallation().CheckCudaVersionSupportsArch( - StringToCudaArch(gpu_arch)); + TC.cudaInstallation().CheckCudaVersionSupportsArch(gpu_arch); } ArgStringList CmdArgs; @@ -11203,7 +11230,7 @@ } CmdArgs.push_back("--gpu-name"); - CmdArgs.push_back(Args.MakeArgString(gpu_arch)); + CmdArgs.push_back(Args.MakeArgString(CudaArchToString(gpu_arch))); CmdArgs.push_back("--output-file"); CmdArgs.push_back(Args.MakeArgString(Output.getFilename())); for (const auto& II : Inputs) @@ -11235,13 +11262,20 @@ CmdArgs.push_back(Args.MakeArgString(Output.getFilename())); for (const auto& II : Inputs) { - auto* A = cast(II.getAction()); + auto *A = II.getAction(); + assert(A->getInputs().size() == 1 && + "Device offload action is expected to have a single input"); + const char *gpu_arch_str = A->getOffloadingArch(); + assert(gpu_arch_str && + "Device action expected to have associated a GPU architecture!"); + CudaArch gpu_arch = StringToCudaArch(gpu_arch_str); + // We need to pass an Arch of the form "sm_XX" for cubin files and // "compute_XX" for ptx. const char *Arch = (II.getType() == types::TY_PP_Asm) - ? CudaVirtualArchToString(VirtualArchForCudaArch(A->getGpuArch())) - : CudaArchToString(A->getGpuArch()); + ? CudaVirtualArchToString(VirtualArchForCudaArch(gpu_arch)) + : gpu_arch_str; CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") + Arch + ",file=" + II.getFilename())); } Index: lib/Frontend/CreateInvocationFromCommandLine.cpp =================================================================== --- lib/Frontend/CreateInvocationFromCommandLine.cpp +++ lib/Frontend/CreateInvocationFromCommandLine.cpp @@ -60,25 +60,25 @@ } // We expect to get back exactly one command job, if we didn't something - // failed. CUDA compilation is an exception as it creates multiple jobs. If - // that's the case, we proceed with the first job. If caller needs particular - // CUDA job, it should be controlled via --cuda-{host|device}-only option - // passed to the driver. + // failed. Offload compilation is an exception as it creates multiple jobs. If + // that's the case, we proceed with the first job. If caller needs a + // particular job, it should be controlled via options (e.g. + // --cuda-{host|device}-only for CUDA) passed to the driver. const driver::JobList &Jobs = C->getJobs(); - bool CudaCompilation = false; + bool OffloadCompilation = false; if (Jobs.size() > 1) { for (auto &A : C->getActions()){ // On MacOSX real actions may end up being wrapped in BindArchAction if (isa(A)) A = *A->input_begin(); - if (isa(A)) { - CudaCompilation = true; + if (isa(A)) { + OffloadCompilation = true; break; } } } if (Jobs.size() == 0 || !isa(*Jobs.begin()) || - (Jobs.size() > 1 && !CudaCompilation)) { + (Jobs.size() > 1 && !OffloadCompilation)) { SmallString<256> Msg; llvm::raw_svector_ostream OS(Msg); Jobs.Print(OS, "; ", true); Index: test/Driver/cuda_phases.cu =================================================================== --- /dev/null +++ test/Driver/cuda_phases.cu @@ -0,0 +1,206 @@ +// Tests the phases generated for a CUDA offloading target for different +// combinations of: +// - Number of gpu architectures; +// - Host/device-only compilation; +// - User-requested final phase - binary or assembly. + +// REQUIRES: clang-driver +// REQUIRES: powerpc-registered-target +// REQUIRES: nvptx-registered-target + +// +// Test single gpu architecture with complete compilation. +// +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s 2>&1 \ +// RUN: | FileCheck -check-prefix=BIN %s +// BIN: 0: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda) +// BIN: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda) +// BIN: 2: compiler, {1}, ir, (host-cuda) +// BIN: 3: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30) +// BIN: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_30) +// BIN: 5: compiler, {4}, ir, (device-cuda, sm_30) +// BIN: 6: backend, {5}, assembler, (device-cuda, sm_30) +// BIN: 7: assembler, {6}, object, (device-cuda, sm_30) +// BIN: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {7}, object +// BIN: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {6}, assembler +// BIN: 10: linker, {8, 9}, cuda-fatbin, (device-cuda) +// BIN: 11: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda)" {10}, ir +// BIN: 12: backend, {11}, assembler, (host-cuda) +// BIN: 13: assembler, {12}, object, (host-cuda) +// BIN: 14: linker, {13}, image, (host-cuda) + +// +// Test single gpu architecture up to the assemble phase. +// +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s -S 2>&1 \ +// RUN: | FileCheck -check-prefix=ASM %s +// ASM: 0: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30) +// ASM: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30) +// ASM: 2: compiler, {1}, ir, (device-cuda, sm_30) +// ASM: 3: backend, {2}, assembler, (device-cuda, sm_30) +// ASM: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler +// ASM: 5: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda) +// ASM: 6: preprocessor, {5}, cuda-cpp-output, (host-cuda) +// ASM: 7: compiler, {6}, ir, (host-cuda) +// ASM: 8: backend, {7}, assembler, (host-cuda) + +// +// Test two gpu architecture with complete compilation. +// +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \ +// RUN: | FileCheck -check-prefix=BIN2 %s +// BIN2: 0: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda) +// BIN2: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda) +// BIN2: 2: compiler, {1}, ir, (host-cuda) +// BIN2: 3: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30) +// BIN2: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_30) +// BIN2: 5: compiler, {4}, ir, (device-cuda, sm_30) +// BIN2: 6: backend, {5}, assembler, (device-cuda, sm_30) +// BIN2: 7: assembler, {6}, object, (device-cuda, sm_30) +// BIN2: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {7}, object +// BIN2: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {6}, assembler +// BIN2: 10: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_35) +// BIN2: 11: preprocessor, {10}, cuda-cpp-output, (device-cuda, sm_35) +// BIN2: 12: compiler, {11}, ir, (device-cuda, sm_35) +// BIN2: 13: backend, {12}, assembler, (device-cuda, sm_35) +// BIN2: 14: assembler, {13}, object, (device-cuda, sm_35) +// BIN2: 15: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {14}, object +// BIN2: 16: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {13}, assembler +// BIN2: 17: linker, {8, 9, 15, 16}, cuda-fatbin, (device-cuda) +// BIN2: 18: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda)" {17}, ir +// BIN2: 19: backend, {18}, assembler, (host-cuda) +// BIN2: 20: assembler, {19}, object, (host-cuda) +// BIN2: 21: linker, {20}, image, (host-cuda) + +// +// Test two gpu architectures up to the assemble phase. +// +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \ +// RUN: | FileCheck -check-prefix=ASM2 %s +// ASM2: 0: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30) +// ASM2: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30) +// ASM2: 2: compiler, {1}, ir, (device-cuda, sm_30) +// ASM2: 3: backend, {2}, assembler, (device-cuda, sm_30) +// ASM2: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler +// ASM2: 5: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_35) +// ASM2: 6: preprocessor, {5}, cuda-cpp-output, (device-cuda, sm_35) +// ASM2: 7: compiler, {6}, ir, (device-cuda, sm_35) +// ASM2: 8: backend, {7}, assembler, (device-cuda, sm_35) +// ASM2: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {8}, assembler +// ASM2: 10: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda) +// ASM2: 11: preprocessor, {10}, cuda-cpp-output, (host-cuda) +// ASM2: 12: compiler, {11}, ir, (host-cuda) +// ASM2: 13: backend, {12}, assembler, (host-cuda) + +// +// Test single gpu architecture with complete compilation in host-only +// compilation mode. +// +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only 2>&1 \ +// RUN: | FileCheck -check-prefix=HBIN %s +// HBIN: 0: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda) +// HBIN: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda) +// HBIN: 2: compiler, {1}, ir, (host-cuda) +// HBIN: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir +// HBIN: 4: backend, {3}, assembler, (host-cuda) +// HBIN: 5: assembler, {4}, object, (host-cuda) +// HBIN: 6: linker, {5}, image, (host-cuda) + +// +// Test single gpu architecture up to the assemble phase in host-only +// compilation mode. +// +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only -S 2>&1 \ +// RUN: | FileCheck -check-prefix=HASM %s +// HASM: 0: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda) +// HASM: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda) +// HASM: 2: compiler, {1}, ir, (host-cuda) +// HASM: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir +// HASM: 4: backend, {3}, assembler, (host-cuda) + +// +// Test two gpu architecture with complete compilation in host-only +// compilation mode. +// +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only 2>&1 \ +// RUN: | FileCheck -check-prefix=HBIN2 %s +// HBIN2: 0: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda) +// HBIN2: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda) +// HBIN2: 2: compiler, {1}, ir, (host-cuda) +// HBIN2: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir +// HBIN2: 4: backend, {3}, assembler, (host-cuda) +// HBIN2: 5: assembler, {4}, object, (host-cuda) +// HBIN2: 6: linker, {5}, image, (host-cuda) + +// +// Test two gpu architectures up to the assemble phase in host-only +// compilation mode. +// +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only -S 2>&1 \ +// RUN: | FileCheck -check-prefix=HASM2 %s +// HASM2: 0: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda) +// HASM2: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda) +// HASM2: 2: compiler, {1}, ir, (host-cuda) +// HASM2: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir +// HASM2: 4: backend, {3}, assembler, (host-cuda) + +// +// Test single gpu architecture with complete compilation in device-only +// compilation mode. +// +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only 2>&1 \ +// RUN: | FileCheck -check-prefix=DBIN %s +// DBIN: 0: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30) +// DBIN: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30) +// DBIN: 2: compiler, {1}, ir, (device-cuda, sm_30) +// DBIN: 3: backend, {2}, assembler, (device-cuda, sm_30) +// DBIN: 4: assembler, {3}, object, (device-cuda, sm_30) +// DBIN: 5: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {4}, object + +// +// Test single gpu architecture up to the assemble phase in device-only +// compilation mode. +// +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only -S 2>&1 \ +// RUN: | FileCheck -check-prefix=DASM %s +// DASM: 0: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30) +// DASM: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30) +// DASM: 2: compiler, {1}, ir, (device-cuda, sm_30) +// DASM: 3: backend, {2}, assembler, (device-cuda, sm_30) +// DASM: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler + +// +// Test two gpu architecture with complete compilation in device-only +// compilation mode. +// +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \ +// RUN: | FileCheck -check-prefix=DBIN2 %s +// DBIN2: 0: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30) +// DBIN2: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30) +// DBIN2: 2: compiler, {1}, ir, (device-cuda, sm_30) +// DBIN2: 3: backend, {2}, assembler, (device-cuda, sm_30) +// DBIN2: 4: assembler, {3}, object, (device-cuda, sm_30) +// DBIN2: 5: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {4}, object +// DBIN2: 6: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_35) +// DBIN2: 7: preprocessor, {6}, cuda-cpp-output, (device-cuda, sm_35) +// DBIN2: 8: compiler, {7}, ir, (device-cuda, sm_35) +// DBIN2: 9: backend, {8}, assembler, (device-cuda, sm_35) +// DBIN2: 10: assembler, {9}, object, (device-cuda, sm_35) +// DBIN2: 11: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {10}, object + +// +// Test two gpu architectures up to the assemble phase in device-only +// compilation mode. +// +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S 2>&1 \ +// RUN: | FileCheck -check-prefix=DASM2 %s +// DASM2: 0: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30) +// DASM2: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30) +// DASM2: 2: compiler, {1}, ir, (device-cuda, sm_30) +// DASM2: 3: backend, {2}, assembler, (device-cuda, sm_30) +// DASM2: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler +// DASM2: 5: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_35) +// DASM2: 6: preprocessor, {5}, cuda-cpp-output, (device-cuda, sm_35) +// DASM2: 7: compiler, {6}, ir, (device-cuda, sm_35) +// DASM2: 8: backend, {7}, assembler, (device-cuda, sm_35) +// DASM2: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {8}, assembler