Index: include/clang/Driver/Action.h =================================================================== --- include/clang/Driver/Action.h +++ include/clang/Driver/Action.h @@ -12,6 +12,7 @@ #include "clang/Driver/Types.h" #include "clang/Driver/Util.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" namespace llvm { @@ -26,6 +27,8 @@ namespace clang { namespace driver { +class ToolChain; + /// Action - Represent an abstract compilation step to perform. /// /// An action represents an edge in the compilation graph; typically @@ -49,8 +52,7 @@ enum ActionClass { InputClass = 0, BindArchClass, - CudaDeviceClass, - CudaHostClass, + OffloadClass, PreprocessJobClass, PrecompileJobClass, AnalyzeJobClass, @@ -71,10 +73,6 @@ // The offloading kind determines if this action is binded to a particular // programming model. Each entry reserves one bit. We also have a special kind // to designate the host offloading tool chain. - // - // FIXME: This is currently used to indicate that tool chains are used in a - // given programming, but will be used here as well once a generic offloading - // action is implemented. enum OffloadKind { OFK_None = 0x00, // The host offloading tool chain. @@ -94,13 +92,24 @@ ActionList Inputs; protected: + /// Offload information. + /// \brief Multiple programming models may be supported simultaneously by the + /// same host. Therefore, the host offloading kind is a combination of kinds + /// encoded in a mask. + unsigned ActiveOffloadKindMask; + /// \brief Offloading kind of the device. + OffloadKind OffloadingDeviceKind; + /// \brief The Offloading architecture associated with this action. + const char *OffloadingArch; + Action(ActionClass Kind, types::ID Type) : Action(Kind, ActionList(), Type) {} Action(ActionClass Kind, Action *Input, types::ID Type) : Action(Kind, ActionList({Input}), Type) {} Action(ActionClass Kind, Action *Input) : Action(Kind, ActionList({Input}), Input->getType()) {} Action(ActionClass Kind, const ActionList &Inputs, types::ID Type) - : Kind(Kind), Type(Type), Inputs(Inputs) {} + : Kind(Kind), Type(Type), Inputs(Inputs), ActiveOffloadKindMask(0u), + OffloadingDeviceKind(OFK_None), OffloadingArch(nullptr) {} public: virtual ~Action(); @@ -123,6 +132,38 @@ input_const_range inputs() const { return input_const_range(input_begin(), input_end()); } + + std::string getOffloadingKindPrefix() const; + std::string getOffloadingFileNamePrefix(StringRef NormalizedTriple) const; + + /// \brief Set the device offload info of this action and propagate it to its + /// dependences. + void propagateDeviceOffloadInfo(OffloadKind OKind, const char *OArch); + /// \brief Append the host offload info of this action and propagate it to its + /// dependences. + void propagateHostOffloadInfo(unsigned OKinds, const char *OArch); + /// \brief Set the offload info of this action to be the same as the provided + /// action, and propagate it to its dependences. + void propagateOffloadInfo(const Action *A); + + unsigned getOffloadingHostActiveKinds() const { + return ActiveOffloadKindMask; + } + OffloadKind getOffloadingDeviceKind() const { return OffloadingDeviceKind; } + const char *getOffloadingArch() const { return OffloadingArch; } + + /// \brief Check if this action have any offload kinds. Note that host offload + /// kinds are only set if the action is a dependence to an host offload + /// action. + bool isHostOffloading(OffloadKind OKind) const { + return ActiveOffloadKindMask & OKind; + } + bool isDeviceOffloading(OffloadKind OKind) const { + return OffloadingDeviceKind == OKind; + } + bool isOffloading(OffloadKind OKind) const { + return isHostOffloading(OKind) || isDeviceOffloading(OKind); + } }; class InputAction : public Action { @@ -155,43 +196,116 @@ } }; -class CudaDeviceAction : public Action { +/// \brief An offload action combines host or/and device actions according to +/// the programming model implementation needs and propagates the offloading +/// kind to its dependences. +class OffloadAction : public Action { virtual void anchor(); - /// GPU architecture to bind. Always of the form /sm_\d+/ or null (when the - /// action applies to multiple architectures). - const char *GpuArchName; - /// True when action results are not consumed by the host action (e.g when - /// -fsyntax-only or --cuda-device-only options are used). - bool AtTopLevel; - public: - CudaDeviceAction(Action *Input, const char *ArchName, bool AtTopLevel); + /// \brief Type used to communicate device actions. It associates bound + /// architecture, toolchain, and offload kind to each action. + class DeviceDependences { + public: + typedef SmallVector ToolChainList; + typedef SmallVector BoundArchList; + typedef SmallVector OffloadKindList; + + private: + // Lists that keep the information for each dependency. All the lists are + // meant to be updated in sync. We are adopting separate lists instead of a + // list of structs, because that simplifies forwarding the actions list to + // initialize the inputs of the base Action class. + // + /// \brief The dependence actions. + ActionList DeviceActions; + /// \brief The offloading toolchains that should be used with the action. + ToolChainList DeviceToolChains; + /// \brief The architectures that should be used with this action. + BoundArchList DeviceBoundArchs; + /// \brief The offload kind of each dependence. + OffloadKindList DeviceOffloadKinds; + + public: + /// \brief Add a action along with the associated toolchain, bound arch, and + /// offload kind. + void add(Action &A, const ToolChain &TC, const char *BoundArch, + OffloadKind OKind); + + /// \brief Get each of the individual arrays. + const ActionList &getActions() const { return DeviceActions; }; + const ToolChainList &getToolChains() const { return DeviceToolChains; }; + const BoundArchList &getBoundArchs() const { return DeviceBoundArchs; }; + const OffloadKindList &getOffloadKinds() const { + return DeviceOffloadKinds; + }; + }; - const char *getGpuArchName() const { return GpuArchName; } + /// \brief Type used to communicate host actions. It associates bound + /// architecture, toolchain, and offload kinds to the host action. + class HostDependence { + /// \brief The dependence action. + Action &HostAction; + /// \brief The offloading toolchain that should be used with the action. + const ToolChain &HostToolChain; + /// \brief The architectures that should be used with this action. + const char *HostBoundArch; + /// \brief The offload kind of each dependence. + unsigned HostOffloadKinds; + + public: + HostDependence(Action &A, const ToolChain &TC, const char *BoundArch, + const unsigned OffloadKinds) + : HostAction(A), HostToolChain(TC), HostBoundArch(BoundArch), + HostOffloadKinds(OffloadKinds){}; + /// \brief Constructor version that obtains the offload kinds from the + /// device dependencies. + HostDependence(Action &A, const ToolChain &TC, const char *BoundArch, + const DeviceDependences &DDeps); + Action *getAction() const { return &HostAction; }; + const ToolChain *getToolChain() const { return &HostToolChain; }; + const char *getBoundArch() const { return HostBoundArch; }; + unsigned getOffloadKinds() const { return HostOffloadKinds; }; + }; - /// Gets the compute_XX that corresponds to getGpuArchName(). Returns null - /// when getGpuArchName() is null. - const char *getComputeArchName() const; + typedef llvm::function_ref + OffloadActionWorkTy; - bool isAtTopLevel() const { return AtTopLevel; } +private: + /// \brief The host offloading toolchain that should be used with the action. + const ToolChain *HostTC; - static bool IsValidGpuArchName(llvm::StringRef ArchName); + /// \brief The tool chains associated with the list of actions. + DeviceDependences::ToolChainList DevToolChains; - static bool classof(const Action *A) { - return A->getKind() == CudaDeviceClass; - } -}; +public: + OffloadAction(const HostDependence &HDep); + OffloadAction(const DeviceDependences &DDeps, types::ID Ty); + OffloadAction(const HostDependence &HDep, const DeviceDependences &DDeps); -class CudaHostAction : public Action { - virtual void anchor(); - ActionList DeviceActions; + /// \brief Execute the work specified in \a Work on the host dependence. + void doOnHostDependence(const OffloadActionWorkTy &Work) const; -public: - CudaHostAction(Action *Input, const ActionList &DeviceActions); + /// \brief Execute the work specified in \a Work on each device dependence. + void doOnEachDeviceDependence(const OffloadActionWorkTy &Work) const; + + /// \brief Execute the work specified in \a Work on each dependence. + void doOnEachDependence(const OffloadActionWorkTy &Work) const; + + /// \brief Return true if the action has a host dependence. + bool hasHostDependence() const; + + /// \brief Return the host dependence of this action. This function is only + /// expected to be called if the host dependence exists. + Action *getHostDependence() const; + + /// \brief Return true if the action has a single device dependence. + bool hasSingleDeviceDependence() const; - const ActionList &getDeviceActions() const { return DeviceActions; } + /// \brief Return the single device dependence of this action. This function + /// is only expected to be called if a single device dependence exists. + Action *getSingleDeviceDependence() const; - static bool classof(const Action *A) { return A->getKind() == CudaHostClass; } + static bool classof(const Action *A) { return A->getKind() == OffloadClass; } }; class JobAction : public Action { Index: include/clang/Driver/Compilation.h =================================================================== --- include/clang/Driver/Compilation.h +++ include/clang/Driver/Compilation.h @@ -98,12 +98,7 @@ const Driver &getDriver() const { return TheDriver; } const ToolChain &getDefaultToolChain() const { return DefaultToolChain; } - const ToolChain *getOffloadingHostToolChain() const { - auto It = OrderedOffloadingToolchains.find(Action::OFK_Host); - if (It != OrderedOffloadingToolchains.end()) - return It->second; - return nullptr; - } + unsigned isOffloadingHostKind(Action::OffloadKind Kind) const { return ActiveOffloadMask & Kind; } Index: include/clang/Driver/Driver.h =================================================================== --- include/clang/Driver/Driver.h +++ include/clang/Driver/Driver.h @@ -415,12 +415,11 @@ /// \param BoundArch - The bound architecture. /// \param AtTopLevel - Whether this is a "top-level" action. /// \param MultipleArchs - Whether multiple -arch options were supplied. - const char *GetNamedOutputPath(Compilation &C, - const JobAction &JA, - const char *BaseInput, - const char *BoundArch, - bool AtTopLevel, - bool MultipleArchs) const; + /// \param NormalizedTriple - The normalized triple of the relevant target. + const char *GetNamedOutputPath(Compilation &C, const JobAction &JA, + const char *BaseInput, const char *BoundArch, + bool AtTopLevel, bool MultipleArchs, + StringRef NormalizedTriple) const; /// GetTemporaryPath - Return the pathname of a temporary file to use /// as part of compilation; the file will have the given prefix and suffix. Index: lib/Driver/Action.cpp =================================================================== --- lib/Driver/Action.cpp +++ lib/Driver/Action.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "clang/Driver/Action.h" +#include "clang/Driver/ToolChain.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Regex.h" @@ -21,8 +22,8 @@ switch (AC) { case InputClass: return "input"; case BindArchClass: return "bind-arch"; - case CudaDeviceClass: return "cuda-device"; - case CudaHostClass: return "cuda-host"; + case OffloadClass: + return "offload"; case PreprocessJobClass: return "preprocessor"; case PrecompileJobClass: return "precompiler"; case AnalyzeJobClass: return "analyzer"; @@ -40,6 +41,82 @@ llvm_unreachable("invalid class"); } +void Action::propagateDeviceOffloadInfo(OffloadKind OKind, const char *OArch) { + // Offload action set its own kinds on their dependences. + if (Kind == OffloadClass) + return; + + assert((OffloadingDeviceKind == OKind || OffloadingDeviceKind == OFK_None) && + "Setting device kind to a different device??"); + assert(!ActiveOffloadKindMask && "Setting a device kind in a host action??"); + OffloadingDeviceKind = OKind; + OffloadingArch = OArch; + + for (auto *A : Inputs) + A->propagateDeviceOffloadInfo(OffloadingDeviceKind, OArch); +} + +void Action::propagateHostOffloadInfo(unsigned OKinds, const char *OArch) { + // Offload action set its own kinds on their dependences. + if (Kind == OffloadClass) + return; + + assert(OffloadingDeviceKind == OFK_None && + "Setting a host kind in a device action."); + ActiveOffloadKindMask |= OKinds; + OffloadingArch = OArch; + + for (auto *A : Inputs) + A->propagateHostOffloadInfo(ActiveOffloadKindMask, OArch); +} + +void Action::propagateOffloadInfo(const Action *A) { + if (unsigned HK = A->getOffloadingHostActiveKinds()) + propagateHostOffloadInfo(HK, A->getOffloadingArch()); + else + propagateDeviceOffloadInfo(A->getOffloadingDeviceKind(), + A->getOffloadingArch()); +} + +std::string Action::getOffloadingKindPrefix() const { + switch (OffloadingDeviceKind) { + case OFK_None: + break; + case OFK_Host: + llvm_unreachable("Host kind is not an offloading device kind."); + break; + case OFK_Cuda: + return "device-cuda"; + + // TODO: Add other programming models here. + } + + if (!ActiveOffloadKindMask) + return ""; + + std::string Res("host"); + if (ActiveOffloadKindMask & OFK_Cuda) + Res += "-cuda"; + + // TODO: Add other programming models here. + + return Res; +} + +std::string +Action::getOffloadingFileNamePrefix(StringRef NormalizedTriple) const { + // A file prefix is only generated for device actions and consists of the + // offload kind and triple. + if (!OffloadingDeviceKind) + return ""; + + std::string Res("-"); + Res += getOffloadingKindPrefix(); + Res += "-"; + Res += NormalizedTriple; + return Res; +} + void InputAction::anchor() {} InputAction::InputAction(const Arg &_Input, types::ID _Type) @@ -51,45 +128,116 @@ BindArchAction::BindArchAction(Action *Input, const char *_ArchName) : Action(BindArchClass, Input), ArchName(_ArchName) {} -// Converts CUDA GPU architecture, e.g. "sm_21", to its corresponding virtual -// compute arch, e.g. "compute_20". Returns null if the input arch is null or -// doesn't match an existing arch. -static const char* GpuArchToComputeName(const char *ArchName) { - if (!ArchName) - return nullptr; - return llvm::StringSwitch(ArchName) - .Cases("sm_20", "sm_21", "compute_20") - .Case("sm_30", "compute_30") - .Case("sm_32", "compute_32") - .Case("sm_35", "compute_35") - .Case("sm_37", "compute_37") - .Case("sm_50", "compute_50") - .Case("sm_52", "compute_52") - .Case("sm_53", "compute_53") - .Default(nullptr); +void OffloadAction::anchor() {} + +OffloadAction::OffloadAction(const HostDependence &HDep) + : Action(OffloadClass, HDep.getAction()), HostTC(HDep.getToolChain()) { + OffloadingArch = HDep.getBoundArch(); + ActiveOffloadKindMask = HDep.getOffloadKinds(); + HDep.getAction()->propagateHostOffloadInfo(HDep.getOffloadKinds(), + HDep.getBoundArch()); +}; + +OffloadAction::OffloadAction(const DeviceDependences &DDeps, types::ID Ty) + : Action(OffloadClass, DDeps.getActions(), Ty), HostTC(nullptr), + DevToolChains(DDeps.getToolChains()) { + auto &OKinds = DDeps.getOffloadKinds(); + auto &BArchs = DDeps.getBoundArchs(); + + // If we have a single dependency, inherit the offloading info from it. + if (OKinds.size() == 1) { + OffloadingDeviceKind = OKinds.front(); + OffloadingArch = BArchs.front(); + } + // Propagate info to the dependencies. + for (unsigned i = 0; i < getInputs().size(); ++i) + getInputs()[i]->propagateDeviceOffloadInfo(OKinds[i], BArchs[i]); +} + +OffloadAction::OffloadAction(const HostDependence &HDep, + const DeviceDependences &DDeps) + : Action(OffloadClass, HDep.getAction()), HostTC(HDep.getToolChain()), + DevToolChains(DDeps.getToolChains()) { + // We use the kinds of the host dependence for this action. + OffloadingArch = HDep.getBoundArch(); + ActiveOffloadKindMask = HDep.getOffloadKinds(); + HDep.getAction()->propagateHostOffloadInfo(HDep.getOffloadKinds(), + HDep.getBoundArch()); + + // Add device inputs and propagate info to the device actions. + for (unsigned i = 0; i < DDeps.getActions().size(); ++i) { + auto *A = DDeps.getActions()[i]; + // Skip actions of empty dependences. + if (!A) + continue; + getInputs().push_back(A); + A->propagateDeviceOffloadInfo(DDeps.getOffloadKinds()[i], + DDeps.getBoundArchs()[i]); + } +} + +void OffloadAction::doOnHostDependence(const OffloadActionWorkTy &Work) const { + if (!HostTC) + return; + auto *A = getInputs().front(); + Work(A, HostTC, A->getOffloadingArch()); } -void CudaDeviceAction::anchor() {} +void OffloadAction::doOnEachDeviceDependence( + const OffloadActionWorkTy &Work) const { + auto I = getInputs().begin(); + auto E = getInputs().end(); + if (I == E) + return; -CudaDeviceAction::CudaDeviceAction(Action *Input, const char *ArchName, - bool AtTopLevel) - : Action(CudaDeviceClass, Input), GpuArchName(ArchName), - AtTopLevel(AtTopLevel) { - assert(!GpuArchName || IsValidGpuArchName(GpuArchName)); + // Skip host action + if (HostTC) + ++I; + + auto TI = DevToolChains.begin(); + for (; I != E; ++I) + Work(*I, *TI, (*I)->getOffloadingArch()); +} + +void OffloadAction::doOnEachDependence(const OffloadActionWorkTy &Work) const { + doOnHostDependence(Work); + doOnEachDeviceDependence(Work); +} + +bool OffloadAction::hasHostDependence() const { return HostTC != nullptr; } + +Action *OffloadAction::getHostDependence() const { + assert(hasHostDependence() && "Host dependence does not exist!"); + return HostTC ? getInputs().front() : nullptr; } -const char *CudaDeviceAction::getComputeArchName() const { - return GpuArchToComputeName(GpuArchName); +bool OffloadAction::hasSingleDeviceDependence() const { + return !HostTC && getInputs().size() == 1; } -bool CudaDeviceAction::IsValidGpuArchName(llvm::StringRef ArchName) { - return GpuArchToComputeName(ArchName.data()) != nullptr; +Action *OffloadAction::getSingleDeviceDependence() const { + assert(hasSingleDeviceDependence() && + "Single device dependence does not exist!"); + return getInputs().front(); } -void CudaHostAction::anchor() {} +void OffloadAction::DeviceDependences::add(Action &A, const ToolChain &TC, + const char *BoundArch, + OffloadKind OKind) { + DeviceActions.push_back(&A); + DeviceToolChains.push_back(&TC); + DeviceBoundArchs.push_back(BoundArch); + DeviceOffloadKinds.push_back(OKind); +} -CudaHostAction::CudaHostAction(Action *Input, const ActionList &DeviceActions) - : Action(CudaHostClass, Input), DeviceActions(DeviceActions) {} +OffloadAction::HostDependence::HostDependence(Action &A, const ToolChain &TC, + const char *BoundArch, + const DeviceDependences &DDeps) + : HostAction(A), HostToolChain(TC), HostBoundArch(BoundArch), + HostOffloadKinds(0u) { + for (auto K : DDeps.getOffloadKinds()) + HostOffloadKinds |= K; +} void JobAction::anchor() {} Index: lib/Driver/Driver.cpp =================================================================== --- lib/Driver/Driver.cpp +++ lib/Driver/Driver.cpp @@ -408,7 +408,9 @@ })) { const ToolChain &TC = getToolChain( C.getInputArgs(), - llvm::Triple(C.getOffloadingHostToolChain()->getTriple().isArch64Bit() + llvm::Triple(C.getSingleOffloadToolChain() + ->getTriple() + .isArch64Bit() ? "nvptx64-nvidia-cuda" : "nvptx-nvidia-cuda")); C.addOffloadDeviceToolChain(&TC, Action::OFK_Cuda); @@ -986,18 +988,33 @@ } else if (BindArchAction *BIA = dyn_cast(A)) { os << '"' << BIA->getArchName() << '"' << ", {" << PrintActions1(C, *BIA->input_begin(), Ids) << "}"; - } else if (CudaDeviceAction *CDA = dyn_cast(A)) { - os << '"' - << (CDA->getGpuArchName() ? CDA->getGpuArchName() : "(multiple archs)") - << '"' << ", {" << PrintActions1(C, *CDA->input_begin(), Ids) << "}"; + } else if (OffloadAction *OA = dyn_cast(A)) { + bool IsFirst = true; + OA->doOnEachDependence( + [&](Action *A, const ToolChain *TC, const char *BoundArch) { + // E.g. for two CUDA device dependences whose bound arch is sm_20 and + // sm_35 this will generate: + // "cuda-device" (nvptx64-nvidia-cuda:sm_20) {#ID}, "cuda-device" + // (nvptx64-nvidia-cuda:sm_35) {#ID} + if (!IsFirst) + os << ", "; + os << '"'; + if (TC) + os << A->getOffloadingKindPrefix(); + else + os << "host"; + os << " ("; + os << TC->getTriple().normalize(); + + if (BoundArch) + os << ":" << BoundArch; + os << ")"; + os << '"'; + os << " {" << PrintActions1(C, A, Ids) << "}"; + IsFirst = false; + }); } else { - const ActionList *AL; - if (CudaHostAction *CHA = dyn_cast(A)) { - os << "{" << PrintActions1(C, *CHA->input_begin(), Ids) << "}" - << ", gpu binaries "; - AL = &CHA->getDeviceActions(); - } else - AL = &A->getInputs(); + const ActionList *AL = &A->getInputs(); if (AL->size()) { const char *Prefix = "{"; @@ -1010,10 +1027,24 @@ os << "{}"; } + // Append offload info for all options other than the offloading action + // itself (e.g. (cuda-device, sm_20) or (cuda-host)). + std::string offload_str; + llvm::raw_string_ostream offload_os(offload_str); + if (!isa(A)) { + auto S = A->getOffloadingKindPrefix(); + if (!S.empty()) { + offload_os << ", (" << S; + if (A->getOffloadingArch()) + offload_os << ", " << A->getOffloadingArch(); + offload_os << ")"; + } + } + unsigned Id = Ids.size(); Ids[A] = Id; llvm::errs() << Id << ": " << os.str() << ", " - << types::getTypeName(A->getType()) << "\n"; + << types::getTypeName(A->getType()) << offload_os.str() << "\n"; return Id; } @@ -1326,8 +1357,12 @@ options::OPT_cuda_device_only); // Host-only compilation case. if (PartialCompilationArg && - PartialCompilationArg->getOption().matches(options::OPT_cuda_host_only)) - return C.MakeAction(HostAction, ActionList()); + PartialCompilationArg->getOption().matches(options::OPT_cuda_host_only)) { + OffloadAction::HostDependence HDep( + *HostAction, *C.getSingleOffloadToolChain(), + /*BoundArch=*/nullptr, Action::OFK_Cuda); + return C.MakeAction(HDep); + } // Collect all cuda_gpu_arch parameters, removing duplicates. SmallVector GpuArchList; @@ -1338,7 +1373,7 @@ A->claim(); const auto& Arch = A->getValue(); - if (!CudaDeviceAction::IsValidGpuArchName(Arch)) + if (!toolchains::CudaToolChain::GpuArchToComputeName(Arch)) C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << Arch; else if (GpuArchNames.insert(Arch).second) GpuArchList.push_back(Arch); @@ -1355,8 +1390,6 @@ CudaDeviceInputs.push_back(std::make_pair(types::TY_CUDA_DEVICE, InputArg)); // Build actions for all device inputs. - assert(C.getSingleOffloadToolChain() && - "Missing toolchain for device-side compilation."); ActionList CudaDeviceActions; C.getDriver().BuildActions(C, Args, CudaDeviceInputs, CudaDeviceActions); assert(GpuArchList.size() == CudaDeviceActions.size() && @@ -1368,6 +1401,8 @@ return a->getKind() != Action::AssembleJobClass; }); + const ToolChain *CudaTC = C.getSingleOffloadToolChain(); + // Figure out what to do with device actions -- pass them as inputs to the // host action or run each of them independently. bool DeviceOnlyCompilation = PartialCompilationArg != nullptr; @@ -1384,10 +1419,13 @@ return nullptr; } - for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) - Actions.push_back(C.MakeAction(CudaDeviceActions[I], - GpuArchList[I], - /* AtTopLevel */ true)); + for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { + OffloadAction::DeviceDependences DDep; + DDep.add(*CudaDeviceActions[I], *CudaTC, GpuArchList[I], + Action::OFK_Cuda); + Actions.push_back( + C.MakeAction(DDep, CudaDeviceActions[I]->getType())); + } // Kill host action in case of device-only compilation. if (DeviceOnlyCompilation) return nullptr; @@ -1407,19 +1445,23 @@ Action* BackendAction = AssembleAction->getInputs()[0]; assert(BackendAction->getType() == types::TY_PP_Asm); - for (const auto& A : {AssembleAction, BackendAction}) { - DeviceActions.push_back(C.MakeAction( - A, GpuArchList[I], /* AtTopLevel */ false)); + for (auto &A : {AssembleAction, BackendAction}) { + OffloadAction::DeviceDependences DDep; + DDep.add(*A, *CudaTC, GpuArchList[I], Action::OFK_Cuda); + DeviceActions.push_back(C.MakeAction(DDep, A->getType())); } } - auto FatbinAction = C.MakeAction( - C.MakeAction(DeviceActions, types::TY_CUDA_FATBIN), - /* GpuArchName = */ nullptr, - /* AtTopLevel = */ false); + auto FatbinAction = + C.MakeAction(DeviceActions, types::TY_CUDA_FATBIN); + // Return a new host action that incorporates original host action and all // device actions. - return C.MakeAction(std::move(HostAction), - ActionList({FatbinAction})); + OffloadAction::HostDependence HDep( + *HostAction, *C.getSingleOffloadToolChain(), + /*BoundArch=*/nullptr, Action::OFK_Cuda); + OffloadAction::DeviceDependences DDep; + DDep.add(*FatbinAction, *CudaTC, /*BoundArch=*/nullptr, Action::OFK_Cuda); + return C.MakeAction(HDep, DDep); } void Driver::BuildActions(Compilation &C, DerivedArgList &Args, @@ -1528,6 +1570,9 @@ YcArg = YuArg = nullptr; } + // Track the host offload kinds used on this compilation. + unsigned CompilationActiveOffloadHostKinds = 0u; + // Construct the actions to perform. ActionList LinkerInputs; @@ -1596,6 +1641,9 @@ ? phases::Compile : FinalPhase; + // Track the host offload kinds used on this input. + unsigned InputActiveOffloadHostKinds = 0u; + // Build the pipeline for this file. Action *Current = C.MakeAction(*InputArg, InputType); for (SmallVectorImpl::iterator i = PL.begin(), e = PL.end(); @@ -1627,21 +1675,36 @@ Current = buildCudaActions(C, Args, InputArg, Current, Actions); if (!Current) break; + + // We produced a CUDA action for this input, so the host has to support + // CUDA. + InputActiveOffloadHostKinds |= Action::OFK_Cuda; + CompilationActiveOffloadHostKinds |= Action::OFK_Cuda; } if (Current->getType() == types::TY_Nothing) break; } - // If we ended with something, add to the output list. - if (Current) + // If we ended with something, add to the output list. Also, propagate the + // offload information to the top-level host action related with the current + // input. + if (Current) { + if (InputActiveOffloadHostKinds) + Current->propagateHostOffloadInfo(InputActiveOffloadHostKinds, + /*BoundArch=*/nullptr); Actions.push_back(Current); + } } - // Add a link action if necessary. - if (!LinkerInputs.empty()) + // Add a link action if necessary and propagate the offload information for + // the current compilation. + if (!LinkerInputs.empty()) { Actions.push_back( C.MakeAction(LinkerInputs, types::TY_Image)); + Actions.back()->propagateHostOffloadInfo(CompilationActiveOffloadHostKinds, + /*BoundArch=*/nullptr); + } // If we are linking, claim any options which are obviously only used for // compilation. @@ -1824,7 +1887,28 @@ } } } - +// Collapse an offloading action looking for a job of the given type. The input +// action is changed to the input of the collapsed sequence. If we effectively +// had a collapse return the corresponding offloading action, otherwise return +// null. +template +static OffloadAction *collapseOffloadingAction(Action *&CurAction) { + if (!CurAction) + return nullptr; + if (auto *OA = dyn_cast(CurAction)) { + if (OA->hasHostDependence()) + if (auto *HDep = dyn_cast(OA->getHostDependence())) { + CurAction = HDep; + return OA; + } + if (OA->hasSingleDeviceDependence()) + if (auto *DDep = dyn_cast(OA->getSingleDeviceDependence())) { + CurAction = DDep; + return OA; + } + } + return nullptr; +} // Returns a Tool for a given JobAction. In case the action and its // predecessors can be combined, updates Inputs with the inputs of the // first combined action. If one of the collapsed actions is a @@ -1834,34 +1918,39 @@ bool EmbedBitcode, const ToolChain *TC, const JobAction *JA, const ActionList *&Inputs, - const CudaHostAction *&CollapsedCHA) { + ActionList &CollapsedOffloadAction) { const Tool *ToolForJob = nullptr; - CollapsedCHA = nullptr; + CollapsedOffloadAction.clear(); // See if we should look for a compiler with an integrated assembler. We match // bottom up, so what we are actually looking for is an assembler job with a // compiler input. + // Look through offload actions between assembler and backend actions. + Action *BackendJA = (isa(JA) && Inputs->size() == 1) + ? *Inputs->begin() + : nullptr; + auto *BackendOA = collapseOffloadingAction(BackendJA); + if (TC->useIntegratedAs() && !SaveTemps && !C.getArgs().hasArg(options::OPT_via_file_asm) && !C.getArgs().hasArg(options::OPT__SLASH_FA) && - !C.getArgs().hasArg(options::OPT__SLASH_Fa) && - isa(JA) && Inputs->size() == 1 && - isa(*Inputs->begin())) { + !C.getArgs().hasArg(options::OPT__SLASH_Fa) && BackendJA && + isa(BackendJA)) { // A BackendJob is always preceded by a CompileJob, and without -save-temps // or -fembed-bitcode, they will always get combined together, so instead of // checking the backend tool, check if the tool for the CompileJob has an // integrated assembler. For -fembed-bitcode, CompileJob is still used to // look up tools for BackendJob, but they need to match before we can split // them. - const ActionList *BackendInputs = &(*Inputs)[0]->getInputs(); - // Compile job may be wrapped in CudaHostAction, extract it if - // that's the case and update CollapsedCHA if we combine phases. - CudaHostAction *CHA = dyn_cast(*BackendInputs->begin()); - JobAction *CompileJA = cast( - CHA ? *CHA->input_begin() : *BackendInputs->begin()); - assert(CompileJA && "Backend job is not preceeded by compile job."); - const Tool *Compiler = TC->SelectTool(*CompileJA); + + // Look through offload actions between backend and compile actions. + Action *CompileJA = *BackendJA->getInputs().begin(); + auto *CompileOA = collapseOffloadingAction(CompileJA); + + assert(CompileJA && isa(CompileJA) && + "Backend job is not preceeded by compile job."); + const Tool *Compiler = TC->SelectTool(*cast(CompileJA)); if (!Compiler) return nullptr; // When using -fembed-bitcode, it is required to have the same tool (clang) @@ -1875,7 +1964,12 @@ if (Compiler->hasIntegratedAssembler()) { Inputs = &CompileJA->getInputs(); ToolForJob = Compiler; - CollapsedCHA = CHA; + // Save the collapsed offload actions because they may still contain + // device actions. + if (CompileOA) + CollapsedOffloadAction.push_back(CompileOA); + if (BackendOA) + CollapsedOffloadAction.push_back(BackendOA); } } @@ -1885,20 +1979,23 @@ if (isa(JA)) { // Check if the compiler supports emitting LLVM IR. assert(Inputs->size() == 1); - // Compile job may be wrapped in CudaHostAction, extract it if - // that's the case and update CollapsedCHA if we combine phases. - CudaHostAction *CHA = dyn_cast(*Inputs->begin()); - JobAction *CompileJA = - cast(CHA ? *CHA->input_begin() : *Inputs->begin()); - assert(CompileJA && "Backend job is not preceeded by compile job."); - const Tool *Compiler = TC->SelectTool(*CompileJA); + + // Look through offload actions between backend and compile actions. + Action *CompileJA = *JA->getInputs().begin(); + auto *CompileOA = collapseOffloadingAction(CompileJA); + + assert(CompileJA && isa(CompileJA) && + "Backend job is not preceeded by compile job."); + const Tool *Compiler = TC->SelectTool(*cast(CompileJA)); if (!Compiler) return nullptr; if (!Compiler->canEmitIR() || (!SaveTemps && !EmbedBitcode)) { Inputs = &CompileJA->getInputs(); ToolForJob = Compiler; - CollapsedCHA = CHA; + + if (CompileOA) + CollapsedOffloadAction.push_back(CompileOA); } } @@ -1909,12 +2006,21 @@ // See if we should use an integrated preprocessor. We do so when we have // exactly one input, since this is the only use case we care about // (irrelevant since we don't support combine yet). - if (Inputs->size() == 1 && isa(*Inputs->begin()) && + + // Look through offload actions after preprocessing. + Action *PreprocessJA = (Inputs->size() == 1) ? *Inputs->begin() : nullptr; + auto *PreprocessOA = + collapseOffloadingAction(PreprocessJA); + + if (PreprocessJA && isa(PreprocessJA) && !C.getArgs().hasArg(options::OPT_no_integrated_cpp) && !C.getArgs().hasArg(options::OPT_traditional_cpp) && !SaveTemps && !C.getArgs().hasArg(options::OPT_rewrite_objc) && - ToolForJob->hasIntegratedCPP()) - Inputs = &(*Inputs)[0]->getInputs(); + ToolForJob->hasIntegratedCPP()) { + Inputs = &PreprocessJA->getInputs(); + if (PreprocessOA) + CollapsedOffloadAction.push_back(PreprocessOA); + } return ToolForJob; } @@ -1951,17 +2057,29 @@ const { llvm::PrettyStackTraceString CrashInfo("Building compilation jobs"); - InputInfoList CudaDeviceInputInfos; - if (const CudaHostAction *CHA = dyn_cast(A)) { - // Append outputs of device jobs to the input list. - for (const Action *DA : CHA->getDeviceActions()) { - CudaDeviceInputInfos.push_back(BuildJobsForAction( - C, DA, TC, nullptr, AtTopLevel, - /*MultipleArchs*/ false, LinkingOutput, CachedResults)); + InputInfoList OffloadDeviceInputInfos; + if (const OffloadAction *OA = dyn_cast(A)) { + + OA->doOnEachDeviceDependence( + [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) { + OffloadDeviceInputInfos.push_back(BuildJobsForAction( + C, DepA, DepTC, DepBoundArch, AtTopLevel, + /*MultipleArchs*/ !!DepBoundArch, LinkingOutput, CachedResults)); + }); + + // If we have a single device action, just return its info - it is a + // dependence to some other device action or host action. If we have + // multiple device dependences, we expect them to be combined with the host + // action (we do not expect having to combine different devices/programming + // models actions with an offload action). Therefore, when we have multiple + // device dependences, we expect to have an host dependence as well. + if (OA->hasSingleDeviceDependence()) { + return OffloadDeviceInputInfos.back(); } + // Override current action with a real host compile action and continue // processing it. - A = *CHA->input_begin(); + A = OA->getHostDependence(); } if (const InputAction *IA = dyn_cast(A)) { @@ -1991,37 +2109,27 @@ MultipleArchs, LinkingOutput, CachedResults); } - if (const CudaDeviceAction *CDA = dyn_cast(A)) { - // Initial processing of CudaDeviceAction carries host params. - // Call BuildJobsForAction() again, now with correct device parameters. - InputInfo II = BuildJobsForAction( - C, *CDA->input_begin(), C.getSingleOffloadToolChain(), - CDA->getGpuArchName(), CDA->isAtTopLevel(), /*MultipleArchs=*/true, - LinkingOutput, CachedResults); - // Currently II's Action is *CDA->input_begin(). Set it to CDA instead, so - // that one can retrieve II's GPU arch. - II.setAction(A); - return II; - } const ActionList *Inputs = &A->getInputs(); const JobAction *JA = cast(A); - const CudaHostAction *CollapsedCHA = nullptr; + ActionList CollapsedOffloadActions; + const Tool *T = selectToolForJob(C, isSaveTempsEnabled(), embedBitcodeEnabled(), TC, JA, - Inputs, CollapsedCHA); + Inputs, CollapsedOffloadActions); if (!T) return InputInfo(); - // If we've collapsed action list that contained CudaHostAction we + // If we've collapsed action list that contained OffloadAction we // need to build jobs for device-side inputs it may have held. - if (CollapsedCHA) { - for (const Action *DA : CollapsedCHA->getDeviceActions()) { - CudaDeviceInputInfos.push_back(BuildJobsForAction( - C, DA, TC, "", AtTopLevel, - /*MultipleArchs*/ false, LinkingOutput, CachedResults)); - } + for (const auto *OA : CollapsedOffloadActions) { + cast(OA)->doOnEachDeviceDependence( + [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) { + OffloadDeviceInputInfos.push_back(BuildJobsForAction( + C, DepA, DepTC, DepBoundArch, AtTopLevel, + /*MultipleArchs=*/!!DepBoundArch, LinkingOutput, CachedResults)); + }); } // Only use pipes when there is exactly one input. @@ -2045,9 +2153,10 @@ if (JA->getType() == types::TY_dSYM) BaseInput = InputInfos[0].getFilename(); - // Append outputs of cuda device jobs to the input list - if (CudaDeviceInputInfos.size()) - InputInfos.append(CudaDeviceInputInfos.begin(), CudaDeviceInputInfos.end()); + // Append outputs of offload device jobs to the input list + if (!OffloadDeviceInputInfos.empty()) + InputInfos.append(OffloadDeviceInputInfos.begin(), + OffloadDeviceInputInfos.end()); // Determine the place to write output to, if any. InputInfo Result; @@ -2055,7 +2164,8 @@ Result = InputInfo(A, BaseInput); else Result = InputInfo(A, GetNamedOutputPath(C, *JA, BaseInput, BoundArch, - AtTopLevel, MultipleArchs), + AtTopLevel, MultipleArchs, + TC->getTriple().normalize()), BaseInput); if (CCCPrintBindings && !CCGenDiagnostics) { @@ -2115,7 +2225,8 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA, const char *BaseInput, const char *BoundArch, bool AtTopLevel, - bool MultipleArchs) const { + bool MultipleArchs, + StringRef NormalizedTriple) const { llvm::PrettyStackTraceString CrashInfo("Computing output path"); // Output to a user requested destination? if (AtTopLevel && !isa(JA) && !isa(JA)) { @@ -2201,6 +2312,7 @@ MakeCLOutputFilename(C.getArgs(), "", BaseName, types::TY_Image); } else if (MultipleArchs && BoundArch) { SmallString<128> Output(getDefaultImageName()); + Output += JA.getOffloadingFileNamePrefix(NormalizedTriple); Output += "-"; Output.append(BoundArch); NamedOutput = C.getArgs().MakeArgString(Output.c_str()); @@ -2217,6 +2329,7 @@ if (!types::appendSuffixForType(JA.getType())) End = BaseName.rfind('.'); SmallString<128> Suffixed(BaseName.substr(0, End)); + Suffixed += JA.getOffloadingFileNamePrefix(NormalizedTriple); if (MultipleArchs && BoundArch) { Suffixed += "-"; Suffixed.append(BoundArch); Index: lib/Driver/ToolChain.cpp =================================================================== --- lib/Driver/ToolChain.cpp +++ lib/Driver/ToolChain.cpp @@ -248,8 +248,7 @@ case Action::InputClass: case Action::BindArchClass: - case Action::CudaDeviceClass: - case Action::CudaHostClass: + case Action::OffloadClass: case Action::LipoJobClass: case Action::DsymutilJobClass: case Action::VerifyDebugInfoJobClass: Index: lib/Driver/ToolChains.h =================================================================== --- lib/Driver/ToolChains.h +++ lib/Driver/ToolChains.h @@ -834,6 +834,11 @@ // ptxas. bool useIntegratedAs() const override { return false; } + // Converts CUDA GPU architecture, e.g. "sm_21", to its corresponding virtual + // compute arch, e.g. "compute_20". Returns null if the input arch is null or + // doesn't match an existing arch. + static const char *GpuArchToComputeName(const char *ArchName); + protected: Tool *buildAssembler() const override; // ptxas Tool *buildLinker() const override; // fatbinary (ok, not really a linker) Index: lib/Driver/ToolChains.cpp =================================================================== --- lib/Driver/ToolChains.cpp +++ lib/Driver/ToolChains.cpp @@ -4270,6 +4270,21 @@ return DAL; } +const char *CudaToolChain::GpuArchToComputeName(const char *ArchName) { + if (!ArchName) + return nullptr; + return llvm::StringSwitch(ArchName) + .Cases("sm_20", "sm_21", "compute_20") + .Case("sm_30", "compute_30") + .Case("sm_32", "compute_32") + .Case("sm_35", "compute_35") + .Case("sm_37", "compute_37") + .Case("sm_50", "compute_50") + .Case("sm_52", "compute_52") + .Case("sm_53", "compute_53") + .Default(nullptr); +} + Tool *CudaToolChain::buildAssembler() const { return new tools::NVPTX::Assembler(*this); } Index: lib/Driver/Tools.h =================================================================== --- lib/Driver/Tools.h +++ lib/Driver/Tools.h @@ -57,8 +57,7 @@ const Driver &D, const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs, const InputInfo &Output, - const InputInfoList &Inputs, - const ToolChain *AuxToolChain) const; + const InputInfoList &Inputs) const; void AddAArch64TargetArgs(const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const; Index: lib/Driver/Tools.cpp =================================================================== --- lib/Driver/Tools.cpp +++ lib/Driver/Tools.cpp @@ -288,12 +288,46 @@ !O.hasFlag(options::DriverOption) && !O.hasFlag(options::LinkerInput); } +/// \brief Add the C++ include args of other offloading toolchains. If this is a +/// host job, the device toolchains are added. If this is a device job, the host +/// toolchains will be added. +static void addExtraOffloadCXXStdlibIncludeArgs(Compilation &C, + const JobAction &JA, + const ArgList &Args, + ArgStringList &CmdArgs) { + + if (JA.isHostOffloading(Action::OFK_Cuda)) + C.getSingleOffloadToolChain() + ->AddClangCXXStdlibIncludeArgs(Args, CmdArgs); + else if (JA.isDeviceOffloading(Action::OFK_Cuda)) + C.getSingleOffloadToolChain() + ->AddClangCXXStdlibIncludeArgs(Args, CmdArgs); + + // TODO: Add support for other programming models here. +} + +/// \brief Add the include args that are specific of each offloading programming +/// model. +static void addExtraOffloadSpecificIncludeArgs(Compilation &C, + const JobAction &JA, + const ArgList &Args, + ArgStringList &CmdArgs) { + + if (JA.isHostOffloading(Action::OFK_Cuda)) + C.getSingleOffloadToolChain()->AddCudaIncludeArgs( + Args, CmdArgs); + else if (JA.isDeviceOffloading(Action::OFK_Cuda)) + C.getSingleOffloadToolChain()->AddCudaIncludeArgs( + Args, CmdArgs); + + // TODO: Add support for other programming models here. +} + void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA, const Driver &D, const ArgList &Args, ArgStringList &CmdArgs, const InputInfo &Output, - const InputInfoList &Inputs, - const ToolChain *AuxToolChain) const { + const InputInfoList &Inputs) const { Arg *A; CheckPreprocessingOptions(D, Args); @@ -550,26 +584,22 @@ // OBJCPLUS_INCLUDE_PATH - system includes enabled when compiling ObjC++. addDirectoryList(Args, CmdArgs, "-objcxx-isystem", "OBJCPLUS_INCLUDE_PATH"); - // Optional AuxToolChain indicates that we need to include headers - // for more than one target. If that's the case, add include paths - // from AuxToolChain right after include paths of the same kind for - // the current target. + // While adding the include arguments, we also attempt to retrieve the + // arguments of related offloading toolchains or arguments that are specific + // of an offloading programming model. // Add C++ include arguments, if needed. if (types::isCXX(Inputs[0].getType())) { getToolChain().AddClangCXXStdlibIncludeArgs(Args, CmdArgs); - if (AuxToolChain) - AuxToolChain->AddClangCXXStdlibIncludeArgs(Args, CmdArgs); + addExtraOffloadCXXStdlibIncludeArgs(C, JA, Args, CmdArgs); } // Add system include arguments. getToolChain().AddClangSystemIncludeArgs(Args, CmdArgs); - if (AuxToolChain) - AuxToolChain->AddClangCXXStdlibIncludeArgs(Args, CmdArgs); + addExtraOffloadCXXStdlibIncludeArgs(C, JA, Args, CmdArgs); - // Add CUDA include arguments, if needed. - if (types::isCuda(Inputs[0].getType())) - getToolChain().AddCudaIncludeArgs(Args, CmdArgs); + // Add offload include arguments, if needed. + addExtraOffloadSpecificIncludeArgs(C, JA, Args, CmdArgs); } // FIXME: Move to target hook. @@ -3602,7 +3632,7 @@ // CUDA compilation may have multiple inputs (source file + results of // device-side compilations). All other jobs are expected to have exactly one // input. - bool IsCuda = types::isCuda(Input.getType()); + bool IsCuda = JA.isOffloading(Action::OFK_Cuda); assert((IsCuda || Inputs.size() == 1) && "Unable to handle multiple inputs."); // Invoke ourselves in -cc1 mode. @@ -3614,21 +3644,21 @@ CmdArgs.push_back("-triple"); CmdArgs.push_back(Args.MakeArgString(TripleStr)); - const ToolChain *AuxToolChain = nullptr; if (IsCuda) { - // FIXME: We need a (better) way to pass information about - // particular compilation pass we're constructing here. For now we - // can check which toolchain we're using and pick the other one to - // extract the triple. - if (&getToolChain() == C.getSingleOffloadToolChain()) - AuxToolChain = C.getOffloadingHostToolChain(); - else if (&getToolChain() == C.getOffloadingHostToolChain()) - AuxToolChain = C.getSingleOffloadToolChain(); + // We have to pass the triple of the host if compiling for a CUDA device and + // vice-versa. + StringRef NormalizedTriple; + if (JA.isDeviceOffloading(Action::OFK_Cuda)) + NormalizedTriple = C.getSingleOffloadToolChain() + ->getTriple() + .normalize(); else - llvm_unreachable("Can't figure out CUDA compilation mode."); - assert(AuxToolChain != nullptr && "No aux toolchain."); + NormalizedTriple = C.getSingleOffloadToolChain() + ->getTriple() + .normalize(); + CmdArgs.push_back("-aux-triple"); - CmdArgs.push_back(Args.MakeArgString(AuxToolChain->getTriple().str())); + CmdArgs.push_back(Args.MakeArgString(NormalizedTriple)); } if (Triple.isOSWindows() && (Triple.getArch() == llvm::Triple::arm || @@ -4526,8 +4556,7 @@ // // FIXME: Support -fpreprocessed if (types::getPreprocessedType(InputType) != types::TY_INVALID) - AddPreprocessingOptions(C, JA, D, Args, CmdArgs, Output, Inputs, - AuxToolChain); + AddPreprocessingOptions(C, JA, D, Args, CmdArgs, Output, Inputs); // Don't warn about "clang -c -DPIC -fPIC test.i" because libtool.m4 assumes // that "The compiler can only warn and ignore the option if not recognized". @@ -10919,10 +10948,9 @@ static_cast(getToolChain()); assert(TC.getTriple().isNVPTX() && "Wrong platform"); - std::vector gpu_archs = - Args.getAllArgValues(options::OPT_march_EQ); - assert(gpu_archs.size() == 1 && "Exactly one GPU Arch required for ptxas."); - const std::string& gpu_arch = gpu_archs[0]; + // Obtain architecture from the action. + const char *gpu_arch = JA.getOffloadingArch(); + assert(gpu_arch && "Device action expected to have an architecture."); ArgStringList CmdArgs; CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-m64" : "-m32"); @@ -10996,12 +11024,19 @@ CmdArgs.push_back(Args.MakeArgString(Output.getFilename())); for (const auto& II : Inputs) { - auto* A = cast(II.getAction()); + auto *A = II.getAction(); + assert(A->getInputs().size() == 1 && + "Device offload action is expected to have a single input"); + const char *gpu_arch = A->getOffloadingArch(); + assert(gpu_arch && + "Device action expected to have associated a GPU architecture!"); + // We need to pass an Arch of the form "sm_XX" for cubin files and // "compute_XX" for ptx. - const char *Arch = (II.getType() == types::TY_PP_Asm) - ? A->getComputeArchName() - : A->getGpuArchName(); + const char *Arch = + (II.getType() == types::TY_PP_Asm) + ? toolchains::CudaToolChain::GpuArchToComputeName(gpu_arch) + : gpu_arch; CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") + Arch + ",file=" + II.getFilename())); } Index: lib/Frontend/CreateInvocationFromCommandLine.cpp =================================================================== --- lib/Frontend/CreateInvocationFromCommandLine.cpp +++ lib/Frontend/CreateInvocationFromCommandLine.cpp @@ -60,25 +60,25 @@ } // We expect to get back exactly one command job, if we didn't something - // failed. CUDA compilation is an exception as it creates multiple jobs. If - // that's the case, we proceed with the first job. If caller needs particular - // CUDA job, it should be controlled via --cuda-{host|device}-only option - // passed to the driver. + // failed. Offload compilation is an exception as it creates multiple jobs. If + // that's the case, we proceed with the first job. If caller needs a + // particular job, it should be controlled via options (e.g. + // --cuda-{host|device}-only for CUDA) passed to the driver. const driver::JobList &Jobs = C->getJobs(); - bool CudaCompilation = false; + bool OffloadCompilation = false; if (Jobs.size() > 1) { for (auto &A : C->getActions()){ // On MacOSX real actions may end up being wrapped in BindArchAction if (isa(A)) A = *A->input_begin(); - if (isa(A)) { - CudaCompilation = true; + if (isa(A)) { + OffloadCompilation = true; break; } } } if (Jobs.size() == 0 || !isa(*Jobs.begin()) || - (Jobs.size() > 1 && !CudaCompilation)) { + (Jobs.size() > 1 && !OffloadCompilation)) { SmallString<256> Msg; llvm::raw_svector_ostream OS(Msg); Jobs.Print(OS, "; ", true);