Index: cfe/trunk/include/clang/Driver/Compilation.h =================================================================== --- cfe/trunk/include/clang/Driver/Compilation.h +++ cfe/trunk/include/clang/Driver/Compilation.h @@ -115,6 +115,12 @@ return OrderedOffloadingToolchains.equal_range(Kind); } + /// Return true if an offloading tool chain of a given kind exists. + template bool hasOffloadToolChain() const { + return OrderedOffloadingToolchains.find(Kind) != + OrderedOffloadingToolchains.end(); + } + /// Return an offload toolchain of the provided kind. Only one is expected to /// exist. template Index: cfe/trunk/lib/Driver/Driver.cpp =================================================================== --- cfe/trunk/lib/Driver/Driver.cpp +++ cfe/trunk/lib/Driver/Driver.cpp @@ -1400,139 +1400,536 @@ } } -// For each unique --cuda-gpu-arch= argument creates a TY_CUDA_DEVICE -// input action and then wraps each in CudaDeviceAction paired with -// appropriate GPU arch name. In case of partial (i.e preprocessing -// only) or device-only compilation, each device action is added to /p -// Actions and /p Current is released. Otherwise the function creates -// and returns a new CudaHostAction which wraps /p Current and device -// side actions. -static Action *buildCudaActions(Compilation &C, DerivedArgList &Args, - const Arg *InputArg, Action *HostAction, - ActionList &Actions) { - Arg *PartialCompilationArg = Args.getLastArg( - options::OPT_cuda_host_only, options::OPT_cuda_device_only, - options::OPT_cuda_compile_host_device); - bool CompileHostOnly = - PartialCompilationArg && - PartialCompilationArg->getOption().matches(options::OPT_cuda_host_only); - bool CompileDeviceOnly = - PartialCompilationArg && - PartialCompilationArg->getOption().matches(options::OPT_cuda_device_only); - const ToolChain *HostTC = C.getSingleOffloadToolChain(); - assert(HostTC && "No toolchain for host compilation."); - if (HostTC->getTriple().isNVPTX()) { - // We do not support targeting NVPTX for host compilation. Throw - // an error and abort pipeline construction early so we don't trip - // asserts that assume device-side compilation. - C.getDriver().Diag(diag::err_drv_cuda_nvptx_host); - return nullptr; - } +namespace { +/// Provides a convenient interface for different programming models to generate +/// the required device actions. +class OffloadingActionBuilder final { + /// Flag used to trace errors in the builder. + bool IsValid = false; + + /// The compilation that is using this builder. + Compilation &C; + + /// The derived arguments associated with this builder. + DerivedArgList &Args; + + /// Map between an input argument and the offload kinds used to process it. + std::map InputArgToOffloadKindMap; + + /// Builder interface. It doesn't build anything or keep any state. + class DeviceActionBuilder { + public: + typedef llvm::SmallVector PhasesTy; + + enum ActionBuilderReturnCode { + // The builder acted successfully on the current action. + ABRT_Success, + // The builder didn't have to act on the current action. + ABRT_Inactive, + // The builder was successful and requested the host action to not be + // generated. + ABRT_Ignore_Host, + }; + + protected: + /// Compilation associated with this builder. + Compilation &C; + + /// Tool chains associated with this builder. The same programming + /// model may have associated one or more tool chains. + SmallVector ToolChains; + + /// The derived arguments associated with this builder. + DerivedArgList &Args; + + /// The inputs associated with this builder. + const Driver::InputList &Inputs; + + /// The associated offload kind. + Action::OffloadKind AssociatedOffloadKind = Action::OFK_None; + + public: + DeviceActionBuilder(Compilation &C, DerivedArgList &Args, + const Driver::InputList &Inputs, + Action::OffloadKind AssociatedOffloadKind) + : C(C), Args(Args), Inputs(Inputs), + AssociatedOffloadKind(AssociatedOffloadKind) {} + virtual ~DeviceActionBuilder() {} + + /// Fill up the array \a DA with all the device dependences that should be + /// added to the provided host action \a HostAction. By default it is + /// inactive. + virtual ActionBuilderReturnCode + getDeviceDepences(OffloadAction::DeviceDependences &DA, phases::ID CurPhase, + phases::ID FinalPhase, PhasesTy &Phases) { + return ABRT_Inactive; + } + + /// Update the state to include the provided host action \a HostAction as a + /// dependency of the current device action. By default it is inactive. + virtual ActionBuilderReturnCode addDeviceDepences(Action *HostAction) { + return ABRT_Inactive; + } + + /// Append top level actions generated by the builder. Return true if errors + /// were found. + virtual void appendTopLevelActions(ActionList &AL) {} + + /// Append linker actions generated by the builder. Return true if errors + /// were found. + virtual void appendLinkDependences(OffloadAction::DeviceDependences &DA) {} + + /// Initialize the builder. Return true if any initialization errors are + /// found. + virtual bool initialize() { return false; } + + /// Return true if this builder is valid. We have a valid builder if we have + /// associated device tool chains. + bool isValid() { return !ToolChains.empty(); } + + /// Return the associated offload kind. + Action::OffloadKind getAssociatedOffloadKind() { + return AssociatedOffloadKind; + } + }; + + /// \brief CUDA action builder. It injects device code in the host backend + /// action. + class CudaActionBuilder final : public DeviceActionBuilder { + /// Flags to signal if the user requested host-only or device-only + /// compilation. + bool CompileHostOnly = false; + bool CompileDeviceOnly = false; + + /// List of GPU architectures to use in this compilation. + SmallVector GpuArchList; + + /// The CUDA actions for the current input. + ActionList CudaDeviceActions; + + /// The CUDA fat binary if it was generated for the current input. + Action *CudaFatBinary = nullptr; + + /// Flag that is set to true if this builder acted on the current input. + bool IsActive = false; + + public: + CudaActionBuilder(Compilation &C, DerivedArgList &Args, + const Driver::InputList &Inputs) + : DeviceActionBuilder(C, Args, Inputs, Action::OFK_Cuda) {} + + ActionBuilderReturnCode + getDeviceDepences(OffloadAction::DeviceDependences &DA, phases::ID CurPhase, + phases::ID FinalPhase, PhasesTy &Phases) override { + if (!IsActive) + return ABRT_Inactive; + + // If we don't have more CUDA actions, we don't have any dependences to + // create for the host. + if (CudaDeviceActions.empty()) + return ABRT_Success; + + assert(CudaDeviceActions.size() == GpuArchList.size() && + "Expecting one action per GPU architecture."); + assert(!CompileHostOnly && + "Not expecting CUDA actions in host-only compilation."); + + // If we are generating code for the device or we are in a backend phase, + // we attempt to generate the fat binary. We compile each arch to ptx and + // assemble to cubin, then feed the cubin *and* the ptx into a device + // "link" action, which uses fatbinary to combine these cubins into one + // fatbin. The fatbin is then an input to the host action if not in + // device-only mode. + if (CompileDeviceOnly || CurPhase == phases::Backend) { + ActionList DeviceActions; + for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { + // Produce the device action from the current phase up to the assemble + // phase. + for (auto Ph : Phases) { + // Skip the phases that were already dealt with. + if (Ph < CurPhase) + continue; + // We have to be consistent with the host final phase. + if (Ph > FinalPhase) + break; + + CudaDeviceActions[I] = C.getDriver().ConstructPhaseAction( + C, Args, Ph, CudaDeviceActions[I]); + + if (Ph == phases::Assemble) + break; + } + + // If we didn't reach the assemble phase, we can't generate the fat + // binary. We don't need to generate the fat binary if we are not in + // device-only mode. + if (!isa(CudaDeviceActions[I]) || + CompileDeviceOnly) + continue; + + Action *AssembleAction = CudaDeviceActions[I]; + assert(AssembleAction->getType() == types::TY_Object); + assert(AssembleAction->getInputs().size() == 1); + + Action *BackendAction = AssembleAction->getInputs()[0]; + assert(BackendAction->getType() == types::TY_PP_Asm); + + for (auto &A : {AssembleAction, BackendAction}) { + OffloadAction::DeviceDependences DDep; + DDep.add(*A, *ToolChains.front(), CudaArchToString(GpuArchList[I]), + Action::OFK_Cuda); + DeviceActions.push_back( + C.MakeAction(DDep, A->getType())); + } + } + + // We generate the fat binary if we have device input actions. + if (!DeviceActions.empty()) { + CudaFatBinary = + C.MakeAction(DeviceActions, types::TY_CUDA_FATBIN); + + if (!CompileDeviceOnly) { + DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr, + Action::OFK_Cuda); + // Clear the fat binary, it is already a dependence to an host + // action. + CudaFatBinary = nullptr; + } + + // Remove the CUDA actions as they are already connected to an host + // action or fat binary. + CudaDeviceActions.clear(); + } + + // We avoid creating host action in device-only mode. + return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success; + } + + assert(CurPhase < phases::Backend && "Generating single CUDA " + "instructions should only occur " + "before the backend phase!"); + + // By default, we produce an action for each device arch. + for (Action *&A : CudaDeviceActions) + A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A); + + return ABRT_Success; + } + + ActionBuilderReturnCode addDeviceDepences(Action *HostAction) override { + // While generating code for CUDA, we only depend on the host input action + // to trigger the creation of all the CUDA device actions. + + // If we are dealing with an input action, replicate it for each GPU + // architecture. If we are in host-only mode we return 'success' so that + // the host uses the CUDA offload kind. + if (auto *IA = dyn_cast(HostAction)) { + assert(!GpuArchList.empty() && + "We should have at least one GPU architecture."); + + // If the host input is not CUDA, we don't need to bother about this + // input. + if (IA->getType() != types::TY_CUDA) { + // The builder will ignore this input. + IsActive = false; + return ABRT_Inactive; + } + + // Set the flag to true, so that the builder acts on the current input. + IsActive = true; + + if (CompileHostOnly) + return ABRT_Success; + + // Replicate inputs for each GPU architecture. + for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) + CudaDeviceActions.push_back(C.MakeAction( + IA->getInputArg(), types::TY_CUDA_DEVICE)); + + return ABRT_Success; + } + + return IsActive ? ABRT_Success : ABRT_Inactive; + } + + void appendTopLevelActions(ActionList &AL) override { + // Utility to append actions to the top level list. + auto AddTopLevel = [&](Action *A, CudaArch BoundArch) { + OffloadAction::DeviceDependences Dep; + Dep.add(*A, *ToolChains.front(), CudaArchToString(BoundArch), + Action::OFK_Cuda); + AL.push_back(C.MakeAction(Dep, A->getType())); + }; + + // If we have a fat binary, add it to the list. + if (CudaFatBinary) { + AddTopLevel(CudaFatBinary, CudaArch::UNKNOWN); + CudaDeviceActions.clear(); + CudaFatBinary = nullptr; + return; + } + + if (CudaDeviceActions.empty()) + return; + + // If we have CUDA actions at this point, that's because we have a have + // partial compilation, so we should have an action for each GPU + // architecture. + assert(CudaDeviceActions.size() == GpuArchList.size() && + "Expecting one action per GPU architecture."); + assert(ToolChains.size() == 1 && + "Expecting to have a sing CUDA toolchain."); + for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) + AddTopLevel(CudaDeviceActions[I], GpuArchList[I]); + + CudaDeviceActions.clear(); + } + + bool initialize() override { + // We don't need to support CUDA. + if (!C.hasOffloadToolChain()) + return false; + + const ToolChain *HostTC = C.getSingleOffloadToolChain(); + assert(HostTC && "No toolchain for host compilation."); + if (HostTC->getTriple().isNVPTX()) { + // We do not support targeting NVPTX for host compilation. Throw + // an error and abort pipeline construction early so we don't trip + // asserts that assume device-side compilation. + C.getDriver().Diag(diag::err_drv_cuda_nvptx_host); + return true; + } + + ToolChains.push_back(C.getSingleOffloadToolChain()); + + Arg *PartialCompilationArg = Args.getLastArg( + options::OPT_cuda_host_only, options::OPT_cuda_device_only, + options::OPT_cuda_compile_host_device); + CompileHostOnly = PartialCompilationArg && + PartialCompilationArg->getOption().matches( + options::OPT_cuda_host_only); + CompileDeviceOnly = PartialCompilationArg && + PartialCompilationArg->getOption().matches( + options::OPT_cuda_device_only); + + // Collect all cuda_gpu_arch parameters, removing duplicates. + llvm::SmallSet GpuArchs; + bool Error = false; + for (Arg *A : Args) { + if (!A->getOption().matches(options::OPT_cuda_gpu_arch_EQ)) + continue; + A->claim(); + + const auto &ArchStr = A->getValue(); + CudaArch Arch = StringToCudaArch(ArchStr); + if (Arch == CudaArch::UNKNOWN) { + C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << ArchStr; + Error = true; + } else if (GpuArchs.insert(Arch).second) + GpuArchList.push_back(Arch); + } + + // Default to sm_20 which is the lowest common denominator for supported + // GPUs. + // sm_20 code should work correctly, if suboptimally, on all newer GPUs. + if (GpuArchList.empty()) + GpuArchList.push_back(CudaArch::SM_20); + + return Error; + } + }; + + /// Add the implementation for other specialized builders here. + + /// Specialized builders being used by this offloading action builder. + SmallVector SpecializedBuilders; + +public: + OffloadingActionBuilder(Compilation &C, DerivedArgList &Args, + const Driver::InputList &Inputs) + : C(C), Args(Args) { + // Create a specialized builder for each device toolchain. + + IsValid = true; + + // Create a specialized builder for CUDA. + SpecializedBuilders.push_back(new CudaActionBuilder(C, Args, Inputs)); + + // + // TODO: Build other specialized builders here. + // + + // Initialize all the builders, keeping track of errors. + for (auto *SB : SpecializedBuilders) + IsValid = IsValid && !SB->initialize(); + } + + ~OffloadingActionBuilder() { + for (auto *SB : SpecializedBuilders) + delete SB; + } + + /// Generate an action that adds device dependences (if any) to a host action. + /// If no device dependence actions exist, just return the host action \a + /// HostAction. If an error is found or if no builder requires the host action + /// to be generated, return nullptr. + Action * + addDeviceDependencesToHostAction(Action *HostAction, const Arg *InputArg, + phases::ID CurPhase, phases::ID FinalPhase, + DeviceActionBuilder::PhasesTy &Phases) { + if (!IsValid) + return nullptr; + + if (SpecializedBuilders.empty()) + return HostAction; + + assert(HostAction && "Invalid host action!"); + + OffloadAction::DeviceDependences DDeps; + // Check if all the programming models agree we should not emit the host + // action. Also, keep track of the offloading kinds employed. + auto &OffloadKind = InputArgToOffloadKindMap[InputArg]; + unsigned InactiveBuilders = 0u; + unsigned IgnoringBuilders = 0u; + for (auto *SB : SpecializedBuilders) { + if (!SB->isValid()) { + ++InactiveBuilders; + continue; + } - if (CompileHostOnly) { - OffloadAction::HostDependence HDep(*HostAction, *HostTC, - /*BoundArch=*/nullptr, Action::OFK_Cuda); - return C.MakeAction(HDep); + auto RetCode = SB->getDeviceDepences(DDeps, CurPhase, FinalPhase, Phases); + + // If the builder explicitly says the host action should be ignored, + // we need to increment the variable that tracks the builders that request + // the host object to be ignored. + if (RetCode == DeviceActionBuilder::ABRT_Ignore_Host) + ++IgnoringBuilders; + + // Unless the builder was inactive for this action, we have to record the + // offload kind because the host will have to use it. + if (RetCode != DeviceActionBuilder::ABRT_Inactive) + OffloadKind |= SB->getAssociatedOffloadKind(); + } + + // If all builders agree that the host object should be ignored, just return + // nullptr. + if (IgnoringBuilders && + SpecializedBuilders.size() == (InactiveBuilders + IgnoringBuilders)) + return nullptr; + + if (DDeps.getActions().empty()) + return HostAction; + + // We have dependences we need to bundle together. We use an offload action + // for that. + OffloadAction::HostDependence HDep( + *HostAction, *C.getSingleOffloadToolChain(), + /*BoundArch=*/nullptr, DDeps); + return C.MakeAction(HDep, DDeps); + } + + /// Generate an action that adds a host dependence to a device action. The + /// results will be kept in this action builder. Return true if an error was + /// found. + bool addHostDependenceToDeviceActions(Action *HostAction, + const Arg *InputArg) { + if (!IsValid) + return true; + + assert(HostAction && "Invalid host action!"); + + // Register the offload kinds that are used. + auto &OffloadKind = InputArgToOffloadKindMap[InputArg]; + for (auto *SB : SpecializedBuilders) { + if (!SB->isValid()) + continue; + + auto RetCode = SB->addDeviceDepences(HostAction); + + // Host dependences for device actions are not compatible with that same + // action being ignored. + assert(RetCode != DeviceActionBuilder::ABRT_Ignore_Host && + "Host dependence not expected to be ignored.!"); + + // Unless the builder was inactive for this action, we have to record the + // offload kind because the host will have to use it. + if (RetCode != DeviceActionBuilder::ABRT_Inactive) + OffloadKind |= SB->getAssociatedOffloadKind(); + } + + return false; } - // Collect all cuda_gpu_arch parameters, removing duplicates. - SmallVector GpuArchList; - llvm::SmallSet GpuArchs; - for (Arg *A : Args) { - if (!A->getOption().matches(options::OPT_cuda_gpu_arch_EQ)) - continue; - A->claim(); + /// Add the offloading top level actions to the provided action list. + bool appendTopLevelActions(ActionList &AL, Action *HostAction, + const Arg *InputArg) { + auto NumActions = AL.size(); - const auto &ArchStr = A->getValue(); - CudaArch Arch = StringToCudaArch(ArchStr); - if (Arch == CudaArch::UNKNOWN) - C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << ArchStr; - else if (GpuArchs.insert(Arch).second) - GpuArchList.push_back(Arch); - } - - // Default to sm_20 which is the lowest common denominator for supported GPUs. - // sm_20 code should work correctly, if suboptimally, on all newer GPUs. - if (GpuArchList.empty()) - GpuArchList.push_back(CudaArch::SM_20); - - // Replicate inputs for each GPU architecture. - Driver::InputList CudaDeviceInputs; - for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) - CudaDeviceInputs.push_back(std::make_pair(types::TY_CUDA_DEVICE, InputArg)); - - // Build actions for all device inputs. - ActionList CudaDeviceActions; - C.getDriver().BuildActions(C, Args, CudaDeviceInputs, CudaDeviceActions); - assert(GpuArchList.size() == CudaDeviceActions.size() && - "Failed to create actions for all devices"); - - // Check whether any of device actions stopped before they could generate PTX. - bool PartialCompilation = - llvm::any_of(CudaDeviceActions, [](const Action *a) { - return a->getKind() != Action::AssembleJobClass; - }); + for (auto *SB : SpecializedBuilders) { + if (!SB->isValid()) + continue; + SB->appendTopLevelActions(AL); + } + + assert(NumActions <= AL.size() && "Expecting more actions, not less!"); - const ToolChain *CudaTC = C.getSingleOffloadToolChain(); + // Propagate to the current host action (if any) the offload information + // associated with the current input. + if (HostAction) + HostAction->propagateHostOffloadInfo(InputArgToOffloadKindMap[InputArg], + /*BoundArch=*/nullptr); - // Figure out what to do with device actions -- pass them as inputs to the - // host action or run each of them independently. - if (PartialCompilation || CompileDeviceOnly) { - // In case of partial or device-only compilation results of device actions - // are not consumed by the host action device actions have to be added to - // top-level actions list with AtTopLevel=true and run independently. - - // -o is ambiguous if we have more than one top-level action. - if (Args.hasArg(options::OPT_o) && - (!CompileDeviceOnly || GpuArchList.size() > 1)) { + // If any action is added by the builders, -o is ambiguous if we have more + // than one top-level action. + if (NumActions < AL.size() && Args.hasArg(options::OPT_o) && + AL.size() > 1) { C.getDriver().Diag( clang::diag::err_drv_output_argument_with_multiple_files); - return nullptr; + return true; } - for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { - OffloadAction::DeviceDependences DDep; - DDep.add(*CudaDeviceActions[I], *CudaTC, CudaArchToString(GpuArchList[I]), - Action::OFK_Cuda); - Actions.push_back( - C.MakeAction(DDep, CudaDeviceActions[I]->getType())); - } - // Kill host action in case of device-only compilation. - if (CompileDeviceOnly) - return nullptr; - return HostAction; + return false; } - // If we're not a partial or device-only compilation, we compile each arch to - // ptx and assemble to cubin, then feed the cubin *and* the ptx into a device - // "link" action, which uses fatbinary to combine these cubins into one - // fatbin. The fatbin is then an input to the host compilation. - ActionList DeviceActions; - for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { - Action* AssembleAction = CudaDeviceActions[I]; - assert(AssembleAction->getType() == types::TY_Object); - assert(AssembleAction->getInputs().size() == 1); - - Action* BackendAction = AssembleAction->getInputs()[0]; - assert(BackendAction->getType() == types::TY_PP_Asm); - - for (auto &A : {AssembleAction, BackendAction}) { - OffloadAction::DeviceDependences DDep; - DDep.add(*A, *CudaTC, CudaArchToString(GpuArchList[I]), Action::OFK_Cuda); - DeviceActions.push_back(C.MakeAction(DDep, A->getType())); - } - } - auto FatbinAction = - C.MakeAction(DeviceActions, types::TY_CUDA_FATBIN); - - // Return a new host action that incorporates original host action and all - // device actions. - OffloadAction::HostDependence HDep(*HostAction, *HostTC, - /*BoundArch=*/nullptr, Action::OFK_Cuda); - OffloadAction::DeviceDependences DDep; - DDep.add(*FatbinAction, *CudaTC, /*BoundArch=*/nullptr, Action::OFK_Cuda); - return C.MakeAction(HDep, DDep); -} + /// Processes the host linker action. This currently consists of replacing it + /// with an offload action if there are device link objects and propagate to + /// the host action all the offload kinds used in the current compilation. The + /// resulting action is returned. + Action *processHostLinkAction(Action *HostAction) { + // Add all the dependences from the device linking actions. + OffloadAction::DeviceDependences DDeps; + for (auto *SB : SpecializedBuilders) { + if (!SB->isValid()) + continue; + + SB->appendLinkDependences(DDeps); + } + + // Calculate all the offload kinds used in the current compilation. + unsigned ActiveOffloadKinds = 0u; + for (auto &I : InputArgToOffloadKindMap) + ActiveOffloadKinds |= I.second; + + // If we don't have device dependencies, we don't have to create an offload + // action. + if (DDeps.getActions().empty()) { + // Propagate all the active kinds to host action. Given that it is a link + // action it is assumed to depend on all actions generated so far. + HostAction->propagateHostOffloadInfo(ActiveOffloadKinds, + /*BoundArch=*/nullptr); + return HostAction; + } + + // Create the offload action with all dependences. When an offload action + // is created the kinds are propagated to the host action, so we don't have + // to do that explicitely here. + OffloadAction::HostDependence HDep( + *HostAction, *C.getSingleOffloadToolChain(), + /*BoundArch*/ nullptr, ActiveOffloadKinds); + return C.MakeAction(HDep, DDeps); + } +}; +} // anonymous namespace. void Driver::BuildActions(Compilation &C, DerivedArgList &Args, const InputList &Inputs, ActionList &Actions) const { @@ -1640,8 +2037,8 @@ YcArg = YuArg = nullptr; } - // Track the host offload kinds used on this compilation. - unsigned CompilationActiveOffloadHostKinds = 0u; + // Builder to be used to build offloading actions. + OffloadingActionBuilder OffloadBuilder(C, Args, Inputs); // Construct the actions to perform. ActionList LinkerInputs; @@ -1707,17 +2104,14 @@ } } - phases::ID CudaInjectionPhase = - (phases::Compile < FinalPhase && - llvm::find(PL, phases::Compile) != PL.end()) - ? phases::Compile - : FinalPhase; - - // Track the host offload kinds used on this input. - unsigned InputActiveOffloadHostKinds = 0u; - // Build the pipeline for this file. Action *Current = C.MakeAction(*InputArg, InputType); + + // Use the current host action in any of the offloading actions, if + // required. + if (OffloadBuilder.addHostDependenceToDeviceActions(Current, InputArg)) + break; + for (SmallVectorImpl::iterator i = PL.begin(), e = PL.end(); i != e; ++i) { phases::ID Phase = *i; @@ -1726,6 +2120,12 @@ if (Phase > FinalPhase) break; + // Add any offload action the host action depends on. + Current = OffloadBuilder.addDeviceDependencesToHostAction( + Current, InputArg, Phase, FinalPhase, PL); + if (!Current) + break; + // Queue linker inputs. if (Phase == phases::Link) { assert((i + 1) == e && "linking must be final compilation step."); @@ -1734,48 +2134,37 @@ break; } - // Some types skip the assembler phase (e.g., llvm-bc), but we can't - // encode this in the steps because the intermediate type depends on - // arguments. Just special case here. - if (Phase == phases::Assemble && Current->getType() != types::TY_PP_Asm) + // Otherwise construct the appropriate action. + auto *NewCurrent = ConstructPhaseAction(C, Args, Phase, Current); + + // We didn't create a new action, so we will just move to the next phase. + if (NewCurrent == Current) continue; - // Otherwise construct the appropriate action. - Current = ConstructPhaseAction(C, Args, Phase, Current); + Current = NewCurrent; - if (InputType == types::TY_CUDA && Phase == CudaInjectionPhase) { - Current = buildCudaActions(C, Args, InputArg, Current, Actions); - if (!Current) - break; - - // We produced a CUDA action for this input, so the host has to support - // CUDA. - InputActiveOffloadHostKinds |= Action::OFK_Cuda; - CompilationActiveOffloadHostKinds |= Action::OFK_Cuda; - } + // Use the current host action in any of the offloading actions, if + // required. + if (OffloadBuilder.addHostDependenceToDeviceActions(Current, InputArg)) + break; if (Current->getType() == types::TY_Nothing) break; } - // If we ended with something, add to the output list. Also, propagate the - // offload information to the top-level host action related with the current - // input. - if (Current) { - if (InputActiveOffloadHostKinds) - Current->propagateHostOffloadInfo(InputActiveOffloadHostKinds, - /*BoundArch=*/nullptr); + // If we ended with something, add to the output list. + if (Current) Actions.push_back(Current); - } + + // Add any top level actions generated for offloading. + OffloadBuilder.appendTopLevelActions(Actions, Current, InputArg); } - // Add a link action if necessary and propagate the offload information for - // the current compilation. + // Add a link action if necessary. if (!LinkerInputs.empty()) { - Actions.push_back( - C.MakeAction(LinkerInputs, types::TY_Image)); - Actions.back()->propagateHostOffloadInfo(CompilationActiveOffloadHostKinds, - /*BoundArch=*/nullptr); + Action *LA = C.MakeAction(LinkerInputs, types::TY_Image); + LA = OffloadBuilder.processHostLinkAction(LA); + Actions.push_back(LA); } // If we are linking, claim any options which are obviously only used for @@ -1797,6 +2186,13 @@ Action *Driver::ConstructPhaseAction(Compilation &C, const ArgList &Args, phases::ID Phase, Action *Input) const { llvm::PrettyStackTraceString CrashInfo("Constructing phase actions"); + + // Some types skip the assembler phase (e.g., llvm-bc), but we can't + // encode this in the steps because the intermediate type depends on + // arguments. Just special case here. + if (Phase == phases::Assemble && Input->getType() != types::TY_PP_Asm) + return Input; + // Build the appropriate action. switch (Phase) { case phases::Link: Index: cfe/trunk/lib/Driver/Types.cpp =================================================================== --- cfe/trunk/lib/Driver/Types.cpp +++ cfe/trunk/lib/Driver/Types.cpp @@ -254,7 +254,7 @@ } } - if (!onlyPrecompileType(Id) && Id != TY_CUDA_DEVICE) { + if (!onlyPrecompileType(Id)) { P.push_back(phases::Link); } assert(0 < P.size() && "Not enough phases in list"); Index: cfe/trunk/test/Driver/cuda-bindings.cu =================================================================== --- cfe/trunk/test/Driver/cuda-bindings.cu +++ cfe/trunk/test/Driver/cuda-bindings.cu @@ -34,8 +34,8 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings --cuda-gpu-arch=sm_30 %s -S 2>&1 \ // RUN: | FileCheck -check-prefix=ASM %s -// ASM: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-device-cuda-nvptx64-nvidia-cuda-sm_30.s" -// ASM: # "powerpc64le-ibm-linux-gnu" - "clang",{{.*}} output: "cuda-bindings.s" +// ASM-DAG: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-device-cuda-nvptx64-nvidia-cuda-sm_30.s" +// ASM-DAG: # "powerpc64le-ibm-linux-gnu" - "clang",{{.*}} output: "cuda-bindings.s" // // Test two gpu architectures with complete compilation. @@ -62,9 +62,9 @@ // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings \ // RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \ // RUN: | FileCheck -check-prefix=ASM2 %s -// ASM2: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-device-cuda-nvptx64-nvidia-cuda-sm_30.s" -// ASM2: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-device-cuda-nvptx64-nvidia-cuda-sm_35.s" -// ASM2: # "powerpc64le-ibm-linux-gnu" - "clang",{{.*}} output: "cuda-bindings.s" +// ASM2-DAG: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-device-cuda-nvptx64-nvidia-cuda-sm_30.s" +// ASM2-DAG: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-device-cuda-nvptx64-nvidia-cuda-sm_35.s" +// ASM2-DAG: # "powerpc64le-ibm-linux-gnu" - "clang",{{.*}} output: "cuda-bindings.s" // // Test one or more gpu architecture with complete compilation in host-only Index: cfe/trunk/test/Driver/cuda-phases.cu =================================================================== --- cfe/trunk/test/Driver/cuda-phases.cu +++ cfe/trunk/test/Driver/cuda-phases.cu @@ -13,84 +13,84 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s 2>&1 \ // RUN: | FileCheck -check-prefix=BIN %s -// BIN: 0: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// BIN: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda) -// BIN: 2: compiler, {1}, ir, (host-cuda) -// BIN: 3: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// BIN: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_30) -// BIN: 5: compiler, {4}, ir, (device-cuda, sm_30) -// BIN: 6: backend, {5}, assembler, (device-cuda, sm_30) -// BIN: 7: assembler, {6}, object, (device-cuda, sm_30) -// BIN: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {7}, object -// BIN: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {6}, assembler -// BIN: 10: linker, {8, 9}, cuda-fatbin, (device-cuda) -// BIN: 11: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda)" {10}, ir -// BIN: 12: backend, {11}, assembler, (host-cuda) -// BIN: 13: assembler, {12}, object, (host-cuda) -// BIN: 14: linker, {13}, image, (host-cuda) +// BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) +// BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) +// BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) +// BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) +// BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30) +// BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30) +// BIN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30) +// BIN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30) +// BIN-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object +// BIN-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler +// BIN-DAG: [[P10:[0-9]+]]: linker, {[[P8]], [[P9]]}, cuda-fatbin, (device-cuda) +// BIN-DAG: [[P11:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P10]]}, ir +// BIN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-cuda) +// BIN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-cuda) +// BIN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-cuda) // // Test single gpu architecture up to the assemble phase. // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s -S 2>&1 \ // RUN: | FileCheck -check-prefix=ASM %s -// ASM: 0: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// ASM: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30) -// ASM: 2: compiler, {1}, ir, (device-cuda, sm_30) -// ASM: 3: backend, {2}, assembler, (device-cuda, sm_30) -// ASM: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler -// ASM: 5: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// ASM: 6: preprocessor, {5}, cuda-cpp-output, (host-cuda) -// ASM: 7: compiler, {6}, ir, (host-cuda) -// ASM: 8: backend, {7}, assembler, (host-cuda) +// ASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) +// ASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) +// ASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) +// ASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) +// ASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler +// ASM-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) +// ASM-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (host-cuda) +// ASM-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (host-cuda) +// ASM-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (host-cuda) // // Test two gpu architectures with complete compilation. // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \ // RUN: | FileCheck -check-prefix=BIN2 %s -// BIN2: 0: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// BIN2: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda) -// BIN2: 2: compiler, {1}, ir, (host-cuda) -// BIN2: 3: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// BIN2: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_30) -// BIN2: 5: compiler, {4}, ir, (device-cuda, sm_30) -// BIN2: 6: backend, {5}, assembler, (device-cuda, sm_30) -// BIN2: 7: assembler, {6}, object, (device-cuda, sm_30) -// BIN2: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {7}, object -// BIN2: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {6}, assembler -// BIN2: 10: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// BIN2: 11: preprocessor, {10}, cuda-cpp-output, (device-cuda, sm_35) -// BIN2: 12: compiler, {11}, ir, (device-cuda, sm_35) -// BIN2: 13: backend, {12}, assembler, (device-cuda, sm_35) -// BIN2: 14: assembler, {13}, object, (device-cuda, sm_35) -// BIN2: 15: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {14}, object -// BIN2: 16: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {13}, assembler -// BIN2: 17: linker, {8, 9, 15, 16}, cuda-fatbin, (device-cuda) -// BIN2: 18: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda)" {17}, ir -// BIN2: 19: backend, {18}, assembler, (host-cuda) -// BIN2: 20: assembler, {19}, object, (host-cuda) -// BIN2: 21: linker, {20}, image, (host-cuda) +// BIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) +// BIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) +// BIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) +// BIN2-DAG: [[P3:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) +// BIN2-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, cuda-cpp-output, (device-cuda, sm_30) +// BIN2-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-cuda, sm_30) +// BIN2-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-cuda, sm_30) +// BIN2-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-cuda, sm_30) +// BIN2-DAG: [[P8:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P7]]}, object +// BIN2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P6]]}, assembler +// BIN2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) +// BIN2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (device-cuda, sm_35) +// BIN2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (device-cuda, sm_35) +// BIN2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (device-cuda, sm_35) +// BIN2-DAG: [[P14:[0-9]+]]: assembler, {[[P13]]}, object, (device-cuda, sm_35) +// BIN2-DAG: [[P15:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P14]]}, object +// BIN2-DAG: [[P16:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P13]]}, assembler +// BIN2-DAG: [[P17:[0-9]+]]: linker, {[[P8]], [[P9]], [[P15]], [[P16]]}, cuda-fatbin, (device-cuda) +// BIN2-DAG: [[P18:[0-9]+]]: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {[[P2]]}, "device-cuda (nvptx64-nvidia-cuda)" {[[P17]]}, ir +// BIN2-DAG: [[P19:[0-9]+]]: backend, {[[P18]]}, assembler, (host-cuda) +// BIN2-DAG: [[P20:[0-9]+]]: assembler, {[[P19]]}, object, (host-cuda) +// BIN2-DAG: [[P21:[0-9]+]]: linker, {[[P20]]}, image, (host-cuda) // // Test two gpu architecturess up to the assemble phase. // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \ // RUN: | FileCheck -check-prefix=ASM2 %s -// ASM2: 0: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// ASM2: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30) -// ASM2: 2: compiler, {1}, ir, (device-cuda, sm_30) -// ASM2: 3: backend, {2}, assembler, (device-cuda, sm_30) -// ASM2: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler -// ASM2: 5: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// ASM2: 6: preprocessor, {5}, cuda-cpp-output, (device-cuda, sm_35) -// ASM2: 7: compiler, {6}, ir, (device-cuda, sm_35) -// ASM2: 8: backend, {7}, assembler, (device-cuda, sm_35) -// ASM2: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {8}, assembler -// ASM2: 10: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// ASM2: 11: preprocessor, {10}, cuda-cpp-output, (host-cuda) -// ASM2: 12: compiler, {11}, ir, (host-cuda) -// ASM2: 13: backend, {12}, assembler, (host-cuda) +// ASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) +// ASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) +// ASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) +// ASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) +// ASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler +// ASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) +// ASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35) +// ASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35) +// ASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35) +// ASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler +// ASM2-DAG: [[P10:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) +// ASM2-DAG: [[P11:[0-9]+]]: preprocessor, {[[P10]]}, cuda-cpp-output, (host-cuda) +// ASM2-DAG: [[P12:[0-9]+]]: compiler, {[[P11]]}, ir, (host-cuda) +// ASM2-DAG: [[P13:[0-9]+]]: backend, {[[P12]]}, assembler, (host-cuda) // // Test single gpu architecture with complete compilation in host-only @@ -98,25 +98,22 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only 2>&1 \ // RUN: | FileCheck -check-prefix=HBIN %s -// HBIN: 0: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// HBIN: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda) -// HBIN: 2: compiler, {1}, ir, (host-cuda) -// HBIN: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir -// HBIN: 4: backend, {3}, assembler, (host-cuda) -// HBIN: 5: assembler, {4}, object, (host-cuda) -// HBIN: 6: linker, {5}, image, (host-cuda) - +// HBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) +// HBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) +// HBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) +// HBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda) +// HBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-cuda) +// HBIN-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-cuda) // // Test single gpu architecture up to the assemble phase in host-only // compilation mode. // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only -S 2>&1 \ // RUN: | FileCheck -check-prefix=HASM %s -// HASM: 0: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// HASM: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda) -// HASM: 2: compiler, {1}, ir, (host-cuda) -// HASM: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir -// HASM: 4: backend, {3}, assembler, (host-cuda) +// HASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) +// HASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) +// HASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) +// HASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda) // // Test two gpu architectures with complete compilation in host-only @@ -124,13 +121,12 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only 2>&1 \ // RUN: | FileCheck -check-prefix=HBIN2 %s -// HBIN2: 0: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// HBIN2: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda) -// HBIN2: 2: compiler, {1}, ir, (host-cuda) -// HBIN2: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir -// HBIN2: 4: backend, {3}, assembler, (host-cuda) -// HBIN2: 5: assembler, {4}, object, (host-cuda) -// HBIN2: 6: linker, {5}, image, (host-cuda) +// HBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) +// HBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) +// HBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) +// HBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda) +// HBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (host-cuda) +// HBIN2-DAG: [[P5:[0-9]+]]: linker, {[[P4]]}, image, (host-cuda) // // Test two gpu architectures up to the assemble phase in host-only @@ -138,11 +134,10 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only -S 2>&1 \ // RUN: | FileCheck -check-prefix=HASM2 %s -// HASM2: 0: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) -// HASM2: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda) -// HASM2: 2: compiler, {1}, ir, (host-cuda) -// HASM2: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir -// HASM2: 4: backend, {3}, assembler, (host-cuda) +// HASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (host-cuda) +// HASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (host-cuda) +// HASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-cuda) +// HASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (host-cuda) // // Test single gpu architecture with complete compilation in device-only @@ -150,12 +145,12 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only 2>&1 \ // RUN: | FileCheck -check-prefix=DBIN %s -// DBIN: 0: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DBIN: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30) -// DBIN: 2: compiler, {1}, ir, (device-cuda, sm_30) -// DBIN: 3: backend, {2}, assembler, (device-cuda, sm_30) -// DBIN: 4: assembler, {3}, object, (device-cuda, sm_30) -// DBIN: 5: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {4}, object +// DBIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) +// DBIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) +// DBIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) +// DBIN-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) +// DBIN-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30) +// DBIN-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object // // Test single gpu architecture up to the assemble phase in device-only @@ -163,11 +158,11 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only -S 2>&1 \ // RUN: | FileCheck -check-prefix=DASM %s -// DASM: 0: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DASM: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30) -// DASM: 2: compiler, {1}, ir, (device-cuda, sm_30) -// DASM: 3: backend, {2}, assembler, (device-cuda, sm_30) -// DASM: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler +// DASM-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) +// DASM-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) +// DASM-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) +// DASM-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) +// DASM-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler // // Test two gpu architectures with complete compilation in device-only @@ -175,18 +170,18 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \ // RUN: | FileCheck -check-prefix=DBIN2 %s -// DBIN2: 0: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DBIN2: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30) -// DBIN2: 2: compiler, {1}, ir, (device-cuda, sm_30) -// DBIN2: 3: backend, {2}, assembler, (device-cuda, sm_30) -// DBIN2: 4: assembler, {3}, object, (device-cuda, sm_30) -// DBIN2: 5: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {4}, object -// DBIN2: 6: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// DBIN2: 7: preprocessor, {6}, cuda-cpp-output, (device-cuda, sm_35) -// DBIN2: 8: compiler, {7}, ir, (device-cuda, sm_35) -// DBIN2: 9: backend, {8}, assembler, (device-cuda, sm_35) -// DBIN2: 10: assembler, {9}, object, (device-cuda, sm_35) -// DBIN2: 11: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {10}, object +// DBIN2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) +// DBIN2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) +// DBIN2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) +// DBIN2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) +// DBIN2-DAG: [[P4:[0-9]+]]: assembler, {[[P3]]}, object, (device-cuda, sm_30) +// DBIN2-DAG: [[P5:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P4]]}, object +// DBIN2-DAG: [[P6:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) +// DBIN2-DAG: [[P7:[0-9]+]]: preprocessor, {[[P6]]}, cuda-cpp-output, (device-cuda, sm_35) +// DBIN2-DAG: [[P8:[0-9]+]]: compiler, {[[P7]]}, ir, (device-cuda, sm_35) +// DBIN2-DAG: [[P9:[0-9]+]]: backend, {[[P8]]}, assembler, (device-cuda, sm_35) +// DBIN2-DAG: [[P10:[0-9]+]]: assembler, {[[P9]]}, object, (device-cuda, sm_35) +// DBIN2-DAG: [[P11:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P10]]}, object // // Test two gpu architectures up to the assemble phase in device-only @@ -194,13 +189,13 @@ // // RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S 2>&1 \ // RUN: | FileCheck -check-prefix=DASM2 %s -// DASM2: 0: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) -// DASM2: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30) -// DASM2: 2: compiler, {1}, ir, (device-cuda, sm_30) -// DASM2: 3: backend, {2}, assembler, (device-cuda, sm_30) -// DASM2: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler -// DASM2: 5: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) -// DASM2: 6: preprocessor, {5}, cuda-cpp-output, (device-cuda, sm_35) -// DASM2: 7: compiler, {6}, ir, (device-cuda, sm_35) -// DASM2: 8: backend, {7}, assembler, (device-cuda, sm_35) -// DASM2: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {8}, assembler +// DASM2-DAG: [[P0:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_30) +// DASM2-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, cuda-cpp-output, (device-cuda, sm_30) +// DASM2-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (device-cuda, sm_30) +// DASM2-DAG: [[P3:[0-9]+]]: backend, {[[P2]]}, assembler, (device-cuda, sm_30) +// DASM2-DAG: [[P4:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {[[P3]]}, assembler +// DASM2-DAG: [[P5:[0-9]+]]: input, "{{.*}}cuda-phases.cu", cuda, (device-cuda, sm_35) +// DASM2-DAG: [[P6:[0-9]+]]: preprocessor, {[[P5]]}, cuda-cpp-output, (device-cuda, sm_35) +// DASM2-DAG: [[P7:[0-9]+]]: compiler, {[[P6]]}, ir, (device-cuda, sm_35) +// DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-cuda, sm_35) +// DASM2-DAG: [[P9:[0-9]+]]: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {[[P8]]}, assembler