diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -3065,7 +3065,7 @@ // amdgcn does not support linking of object files, therefore we skip // backend and assemble phases to output LLVM IR. Except for generating - // non-relocatable device coee, where we generate fat binary for device + // non-relocatable device code, where we generate fat binary for device // code and pass to host in Backend phase. if (CudaDeviceActions.empty()) return ABRT_Success; @@ -3074,7 +3074,7 @@ CudaDeviceActions.size() == GpuArchList.size()) && "Expecting one action per GPU architecture."); assert(!CompileHostOnly && - "Not expecting CUDA actions in host-only compilation."); + "Not expecting HIP actions in host-only compilation."); if (!Relocatable && CurPhase == phases::Backend && !EmitLLVM && !EmitAsm) { @@ -3203,12 +3203,16 @@ "Linker inputs and GPU arch list sizes do not match."); ActionList Actions; - // Append a new link action for each device. unsigned I = 0; + // Append a new link action for each device. + // Each entry in DeviceLinkerInputs corresponds to a GPU arch. for (auto &LI : DeviceLinkerInputs) { - // Each entry in DeviceLinkerInputs corresponds to a GPU arch. - auto *DeviceLinkAction = - C.MakeAction(LI, types::TY_Image); + + types::ID Output = Args.hasArg(options::OPT_emit_llvm) + ? types::TY_LLVM_BC + : types::TY_Image; + + auto *DeviceLinkAction = C.MakeAction(LI, Output); // Linking all inputs for the current GPU arch. // LI contains all the inputs for the linker. OffloadAction::DeviceDependences DeviceLinkDeps; @@ -3220,6 +3224,12 @@ } DeviceLinkerInputs.clear(); + // If emitting LLVM, do not generate final host/device compilation action + if (Args.hasArg(options::OPT_emit_llvm)) { + AL.append(Actions); + return; + } + // Create a host object from all the device images by embedding them // in a fat binary for mixed host-device compilation. For device-only // compilation, creates a fat binary. @@ -3747,7 +3757,8 @@ phases::ID FinalPhase = getFinalPhase(Args, &FinalPhaseArg); if (FinalPhase == phases::Link) { - if (Args.hasArg(options::OPT_emit_llvm)) + // Emitting LLVM while linking disabled except in HIPAMD Toolchain + if (Args.hasArg(options::OPT_emit_llvm) && !Args.hasArg(options::OPT_hip_link)) Diag(clang::diag::err_drv_emit_llvm_link); if (IsCLMode() && LTOMode != LTOK_None && !Args.getLastArgValue(options::OPT_fuse_ld_EQ) @@ -3932,7 +3943,10 @@ // Queue linker inputs. if (Phase == phases::Link) { assert(Phase == PL.back() && "linking must be final compilation step."); - LinkerInputs.push_back(Current); + // We don't need to generate additional link commands if emitting AMD bitcode + if (!(C.getInputArgs().hasArg(options::OPT_hip_link) && + (C.getInputArgs().hasArg(options::OPT_emit_llvm)))) + LinkerInputs.push_back(Current); Current = nullptr; break; } diff --git a/clang/lib/Driver/ToolChains/HIPAMD.h b/clang/lib/Driver/ToolChains/HIPAMD.h --- a/clang/lib/Driver/ToolChains/HIPAMD.h +++ b/clang/lib/Driver/ToolChains/HIPAMD.h @@ -36,6 +36,9 @@ void constructLldCommand(Compilation &C, const JobAction &JA, const InputInfoList &Inputs, const InputInfo &Output, const llvm::opt::ArgList &Args) const; + void constructLlvmLinkCommand(Compilation &C, const JobAction &JA, + const InputInfoList &Inputs, const InputInfo &Output, + const llvm::opt::ArgList &Args) const; }; } // end namespace AMDGCN diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp --- a/clang/lib/Driver/ToolChains/HIPAMD.cpp +++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp @@ -72,6 +72,36 @@ return false; } +void AMDGCN::Linker::constructLlvmLinkCommand(Compilation &C, + const JobAction &JA, + const InputInfoList &Inputs, + const InputInfo &Output, + const llvm::opt::ArgList &Args) const { + // Construct llvm-link command. + // The output from llvm-link is a bitcode file. + ArgStringList LlvmLinkArgs; + + assert(!Inputs.empty() && "Must have at least one input."); + + LlvmLinkArgs.append({"-o", Output.getFilename()}); + for (auto Input : Inputs) + LlvmLinkArgs.push_back(Input.getFilename()); + + // Look for archive of bundled bitcode in arguments, and add temporary files + // for the extracted archive of bitcode to inputs. + auto TargetID = Args.getLastArgValue(options::OPT_mcpu_EQ); + AddStaticDeviceLibsLinking(C, *this, JA, Inputs, Args, LlvmLinkArgs, "amdgcn", + TargetID, + /*IsBitCodeSDL=*/true, + /*PostClangLink=*/false); + + const char *LlvmLink = + Args.MakeArgString(getToolChain().GetProgramPath("llvm-link")); + C.addCommand(std::make_unique(JA, *this, ResponseFileSupport::None(), + LlvmLink, LlvmLinkArgs, Inputs, + Output)); +} + void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA, const InputInfoList &Inputs, const InputInfo &Output, @@ -135,7 +165,8 @@ } // For amdgcn the inputs of the linker job are device bitcode and output is -// object file. It calls llvm-link, opt, llc, then lld steps. +// either an object file or bitcode (-emit-llvm). It calls llvm-link, opt, +// llc, then lld steps. void AMDGCN::Linker::ConstructJob(Compilation &C, const JobAction &JA, const InputInfo &Output, const InputInfoList &Inputs, @@ -151,6 +182,9 @@ return HIP::constructHIPFatbinCommand(C, JA, Output.getFilename(), Inputs, Args, *this); + if (JA.getType() == types::TY_LLVM_BC) + return constructLlvmLinkCommand(C, JA, Inputs, Output, Args); + return constructLldCommand(C, JA, Inputs, Output, Args); } diff --git a/clang/test/Driver/hip-link-bc-to-bc.hip b/clang/test/Driver/hip-link-bc-to-bc.hip new file mode 100644 --- /dev/null +++ b/clang/test/Driver/hip-link-bc-to-bc.hip @@ -0,0 +1,34 @@ +// REQUIRES: clang-driver, x86-registered-target, amdgpu-registered-target + +// Check that clang unbundles the two bitcodes and links via llvm-link +// RUN: touch %T/bundle1.bc +// RUN: touch %T/bundle2.bc + +// RUN: %clang -### --offload-arch=gfx906 --hip-link \ +// RUN: -emit-llvm -fgpu-rdc --cuda-device-only \ +// RUN: %T/bundle1.bc %T/bundle2.bc \ +// RUN: 2>&1 | FileCheck -check-prefix=BITCODE %s + +// BITCODE: "{{.*}}clang-offload-bundler" "-type=bc" "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx906" "-input={{.*}}bundle1.bc" "-output=[[B1HOST:.*\.bc]]" "-output=[[B1DEV1:.*\.bc]]" "-unbundle" "-allow-missing-bundles" +// BITCODE: "{{.*}}clang-{{.*}}" "-o" "[[B1DEV2:.*bundle1-gfx906.bc]]" "-x" "ir" "[[B1DEV1]]" + +// BITCODE: "{{.*}}clang-offload-bundler" "-type=bc" "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx906" "-input={{.*}}bundle2.bc" "-output=[[B2HOST:.*\.bc]]" "-output=[[B2DEV1:.*\.bc]]" "-unbundle" "-allow-missing-bundles" +// BITCODE: "{{.*}}clang-{{.*}}" "-o" "[[B2DEV2:.*bundle2-gfx906.bc]]" "-x" "ir" "[[B2DEV1]]" + +// BITCODE: "{{.*}}llvm-link" "-o" "bundle1-hip-amdgcn-amd-amdhsa-gfx906.bc" "[[B1DEV2]]" "[[B2DEV2]]" + +// Check that clang unbundles the bitcode and archive and links via llvm-link +// RUN: touch %T/libhipbundle.a +// RUN: touch %T/bundle.bc + +// RUN: %clang -### --offload-arch=gfx906 --hip-link \ +// RUN: -emit-llvm -fgpu-rdc --cuda-device-only \ +// RUN: %T/bundle.bc -L%T -lhipbundle \ +// RUN: 2>&1 | FileCheck -check-prefix=ARCHIVE %s + +// ARCHIVE: "{{.*}}clang-offload-bundler" "-type=bc" "-targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa-gfx906" "-input={{.*}}bundle.bc" "-output=[[HOST:.*\.bc]]" "-output=[[DEV1:.*\.bc]]" "-unbundle" "-allow-missing-bundles" +// ARCHIVE: "{{.*}}clang-{{.*}}" "-o" "[[DEV2:.*\.bc]]" "-x" "ir" "[[DEV1]]" + +// ARCHIVE: "{{.*}}clang-offload-bundler" "-unbundle" "-type=a" "-input={{.*}}libhipbundle.a" "-targets=hip-amdgcn-amd-amdhsa-gfx906" "-output=[[AR:.*\.a]]" "-allow-missing-bundles" "-hip-openmp-compatible" + +// ARCHIVE: "{{.*}}llvm-link" "-o" "bundle-hip-amdgcn-amd-amdhsa-gfx906.bc" "[[DEV2]]" "[[AR]]" diff --git a/clang/test/Driver/hip-phases.hip b/clang/test/Driver/hip-phases.hip --- a/clang/test/Driver/hip-phases.hip +++ b/clang/test/Driver/hip-phases.hip @@ -520,3 +520,25 @@ // MIXED2-DAG: input, "{{.*}}empty.cpp", hip, (device-hip, gfx803) // MIXED2-DAG: input, "{{.*}}empty.cpp", hip, (device-hip, gfx900) // MIXED2-NEG-NOT: input, "{{.*}}empty.cpp", c++ + +// Test HIP bitcode to bitcode linking. Input should be bundled or unbundled bitcode, and +// output should be unbundled linked bitcode + +// RUN: touch %T/bitcodeA.bc +// RUN: touch %T/bitcodeB.bc +// RUN: %clang -ccc-print-phases --hip-link -emit-llvm --cuda-device-only \ +// RUN: --offload-arch=gfx906 %T/bitcodeA.bc %T/bitcodeB.bc 2>&1 \ +// RUN: | FileCheck -check-prefixes=CHECK %s + +// CHECK: [[A0:[0-9]+]]: input, "{{.*}}bitcodeA.bc", ir +// CHECK: [[A1:[0-9]+]]: clang-offload-unbundler, {[[A0]]}, ir +// CHECK: [[A2:[0-9]+]]: compiler, {[[A1]]}, ir, (device-hip, [[ARCH:gfx906]]) +// CHECK: [[A3:[0-9]+]]: backend, {[[A2]]}, ir, (device-hip, [[ARCH]]) + +// CHECK: [[B0:[0-9]+]]: input, "{{.*}}bitcodeB.bc", ir +// CHECK: [[B1:[0-9]+]]: clang-offload-unbundler, {[[B0]]}, ir +// CHECK: [[B2:[0-9]+]]: compiler, {[[B1]]}, ir, (device-hip, [[ARCH]]) +// CHECK: [[B3:[0-9]+]]: backend, {[[B2]]}, ir, (device-hip, [[ARCH]]) + +// CHECK: [[L0:[0-9]+]]: linker, {[[A3]], [[B3]]}, ir, (device-hip, [[ARCH]]) +// CHECK: offload, "device-hip (amdgcn-amd-amdhsa:[[ARCH]])" {[[L0]]}, ir