diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -597,8 +597,10 @@ if (CudaGpuBinary) { // If fatbin is available from early finalization, create a string // literal containing the fat binary loaded from the given file. - FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), - "", FatbinConstantName, 8); + const unsigned HIPCodeObjectAlign = 4096; + FatBinStr = + makeConstantString(std::string(CudaGpuBinary->getBuffer()), "", + FatbinConstantName, HIPCodeObjectAlign); } else { // If fatbin is not available, create an external symbol // __hip_fatbin in section .hip_fatbin. The external symbol is supposed diff --git a/clang/lib/Driver/ToolChains/HIP.cpp b/clang/lib/Driver/ToolChains/HIP.cpp --- a/clang/lib/Driver/ToolChains/HIP.cpp +++ b/clang/lib/Driver/ToolChains/HIP.cpp @@ -16,6 +16,7 @@ #include "clang/Driver/Driver.h" #include "clang/Driver/DriverDiagnostic.h" #include "clang/Driver/Options.h" +#include "llvm/Support/Alignment.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" #include "llvm/Support/TargetParser.h" @@ -33,6 +34,7 @@ #endif namespace { +const unsigned HIPCodeObjectAlign = 4096; static void addBCLib(const Driver &D, const ArgList &Args, ArgStringList &CmdArgs, ArgStringList LibraryPaths, @@ -108,6 +110,8 @@ // for different GPU archs. ArgStringList BundlerArgs; BundlerArgs.push_back(Args.MakeArgString("-type=o")); + BundlerArgs.push_back( + Args.MakeArgString("-bundle-align=" + Twine(HIPCodeObjectAlign))); // ToDo: Remove the dummy host binary entry which is required by // clang-offload-bundler. @@ -175,7 +179,8 @@ ObjStream << " .section .hip_fatbin,\"aMS\",@progbits,1\n"; ObjStream << " .data\n"; ObjStream << " .globl __hip_fatbin\n"; - ObjStream << " .p2align 3\n"; + ObjStream << " .p2align " << llvm::Log2(llvm::Align(HIPCodeObjectAlign)) + << "\n"; ObjStream << "__hip_fatbin:\n"; ObjStream << " .incbin \"" << BundleFile << "\"\n"; ObjStream.flush(); diff --git a/clang/test/CodeGenCUDA/device-stub.cu b/clang/test/CodeGenCUDA/device-stub.cu --- a/clang/test/CodeGenCUDA/device-stub.cu +++ b/clang/test/CodeGenCUDA/device-stub.cu @@ -115,7 +115,7 @@ // ALL: @4 = private unnamed_addr constant [21 x i8] c"ext_constant_var_def\00" // * constant unnamed string with GPU binary // CUDA: @[[FATBIN:.*]] = private constant{{.*GPU binary would be here.*}}\00", -// HIPEF: @[[FATBIN:.*]] = private constant{{.*GPU binary would be here.*}}\00", +// HIPEF: @[[FATBIN:.*]] = private constant{{.*GPU binary would be here.*}}\00",{{.*}}align 4096 // HIPNEF: @[[FATBIN:__hip_fatbin]] = external constant i8, section ".hip_fatbin" // CUDANORDC-SAME: section ".nv_fatbin", align 8 // CUDARDC-SAME: section "__nv_relfatbin", align 8 diff --git a/clang/test/Driver/clang-offload-bundler.c b/clang/test/Driver/clang-offload-bundler.c --- a/clang/test/Driver/clang-offload-bundler.c +++ b/clang/test/Driver/clang-offload-bundler.c @@ -278,6 +278,16 @@ // RUN: diff %t.empty %t.res.tgt1 // RUN: diff %t.empty %t.res.tgt2 +// +// Check -bundle-align option +// + +// RUN: clang-offload-bundler -bundle-align=4096 -type=bc -targets=host-%itanium_abi_triple,openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -inputs=%t.bc,%t.tgt1,%t.tgt2 -outputs=%t.bundle3.bc +// RUN: clang-offload-bundler -type=bc -targets=host-%itanium_abi_triple,openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -outputs=%t.res.bc,%t.res.tgt1,%t.res.tgt2 -inputs=%t.bundle3.bc -unbundle +// RUN: diff %t.bc %t.res.bc +// RUN: diff %t.tgt1 %t.res.tgt1 +// RUN: diff %t.tgt2 %t.res.tgt2 + // Some code so that we can create a binary out of this file. int A = 0; void test_func(void) { diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip --- a/clang/test/Driver/hip-toolchain-no-rdc.hip +++ b/clang/test/Driver/hip-toolchain-no-rdc.hip @@ -81,6 +81,7 @@ // // CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" +// CHECK-SAME: "-bundle-align=4096" // CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" // CHECK-SAME: "-inputs={{.*}},[[IMG_DEV_A_803]],[[IMG_DEV_A_900]]" "-outputs=[[BUNDLE_A:.*hipfb]]" @@ -143,6 +144,7 @@ // // CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" +// CHECK-SAME: "-bundle-align=4096" // CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" // CHECK-SAME: "-inputs={{.*}},[[IMG_DEV_B_803]],[[IMG_DEV_B_900]]" "-outputs=[[BUNDLE_A:.*hipfb]]" diff --git a/clang/test/Driver/hip-toolchain-rdc.hip b/clang/test/Driver/hip-toolchain-rdc.hip --- a/clang/test/Driver/hip-toolchain-rdc.hip +++ b/clang/test/Driver/hip-toolchain-rdc.hip @@ -8,10 +8,14 @@ // RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \ // RUN: --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib2 \ // RUN: -fuse-ld=lld -fgpu-rdc -nogpuinc \ +// RUN: -fhip-dump-offload-linker-script \ // RUN: %S/Inputs/hip_multiple_inputs/a.cu \ // RUN: %S/Inputs/hip_multiple_inputs/b.hip \ // RUN: 2>&1 | FileCheck %s +// check code object alignment in dumped llvm-mc input +// CHECK: .p2align 12 + // emit objects for host side path // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "x86_64-unknown-linux-gnu" // CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa" @@ -87,6 +91,7 @@ // combine images generated into hip fat binary object // CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o" +// CHECK-SAME: "-bundle-align=4096" // CHECK-SAME: "-targets={{.*}},hip-amdgcn-amd-amdhsa-gfx803,hip-amdgcn-amd-amdhsa-gfx900" // CHECK-SAME: "-inputs={{.*}},[[IMG_DEV1]],[[IMG_DEV2]]" "-outputs=[[BUNDLE:.*hipfb]]" diff --git a/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp b/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp --- a/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp +++ b/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp @@ -94,6 +94,11 @@ "instead of actually executing them - for testing purposes.\n"), cl::init(false), cl::cat(ClangOffloadBundlerCategory)); +static cl::opt + BundleAlignment("bundle-align", + cl::desc("Alignment of bundle for binary files"), + cl::init(1), cl::cat(ClangOffloadBundlerCategory)); + /// Magic string that marks the existence of offloading data. #define OFFLOAD_BUNDLER_MAGIC_STR "__CLANG_OFFLOAD_BUNDLE__" @@ -223,6 +228,9 @@ StringMap::iterator CurBundleInfo; StringMap::iterator NextBundleInfo; + /// Current bundle target to be written. + std::string CurWriteBundleTarget; + public: BinaryFileHandler() : FileHandler() {} @@ -337,10 +345,12 @@ unsigned Idx = 0; for (auto &T : TargetNames) { MemoryBuffer &MB = *Inputs[Idx++]; + HeaderSize = alignTo(HeaderSize, BundleAlignment); // Bundle offset. Write8byteIntegerToBuffer(OS, HeaderSize); // Size of the bundle (adds to the next bundle's offset) Write8byteIntegerToBuffer(OS, MB.getBufferSize()); + BundlesInfo[T] = BundleInfo(MB.getBufferSize(), HeaderSize); HeaderSize += MB.getBufferSize(); // Size of the triple Write8byteIntegerToBuffer(OS, T.size()); @@ -351,6 +361,7 @@ } Error WriteBundleStart(raw_fd_ostream &OS, StringRef TargetTriple) final { + CurWriteBundleTarget = TargetTriple.str(); return Error::success(); } @@ -359,6 +370,8 @@ } Error WriteBundle(raw_fd_ostream &OS, MemoryBuffer &Input) final { + auto BI = BundlesInfo[CurWriteBundleTarget]; + OS.seek(BI.Offset); OS.write(Input.getBufferStart(), Input.getBufferSize()); return Error::success(); }