diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt --- a/clang/CMakeLists.txt +++ b/clang/CMakeLists.txt @@ -869,67 +869,106 @@ endforeach() endif() -if (CLANG_BOLT_INSTRUMENT AND NOT LLVM_BUILD_INSTRUMENTED) +if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED) set(CLANG_PATH ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang) set(CLANGXX_PATH ${CLANG_PATH}++) - set(CLANG_INSTRUMENTED ${CLANG_PATH}-bolt.inst) - set(CLANGXX_INSTRUMENTED ${CLANGXX_PATH}-bolt.inst) set(CLANG_OPTIMIZED ${CLANG_PATH}-bolt) set(CLANGXX_OPTIMIZED ${CLANGXX_PATH}-bolt) - # Instrument clang with BOLT - add_custom_target(clang-instrumented - DEPENDS ${CLANG_INSTRUMENTED} - ) - add_custom_command(OUTPUT ${CLANG_INSTRUMENTED} - DEPENDS clang llvm-bolt - COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED} - -instrument --instrumentation-file-append-pid - --instrumentation-file=${CMAKE_CURRENT_BINARY_DIR}/prof.fdata - COMMENT "Instrumenting clang binary with BOLT" - VERBATIM - ) + string(TOUPPER "${CLANG_BOLT}" uppercase_CLANG_BOLT) + if (uppercase_CLANG_BOLT STREQUAL "INSTRUMENT") + set(CLANG_INSTRUMENTED ${CLANG_PATH}-bolt.inst) + set(CLANGXX_INSTRUMENTED ${CLANGXX_PATH}-bolt.inst) - # Make a symlink from clang-bolt.inst to clang++-bolt.inst - add_custom_target(clang++-instrumented - DEPENDS ${CLANGXX_INSTRUMENTED} - ) - add_custom_command(OUTPUT ${CLANGXX_INSTRUMENTED} - DEPENDS clang-instrumented - COMMAND ${CMAKE_COMMAND} -E create_symlink - ${CLANG_INSTRUMENTED} - ${CLANGXX_INSTRUMENTED} - COMMENT "Creating symlink from BOLT instrumented clang to clang++" - VERBATIM - ) + # Instrument clang with BOLT + add_custom_target(clang-instrumented + DEPENDS ${CLANG_INSTRUMENTED} + ) + add_custom_command(OUTPUT ${CLANG_INSTRUMENTED} + DEPENDS clang llvm-bolt + COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED} + -instrument --instrumentation-file-append-pid + --instrumentation-file=${CMAKE_CURRENT_BINARY_DIR}/prof.fdata + COMMAND ${CMAKE_COMMAND} -E create_symlink + ${CLANG_INSTRUMENTED} + ${CLANGXX_INSTRUMENTED} + COMMENT "Instrumenting clang binary with BOLT" + VERBATIM + ) + endif() + + # Set variables for profile collection step + if (uppercase_CLANG_BOLT STREQUAL "INSTRUMENT") + set(CLANG_BOLT_CC ${CLANG_INSTRUMENTED}) + set(CLANG_BOLT_CXX ${CLANGXX_INSTRUMENTED}) + else() + set(CLANG_BOLT_CC ${CLANG_PATH}) + set(CLANG_BOLT_CXX ${CLANGXX_PATH}) + + # Perf sampling: + # - use maximum frequency to reduce training time + # - use cycle events instead of branches - empirically found to produce + # better results + # - if available, enable taken branch stack/LBR sampling + # (-j/--branch-filter) + set(PERF_CMDLINE + perf record --event=cycles:u + --output=${CMAKE_CURRENT_BINARY_DIR}/prof.data + --freq=max + ) + if (uppercase_CLANG_BOLT STREQUAL "LBR") + list(APPEND PERF_CMDLINE --branch-filter=any,u) + endif() + list(APPEND PERF_CMDLINE --) + endif() + + # Build specified targets to collect the profile + add_custom_target(bolt-profile-deps) + set(CLANG_BOLT_PROFILE ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata) + if (uppercase_CLANG_BOLT STREQUAL "INSTRUMENT") + add_dependencies(bolt-profile-deps clang-instrumented) + else() + add_dependencies(bolt-profile-deps clang) + endif() + set(STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt-clang-stamps/) + set(BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt-clang-bins/) + add_custom_target(bolt-clang-clear + DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/bolt-clang-cleared + ) + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/bolt-clang-cleared + DEPENDS bolt-profile-deps + COMMAND ${CMAKE_COMMAND} -E remove_directory ${BINARY_DIR} + COMMAND ${CMAKE_COMMAND} -E make_directory ${BINARY_DIR} + COMMAND ${CMAKE_COMMAND} -E remove_directory ${STAMP_DIR} + COMMAND ${CMAKE_COMMAND} -E make_directory ${STAMP_DIR} + COMMENT "Clobberring bolt-clang build and stamp directories" + ) - # Build specified targets with instrumented Clang to collect the profile - set(STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt-instrumented-clang-stamps/) - set(BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt-instrumented-clang-bins/) set(build_configuration "$") include(ExternalProject) - ExternalProject_Add(bolt-instrumentation-profile - DEPENDS clang++-instrumented - PREFIX bolt-instrumentation-profile + ExternalProject_Add(bolt-profile + DEPENDS bolt-profile-deps + PREFIX bolt-profile SOURCE_DIR ${CMAKE_SOURCE_DIR} STAMP_DIR ${STAMP_DIR} BINARY_DIR ${BINARY_DIR} EXCLUDE_FROM_ALL 1 CMAKE_ARGS - ${CLANG_BOLT_INSTRUMENT_EXTRA_CMAKE_FLAGS} + ${CLANG_BOLT_EXTRA_CMAKE_FLAGS} # We shouldn't need to set this here, but INSTALL_DIR doesn't # seem to work, so instead I'm passing this through -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX} - -DCMAKE_C_COMPILER=${CLANG_INSTRUMENTED} - -DCMAKE_CXX_COMPILER=${CLANGXX_INSTRUMENTED} - -DCMAKE_ASM_COMPILER=${CLANG_INSTRUMENTED} + -DCMAKE_C_COMPILER=${CLANG_BOLT_CC} + -DCMAKE_CXX_COMPILER=${CLANG_BOLT_CXX} + -DCMAKE_ASM_COMPILER=${CLANG_BOLT_CC} -DCMAKE_ASM_COMPILER_ID=Clang - -DCMAKE_BUILD_TYPE=Release - -DLLVM_ENABLE_PROJECTS=${CLANG_BOLT_INSTRUMENT_PROJECTS} + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DLLVM_ENABLE_PROJECTS=${CLANG_BOLT_PROJECTS} -DLLVM_TARGETS_TO_BUILD=${LLVM_TARGETS_TO_BUILD} - BUILD_COMMAND ${CMAKE_COMMAND} --build ${BINARY_DIR} + BUILD_COMMAND ${PERF_CMDLINE} ${CMAKE_COMMAND} --build ${BINARY_DIR} --config ${build_configuration} - --target ${CLANG_BOLT_INSTRUMENT_TARGETS} + --target ${CLANG_BOLT_TARGETS} INSTALL_COMMAND "" STEP_TARGETS configure build USES_TERMINAL_CONFIGURE 1 @@ -937,20 +976,37 @@ USES_TERMINAL_INSTALL 1 ) - # Merge profiles into one using merge-fdata + # Pass extra flag in no-LBR mode + if (uppercase_CLANG_BOLT STREQUAL "PERF") + set(BOLT_NO_LBR "-nl") + endif() add_custom_target(clang-bolt-profile - DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata - ) - add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata - DEPENDS merge-fdata bolt-instrumentation-profile-build - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - COMMAND ${Python3_EXECUTABLE} - ${CMAKE_CURRENT_SOURCE_DIR}/utils/perf-training/perf-helper.py merge-fdata - $ ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata - ${CMAKE_CURRENT_BINARY_DIR} - COMMENT "Preparing BOLT profile" - VERBATIM + DEPENDS ${CLANG_BOLT_PROFILE} ) + if (uppercase_CLANG_BOLT STREQUAL "INSTRUMENT") + # Merge profiles into one using merge-fdata + add_custom_command(OUTPUT ${CLANG_BOLT_PROFILE} + DEPENDS bolt-profile-build merge-fdata + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + COMMAND ${Python3_EXECUTABLE} + ${CMAKE_CURRENT_SOURCE_DIR}/utils/perf-training/perf-helper.py merge-fdata + $ prof.fdata . + COMMENT "Preparing BOLT profile" + VERBATIM + ) + else() # perf with or without LBR + # perf profile is produced by running the build, use perf2bolt to convert it to fdata + add_custom_command(OUTPUT ${CLANG_BOLT_PROFILE} + DEPENDS bolt-profile-build + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + COMMAND llvm-bolt --aggregate-only ${CLANG_PATH} + -o ${CLANG_BOLT_PROFILE} + -p ${CMAKE_CURRENT_BINARY_DIR}/prof.data + ${BOLT_NO_LBR} + COMMENT "Converting perf profile to fdata" + VERBATIM + ) + endif() # Optimize original (pre-bolt) Clang using the collected profile add_custom_target(clang-bolt @@ -960,9 +1016,10 @@ DEPENDS clang-bolt-profile COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_OPTIMIZED} - -data ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata + -data ${CLANG_BOLT_PROFILE} -reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions - -split-all-cold -split-eh -dyno-stats -icf=1 -use-gnu-stack + -split-all-cold -split-eh -dyno-stats -icf=1 -use-gnu-stack -plt=hot + ${BOLT_NO_LBR} COMMAND ${CMAKE_COMMAND} -E rename ${CLANG_OPTIMIZED} ${CLANG_PATH}-${CLANG_VERSION_MAJOR} COMMENT "Optimizing Clang with BOLT" VERBATIM diff --git a/clang/cmake/caches/BOLT.cmake b/clang/cmake/caches/BOLT.cmake --- a/clang/cmake/caches/BOLT.cmake +++ b/clang/cmake/caches/BOLT.cmake @@ -1,15 +1,17 @@ set(CMAKE_BUILD_TYPE Release CACHE STRING "") -set(CLANG_BOLT_INSTRUMENT ON CACHE BOOL "") -set(CLANG_BOLT_INSTRUMENT_PROJECTS "llvm" CACHE STRING "") -set(CLANG_BOLT_INSTRUMENT_TARGETS "count" CACHE STRING "") +set(CLANG_BOLT "INSTRUMENT" CACHE STRING "Apply BOLT optimization to Clang. \ + May be specified as Instrument or Perf or LBR to use a particular profiling \ + mechanism.") + +set(CLANG_BOLT_PROJECTS "llvm" CACHE STRING "") +string(TOUPPER "${CLANG_BOLT}" uppercase_CLANG_BOLT) +if (uppercase_CLANG_BOLT STREQUAL "INSTRUMENT") + set(CLANG_BOLT_TARGETS "count" CACHE STRING "") +else() + set(CLANG_BOLT_TARGETS "FileCheck" CACHE STRING "") +endif() set(CMAKE_EXE_LINKER_FLAGS "-Wl,--emit-relocs,-znow" CACHE STRING "") -set(CLANG_BOLT_INSTRUMENT_EXTRA_CMAKE_FLAGS "" CACHE STRING "") +set(CLANG_BOLT_EXTRA_CMAKE_FLAGS "" CACHE STRING "") set(LLVM_ENABLE_PROJECTS "bolt;clang" CACHE STRING "") set(LLVM_TARGETS_TO_BUILD Native CACHE STRING "") - -# Disable function splitting enabled by default in GCC8+ -if("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-reorder-blocks-and-partition") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-reorder-blocks-and-partition") -endif() diff --git a/llvm/docs/AdvancedBuilds.rst b/llvm/docs/AdvancedBuilds.rst --- a/llvm/docs/AdvancedBuilds.rst +++ b/llvm/docs/AdvancedBuilds.rst @@ -241,6 +241,62 @@ $ ninja stage2-clang-bolt +BOLT profile +------------ +BOLT uses the profile collected by either Linux `perf` or via BOLT's own +instrumentation. Both modes are supported by CMake automation, with +instrumentation being the default (`-DCLANG_BOLT=INSTRUMENT`). + +It's strongly recommended to use `perf` if host system supports it as it +is a significantly faster and potentially more reliable method: + +.. code-block:: console + + $ cmake <...> -DCLANG_BOLT=perf \ + -C /clang/cmake/caches/BOLT.cmake + +If the host system supports profiling branch stacks (e.g. AMD or Intel LBR +(Last Branch Record), Armv9-A BRBE (Branch Record Buffer Extension)), it can be +enabled with `-DCLANG_BOLT=LBR` to further improve the profile quality: + +.. code-block:: console + + $ cmake <...> -DCLANG_BOLT=LBR \ + -C /clang/cmake/caches/BOLT.cmake + +The following matrix describes supported profiling methods. Note that Linux/ELF +is the only supported platform. + +============ =============================== +Architecture `-DCLANG_BOLT` value +------------ ------------ ------ ----------- + `Instrument` `perf` `LBR` +============ ============ ====== =========== +x86_64 Yes Yes Yes +AArch64 No Yes No HW exist +============ ============ ====== =========== + +Profiling variables +------------------- +BOLT profile is collected from building one of in-tree projects/targets with +Clang as a workload. The following configuration options can be used to change +the profiling build and profiling mechanism: + +**CLANG_BOLT** + Profiling mechanism to be used. Supported values: `Instrument` (default), + `perf` (requires OS support), `LBR` (requires hardware support). + +**CLANG_BOLT_PROJECTS** + Projects to enable in profiling build. Defaults to `llvm`. + +**CLANG_BOLT_TARGETS** + Targets to build in profiling build. Defaults to `count` in instrumentation + build and `FileCheck` in perf-build. + +**CLANG_BOLT_EXTRA_CMAKE_FLAGS** + Extra CMake flags to pass to profiling build at configuration time. + + 3-Stage Non-Determinism =======================