diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt --- a/clang/CMakeLists.txt +++ b/clang/CMakeLists.txt @@ -851,23 +851,38 @@ endforeach() endif() -if (CLANG_BOLT_INSTRUMENT AND NOT LLVM_BUILD_INSTRUMENTED) +set(CLANG_BOLT "INSTRUMENT" CACHE STRING "Apply BOLT optimization to Clang. \ + May be specified as Instrument or Perf or LBR to use a particular profiling \ + mechanism.") +string(TOUPPER "${CLANG_BOLT}" uppercase_CLANG_BOLT) + +if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED) set(CLANG_PATH ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang) set(CLANG_INSTRUMENTED ${CLANG_PATH}-bolt.inst) - # Instrument clang with BOLT - add_custom_target(clang-instrumented - DEPENDS ${CLANG_INSTRUMENTED} - ) - set(BOLT_FDATA ${CMAKE_CURRENT_BINARY_DIR}/utils/perf-training/prof.fdata) - add_custom_command(OUTPUT ${CLANG_INSTRUMENTED} - DEPENDS clang llvm-bolt - COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED} - -instrument --instrumentation-file-append-pid - --instrumentation-file=${BOLT_FDATA} - COMMENT "Instrumenting clang binary with BOLT" - VERBATIM - ) + # Pass extra flag in no-LBR mode + if (uppercase_CLANG_BOLT STREQUAL "PERF") + set(BOLT_NO_LBR "-nl") + endif() + + if (uppercase_CLANG_BOLT STREQUAL "INSTRUMENT") + # Instrument clang with BOLT + add_custom_target(clang-instrumented + DEPENDS ${CLANG_INSTRUMENTED} + ) + set(BOLT_FDATA ${CMAKE_CURRENT_BINARY_DIR}/utils/perf-training/prof.fdata) + add_custom_command(OUTPUT ${CLANG_INSTRUMENTED} + DEPENDS clang llvm-bolt + COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED} + -instrument --instrumentation-file-append-pid + --instrumentation-file=${BOLT_FDATA} + COMMENT "Instrumenting clang binary with BOLT" + VERBATIM + ) + add_custom_target(clang-bolt-training-deps DEPENDS clang-instrumented) + else() # perf or LBR + add_custom_target(clang-bolt-training-deps DEPENDS clang) + endif() # Optimize original (pre-bolt) Clang using the collected profile set(CLANG_OPTIMIZED ${CMAKE_CURRENT_BINARY_DIR}/clang.bolt) @@ -878,6 +893,7 @@ -data ${BOLT_FDATA} -reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions -split-all-cold -split-eh -dyno-stats -icf=1 -use-gnu-stack + ${BOLT_NO_LBR} COMMAND ${CMAKE_COMMAND} -E rename ${CLANG_OPTIMIZED} $ COMMENT "Optimizing Clang with BOLT" VERBATIM diff --git a/clang/cmake/caches/BOLT.cmake b/clang/cmake/caches/BOLT.cmake --- a/clang/cmake/caches/BOLT.cmake +++ b/clang/cmake/caches/BOLT.cmake @@ -1,5 +1,5 @@ set(CMAKE_BUILD_TYPE Release CACHE STRING "") -set(CLANG_BOLT_INSTRUMENT ON CACHE BOOL "") +set(CLANG_BOLT "INSTRUMENT" CACHE STRING "") set(CMAKE_EXE_LINKER_FLAGS "-Wl,--emit-relocs,-znow" CACHE STRING "") set(LLVM_ENABLE_PROJECTS "bolt;clang" CACHE STRING "") diff --git a/clang/utils/perf-training/CMakeLists.txt b/clang/utils/perf-training/CMakeLists.txt --- a/clang/utils/perf-training/CMakeLists.txt +++ b/clang/utils/perf-training/CMakeLists.txt @@ -60,7 +60,7 @@ DEPENDS generate-dtrace-logs) endif() -if(CLANG_BOLT_INSTRUMENT AND NOT LLVM_BUILD_INSTRUMENTED) +if(CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED) configure_lit_site_cfg( ${CMAKE_CURRENT_SOURCE_DIR}/bolt.lit.site.cfg.in ${CMAKE_CURRENT_BINARY_DIR}/bolt-fdata/lit.site.cfg @@ -69,16 +69,38 @@ add_lit_testsuite(generate-bolt-fdata "Generating BOLT profile for Clang" ${CMAKE_CURRENT_BINARY_DIR}/bolt-fdata/ EXCLUDE_FROM_CHECK_ALL - DEPENDS clang-instrumented clear-bolt-fdata + DEPENDS clang-bolt-training-deps clear-bolt-fdata clear-perf-data ) add_custom_target(clear-bolt-fdata COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} fdata COMMENT "Clearing old BOLT fdata") + add_custom_target(clear-perf-data + COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} perf.data + COMMENT "Clearing old perf data") + + string(TOUPPER "${CLANG_BOLT}" uppercase_CLANG_BOLT) + # Pass extra flag in no-LBR mode + if (uppercase_CLANG_BOLT STREQUAL "PERF") + set(BOLT_NO_LBR "--nolbr") + endif() + + add_custom_target(merge-fdata-deps) + if (uppercase_CLANG_BOLT STREQUAL "INSTRUMENT") + add_dependencies(merge-fdata-deps generate-bolt-fdata) + else() + # Convert perf profiles into fdata + add_custom_target(convert-perf-fdata + COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py perf2bolt $ ${CMAKE_CURRENT_BINARY_DIR} $ ${BOLT_NO_LBR} + COMMENT "Converting perf files to BOLT fdata" + DEPENDS llvm-bolt generate-bolt-fdata) + add_dependencies(merge-fdata-deps convert-perf-fdata) + endif() + # Merge profiles into one using merge-fdata add_custom_target(merge-bolt-fdata COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py merge-fdata $ ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata ${CMAKE_CURRENT_BINARY_DIR} COMMENT "Merging BOLT fdata" - DEPENDS merge-fdata generate-bolt-fdata) + DEPENDS merge-fdata merge-fdata-deps) endif() diff --git a/clang/utils/perf-training/bolt.lit.cfg b/clang/utils/perf-training/bolt.lit.cfg --- a/clang/utils/perf-training/bolt.lit.cfg +++ b/clang/utils/perf-training/bolt.lit.cfg @@ -6,15 +6,25 @@ import os import subprocess -config.clang = os.path.realpath(lit.util.which('clang-bolt.inst', config.clang_tools_dir)).replace('\\', '/') +clang_binary = 'clang' +perf_wrapper = '' +if config.clang_bolt_mode.lower() == "instrument": + clang_binary = 'clang-bolt.inst' +else: # perf or LBR + perf_wrapper = '%s %s/perf-helper.py perf' % (config.python_exe, config.perf_helper_dir) + if config.clang_bolt_mode.lower() == "lbr": + perf_wrapper += " --lbr" + perf_wrapper += " -- " + +config.clang = os.path.realpath(lit.util.which(clang_binary, config.clang_tools_dir)).replace('\\', '/') config.name = 'Clang Perf Training' config.suffixes = ['.c', '.cc', '.cpp', '.m', '.mm', '.cu', '.ll', '.cl', '.s', '.S', '.modulemap', '.test'] use_lit_shell = os.environ.get("LIT_USE_INTERNAL_SHELL") config.test_format = lit.formats.ShTest(use_lit_shell == "0") -config.substitutions.append( ('%clang_cpp_skip_driver', ' %s --driver-mode=g++ ' % (config.clang))) -config.substitutions.append( ('%clang_cpp', ' %s --driver-mode=g++ ' % (config.clang))) -config.substitutions.append( ('%clang_skip_driver', ' %s ' % (config.clang))) -config.substitutions.append( ('%clang', ' %s ' % (config.clang) ) ) +config.substitutions.append( ('%clang_cpp_skip_driver', ' %s %s --driver-mode=g++ ' % (perf_wrapper, config.clang))) +config.substitutions.append( ('%clang_cpp', ' %s %s --driver-mode=g++ ' % (perf_wrapper, config.clang))) +config.substitutions.append( ('%clang_skip_driver', ' %s %s ' % (perf_wrapper, config.clang))) +config.substitutions.append( ('%clang', ' %s %s ' % (perf_wrapper, config.clang) ) ) config.substitutions.append( ('%test_root', config.test_exec_root ) ) diff --git a/clang/utils/perf-training/bolt.lit.site.cfg.in b/clang/utils/perf-training/bolt.lit.site.cfg.in --- a/clang/utils/perf-training/bolt.lit.site.cfg.in +++ b/clang/utils/perf-training/bolt.lit.site.cfg.in @@ -9,6 +9,7 @@ config.target_triple = "@LLVM_TARGET_TRIPLE@" config.python_exe = "@Python3_EXECUTABLE@" config.clang_obj_root = path(r"@CLANG_BINARY_DIR@") +config.clang_bolt_mode = "@CLANG_BOLT@" # Let the main config do the real work. lit_config.load_config(config, "@CLANG_SOURCE_DIR@/utils/perf-training/bolt.lit.cfg") diff --git a/clang/utils/perf-training/perf-helper.py b/clang/utils/perf-training/perf-helper.py --- a/clang/utils/perf-training/perf-helper.py +++ b/clang/utils/perf-training/perf-helper.py @@ -56,6 +56,57 @@ subprocess.check_call(cmd) return 0 +def perf(args): + parser = argparse.ArgumentParser(prog='perf-helper perf', + description='perf wrapper for BOLT profile collection') + parser.add_argument('--lbr', required=False, action='store_true', + help='Use perf with branch stacks') + parser.add_argument('cmd', nargs='*', help='') + + # Use python's arg parser to handle all leading option arguments, but pass + # everything else through to perf + first_cmd = next(arg for arg in args if not arg.startswith("--")) + last_arg_idx = args.index(first_cmd) + + opts = parser.parse_args(args[:last_arg_idx]) + #cmd = shlex.split(args[last_arg_idx:]) + cmd = args[last_arg_idx:] + + perf_args = [] + perf_args.extend(( + 'perf', 'record', '--event=cycles:u', '--freq=max', + '--output=%d.perf.data' % os.getpid())) + if opts.lbr: + perf_args += ['--branch-filter=any,u'] + perf_args.extend(cmd) + + start_time = time.time() + subprocess.check_call(perf_args) + + elapsed = time.time() - start_time + print("... data collection took %.4fs" % elapsed) + return 0 + +def perf2bolt(args): + parser = argparse.ArgumentParser(prog='perf-helper perf2bolt', + description='perf2bolt conversion wrapper for perf.data files') + parser.add_argument('p2b_path', help='Path to llvm-bolt') + parser.add_argument('path', help='Path containing perf.data files') + parser.add_argument('binary', help='Input binary') + parser.add_argument('--nolbr', required=False, action='store_true', + help='Use -nl perf2bolt mode') + opts = parser.parse_args(args) + + p2b_args = [] + p2b_args.extend((opts.p2b_path, opts.binary, '--aggregate-only', + '--profile-format=yaml')) + if opts.nolbr: + p2b_args += ['-nl'] + p2b_args += ['-p'] + for filename in findFilesWithExtension(opts.path, 'perf.data'): + subprocess.check_call(p2b_args + [filename, '-o', filename+'.fdata']) + return 0 + def dtrace(args): parser = argparse.ArgumentParser(prog='perf-helper dtrace', description='dtrace wrapper for order file generation') @@ -410,6 +461,8 @@ 'cc1' : cc1, 'gen-order-file' : genOrderFile, 'merge-fdata' : merge_fdata, + 'perf' : perf, + 'perf2bolt' : perf2bolt, } def main():