Index: packages/Python/lldbsuite/pre_kill_hook/README.md =================================================================== --- /dev/null +++ packages/Python/lldbsuite/pre_kill_hook/README.md @@ -0,0 +1,55 @@ +# pre\_kill\_hook package + +## Overview + +The pre\_kill\_hook package provides a per-platform method for running code +after a test process times out but before the concurrent test runner kills the +timed-out process. + +## Detailed Description of Usage + +If a platform defines the hook, then the hook gets called right after a timeout +is detected in a test run, but before the process is killed. + +The pre-kill-hook mechanism works as follows: + +* When a timeout is detected in the process_control.ProcessDriver class that + runs the per-test lldb process, a new overridable on\_timeout\_pre\_kill() method + is called on the ProcessDriver instance. + +* The concurrent test driver's derived ProcessDriver overrides this method. It + looks to see if a module called + "lldbsuite.pre\_kill\_hook.{platform-system-name}" module exists, where + platform-system-name is replaced with platform.system().lower(). (e.g. + "Darwin" becomes the darwin.py module). + + * If that module doesn't exist, the rest of the new behavior is skipped. + + * If that module does exist, it is loaded, and the method + "do\_pre\_kill(process\_id, context\_dict, output\_stream)" is called. If + that method throws an exception, we log it and we ignore further processing + of the pre-killed process. + + * The process\_id argument of the do\_pre\_kill function is the process id as + returned by the ProcessDriver.pid property. + + * The output\_stream argument of the do\_pre\_kill function takes a file-like + object. Output to be collected from doing any processing on the + process-to-be-killed should be written into the file-like object. The + current impl uses a six.StringIO and then writes this output to + {TestFilename}-{pid}.sample in the session directory. + +* Platforms where platform.system() is "Darwin" will get a pre-kill action that + runs the 'sample' program on the lldb that has timed out. That data will be + collected on CI and analyzed to determine what is happening during timeouts. + (This has an advantage over a core in that it is much smaller and that it + clearly demonstrates any liveness of the process, if there is any). + +## Running the tests + +To run the tests in the pre\_kill\_hook package, open a console, change into +this directory and run the following: + +``` +python -m unittest discover +``` Index: packages/Python/lldbsuite/pre_kill_hook/__init__.py =================================================================== --- /dev/null +++ packages/Python/lldbsuite/pre_kill_hook/__init__.py @@ -0,0 +1 @@ +"""Initialize the package.""" Index: packages/Python/lldbsuite/pre_kill_hook/darwin.py =================================================================== --- /dev/null +++ packages/Python/lldbsuite/pre_kill_hook/darwin.py @@ -0,0 +1,46 @@ +"""Provides a pre-kill method to run on macOS.""" +from __future__ import print_function + +# system imports +import subprocess +import sys + +# third-party module imports +import six + + +def do_pre_kill(process_id, runner_context, output_stream, sample_time=3): + """Samples the given process id, and puts the output to output_stream. + + @param process_id the local process to sample. + + @param runner_context a dictionary of details about the architectures + and platform on which the given process is running. Expected keys are + archs (array of architectures), platform_name, platform_url, and + platform_working_dir. + + @param output_stream file-like object that should be used to write the + results of sampling. + + @param sample_time specifies the time in seconds that should be captured. + """ + + # Validate args. + if runner_context is None: + raise Exception("runner_context argument is required") + if not isinstance(runner_context, dict): + raise Exception("runner_context argument must be a dictionary") + + # We will try to run sample on the local host only if there is no URL + # to a remote. + if "platform_url" in runner_context and ( + runner_context["platform_url"] is not None): + import pprint + sys.stderr.write( + "warning: skipping timeout pre-kill sample invocation because we " + "don't know how to run on a remote yet. runner_context={}\n" + .format(pprint.pformat(runner_context))) + + output = subprocess.check_output(['sample', six.text_type(process_id), + str(sample_time)]) + output_stream.write(output) Index: packages/Python/lldbsuite/pre_kill_hook/tests/test_darwin.py =================================================================== --- /dev/null +++ packages/Python/lldbsuite/pre_kill_hook/tests/test_darwin.py @@ -0,0 +1,107 @@ +"""Test the pre-kill hook on Darwin.""" +from __future__ import print_function + +# system imports +from multiprocessing import Process, Queue +import platform +import re +from unittest import main, TestCase + +# third party +from six import StringIO + + +def do_child_process(child_work_queue, parent_work_queue, verbose): + import os + + pid = os.getpid() + if verbose: + print("child: pid {} started, sending to parent".format(pid)) + parent_work_queue.put(pid) + if verbose: + print("child: waiting for shut-down request from parent") + child_work_queue.get() + if verbose: + print("child: received shut-down request. Child exiting.") + + +class DarwinPreKillTestCase(TestCase): + + def __init__(self, methodName): + super(DarwinPreKillTestCase, self).__init__(methodName) + self.process = None + self.child_work_queue = None + self.verbose = False + + def tearDown(self): + if self.verbose: + print("parent: sending shut-down request to child") + if self.process: + self.child_work_queue.put("hello, child") + self.process.join() + if self.verbose: + print("parent: child is fully shut down") + + def test_sample(self): + # Ensure we're Darwin. + if platform.system() != 'Darwin': + self.skipTest("requires a Darwin-based OS") + + # Start the child process. + self.child_work_queue = Queue() + parent_work_queue = Queue() + self.process = Process(target=do_child_process, + args=(self.child_work_queue, parent_work_queue, + self.verbose)) + if self.verbose: + print("parent: starting child") + self.process.start() + + # Wait for the child to report its pid. Then we know we're running. + if self.verbose: + print("parent: waiting for child to start") + child_pid = parent_work_queue.get() + + # Sample the child process. + from darwin import do_pre_kill + context_dict = { + "archs": [platform.machine()], + "platform_name": None, + "platform_url": None, + "platform_working_dir": None + } + + if self.verbose: + print("parent: running pre-kill action on child") + output_io = StringIO() + do_pre_kill(child_pid, context_dict, output_io) + output = output_io.getvalue() + + if self.verbose: + print("parent: do_pre_kill() wrote the following output:", output) + self.assertIsNotNone(output) + + # We should have a line with: + # Process: .* [{pid}] + process_re = re.compile(r"Process:[^[]+\[([^]]+)\]") + match = process_re.search(output) + self.assertIsNotNone(match, "should have found process id for " + "sampled process") + self.assertEqual(1, len(match.groups())) + self.assertEqual(child_pid, int(match.group(1))) + + # We should see a Call graph: section. + callgraph_re = re.compile(r"Call graph:") + match = callgraph_re.search(output) + self.assertIsNotNone(match, "should have found the Call graph section" + "in sample output") + + # We should see a Binary Images: section. + binary_images_re = re.compile(r"Binary Images:") + match = binary_images_re.search(output) + self.assertIsNotNone(match, "should have found the Binary Images " + "section in sample output") + + +if __name__ == "__main__": + main() Index: packages/Python/lldbsuite/test/dosep.py =================================================================== --- packages/Python/lldbsuite/test/dosep.py +++ packages/Python/lldbsuite/test/dosep.py @@ -46,6 +46,7 @@ import sys import threading +from six import StringIO from six.moves import queue # Our packages and modules @@ -64,6 +65,8 @@ # Status codes for running command with timeout. eTimedOut, ePassed, eFailed = 124, 0, 1 +g_session_dir = None +g_runner_context = None output_lock = None test_counter = None total_tests = None @@ -227,6 +230,39 @@ failures, unexpected_successes) + def on_timeout_pre_kill(self): + # We're just about to have a timeout take effect. Here's our chance + # to do a pre-kill action. + + # For now, we look to see if the lldbsuite.pre_kill module has a + # runner for our platform. + module_name = "lldbsuite.pre_kill_hook." + platform.system().lower() + import importlib + try: + module = importlib.import_module(module_name) + except ImportError: + # We don't have one for this platform. Skip. + sys.stderr.write("\nwarning: no timeout handler module: " + + module_name) + return + + # Try to run the pre-kill-hook method. + try: + # Run the pre-kill command. + output_io = StringIO() + module.do_pre_kill(self.pid, g_runner_context, output_io) + + # Write the output to a filename associated with the test file and + # pid. + basename = "{}-{}.sample".format(self.file_name, self.pid) + sample_path = os.path.join(g_session_dir, basename) + with open(sample_path, "w") as output_file: + output_file.write(output_io.getvalue()) + except Exception as e: + sys.stderr.write("caught exception while running " + "pre-kill action: {}".format(e)) + return + def is_exceptional_exit(self): """Returns whether the process returned a timeout. @@ -635,12 +671,16 @@ found_func(root, tests) -def initialize_global_vars_common(num_threads, test_work_items): - global total_tests, test_counter, test_name_len +def initialize_global_vars_common(num_threads, test_work_items, session_dir, + runner_context): + global g_session_dir, g_runner_context, total_tests, test_counter + global test_name_len total_tests = sum([len(item[1]) for item in test_work_items]) test_counter = multiprocessing.Value('i', 0) test_name_len = multiprocessing.Value('i', 0) + g_session_dir = session_dir + g_runner_context = runner_context if not (RESULTS_FORMATTER and RESULTS_FORMATTER.is_using_terminal()): print( "Testing: %d test suites, %d thread%s" % @@ -652,20 +692,31 @@ update_progress() -def initialize_global_vars_multiprocessing(num_threads, test_work_items): +def initialize_global_vars_multiprocessing(num_threads, test_work_items, + session_dir, runner_context): # Initialize the global state we'll use to communicate with the # rest of the flat module. global output_lock output_lock = multiprocessing.RLock() - initialize_global_vars_common(num_threads, test_work_items) + initialize_global_vars_common(num_threads, test_work_items, session_dir, + runner_context) -def initialize_global_vars_threading(num_threads, test_work_items): +def initialize_global_vars_threading(num_threads, test_work_items, session_dir, + runner_context): """Initializes global variables used in threading mode. + @param num_threads specifies the number of workers used. + @param test_work_items specifies all the work items that will be processed. + + @param session_dir the session directory where test-run-speciif files are + written. + + @param runner_context a dictionary of platform-related data that is passed + to the timeout pre-kill hook. """ # Initialize the global state we'll use to communicate with the # rest of the flat module. @@ -686,7 +737,8 @@ global GET_WORKER_INDEX GET_WORKER_INDEX = get_worker_index_threading - initialize_global_vars_common(num_threads, test_work_items) + initialize_global_vars_common(num_threads, test_work_items, session_dir, + runner_context) def ctrl_c_loop(main_op_func, done_func, ctrl_c_handler): @@ -833,7 +885,8 @@ return True -def multiprocessing_test_runner(num_threads, test_work_items): +def multiprocessing_test_runner(num_threads, test_work_items, session_dir, + runner_context): """Provides hand-wrapped pooling test runner adapter with Ctrl-C support. This concurrent test runner is based on the multiprocessing @@ -847,10 +900,17 @@ @param test_work_items the iterable of test work item tuples to run. + + @param session_dir the session directory where test-run-speciif files are + written. + + @param runner_context a dictionary of platform-related data that is passed + to the timeout pre-kill hook. """ # Initialize our global state. - initialize_global_vars_multiprocessing(num_threads, test_work_items) + initialize_global_vars_multiprocessing(num_threads, test_work_items, + session_dir, runner_context) # Create jobs. job_queue = multiprocessing.Queue(len(test_work_items)) @@ -955,9 +1015,11 @@ return map_results -def multiprocessing_test_runner_pool(num_threads, test_work_items): +def multiprocessing_test_runner_pool(num_threads, test_work_items, session_dir, + runner_context): # Initialize our global state. - initialize_global_vars_multiprocessing(num_threads, test_work_items) + initialize_global_vars_multiprocessing(num_threads, test_work_items, + session_dir, runner_context) manager = multiprocessing.Manager() worker_index_map = manager.dict() @@ -975,7 +1037,8 @@ map_future, RUNNER_PROCESS_ASYNC_MAP, RESULTS_LISTENER_CHANNEL) -def threading_test_runner(num_threads, test_work_items): +def threading_test_runner(num_threads, test_work_items, session_dir, + runner_context): """Provides hand-wrapped pooling threading-based test runner adapter with Ctrl-C support. @@ -987,10 +1050,17 @@ @param test_work_items the iterable of test work item tuples to run. - """ + + @param session_dir the session directory where test-run-speciif files are + written. + + @param runner_context a dictionary of platform-related data that is passed + to the timeout pre-kill hook. + """ # Initialize our global state. - initialize_global_vars_threading(num_threads, test_work_items) + initialize_global_vars_threading(num_threads, test_work_items, session_dir, + runner_context) # Create jobs. job_queue = queue.Queue() @@ -1038,9 +1108,11 @@ return test_results -def threading_test_runner_pool(num_threads, test_work_items): +def threading_test_runner_pool(num_threads, test_work_items, session_dir, + runner_context): # Initialize our global state. - initialize_global_vars_threading(num_threads, test_work_items) + initialize_global_vars_threading(num_threads, test_work_items, session_dir, + runner_context) pool = multiprocessing.pool.ThreadPool(num_threads) map_future = pool.map_async( @@ -1060,9 +1132,10 @@ pass -def inprocess_exec_test_runner(test_work_items): +def inprocess_exec_test_runner(test_work_items, session_dir, runner_context): # Initialize our global state. - initialize_global_vars_multiprocessing(1, test_work_items) + initialize_global_vars_multiprocessing(1, test_work_items, session_dir, + runner_context) # We're always worker index 0 global GET_WORKER_INDEX @@ -1205,12 +1278,20 @@ return result -def get_test_runner_strategies(num_threads): +def get_test_runner_strategies(num_threads, session_dir, runner_context): """Returns the test runner strategies by name in a dictionary. @param num_threads specifies the number of threads/processes that will be used for concurrent test runners. + @param session_dir specifies the session dir to use for + auxiliary files. + + @param runner_context a dictionary of details on the architectures and + platform used to run the test suite. This is passed along verbatim to + the timeout pre-kill handler, allowing that decoupled component to do + process inspection in a platform-specific way. + @return dictionary with key as test runner strategy name and value set to a callable object that takes the test work item and returns a test result tuple. @@ -1220,32 +1301,34 @@ # multiprocessing.Pool. "multiprocessing": (lambda work_items: multiprocessing_test_runner( - num_threads, work_items)), + num_threads, work_items, session_dir, runner_context)), # multiprocessing-pool uses multiprocessing.Pool but # does not support Ctrl-C. "multiprocessing-pool": (lambda work_items: multiprocessing_test_runner_pool( - num_threads, work_items)), + num_threads, work_items, session_dir, runner_context)), # threading uses a hand-rolled worker pool much # like multiprocessing, but instead uses in-process # worker threads. This one supports Ctrl-C. "threading": - (lambda work_items: threading_test_runner(num_threads, work_items)), + (lambda work_items: threading_test_runner( + num_threads, work_items, session_dir, runner_context)), # threading-pool uses threading for the workers (in-process) # and uses the multiprocessing.pool thread-enabled pool. # This does not properly support Ctrl-C. "threading-pool": (lambda work_items: threading_test_runner_pool( - num_threads, work_items)), + num_threads, work_items, session_dir, runner_context)), # serial uses the subprocess-based, single process # test runner. This provides process isolation but # no concurrent test execution. "serial": - inprocess_exec_test_runner + (lambda work_items: inprocess_exec_test_runner( + work_items, session_dir, runner_context)) } @@ -1425,7 +1508,8 @@ return test_runner_name -def rerun_tests(test_subdir, tests_for_rerun, dotest_argv): +def rerun_tests(test_subdir, tests_for_rerun, dotest_argv, session_dir, + runner_context): # Build the list of test files to rerun. Some future time we'll # enable re-run by test method so we can constrain the rerun set # to just the method(s) that were in issued within a file. @@ -1465,7 +1549,8 @@ print("rerun will use the '{}' test runner strategy".format( rerun_runner_name)) - runner_strategies_by_name = get_test_runner_strategies(rerun_thread_count) + runner_strategies_by_name = get_test_runner_strategies( + rerun_thread_count, session_dir, runner_context) rerun_runner_func = runner_strategies_by_name[ rerun_runner_name] if rerun_runner_func is None: @@ -1546,8 +1631,19 @@ if results_formatter is not None: results_formatter.set_expected_timeouts_by_basename(expected_timeout) + # Setup the test runner context. This is a dictionary of information that + # will be passed along to the timeout pre-kill handler and allows for loose + # coupling of its implementation. + runner_context = { + "archs": configuration.archs, + "platform_name": configuration.lldb_platform_name, + "platform_url": configuration.lldb_platform_url, + "platform_working_dir": configuration.lldb_platform_working_dir, + } + # Figure out which testrunner strategy we'll use. - runner_strategies_by_name = get_test_runner_strategies(num_threads) + runner_strategies_by_name = get_test_runner_strategies( + num_threads, session_dir, runner_context) # If the user didn't specify a test runner strategy, determine # the default now based on number of threads and OS type. @@ -1594,7 +1690,8 @@ "exceeded".format( configuration.rerun_max_file_threshold)) else: - rerun_tests(test_subdir, tests_for_rerun, dotest_argv) + rerun_tests(test_subdir, tests_for_rerun, dotest_argv, + session_dir, runner_context) # The results formatter - if present - is done now. Tell it to # terminate. Index: packages/Python/lldbsuite/test/test_runner/process_control.py =================================================================== --- packages/Python/lldbsuite/test/test_runner/process_control.py +++ packages/Python/lldbsuite/test/test_runner/process_control.py @@ -483,6 +483,19 @@ def on_process_exited(self, command, output, was_timeout, exit_status): pass + def on_timeout_pre_kill(self): + """Called after the timeout interval elapses but before killing it. + + This method is added to enable derived classes the ability to do + something to the process prior to it being killed. For example, + this would be a good spot to run a program that samples the process + to see what it was doing (or not doing). + + Do not attempt to reap the process (i.e. use wait()) in this method. + That will interfere with the kill mechanism and return code processing. + """ + pass + def write(self, content): # pylint: disable=no-self-use # Intended - we want derived classes to be able to override @@ -640,6 +653,11 @@ # Reap the child process here. self.returncode = self.process.wait() else: + + # Allow derived classes to do some work after we detected + # a timeout but before we touch the timed-out process. + self.on_timeout_pre_kill() + # Prepare to stop the process process_terminated = completed_normally terminate_attempt_count = 0