diff --git a/clang/tools/include-mapping/cppreference_parser.py b/clang/tools/include-mapping/cppreference_parser.py --- a/clang/tools/include-mapping/cppreference_parser.py +++ b/clang/tools/include-mapping/cppreference_parser.py @@ -47,7 +47,7 @@ Returns a list of headers. """ - headers = set() + symbol_headers = set() all_headers = set() soup = BeautifulSoup(symbol_page_html, "html.parser") @@ -58,31 +58,39 @@ # Defined in header .t-dsc-header # decl2 .t-dcl for table in soup.select('table.t-dcl-begin, table.t-dsc-begin'): - current_headers = [] - was_decl = False - for row in table.select('tr'): - if _HasClass(row, 't-dcl', 't-dsc'): - was_decl = True - # Symbols are in the first cell. - found_symbols = row.find('td').stripped_strings - if not symbol_name in found_symbols: - continue - headers.update(current_headers) - elif _HasClass(row, 't-dsc-header'): - # If we saw a decl since the last header, this is a new block of headers - # for a new block of decls. - if was_decl: - current_headers = [] - was_decl = False + rows = table.select('tr') + i = 0 + while i < len(rows): + start = i + current_headers = set() + while i < len(rows) and _HasClass(rows[i], 't-dsc-header'): + row = rows[i] # There are also .t-dsc-header for "defined in namespace". if not "Defined in header " in row.text: + i = i + 1 continue # The interesting header content (e.g. ) is wrapped in . for header_code in row.find_all("code"): - current_headers.append(header_code.text) all_headers.add(header_code.text) - # If the symbol was never named, consider all named headers. - return headers or all_headers + current_headers.add(header_code.text) + i = i + 1 + # some tables have header rows, skip them + while i < len(rows) and _HasClass(rows[i], 't-dsc-hitem'): + i = i + 1 + while i < len(rows) and (_HasClass(rows[i], 't-dcl', 't-dsc') or not rows[i].has_attr("class")): + row = rows[i] + # Symbols are in the first cell. + found_symbols = row.find('td').stripped_strings + if symbol_name in found_symbols: + for header in current_headers: + symbol_headers.add(header) + i = i + 1 + # no headers or symbols in this block + if i == start: + i = i + 1 + + # If the symbol was never named, consider all named headers. + return symbol_headers or all_headers def _ParseIndexPage(index_page_html):