diff options
Diffstat (limited to 'clang-tools-extra/clangd/include-mapping')
3 files changed, 218 insertions, 180 deletions
diff --git a/clang-tools-extra/clangd/include-mapping/cppreference_parser.py b/clang-tools-extra/clangd/include-mapping/cppreference_parser.py new file mode 100644 index 00000000000..fd3b8a6837d --- /dev/null +++ b/clang-tools-extra/clangd/include-mapping/cppreference_parser.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python +#===- cppreference_parser.py - ------------------------------*- python -*--===# +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +#===------------------------------------------------------------------------===# + +from bs4 import BeautifulSoup, NavigableString + +import collections +import multiprocessing +import os +import re +import signal +import sys + + +class Symbol: + + def __init__(self, name, namespace, headers): + # unqualifed symbol name, e.g. "move" + self.name = name + # namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope) + # None for C symbols. + self.namespace = namespace + # a list of corresponding headers + self.headers = headers + + +def _HasClass(tag, *classes): + for c in tag.get('class', []): + if c in classes: + return True + return False + + +def _ParseSymbolPage(symbol_page_html, symbol_name): + """Parse symbol page and retrieve the include header defined in this page. + The symbol page provides header for the symbol, specifically in + "Defined in header <header>" section. An example: + + <tr class="t-dsc-header"> + <td colspan="2"> <div>Defined in header <code><ratio></code> </div> + </td></tr> + + Returns a list of headers. + """ + headers = set() + all_headers = set() + + soup = BeautifulSoup(symbol_page_html, "html.parser") + # Rows in table are like: + # Defined in header <foo> .t-dsc-header + # Defined in header <bar> .t-dsc-header + # decl1 .t-dcl + # Defined in header <baz> .t-dsc-header + # decl2 .t-dcl + for table in soup.select('table.t-dcl-begin, table.t-dsc-begin'): + current_headers = [] + was_decl = False + for row in table.select('tr'): + if _HasClass(row, 't-dcl', 't-dsc'): + was_decl = True + # Symbols are in the first cell. + found_symbols = row.find('td').stripped_strings + if not symbol_name in found_symbols: + continue + headers.update(current_headers) + elif _HasClass(row, 't-dsc-header'): + # If we saw a decl since the last header, this is a new block of headers + # for a new block of decls. + if was_decl: + current_headers = [] + was_decl = False + # There are also .t-dsc-header for "defined in namespace". + if not "Defined in header " in row.text: + continue + # The interesting header content (e.g. <cstdlib>) is wrapped in <code>. + for header_code in row.find_all("code"): + current_headers.append(header_code.text) + all_headers.add(header_code.text) + # If the symbol was never named, consider all named headers. + return headers or all_headers + + +def _ParseIndexPage(index_page_html): + """Parse index page. + The index page lists all std symbols and hrefs to their detailed pages + (which contain the defined header). An example: + + <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br> + <a href="acos.html" title="acos"><tt>acos()</tt></a> <br> + + Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant). + """ + symbols = [] + soup = BeautifulSoup(index_page_html, "html.parser") + for symbol_href in soup.select("a[title]"): + # Ignore annotated symbols like "acos<>() (std::complex)". + # These tend to be overloads, and we the primary is more useful. + # This accidentally accepts begin/end despite the (iterator) caption: the + # (since C++11) note is first. They are good symbols, so the bug is unfixed. + caption = symbol_href.next_sibling + variant = isinstance(caption, NavigableString) and "(" in caption + symbol_tt = symbol_href.find("tt") + if symbol_tt: + symbols.append((symbol_tt.text.rstrip("<>()"), # strip any trailing <>() + symbol_href["href"], variant)) + return symbols + + +def _ReadSymbolPage(path, name): + with open(path) as f: + return _ParseSymbolPage(f.read(), name) + + +def _GetSymbols(pool, root_dir, index_page_name, namespace): + """Get all symbols listed in the index page. All symbols should be in the + given namespace. + + Returns a list of Symbols. + """ + + # Workflow steps: + # 1. Parse index page which lists all symbols to get symbol + # name (unqualified name) and its href link to the symbol page which + # contains the defined header. + # 2. Parse the symbol page to get the defined header. + index_page_path = os.path.join(root_dir, index_page_name) + with open(index_page_path, "r") as f: + # Read each symbol page in parallel. + results = [] # (symbol_name, promise of [header...]) + for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()): + # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity. + # FIXME: use these as a fallback rather than ignoring entirely. + if variant: + continue + path = os.path.join(root_dir, symbol_page_path) + results.append((symbol_name, + pool.apply_async(_ReadSymbolPage, (path, symbol_name)))) + + # Build map from symbol name to a set of headers. + symbol_headers = collections.defaultdict(set) + for symbol_name, lazy_headers in results: + symbol_headers[symbol_name].update(lazy_headers.get()) + + symbols = [] + for name, headers in sorted(symbol_headers.items(), key=lambda t : t[0]): + symbols.append(Symbol(name, namespace, list(headers))) + return symbols + + +def GetSymbols(parse_pages): + """Get all symbols by parsing the given pages. + + Args: + parse_pages: a list of tuples (page_root_dir, index_page_name, namespace) + """ + symbols = [] + # Run many workers to process individual symbol pages under the symbol index. + # Don't allow workers to capture Ctrl-C. + pool = multiprocessing.Pool( + initializer=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN)) + try: + for root_dir, page_name, namespace in parse_pages: + symbols.extend(_GetSymbols(pool, root_dir, page_name, namespace)) + finally: + pool.terminate() + pool.join() + return symbols diff --git a/clang-tools-extra/clangd/include-mapping/gen_std.py b/clang-tools-extra/clangd/include-mapping/gen_std.py index c5824a0c42e..1cfe2c026b4 100755 --- a/clang-tools-extra/clangd/include-mapping/gen_std.py +++ b/clang-tools-extra/clangd/include-mapping/gen_std.py @@ -8,7 +8,7 @@ #===------------------------------------------------------------------------===# """gen_std.py is a tool to generate a lookup table (from qualified names to -include headers) for C++ Standard Library symbols by parsing archieved HTML +include headers) for C/C++ Standard Library symbols by parsing archieved HTML files from cppreference. Caveats and FIXMEs: @@ -25,24 +25,23 @@ Usage: 3. Unzip the zip file from step 2 to directory </cppreference>, you should get a "reference" directory in </cppreference> 4. Run the command: - gen_std.py -cppreference </cppreference/reference> > StdSymbolMap.inc + // Generate C++ symbols + gen_std.py -cppreference </cppreference/reference> -language=cpp > StdSymbolMap.inc + // Generate C symbols + gen_std.py -cppreference </cppreference/reference> -language=c > CSymbolMap.inc """ -from bs4 import BeautifulSoup, NavigableString +import cppreference_parser import argparse -import collections import datetime -import multiprocessing import os -import re -import signal import sys -STDGEN_CODE_PREFIX = """\ +CODE_PREFIX = """\ //===-- gen_std.py generated file -------------------------------*- C++ -*-===// // -// Used to build a lookup table (qualified names => include headers) for C++ +// Used to build a lookup table (qualified names => include headers) for %s // Standard Library symbols. // // Automatically generated file, DO NOT EDIT! @@ -51,189 +50,56 @@ STDGEN_CODE_PREFIX = """\ //===----------------------------------------------------------------------===// """ -def HasClass(tag, *classes): - for c in tag.get('class', []): - if c in classes: - return True - return False - -def ParseSymbolPage(symbol_page_html, symbol_name): - """Parse symbol page and retrieve the include header defined in this page. - The symbol page provides header for the symbol, specifically in - "Defined in header <header>" section. An example: - - <tr class="t-dsc-header"> - <td colspan="2"> <div>Defined in header <code><ratio></code> </div> - </td></tr> - - Returns a list of headers. - """ - headers = set() - all_headers = set() - - soup = BeautifulSoup(symbol_page_html, "html.parser") - # Rows in table are like: - # Defined in header <foo> .t-dsc-header - # Defined in header <bar> .t-dsc-header - # decl1 .t-dcl - # Defined in header <baz> .t-dsc-header - # decl2 .t-dcl - for table in soup.select('table.t-dcl-begin, table.t-dsc-begin'): - current_headers = [] - was_decl = False - for row in table.select('tr'): - if HasClass(row, 't-dcl', 't-dsc'): - was_decl = True - # Symbols are in the first cell. - found_symbols = row.find('td').stripped_strings - if not symbol_name in found_symbols: - continue - headers.update(current_headers) - elif HasClass(row, 't-dsc-header'): - # If we saw a decl since the last header, this is a new block of headers - # for a new block of decls. - if was_decl: - current_headers = [] - was_decl = False - # There are also .t-dsc-header for "defined in namespace". - if not "Defined in header " in row.text: - continue - # The interesting header content (e.g. <cstdlib>) is wrapped in <code>. - for header_code in row.find_all("code"): - current_headers.append(header_code.text) - all_headers.add(header_code.text) - # If the symbol was never named, consider all named headers. - return headers or all_headers - - -def ParseIndexPage(index_page_html): - """Parse index page. - The index page lists all std symbols and hrefs to their detailed pages - (which contain the defined header). An example: - - <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br> - <a href="acos.html" title="acos"><tt>acos()</tt></a> <br> - - Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant). - """ - symbols = [] - soup = BeautifulSoup(index_page_html, "html.parser") - for symbol_href in soup.select("a[title]"): - # Ignore annotated symbols like "acos<>() (std::complex)". - # These tend to be overloads, and we the primary is more useful. - # This accidentally accepts begin/end despite the (iterator) caption: the - # (since C++11) note is first. They are good symbols, so the bug is unfixed. - caption = symbol_href.next_sibling - variant = isinstance(caption, NavigableString) and "(" in caption - symbol_tt = symbol_href.find("tt") - if symbol_tt: - symbols.append((symbol_tt.text.rstrip("<>()"), # strip any trailing <>() - symbol_href["href"], variant)) - return symbols - -class Symbol: - - def __init__(self, name, namespace, headers): - # unqualifed symbol name, e.g. "move" - self.name = name - # namespace of the symbol (with trailing "::"), e.g. "std::" - self.namespace = namespace - # a list of corresponding headers - self.headers = headers - - -def ReadSymbolPage(path, name): - with open(path) as f: - return ParseSymbolPage(f.read(), name) - - -def GetSymbols(pool, root_dir, index_page_name, namespace): - """Get all symbols listed in the index page. All symbols should be in the - given namespace. - - Returns a list of Symbols. - """ - - # Workflow steps: - # 1. Parse index page which lists all symbols to get symbol - # name (unqualified name) and its href link to the symbol page which - # contains the defined header. - # 2. Parse the symbol page to get the defined header. - index_page_path = os.path.join(root_dir, index_page_name) - with open(index_page_path, "r") as f: - # Read each symbol page in parallel. - results = [] # (symbol_name, promise of [header...]) - for symbol_name, symbol_page_path, variant in ParseIndexPage(f.read()): - # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity. - # FIXME: use these as a fallback rather than ignoring entirely. - if variant: - continue - path = os.path.join(root_dir, symbol_page_path) - results.append((symbol_name, - pool.apply_async(ReadSymbolPage, (path, symbol_name)))) - - # Build map from symbol name to a set of headers. - symbol_headers = collections.defaultdict(set) - for symbol_name, lazy_headers in results: - symbol_headers[symbol_name].update(lazy_headers.get()) - - symbols = [] - for name, headers in sorted(symbol_headers.items(), key=lambda t : t[0]): - symbols.append(Symbol(name, namespace, list(headers))) - return symbols - - def ParseArg(): parser = argparse.ArgumentParser(description='Generate StdGen file') parser.add_argument('-cppreference', metavar='PATH', default='', help='path to the cppreference offline HTML directory', required=True - ) + ) + parser.add_argument('-language', + default='cpp', + help='Generate c or cpp symbols', + required=True) return parser.parse_args() def main(): args = ParseArg() - cpp_root = os.path.join(args.cppreference, "en", "cpp") - symbol_index_root = os.path.join(cpp_root, "symbol_index") + if args.language == 'cpp': + page_root = os.path.join(args.cppreference, "en", "cpp") + symbol_index_root = os.path.join(page_root, "symbol_index") + parse_pages = [ + (page_root, "symbol_index.html", "std::"), + # std sub-namespace symbols have separated pages. + # We don't index std literal operators (e.g. + # std::literals::chrono_literals::operator""d), these symbols can't be + # accessed by std::<symbol_name>. + # FIXME: index std::placeholders symbols, placeholders.html page is + # different (which contains one entry for _1, _2, ..., _N), we need special + # handling. + (symbol_index_root, "chrono.html", "std::chrono::"), + (symbol_index_root, "filesystem.html", "std::filesystem::"), + (symbol_index_root, "pmr.html", "std::pmr::"), + (symbol_index_root, "regex_constants.html", "std::regex_constants::"), + (symbol_index_root, "this_thread.html", "std::this_thread::"), + ] + elif args.language == 'c': + page_root = os.path.join(args.cppreference, "en", "c") + symbol_index_root = page_root + parse_pages = [(page_root, "index.html", None)] + if not os.path.exists(symbol_index_root): exit("Path %s doesn't exist!" % symbol_index_root) - parse_pages = [ - (cpp_root, "symbol_index.html", "std::"), - # std sub-namespace symbols have separated pages. - # We don't index std literal operators (e.g. - # std::literals::chrono_literals::operator""d), these symbols can't be - # accessed by std::<symbol_name>. - # FIXME: index std::placeholders symbols, placeholders.html page is - # different (which contains one entry for _1, _2, ..., _N), we need special - # handling. - (symbol_index_root, "chrono.html", "std::chrono::"), - (symbol_index_root, "filesystem.html", "std::filesystem::"), - (symbol_index_root, "pmr.html", "std::pmr::"), - (symbol_index_root, "regex_constants.html", "std::regex_constants::"), - (symbol_index_root, "this_thread.html", "std::this_thread::"), - ] - - symbols = [] - # Run many workers to process individual symbol pages under the symbol index. - # Don't allow workers to capture Ctrl-C. - pool = multiprocessing.Pool( - initializer=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN)) - try: - for root_dir, page_name, namespace in parse_pages: - symbols.extend(GetSymbols(pool, root_dir, page_name, namespace)) - finally: - pool.terminate() - pool.join() + symbols = cppreference_parser.GetSymbols(parse_pages) # We don't have version information from the unzipped offline HTML files. # so we use the modified time of the symbol_index.html as the version. - index_page_path = os.path.join(cpp_root, "symbol_index.html") + index_page_path = os.path.join(page_root, "index.html") cppreference_modified_date = datetime.datetime.fromtimestamp( os.stat(index_page_path).st_mtime).strftime('%Y-%m-%d') - print STDGEN_CODE_PREFIX % cppreference_modified_date + print CODE_PREFIX % (args.language.upper(), cppreference_modified_date) for symbol in symbols: if len(symbol.headers) == 1: # SYMBOL(unqualified_name, namespace, header) diff --git a/clang-tools-extra/clangd/include-mapping/test.py b/clang-tools-extra/clangd/include-mapping/test.py index 3f17b53189c..9fad952b2e9 100755 --- a/clang-tools-extra/clangd/include-mapping/test.py +++ b/clang-tools-extra/clangd/include-mapping/test.py @@ -7,7 +7,7 @@ # #===------------------------------------------------------------------------===# -from gen_std import ParseSymbolPage, ParseIndexPage +from cppreference_parser import _ParseSymbolPage, _ParseIndexPage import unittest @@ -22,7 +22,7 @@ class TestStdGen(unittest.TestCase): <a href="as_bytes.html" title="as bytes"><tt>as_bytes<>()</tt></a> <span class="t-mark-rev t-since-cxx20">(since C++20)</span> <br> """ - actual = ParseIndexPage(html) + actual = _ParseIndexPage(html) expected = [ ("abs", "abs.html", True), ("abs", "complex/abs.html", True), @@ -53,7 +53,7 @@ class TestStdGen(unittest.TestCase): </tr> </tbody></table> """ - self.assertEqual(ParseSymbolPage(html, 'foo'), set(['<cmath>'])) + self.assertEqual(_ParseSymbolPage(html, 'foo'), set(['<cmath>'])) def testParseSymbolPage_MulHeaders(self): @@ -94,7 +94,7 @@ class TestStdGen(unittest.TestCase): </tr> </tbody></table> """ - self.assertEqual(ParseSymbolPage(html, "foo"), + self.assertEqual(_ParseSymbolPage(html, "foo"), set(['<cstdio>', '<cstdlib>'])) @@ -121,7 +121,7 @@ class TestStdGen(unittest.TestCase): </tr> </tbody></table> """ - self.assertEqual(ParseSymbolPage(html, "foo"), + self.assertEqual(_ParseSymbolPage(html, "foo"), set(['<algorithm>', '<utility>'])) def testParseSymbolPage_MulSymbolsInSameTd(self): @@ -145,9 +145,9 @@ class TestStdGen(unittest.TestCase): </tr> </tbody></table> """ - self.assertEqual(ParseSymbolPage(html, "int8_t"), + self.assertEqual(_ParseSymbolPage(html, "int8_t"), set(['<cstdint>'])) - self.assertEqual(ParseSymbolPage(html, "int16_t"), + self.assertEqual(_ParseSymbolPage(html, "int16_t"), set(['<cstdint>'])) |

