summaryrefslogtreecommitdiffstats
path: root/clang-tools-extra/clangd/include-mapping
diff options
context:
space:
mode:
Diffstat (limited to 'clang-tools-extra/clangd/include-mapping')
-rw-r--r--clang-tools-extra/clangd/include-mapping/cppreference_parser.py172
-rwxr-xr-xclang-tools-extra/clangd/include-mapping/gen_std.py212
-rwxr-xr-xclang-tools-extra/clangd/include-mapping/test.py14
3 files changed, 218 insertions, 180 deletions
diff --git a/clang-tools-extra/clangd/include-mapping/cppreference_parser.py b/clang-tools-extra/clangd/include-mapping/cppreference_parser.py
new file mode 100644
index 00000000000..fd3b8a6837d
--- /dev/null
+++ b/clang-tools-extra/clangd/include-mapping/cppreference_parser.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python
+#===- cppreference_parser.py - ------------------------------*- python -*--===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===------------------------------------------------------------------------===#
+
+from bs4 import BeautifulSoup, NavigableString
+
+import collections
+import multiprocessing
+import os
+import re
+import signal
+import sys
+
+
+class Symbol:
+
+ def __init__(self, name, namespace, headers):
+ # unqualifed symbol name, e.g. "move"
+ self.name = name
+ # namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope)
+ # None for C symbols.
+ self.namespace = namespace
+ # a list of corresponding headers
+ self.headers = headers
+
+
+def _HasClass(tag, *classes):
+ for c in tag.get('class', []):
+ if c in classes:
+ return True
+ return False
+
+
+def _ParseSymbolPage(symbol_page_html, symbol_name):
+ """Parse symbol page and retrieve the include header defined in this page.
+ The symbol page provides header for the symbol, specifically in
+ "Defined in header <header>" section. An example:
+
+ <tr class="t-dsc-header">
+ <td colspan="2"> <div>Defined in header <code>&lt;ratio&gt;</code> </div>
+ </td></tr>
+
+ Returns a list of headers.
+ """
+ headers = set()
+ all_headers = set()
+
+ soup = BeautifulSoup(symbol_page_html, "html.parser")
+ # Rows in table are like:
+ # Defined in header <foo> .t-dsc-header
+ # Defined in header <bar> .t-dsc-header
+ # decl1 .t-dcl
+ # Defined in header <baz> .t-dsc-header
+ # decl2 .t-dcl
+ for table in soup.select('table.t-dcl-begin, table.t-dsc-begin'):
+ current_headers = []
+ was_decl = False
+ for row in table.select('tr'):
+ if _HasClass(row, 't-dcl', 't-dsc'):
+ was_decl = True
+ # Symbols are in the first cell.
+ found_symbols = row.find('td').stripped_strings
+ if not symbol_name in found_symbols:
+ continue
+ headers.update(current_headers)
+ elif _HasClass(row, 't-dsc-header'):
+ # If we saw a decl since the last header, this is a new block of headers
+ # for a new block of decls.
+ if was_decl:
+ current_headers = []
+ was_decl = False
+ # There are also .t-dsc-header for "defined in namespace".
+ if not "Defined in header " in row.text:
+ continue
+ # The interesting header content (e.g. <cstdlib>) is wrapped in <code>.
+ for header_code in row.find_all("code"):
+ current_headers.append(header_code.text)
+ all_headers.add(header_code.text)
+ # If the symbol was never named, consider all named headers.
+ return headers or all_headers
+
+
+def _ParseIndexPage(index_page_html):
+ """Parse index page.
+ The index page lists all std symbols and hrefs to their detailed pages
+ (which contain the defined header). An example:
+
+ <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br>
+ <a href="acos.html" title="acos"><tt>acos()</tt></a> <br>
+
+ Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant).
+ """
+ symbols = []
+ soup = BeautifulSoup(index_page_html, "html.parser")
+ for symbol_href in soup.select("a[title]"):
+ # Ignore annotated symbols like "acos<>() (std::complex)".
+ # These tend to be overloads, and we the primary is more useful.
+ # This accidentally accepts begin/end despite the (iterator) caption: the
+ # (since C++11) note is first. They are good symbols, so the bug is unfixed.
+ caption = symbol_href.next_sibling
+ variant = isinstance(caption, NavigableString) and "(" in caption
+ symbol_tt = symbol_href.find("tt")
+ if symbol_tt:
+ symbols.append((symbol_tt.text.rstrip("<>()"), # strip any trailing <>()
+ symbol_href["href"], variant))
+ return symbols
+
+
+def _ReadSymbolPage(path, name):
+ with open(path) as f:
+ return _ParseSymbolPage(f.read(), name)
+
+
+def _GetSymbols(pool, root_dir, index_page_name, namespace):
+ """Get all symbols listed in the index page. All symbols should be in the
+ given namespace.
+
+ Returns a list of Symbols.
+ """
+
+ # Workflow steps:
+ # 1. Parse index page which lists all symbols to get symbol
+ # name (unqualified name) and its href link to the symbol page which
+ # contains the defined header.
+ # 2. Parse the symbol page to get the defined header.
+ index_page_path = os.path.join(root_dir, index_page_name)
+ with open(index_page_path, "r") as f:
+ # Read each symbol page in parallel.
+ results = [] # (symbol_name, promise of [header...])
+ for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()):
+ # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity.
+ # FIXME: use these as a fallback rather than ignoring entirely.
+ if variant:
+ continue
+ path = os.path.join(root_dir, symbol_page_path)
+ results.append((symbol_name,
+ pool.apply_async(_ReadSymbolPage, (path, symbol_name))))
+
+ # Build map from symbol name to a set of headers.
+ symbol_headers = collections.defaultdict(set)
+ for symbol_name, lazy_headers in results:
+ symbol_headers[symbol_name].update(lazy_headers.get())
+
+ symbols = []
+ for name, headers in sorted(symbol_headers.items(), key=lambda t : t[0]):
+ symbols.append(Symbol(name, namespace, list(headers)))
+ return symbols
+
+
+def GetSymbols(parse_pages):
+ """Get all symbols by parsing the given pages.
+
+ Args:
+ parse_pages: a list of tuples (page_root_dir, index_page_name, namespace)
+ """
+ symbols = []
+ # Run many workers to process individual symbol pages under the symbol index.
+ # Don't allow workers to capture Ctrl-C.
+ pool = multiprocessing.Pool(
+ initializer=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN))
+ try:
+ for root_dir, page_name, namespace in parse_pages:
+ symbols.extend(_GetSymbols(pool, root_dir, page_name, namespace))
+ finally:
+ pool.terminate()
+ pool.join()
+ return symbols
diff --git a/clang-tools-extra/clangd/include-mapping/gen_std.py b/clang-tools-extra/clangd/include-mapping/gen_std.py
index c5824a0c42e..1cfe2c026b4 100755
--- a/clang-tools-extra/clangd/include-mapping/gen_std.py
+++ b/clang-tools-extra/clangd/include-mapping/gen_std.py
@@ -8,7 +8,7 @@
#===------------------------------------------------------------------------===#
"""gen_std.py is a tool to generate a lookup table (from qualified names to
-include headers) for C++ Standard Library symbols by parsing archieved HTML
+include headers) for C/C++ Standard Library symbols by parsing archieved HTML
files from cppreference.
Caveats and FIXMEs:
@@ -25,24 +25,23 @@ Usage:
3. Unzip the zip file from step 2 to directory </cppreference>, you should
get a "reference" directory in </cppreference>
4. Run the command:
- gen_std.py -cppreference </cppreference/reference> > StdSymbolMap.inc
+ // Generate C++ symbols
+ gen_std.py -cppreference </cppreference/reference> -language=cpp > StdSymbolMap.inc
+ // Generate C symbols
+ gen_std.py -cppreference </cppreference/reference> -language=c > CSymbolMap.inc
"""
-from bs4 import BeautifulSoup, NavigableString
+import cppreference_parser
import argparse
-import collections
import datetime
-import multiprocessing
import os
-import re
-import signal
import sys
-STDGEN_CODE_PREFIX = """\
+CODE_PREFIX = """\
//===-- gen_std.py generated file -------------------------------*- C++ -*-===//
//
-// Used to build a lookup table (qualified names => include headers) for C++
+// Used to build a lookup table (qualified names => include headers) for %s
// Standard Library symbols.
//
// Automatically generated file, DO NOT EDIT!
@@ -51,189 +50,56 @@ STDGEN_CODE_PREFIX = """\
//===----------------------------------------------------------------------===//
"""
-def HasClass(tag, *classes):
- for c in tag.get('class', []):
- if c in classes:
- return True
- return False
-
-def ParseSymbolPage(symbol_page_html, symbol_name):
- """Parse symbol page and retrieve the include header defined in this page.
- The symbol page provides header for the symbol, specifically in
- "Defined in header <header>" section. An example:
-
- <tr class="t-dsc-header">
- <td colspan="2"> <div>Defined in header <code>&lt;ratio&gt;</code> </div>
- </td></tr>
-
- Returns a list of headers.
- """
- headers = set()
- all_headers = set()
-
- soup = BeautifulSoup(symbol_page_html, "html.parser")
- # Rows in table are like:
- # Defined in header <foo> .t-dsc-header
- # Defined in header <bar> .t-dsc-header
- # decl1 .t-dcl
- # Defined in header <baz> .t-dsc-header
- # decl2 .t-dcl
- for table in soup.select('table.t-dcl-begin, table.t-dsc-begin'):
- current_headers = []
- was_decl = False
- for row in table.select('tr'):
- if HasClass(row, 't-dcl', 't-dsc'):
- was_decl = True
- # Symbols are in the first cell.
- found_symbols = row.find('td').stripped_strings
- if not symbol_name in found_symbols:
- continue
- headers.update(current_headers)
- elif HasClass(row, 't-dsc-header'):
- # If we saw a decl since the last header, this is a new block of headers
- # for a new block of decls.
- if was_decl:
- current_headers = []
- was_decl = False
- # There are also .t-dsc-header for "defined in namespace".
- if not "Defined in header " in row.text:
- continue
- # The interesting header content (e.g. <cstdlib>) is wrapped in <code>.
- for header_code in row.find_all("code"):
- current_headers.append(header_code.text)
- all_headers.add(header_code.text)
- # If the symbol was never named, consider all named headers.
- return headers or all_headers
-
-
-def ParseIndexPage(index_page_html):
- """Parse index page.
- The index page lists all std symbols and hrefs to their detailed pages
- (which contain the defined header). An example:
-
- <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br>
- <a href="acos.html" title="acos"><tt>acos()</tt></a> <br>
-
- Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant).
- """
- symbols = []
- soup = BeautifulSoup(index_page_html, "html.parser")
- for symbol_href in soup.select("a[title]"):
- # Ignore annotated symbols like "acos<>() (std::complex)".
- # These tend to be overloads, and we the primary is more useful.
- # This accidentally accepts begin/end despite the (iterator) caption: the
- # (since C++11) note is first. They are good symbols, so the bug is unfixed.
- caption = symbol_href.next_sibling
- variant = isinstance(caption, NavigableString) and "(" in caption
- symbol_tt = symbol_href.find("tt")
- if symbol_tt:
- symbols.append((symbol_tt.text.rstrip("<>()"), # strip any trailing <>()
- symbol_href["href"], variant))
- return symbols
-
-class Symbol:
-
- def __init__(self, name, namespace, headers):
- # unqualifed symbol name, e.g. "move"
- self.name = name
- # namespace of the symbol (with trailing "::"), e.g. "std::"
- self.namespace = namespace
- # a list of corresponding headers
- self.headers = headers
-
-
-def ReadSymbolPage(path, name):
- with open(path) as f:
- return ParseSymbolPage(f.read(), name)
-
-
-def GetSymbols(pool, root_dir, index_page_name, namespace):
- """Get all symbols listed in the index page. All symbols should be in the
- given namespace.
-
- Returns a list of Symbols.
- """
-
- # Workflow steps:
- # 1. Parse index page which lists all symbols to get symbol
- # name (unqualified name) and its href link to the symbol page which
- # contains the defined header.
- # 2. Parse the symbol page to get the defined header.
- index_page_path = os.path.join(root_dir, index_page_name)
- with open(index_page_path, "r") as f:
- # Read each symbol page in parallel.
- results = [] # (symbol_name, promise of [header...])
- for symbol_name, symbol_page_path, variant in ParseIndexPage(f.read()):
- # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity.
- # FIXME: use these as a fallback rather than ignoring entirely.
- if variant:
- continue
- path = os.path.join(root_dir, symbol_page_path)
- results.append((symbol_name,
- pool.apply_async(ReadSymbolPage, (path, symbol_name))))
-
- # Build map from symbol name to a set of headers.
- symbol_headers = collections.defaultdict(set)
- for symbol_name, lazy_headers in results:
- symbol_headers[symbol_name].update(lazy_headers.get())
-
- symbols = []
- for name, headers in sorted(symbol_headers.items(), key=lambda t : t[0]):
- symbols.append(Symbol(name, namespace, list(headers)))
- return symbols
-
-
def ParseArg():
parser = argparse.ArgumentParser(description='Generate StdGen file')
parser.add_argument('-cppreference', metavar='PATH',
default='',
help='path to the cppreference offline HTML directory',
required=True
- )
+ )
+ parser.add_argument('-language',
+ default='cpp',
+ help='Generate c or cpp symbols',
+ required=True)
return parser.parse_args()
def main():
args = ParseArg()
- cpp_root = os.path.join(args.cppreference, "en", "cpp")
- symbol_index_root = os.path.join(cpp_root, "symbol_index")
+ if args.language == 'cpp':
+ page_root = os.path.join(args.cppreference, "en", "cpp")
+ symbol_index_root = os.path.join(page_root, "symbol_index")
+ parse_pages = [
+ (page_root, "symbol_index.html", "std::"),
+ # std sub-namespace symbols have separated pages.
+ # We don't index std literal operators (e.g.
+ # std::literals::chrono_literals::operator""d), these symbols can't be
+ # accessed by std::<symbol_name>.
+ # FIXME: index std::placeholders symbols, placeholders.html page is
+ # different (which contains one entry for _1, _2, ..., _N), we need special
+ # handling.
+ (symbol_index_root, "chrono.html", "std::chrono::"),
+ (symbol_index_root, "filesystem.html", "std::filesystem::"),
+ (symbol_index_root, "pmr.html", "std::pmr::"),
+ (symbol_index_root, "regex_constants.html", "std::regex_constants::"),
+ (symbol_index_root, "this_thread.html", "std::this_thread::"),
+ ]
+ elif args.language == 'c':
+ page_root = os.path.join(args.cppreference, "en", "c")
+ symbol_index_root = page_root
+ parse_pages = [(page_root, "index.html", None)]
+
if not os.path.exists(symbol_index_root):
exit("Path %s doesn't exist!" % symbol_index_root)
- parse_pages = [
- (cpp_root, "symbol_index.html", "std::"),
- # std sub-namespace symbols have separated pages.
- # We don't index std literal operators (e.g.
- # std::literals::chrono_literals::operator""d), these symbols can't be
- # accessed by std::<symbol_name>.
- # FIXME: index std::placeholders symbols, placeholders.html page is
- # different (which contains one entry for _1, _2, ..., _N), we need special
- # handling.
- (symbol_index_root, "chrono.html", "std::chrono::"),
- (symbol_index_root, "filesystem.html", "std::filesystem::"),
- (symbol_index_root, "pmr.html", "std::pmr::"),
- (symbol_index_root, "regex_constants.html", "std::regex_constants::"),
- (symbol_index_root, "this_thread.html", "std::this_thread::"),
- ]
-
- symbols = []
- # Run many workers to process individual symbol pages under the symbol index.
- # Don't allow workers to capture Ctrl-C.
- pool = multiprocessing.Pool(
- initializer=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN))
- try:
- for root_dir, page_name, namespace in parse_pages:
- symbols.extend(GetSymbols(pool, root_dir, page_name, namespace))
- finally:
- pool.terminate()
- pool.join()
+ symbols = cppreference_parser.GetSymbols(parse_pages)
# We don't have version information from the unzipped offline HTML files.
# so we use the modified time of the symbol_index.html as the version.
- index_page_path = os.path.join(cpp_root, "symbol_index.html")
+ index_page_path = os.path.join(page_root, "index.html")
cppreference_modified_date = datetime.datetime.fromtimestamp(
os.stat(index_page_path).st_mtime).strftime('%Y-%m-%d')
- print STDGEN_CODE_PREFIX % cppreference_modified_date
+ print CODE_PREFIX % (args.language.upper(), cppreference_modified_date)
for symbol in symbols:
if len(symbol.headers) == 1:
# SYMBOL(unqualified_name, namespace, header)
diff --git a/clang-tools-extra/clangd/include-mapping/test.py b/clang-tools-extra/clangd/include-mapping/test.py
index 3f17b53189c..9fad952b2e9 100755
--- a/clang-tools-extra/clangd/include-mapping/test.py
+++ b/clang-tools-extra/clangd/include-mapping/test.py
@@ -7,7 +7,7 @@
#
#===------------------------------------------------------------------------===#
-from gen_std import ParseSymbolPage, ParseIndexPage
+from cppreference_parser import _ParseSymbolPage, _ParseIndexPage
import unittest
@@ -22,7 +22,7 @@ class TestStdGen(unittest.TestCase):
<a href="as_bytes.html" title="as bytes"><tt>as_bytes&lt;&gt;()</tt></a> <span class="t-mark-rev t-since-cxx20">(since C++20)</span> <br>
"""
- actual = ParseIndexPage(html)
+ actual = _ParseIndexPage(html)
expected = [
("abs", "abs.html", True),
("abs", "complex/abs.html", True),
@@ -53,7 +53,7 @@ class TestStdGen(unittest.TestCase):
</tr>
</tbody></table>
"""
- self.assertEqual(ParseSymbolPage(html, 'foo'), set(['<cmath>']))
+ self.assertEqual(_ParseSymbolPage(html, 'foo'), set(['<cmath>']))
def testParseSymbolPage_MulHeaders(self):
@@ -94,7 +94,7 @@ class TestStdGen(unittest.TestCase):
</tr>
</tbody></table>
"""
- self.assertEqual(ParseSymbolPage(html, "foo"),
+ self.assertEqual(_ParseSymbolPage(html, "foo"),
set(['<cstdio>', '<cstdlib>']))
@@ -121,7 +121,7 @@ class TestStdGen(unittest.TestCase):
</tr>
</tbody></table>
"""
- self.assertEqual(ParseSymbolPage(html, "foo"),
+ self.assertEqual(_ParseSymbolPage(html, "foo"),
set(['<algorithm>', '<utility>']))
def testParseSymbolPage_MulSymbolsInSameTd(self):
@@ -145,9 +145,9 @@ class TestStdGen(unittest.TestCase):
</tr>
</tbody></table>
"""
- self.assertEqual(ParseSymbolPage(html, "int8_t"),
+ self.assertEqual(_ParseSymbolPage(html, "int8_t"),
set(['<cstdint>']))
- self.assertEqual(ParseSymbolPage(html, "int16_t"),
+ self.assertEqual(_ParseSymbolPage(html, "int16_t"),
set(['<cstdint>']))
OpenPOWER on IntegriCloud