3 files changed, 218 insertions, 180 deletions
diff --git a/clang-tools-extra/clangd/include-mapping/cppreference_parser.py b/clang-tools-extra/clangd/include-mapping/cppreference_parser.py
new file mode 100644
index 00000000000..fd3b8a6837d
--- /dev/null
+++ b/clang-tools-extra/clangd/include-mapping/cppreference_parser.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python
+#===- cppreference_parser.py -  ------------------------------*- python -*--===#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+#===------------------------------------------------------------------------===#
+
+from bs4 import BeautifulSoup, NavigableString
+
+import collections
+import multiprocessing
+import os
+import re
+import signal
+import sys
+
+
+class Symbol:
+
+  def __init__(self, name, namespace, headers):
+    # unqualifed symbol name, e.g. "move"
+    self.name = name
+    # namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope)
+    # None for C symbols.
+    self.namespace = namespace
+    # a list of corresponding headers
+    self.headers = headers
+
+
+def _HasClass(tag, *classes):
+  for c in tag.get('class', []):
+    if c in classes:
+      return True
+  return False
+
+
+def _ParseSymbolPage(symbol_page_html, symbol_name):
+  """Parse symbol page and retrieve the include header defined in this page.
+  The symbol page provides header for the symbol, specifically in
+  "Defined in header <header>" section. An example:
+
+  <tr class="t-dsc-header">
+    <td colspan="2"> <div>Defined in header <code>&lt;ratio&gt;</code> </div>
+  </td></tr>
+
+  Returns a list of headers.
+  """
+  headers = set()
+  all_headers = set()
+
+  soup = BeautifulSoup(symbol_page_html, "html.parser")
+  # Rows in table are like:
+  #   Defined in header <foo>      .t-dsc-header
+  #   Defined in header <bar>      .t-dsc-header
+  #   decl1                        .t-dcl
+  #   Defined in header <baz>      .t-dsc-header
+  #   decl2                        .t-dcl
+  for table in soup.select('table.t-dcl-begin, table.t-dsc-begin'):
+    current_headers = []
+    was_decl = False
+    for row in table.select('tr'):
+      if _HasClass(row, 't-dcl', 't-dsc'):
+        was_decl = True
+        # Symbols are in the first cell.
+        found_symbols = row.find('td').stripped_strings
+        if not symbol_name in found_symbols:
+          continue
+        headers.update(current_headers)
+      elif _HasClass(row, 't-dsc-header'):
+        # If we saw a decl since the last header, this is a new block of headers
+        # for a new block of decls.
+        if was_decl:
+          current_headers = []
+        was_decl = False
+        # There are also .t-dsc-header for "defined in namespace".
+        if not "Defined in header " in row.text:
+          continue
+        # The interesting header content (e.g. <cstdlib>) is wrapped in <code>.
+        for header_code in row.find_all("code"):
+          current_headers.append(header_code.text)
+          all_headers.add(header_code.text)
+  # If the symbol was never named, consider all named headers.
+  return headers or all_headers
+
+
+def _ParseIndexPage(index_page_html):
+  """Parse index page.
+  The index page lists all std symbols and hrefs to their detailed pages
+  (which contain the defined header). An example:
+
+  <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br>
+  <a href="acos.html" title="acos"><tt>acos()</tt></a> <br>
+
+  Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant).
+  """
+  symbols = []
+  soup = BeautifulSoup(index_page_html, "html.parser")
+  for symbol_href in soup.select("a[title]"):
+    # Ignore annotated symbols like "acos<>() (std::complex)".
+    # These tend to be overloads, and we the primary is more useful.
+    # This accidentally accepts begin/end despite the (iterator) caption: the
+    # (since C++11) note is first. They are good symbols, so the bug is unfixed.
+    caption = symbol_href.next_sibling
+    variant = isinstance(caption, NavigableString) and "(" in caption
+    symbol_tt = symbol_href.find("tt")
+    if symbol_tt:
+      symbols.append((symbol_tt.text.rstrip("<>()"), # strip any trailing <>()
+                      symbol_href["href"], variant))
+  return symbols
+
+
+def _ReadSymbolPage(path, name):
+  with open(path) as f:
+    return _ParseSymbolPage(f.read(), name)
+
+
+def _GetSymbols(pool, root_dir, index_page_name, namespace):
+  """Get all symbols listed in the index page. All symbols should be in the
+  given namespace.
+
+  Returns a list of Symbols.
+  """
+
+  # Workflow steps:
+  #   1. Parse index page which lists all symbols to get symbol
+  #      name (unqualified name) and its href link to the symbol page which
+  #      contains the defined header.
+  #   2. Parse the symbol page to get the defined header.
+  index_page_path = os.path.join(root_dir, index_page_name)
+  with open(index_page_path, "r") as f:
+    # Read each symbol page in parallel.
+    results = [] # (symbol_name, promise of [header...])
+    for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()):
+      # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity.
+      # FIXME: use these as a fallback rather than ignoring entirely.
+      if variant:
+        continue
+      path = os.path.join(root_dir, symbol_page_path)
+      results.append((symbol_name,
+                      pool.apply_async(_ReadSymbolPage, (path, symbol_name))))
+
+    # Build map from symbol name to a set of headers.
+    symbol_headers = collections.defaultdict(set)
+    for symbol_name, lazy_headers in results:
+      symbol_headers[symbol_name].update(lazy_headers.get())
+
+  symbols = []
+  for name, headers in sorted(symbol_headers.items(), key=lambda t : t[0]):
+    symbols.append(Symbol(name, namespace, list(headers)))
+  return symbols
+
+
+def GetSymbols(parse_pages):
+  """Get all symbols by parsing the given pages.
+
+  Args:
+    parse_pages: a list of tuples (page_root_dir, index_page_name, namespace)
+  """
+  symbols = []
+  # Run many workers to process individual symbol pages under the symbol index.
+  # Don't allow workers to capture Ctrl-C.
+  pool = multiprocessing.Pool(
+      initializer=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN))
+  try:
+    for root_dir, page_name, namespace in parse_pages:
+      symbols.extend(_GetSymbols(pool, root_dir, page_name, namespace))
+  finally:
+    pool.terminate()
+    pool.join()
+  return symbols
diff --git a/clang-tools-extra/clangd/include-mapping/gen_std.py b/clang-tools-extra/clangd/include-mapping/gen_std.py
index c5824a0c42e..1cfe2c026b4 100755
--- a/clang-tools-extra/clangd/include-mapping/gen_std.py
+++ b/clang-tools-extra/clangd/include-mapping/gen_std.py
@@ -8,7 +8,7 @@
 #===------------------------------------------------------------------------===#
 
 """gen_std.py is a tool to generate a lookup table (from qualified names to
-include headers) for C++ Standard Library symbols by parsing archieved HTML
+include headers) for C/C++ Standard Library symbols by parsing archieved HTML
 files from cppreference.
 
 Caveats and FIXMEs:
@@ -25,24 +25,23 @@ Usage:
   3. Unzip the zip file from step 2 to directory </cppreference>, you should
      get a "reference" directory in </cppreference>
   4. Run the command:
-       gen_std.py -cppreference </cppreference/reference> > StdSymbolMap.inc
+       // Generate C++ symbols
+       gen_std.py -cppreference </cppreference/reference> -language=cpp > StdSymbolMap.inc
+       // Generate C symbols
+       gen_std.py -cppreference </cppreference/reference> -language=c > CSymbolMap.inc
 """
 
-from bs4 import BeautifulSoup, NavigableString
 
+import cppreference_parser
 import argparse
-import collections
 import datetime
-import multiprocessing
 import os
-import re
-import signal
 import sys
 
-STDGEN_CODE_PREFIX = """\
+CODE_PREFIX = """\
 //===-- gen_std.py generated file -------------------------------*- C++ -*-===//
 //
-// Used to build a lookup table (qualified names => include headers) for C++
+// Used to build a lookup table (qualified names => include headers) for %s
 // Standard Library symbols.
 //
 // Automatically generated file, DO NOT EDIT!
@@ -51,189 +50,56 @@ STDGEN_CODE_PREFIX = """\
 //===----------------------------------------------------------------------===//
 """
 
-def HasClass(tag, *classes):
-  for c in tag.get('class', []):
-    if c in classes:
-      return True
-  return False
-
-def ParseSymbolPage(symbol_page_html, symbol_name):
-  """Parse symbol page and retrieve the include header defined in this page.
-  The symbol page provides header for the symbol, specifically in
-  "Defined in header <header>" section. An example:
-
-  <tr class="t-dsc-header">
-    <td colspan="2"> <div>Defined in header <code>&lt;ratio&gt;</code> </div>
-  </td></tr>
-
-  Returns a list of headers.
-  """
-  headers = set()
-  all_headers = set()
-
-  soup = BeautifulSoup(symbol_page_html, "html.parser")
-  # Rows in table are like:
-  #   Defined in header <foo>      .t-dsc-header
-  #   Defined in header <bar>      .t-dsc-header
-  #   decl1                        .t-dcl
-  #   Defined in header <baz>      .t-dsc-header
-  #   decl2                        .t-dcl
-  for table in soup.select('table.t-dcl-begin, table.t-dsc-begin'):
-    current_headers = []
-    was_decl = False
-    for row in table.select('tr'):
-      if HasClass(row, 't-dcl', 't-dsc'):
-        was_decl = True
-        # Symbols are in the first cell.
-        found_symbols = row.find('td').stripped_strings
-        if not symbol_name in found_symbols:
-          continue
-        headers.update(current_headers)
-      elif HasClass(row, 't-dsc-header'):
-        # If we saw a decl since the last header, this is a new block of headers
-        # for a new block of decls.
-        if was_decl:
-          current_headers = []
-        was_decl = False
-        # There are also .t-dsc-header for "defined in namespace".
-        if not "Defined in header " in row.text:
-          continue
-        # The interesting header content (e.g. <cstdlib>) is wrapped in <code>.
-        for header_code in row.find_all("code"):
-          current_headers.append(header_code.text)
-          all_headers.add(header_code.text)
-  # If the symbol was never named, consider all named headers.
-  return headers or all_headers
-
-
-def ParseIndexPage(index_page_html):
-  """Parse index page.
-  The index page lists all std symbols and hrefs to their detailed pages
-  (which contain the defined header). An example:
-
-  <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br>
-  <a href="acos.html" title="acos"><tt>acos()</tt></a> <br>
-
-  Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant).
-  """
-  symbols = []
-  soup = BeautifulSoup(index_page_html, "html.parser")
-  for symbol_href in soup.select("a[title]"):
-    # Ignore annotated symbols like "acos<>() (std::complex)".
-    # These tend to be overloads, and we the primary is more useful.
-    # This accidentally accepts begin/end despite the (iterator) caption: the
-    # (since C++11) note is first. They are good symbols, so the bug is unfixed.
-    caption = symbol_href.next_sibling
-    variant = isinstance(caption, NavigableString) and "(" in caption
-    symbol_tt = symbol_href.find("tt")
-    if symbol_tt:
-      symbols.append((symbol_tt.text.rstrip("<>()"), # strip any trailing <>()
-                      symbol_href["href"], variant))
-  return symbols
-
-class Symbol:
-
-  def __init__(self, name, namespace, headers):
-    # unqualifed symbol name, e.g. "move"
-    self.name = name
-    # namespace of the symbol (with trailing "::"), e.g. "std::"
-    self.namespace = namespace
-    # a list of corresponding headers
-    self.headers = headers
-
-
-def ReadSymbolPage(path, name):
-  with open(path) as f:
-    return ParseSymbolPage(f.read(), name)
-
-
-def GetSymbols(pool, root_dir, index_page_name, namespace):
-  """Get all symbols listed in the index page. All symbols should be in the
-  given namespace.
-
-  Returns a list of Symbols.
-  """
-
-  # Workflow steps:
-  #   1. Parse index page which lists all symbols to get symbol
-  #      name (unqualified name) and its href link to the symbol page which
-  #      contains the defined header.
-  #   2. Parse the symbol page to get the defined header.
-  index_page_path = os.path.join(root_dir, index_page_name)
-  with open(index_page_path, "r") as f:
-    # Read each symbol page in parallel.
-    results = [] # (symbol_name, promise of [header...])
-    for symbol_name, symbol_page_path, variant in ParseIndexPage(f.read()):
-      # Variant symbols (e.g. the std::locale version of isalpha) add ambiguity.
-      # FIXME: use these as a fallback rather than ignoring entirely.
-      if variant:
-        continue
-      path = os.path.join(root_dir, symbol_page_path)
-      results.append((symbol_name,
-                      pool.apply_async(ReadSymbolPage, (path, symbol_name))))
-
-    # Build map from symbol name to a set of headers.
-    symbol_headers = collections.defaultdict(set)
-    for symbol_name, lazy_headers in results:
-      symbol_headers[symbol_name].update(lazy_headers.get())
-
-  symbols = []
-  for name, headers in sorted(symbol_headers.items(), key=lambda t : t[0]):
-    symbols.append(Symbol(name, namespace, list(headers)))
-  return symbols
-
-
 def ParseArg():
   parser = argparse.ArgumentParser(description='Generate StdGen file')
   parser.add_argument('-cppreference', metavar='PATH',
                       default='',
                       help='path to the cppreference offline HTML directory',
                       required=True
-                      )
+                     )
+  parser.add_argument('-language',
+                      default='cpp',
+                      help='Generate c or cpp symbols',
+                      required=True)
   return parser.parse_args()
 
 
 def main():
   args = ParseArg()
-  cpp_root = os.path.join(args.cppreference, "en", "cpp")
-  symbol_index_root = os.path.join(cpp_root, "symbol_index")
+  if args.language == 'cpp':
+    page_root = os.path.join(args.cppreference, "en", "cpp")
+    symbol_index_root = os.path.join(page_root, "symbol_index")
+    parse_pages =  [
+      (page_root, "symbol_index.html", "std::"),
+      # std sub-namespace symbols have separated pages.
+      # We don't index std literal operators (e.g.
+      # std::literals::chrono_literals::operator""d), these symbols can't be
+      # accessed by std::<symbol_name>.
+      # FIXME: index std::placeholders symbols, placeholders.html page is
+      # different (which contains one entry for _1, _2, ..., _N), we need special
+      # handling.
+      (symbol_index_root, "chrono.html", "std::chrono::"),
+      (symbol_index_root, "filesystem.html", "std::filesystem::"),
+      (symbol_index_root, "pmr.html", "std::pmr::"),
+      (symbol_index_root, "regex_constants.html", "std::regex_constants::"),
+      (symbol_index_root, "this_thread.html", "std::this_thread::"),
+    ]
+  elif args.language == 'c':
+    page_root = os.path.join(args.cppreference, "en", "c")
+    symbol_index_root = page_root
+    parse_pages = [(page_root, "index.html", None)]
+
   if not os.path.exists(symbol_index_root):
     exit("Path %s doesn't exist!" % symbol_index_root)
 
-  parse_pages =  [
-    (cpp_root, "symbol_index.html", "std::"),
-    # std sub-namespace symbols have separated pages.
-    # We don't index std literal operators (e.g.
-    # std::literals::chrono_literals::operator""d), these symbols can't be
-    # accessed by std::<symbol_name>.
-    # FIXME: index std::placeholders symbols, placeholders.html page is
-    # different (which contains one entry for _1, _2, ..., _N), we need special
-    # handling.
-    (symbol_index_root, "chrono.html", "std::chrono::"),
-    (symbol_index_root, "filesystem.html", "std::filesystem::"),
-    (symbol_index_root, "pmr.html", "std::pmr::"),
-    (symbol_index_root, "regex_constants.html", "std::regex_constants::"),
-    (symbol_index_root, "this_thread.html", "std::this_thread::"),
-  ]
-
-  symbols = []
-  # Run many workers to process individual symbol pages under the symbol index.
-  # Don't allow workers to capture Ctrl-C.
-  pool = multiprocessing.Pool(
-      initializer=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN))
-  try:
-    for root_dir, page_name, namespace in parse_pages:
-      symbols.extend(GetSymbols(pool, root_dir, page_name, namespace))
-  finally:
-    pool.terminate()
-    pool.join()
+  symbols = cppreference_parser.GetSymbols(parse_pages)
 
   # We don't have version information from the unzipped offline HTML files.
   # so we use the modified time of the symbol_index.html as the version.
-  index_page_path = os.path.join(cpp_root, "symbol_index.html")
+  index_page_path = os.path.join(page_root, "index.html")
   cppreference_modified_date = datetime.datetime.fromtimestamp(
     os.stat(index_page_path).st_mtime).strftime('%Y-%m-%d')
-  print STDGEN_CODE_PREFIX % cppreference_modified_date
+  print CODE_PREFIX % (args.language.upper(), cppreference_modified_date)
   for symbol in symbols:
     if len(symbol.headers) == 1:
       # SYMBOL(unqualified_name, namespace, header)
diff --git a/clang-tools-extra/clangd/include-mapping/test.py b/clang-tools-extra/clangd/include-mapping/test.py
index 3f17b53189c..9fad952b2e9 100755
--- a/clang-tools-extra/clangd/include-mapping/test.py
+++ b/clang-tools-extra/clangd/include-mapping/test.py
@@ -7,7 +7,7 @@
 #
 #===------------------------------------------------------------------------===#
 
-from gen_std import ParseSymbolPage, ParseIndexPage
+from cppreference_parser import _ParseSymbolPage, _ParseIndexPage
 
 import unittest
 
@@ -22,7 +22,7 @@ class TestStdGen(unittest.TestCase):
  <a href="as_bytes.html" title="as bytes"><tt>as_bytes&lt;&gt;()</tt></a> <span class="t-mark-rev t-since-cxx20">(since C++20)</span> <br>
  """
 
-    actual = ParseIndexPage(html)
+    actual = _ParseIndexPage(html)
     expected = [
       ("abs", "abs.html", True),
       ("abs", "complex/abs.html", True),
@@ -53,7 +53,7 @@ class TestStdGen(unittest.TestCase):
   </tr>
 </tbody></table>
 """
-    self.assertEqual(ParseSymbolPage(html, 'foo'), set(['<cmath>']))
+    self.assertEqual(_ParseSymbolPage(html, 'foo'), set(['<cmath>']))
 
 
   def testParseSymbolPage_MulHeaders(self):
@@ -94,7 +94,7 @@ class TestStdGen(unittest.TestCase):
   </tr>
 </tbody></table>
 """
-    self.assertEqual(ParseSymbolPage(html, "foo"),
+    self.assertEqual(_ParseSymbolPage(html, "foo"),
                      set(['<cstdio>', '<cstdlib>']))
 
 
@@ -121,7 +121,7 @@ class TestStdGen(unittest.TestCase):
 </tr>
 </tbody></table>
 """
-    self.assertEqual(ParseSymbolPage(html, "foo"),
+    self.assertEqual(_ParseSymbolPage(html, "foo"),
                      set(['<algorithm>', '<utility>']))
 
   def testParseSymbolPage_MulSymbolsInSameTd(self):
@@ -145,9 +145,9 @@ class TestStdGen(unittest.TestCase):
 </tr>
 </tbody></table>
 """
-    self.assertEqual(ParseSymbolPage(html, "int8_t"),
+    self.assertEqual(_ParseSymbolPage(html, "int8_t"),
                      set(['<cstdint>']))
-    self.assertEqual(ParseSymbolPage(html, "int16_t"),
+    self.assertEqual(_ParseSymbolPage(html, "int16_t"),
                      set(['<cstdint>']))