[AST] Added a helper to extract a user-friendly text of a comment.

Summary: The helper is used in clangd for documentation shown in code completion and storing the docs in the symbols. See D45999. This patch reuses the code of the Doxygen comment lexer, disabling the bits that do command and html tag parsing. The new helper works on all comments, including non-doxygen comments. However, it does not understand or transform any doxygen directives, i.e. cannot extract brief text, etc. Reviewers: sammccall, hokein, ioeric Reviewed By: ioeric Subscribers: mgorny, cfe-commits Differential Revision: https://reviews.llvm.org/D46000 llvm-svn: 332458
author: Ilya Biryukov <ibiryukov@google.com> 2018-05-16 12:30:09 +0000
committer: Ilya Biryukov <ibiryukov@google.com> 2018-05-16 12:30:09 +0000
commit: 1ff7c32fc91c607b690d4bb9cf42f406be8dde68 (patch)
tree: d0381185f45ca8a75734da3f283df9e9432ed125 /clang/lib/AST/CommentLexer.cpp
parent: a3f955bddb9102eb9813103f76fa76f1db9d1d9d (diff)
download: bcm5719-llvm-1ff7c32fc91c607b690d4bb9cf42f406be8dde68.tar.gz
bcm5719-llvm-1ff7c32fc91c607b690d4bb9cf42f406be8dde68.zip
1 files changed, 129 insertions, 117 deletions
diff --git a/clang/lib/AST/CommentLexer.cpp b/clang/lib/AST/CommentLexer.cpp
index 65d0f56f09a..6ff4d45a957 100644
--- a/clang/lib/AST/CommentLexer.cpp
+++ b/clang/lib/AST/CommentLexer.cpp
@@ -294,6 +294,39 @@ void Lexer::lexCommentText(Token &T) {
   assert(CommentState == LCS_InsideBCPLComment ||
          CommentState == LCS_InsideCComment);
 
+  // Handles lexing non-command text, i.e. text and newline.
+  auto HandleNonCommandToken = [&]() -> void {
+    assert(State == LS_Normal);
+
+    const char *TokenPtr = BufferPtr;
+    assert(TokenPtr < CommentEnd);
+    switch (*TokenPtr) {
+      case '\n':
+      case '\r':
+          TokenPtr = skipNewline(TokenPtr, CommentEnd);
+          formTokenWithChars(T, TokenPtr, tok::newline);
+
+          if (CommentState == LCS_InsideCComment)
+            skipLineStartingDecorations();
+          return;
+
+      default: {
+          StringRef TokStartSymbols = ParseCommands ? "\n\r\\@&<" : "\n\r";
+          size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr)
+                           .find_first_of(TokStartSymbols);
+          if (End != StringRef::npos)
+            TokenPtr += End;
+          else
+            TokenPtr = CommentEnd;
+          formTextToken(T, TokenPtr);
+          return;
+      }
+    }
+  };
+
+  if (!ParseCommands)
+    return HandleNonCommandToken();
+
   switch (State) {
   case LS_Normal:
     break;
@@ -315,136 +348,116 @@ void Lexer::lexCommentText(Token &T) {
   }
 
   assert(State == LS_Normal);
-
   const char *TokenPtr = BufferPtr;
   assert(TokenPtr < CommentEnd);
-  while (TokenPtr != CommentEnd) {
-    switch(*TokenPtr) {
-      case '\\':
-      case '@': {
-        // Commands that start with a backslash and commands that start with
-        // 'at' have equivalent semantics.  But we keep information about the
-        // exact syntax in AST for comments.
-        tok::TokenKind CommandKind =
-            (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
+  switch(*TokenPtr) {
+    case '\\':
+    case '@': {
+      // Commands that start with a backslash and commands that start with
+      // 'at' have equivalent semantics.  But we keep information about the
+      // exact syntax in AST for comments.
+      tok::TokenKind CommandKind =
+          (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
+      TokenPtr++;
+      if (TokenPtr == CommentEnd) {
+        formTextToken(T, TokenPtr);
+        return;
+      }
+      char C = *TokenPtr;
+      switch (C) {
+      default:
+        break;
+
+      case '\\': case '@': case '&': case '$':
+      case '#':  case '<': case '>': case '%':
+      case '\"': case '.': case ':':
+        // This is one of \\ \@ \& \$ etc escape sequences.
         TokenPtr++;
-        if (TokenPtr == CommentEnd) {
-          formTextToken(T, TokenPtr);
-          return;
-        }
-        char C = *TokenPtr;
-        switch (C) {
-        default:
-          break;
-
-        case '\\': case '@': case '&': case '$':
-        case '#':  case '<': case '>': case '%':
-        case '\"': case '.': case ':':
-          // This is one of \\ \@ \& \$ etc escape sequences.
+        if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
+          // This is the \:: escape sequence.
           TokenPtr++;
-          if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
-            // This is the \:: escape sequence.
-            TokenPtr++;
-          }
-          StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
-          formTokenWithChars(T, TokenPtr, tok::text);
-          T.setText(UnescapedText);
-          return;
         }
+        StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
+        formTokenWithChars(T, TokenPtr, tok::text);
+        T.setText(UnescapedText);
+        return;
+      }
 
-        // Don't make zero-length commands.
-        if (!isCommandNameStartCharacter(*TokenPtr)) {
-          formTextToken(T, TokenPtr);
-          return;
-        }
+      // Don't make zero-length commands.
+      if (!isCommandNameStartCharacter(*TokenPtr)) {
+        formTextToken(T, TokenPtr);
+        return;
+      }
 
-        TokenPtr = skipCommandName(TokenPtr, CommentEnd);
-        unsigned Length = TokenPtr - (BufferPtr + 1);
-
-        // Hardcoded support for lexing LaTeX formula commands
-        // \f$ \f[ \f] \f{ \f} as a single command.
-        if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
-          C = *TokenPtr;
-          if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
-            TokenPtr++;
-            Length++;
-          }
-        }
+      TokenPtr = skipCommandName(TokenPtr, CommentEnd);
+      unsigned Length = TokenPtr - (BufferPtr + 1);
 
-        StringRef CommandName(BufferPtr + 1, Length);
-
-        const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
-        if (!Info) {
-          if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
-            StringRef CorrectedName = Info->Name;
-            SourceLocation Loc = getSourceLocation(BufferPtr);
-            SourceLocation EndLoc = getSourceLocation(TokenPtr);
-            SourceRange FullRange = SourceRange(Loc, EndLoc);
-            SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
-            Diag(Loc, diag::warn_correct_comment_command_name)
-              << FullRange << CommandName << CorrectedName
-              << FixItHint::CreateReplacement(CommandRange, CorrectedName);
-          } else {
-            formTokenWithChars(T, TokenPtr, tok::unknown_command);
-            T.setUnknownCommandName(CommandName);
-            Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
-                << SourceRange(T.getLocation(), T.getEndLocation());
-            return;
-          }
-        }
-        if (Info->IsVerbatimBlockCommand) {
-          setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
-          return;
-        }
-        if (Info->IsVerbatimLineCommand) {
-          setupAndLexVerbatimLine(T, TokenPtr, Info);
-          return;
+      // Hardcoded support for lexing LaTeX formula commands
+      // \f$ \f[ \f] \f{ \f} as a single command.
+      if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
+        C = *TokenPtr;
+        if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
+          TokenPtr++;
+          Length++;
         }
-        formTokenWithChars(T, TokenPtr, CommandKind);
-        T.setCommandID(Info->getID());
-        return;
       }
 
-      case '&':
-        lexHTMLCharacterReference(T);
-        return;
-
-      case '<': {
-        TokenPtr++;
-        if (TokenPtr == CommentEnd) {
-          formTextToken(T, TokenPtr);
+      StringRef CommandName(BufferPtr + 1, Length);
+
+      const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
+      if (!Info) {
+        if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
+          StringRef CorrectedName = Info->Name;
+          SourceLocation Loc = getSourceLocation(BufferPtr);
+          SourceLocation EndLoc = getSourceLocation(TokenPtr);
+          SourceRange FullRange = SourceRange(Loc, EndLoc);
+          SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
+          Diag(Loc, diag::warn_correct_comment_command_name)
+            << FullRange << CommandName << CorrectedName
+            << FixItHint::CreateReplacement(CommandRange, CorrectedName);
+        } else {
+          formTokenWithChars(T, TokenPtr, tok::unknown_command);
+          T.setUnknownCommandName(CommandName);
+          Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
+              << SourceRange(T.getLocation(), T.getEndLocation());
           return;
         }
-        const char C = *TokenPtr;
-        if (isHTMLIdentifierStartingCharacter(C))
-          setupAndLexHTMLStartTag(T);
-        else if (C == '/')
-          setupAndLexHTMLEndTag(T);
-        else
-          formTextToken(T, TokenPtr);
+      }
+      if (Info->IsVerbatimBlockCommand) {
+        setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
         return;
       }
-
-      case '\n':
-      case '\r':
-        TokenPtr = skipNewline(TokenPtr, CommentEnd);
-        formTokenWithChars(T, TokenPtr, tok::newline);
-
-        if (CommentState == LCS_InsideCComment)
-          skipLineStartingDecorations();
+      if (Info->IsVerbatimLineCommand) {
+        setupAndLexVerbatimLine(T, TokenPtr, Info);
         return;
+      }
+      formTokenWithChars(T, TokenPtr, CommandKind);
+      T.setCommandID(Info->getID());
+      return;
+    }
 
-      default: {
-        size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
-                         find_first_of("\n\r\\@&<");
-        if (End != StringRef::npos)
-          TokenPtr += End;
-        else
-          TokenPtr = CommentEnd;
+    case '&':
+      lexHTMLCharacterReference(T);
+      return;
+
+    case '<': {
+      TokenPtr++;
+      if (TokenPtr == CommentEnd) {
         formTextToken(T, TokenPtr);
         return;
       }
+      const char C = *TokenPtr;
+      if (isHTMLIdentifierStartingCharacter(C))
+        setupAndLexHTMLStartTag(T);
+      else if (C == '/')
+        setupAndLexHTMLEndTag(T);
+      else
+        formTextToken(T, TokenPtr);
+      return;
     }
+
+    default:
+      return HandleNonCommandToken();
   }
 }
 
@@ -727,14 +740,13 @@ void Lexer::lexHTMLEndTag(Token &T) {
 }
 
 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
-             const CommandTraits &Traits,
-             SourceLocation FileLoc,
-             const char *BufferStart, const char *BufferEnd):
-    Allocator(Allocator), Diags(Diags), Traits(Traits),
-    BufferStart(BufferStart), BufferEnd(BufferEnd),
-    FileLoc(FileLoc), BufferPtr(BufferStart),
-    CommentState(LCS_BeforeComment), State(LS_Normal) {
-}
+             const CommandTraits &Traits, SourceLocation FileLoc,
+             const char *BufferStart, const char *BufferEnd,
+             bool ParseCommands)
+    : Allocator(Allocator), Diags(Diags), Traits(Traits),
+      BufferStart(BufferStart), BufferEnd(BufferEnd), FileLoc(FileLoc),
+      BufferPtr(BufferStart), CommentState(LCS_BeforeComment), State(LS_Normal),
+      ParseCommands(ParseCommands) {}
 
 void Lexer::lex(Token &T) {
 again:
author	Ilya Biryukov <ibiryukov@google.com>	2018-05-16 12:30:09 +0000
committer	Ilya Biryukov <ibiryukov@google.com>	2018-05-16 12:30:09 +0000
commit	1ff7c32fc91c607b690d4bb9cf42f406be8dde68 (patch)
tree	d0381185f45ca8a75734da3f283df9e9432ed125 /clang/lib/AST/CommentLexer.cpp
parent	a3f955bddb9102eb9813103f76fa76f1db9d1d9d (diff)
download	bcm5719-llvm-1ff7c32fc91c607b690d4bb9cf42f406be8dde68.tar.gz bcm5719-llvm-1ff7c32fc91c607b690d4bb9cf42f406be8dde68.zip