From 4586df765e927de6eb2d21ee849f7b847b3e91ec Mon Sep 17 00:00:00 2001 From: Dmitri Gribenko Date: Fri, 27 Jul 2012 20:37:06 +0000 Subject: Implement resolving of HTML character references (named: &, decimal: *, hex: ) during comment parsing. Now internal representation of plain text in comment AST does not contain character references, but the characters themselves. llvm-svn: 160891 --- clang/include/clang/AST/CommentLexer.h | 25 ++- clang/lib/AST/ASTContext.cpp | 3 +- clang/lib/AST/CommentLexer.cpp | 198 +++++++++++++++++--- clang/lib/AST/RawCommentList.cpp | 8 +- clang/test/Index/annotate-comments.cpp | 30 ++- clang/unittests/AST/CommentLexer.cpp | 321 ++++++++++++++++++++++++++++++++- clang/unittests/AST/CommentParser.cpp | 2 +- 7 files changed, 553 insertions(+), 34 deletions(-) diff --git a/clang/include/clang/AST/CommentLexer.h b/clang/include/clang/AST/CommentLexer.h index dc014fdb250..5b69a95ee06 100644 --- a/clang/include/clang/AST/CommentLexer.h +++ b/clang/include/clang/AST/CommentLexer.h @@ -211,6 +211,10 @@ private: Lexer(const Lexer&); // DO NOT IMPLEMENT void operator=(const Lexer&); // DO NOT IMPLEMENT + /// Allocator for strings that are semantic values of tokens and have to be + /// computed (for example, resolved decimal character references). + llvm::BumpPtrAllocator &Allocator; + const char *const BufferStart; const char *const BufferEnd; SourceLocation FileLoc; @@ -289,6 +293,16 @@ private: bool isVerbatimLineCommand(StringRef Name) const; + /// Given a character reference name (e.g., "lt"), return the character that + /// it stands for (e.g., "<"). + StringRef resolveHTMLNamedCharacterReference(StringRef Name) const; + + /// Given a Unicode codepoint as base-10 integer, return the character. + StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const; + + /// Given a Unicode codepoint as base-16 integer, return the character. + StringRef resolveHTMLHexCharacterReference(StringRef Name) const; + void formTokenWithChars(Token &Result, const char *TokEnd, tok::TokenKind Kind) { const unsigned TokLen = TokEnd - BufferPtr; @@ -302,6 +316,12 @@ private: BufferPtr = TokEnd; } + void formTextToken(Token &Result, const char *TokEnd) { + StringRef Text(BufferPtr, TokEnd - BufferPtr); + formTokenWithChars(Result, TokEnd, tok::text); + Result.setText(Text); + } + SourceLocation getSourceLocation(const char *Loc) const { assert(Loc >= BufferStart && Loc <= BufferEnd && "Location out of range for this buffer!"); @@ -328,6 +348,8 @@ private: void lexVerbatimLineText(Token &T); + void lexHTMLCharacterReference(Token &T); + void setupAndLexHTMLStartTag(Token &T); void lexHTMLStartTag(Token &T); @@ -337,7 +359,8 @@ private: void lexHTMLEndTag(Token &T); public: - Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts, + Lexer(llvm::BumpPtrAllocator &Allocator, + SourceLocation FileLoc, const CommentOptions &CommOpts, const char *BufferStart, const char *BufferEnd); void lex(Token &T); diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 5b57ce45577..46a4d87f9a9 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -216,7 +216,8 @@ comments::FullComment *ASTContext::getCommentForDecl(const Decl *D) const { return NULL; const StringRef RawText = RC->getRawText(SourceMgr); - comments::Lexer L(RC->getSourceRange().getBegin(), comments::CommentOptions(), + comments::Lexer L(getAllocator(), + RC->getSourceRange().getBegin(), comments::CommentOptions(), RawText.begin(), RawText.end()); comments::Sema S(getAllocator(), getSourceManager(), getDiagnostics()); diff --git a/clang/lib/AST/CommentLexer.cpp b/clang/lib/AST/CommentLexer.cpp index 31468321cf4..dde484510ff 100644 --- a/clang/lib/AST/CommentLexer.cpp +++ b/clang/lib/AST/CommentLexer.cpp @@ -1,4 +1,5 @@ #include "clang/AST/CommentLexer.h" +#include "clang/Basic/ConvertUTF.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/ErrorHandling.h" @@ -87,6 +88,71 @@ bool Lexer::isVerbatimLineCommand(StringRef Name) const { return false; } +namespace { +bool isHTMLNamedCharacterReferenceCharacter(char C) { + return (C >= 'a' && C <= 'z') || + (C >= 'A' && C <= 'Z'); +} + +bool isHTMLDecimalCharacterReferenceCharacter(char C) { + return C >= '0' && C <= '9'; +} + +bool isHTMLHexCharacterReferenceCharacter(char C) { + return (C >= '0' && C <= '9') || + (C >= 'a' && C <= 'f') || + (C >= 'A' && C <= 'F'); +} +} // unnamed namespace + +StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const { + return llvm::StringSwitch(Name) + .Case("amp", "&") + .Case("lt", "<") + .Case("gt", ">") + .Case("quot", "\"") + .Case("apos", "\'") + .Default(""); +} + +StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const { + unsigned CodePoint = 0; + for (unsigned i = 0, e = Name.size(); i != e; ++i) { + assert(isHTMLDecimalCharacterReferenceCharacter(Name[i])); + CodePoint *= 10; + CodePoint += Name[i] - '0'; + } + + char *Resolved = Allocator.Allocate(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); + char *ResolvedPtr = Resolved; + if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) + return StringRef(Resolved, ResolvedPtr - Resolved); + else + return StringRef(); +} + +StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const { + unsigned CodePoint = 0; + for (unsigned i = 0, e = Name.size(); i != e; ++i) { + CodePoint *= 16; + const char C = Name[i]; + assert(isHTMLHexCharacterReferenceCharacter(C)); + if (C >= '0' && C <= '9') + CodePoint += Name[i] - '0'; + else if (C >= 'a' && C <= 'f') + CodePoint += Name[i] - 'a' + 10; + else + CodePoint += Name[i] - 'A' + 10; + } + + char *Resolved = Allocator.Allocate(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); + char *ResolvedPtr = Resolved; + if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) + return StringRef(Resolved, ResolvedPtr - Resolved); + else + return StringRef(); +} + void Lexer::skipLineStartingDecorations() { // This function should be called only for C comments assert(CommentState == LCS_InsideCComment); @@ -147,6 +213,33 @@ const char *skipNewline(const char *BufferPtr, const char *BufferEnd) { return BufferPtr; } +const char *skipNamedCharacterReference(const char *BufferPtr, + const char *BufferEnd) { + for ( ; BufferPtr != BufferEnd; ++BufferPtr) { + if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr)) + return BufferPtr; + } + return BufferEnd; +} + +const char *skipDecimalCharacterReference(const char *BufferPtr, + const char *BufferEnd) { + for ( ; BufferPtr != BufferEnd; ++BufferPtr) { + if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr)) + return BufferPtr; + } + return BufferEnd; +} + +const char *skipHexCharacterReference(const char *BufferPtr, + const char *BufferEnd) { + for ( ; BufferPtr != BufferEnd; ++BufferPtr) { + if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr)) + return BufferPtr; + } + return BufferEnd; +} + bool isHTMLIdentifierStartingCharacter(char C) { return (C >= 'a' && C <= 'z') || (C >= 'A' && C <= 'Z'); @@ -295,9 +388,7 @@ void Lexer::lexCommentText(Token &T) { case '@': { TokenPtr++; if (TokenPtr == CommentEnd) { - StringRef Text(BufferPtr, TokenPtr - BufferPtr); - formTokenWithChars(T, TokenPtr, tok::text); - T.setText(Text); + formTextToken(T, TokenPtr); return; } char C = *TokenPtr; @@ -322,9 +413,7 @@ void Lexer::lexCommentText(Token &T) { // Don't make zero-length commands. if (!isCommandNameCharacter(*TokenPtr)) { - StringRef Text(BufferPtr, TokenPtr - BufferPtr); - formTokenWithChars(T, TokenPtr, tok::text); - T.setText(Text); + formTextToken(T, TokenPtr); return; } @@ -357,12 +446,14 @@ void Lexer::lexCommentText(Token &T) { return; } + case '&': + lexHTMLCharacterReference(T); + return; + case '<': { TokenPtr++; if (TokenPtr == CommentEnd) { - StringRef Text(BufferPtr, TokenPtr - BufferPtr); - formTokenWithChars(T, TokenPtr, tok::text); - T.setText(Text); + formTextToken(T, TokenPtr); return; } const char C = *TokenPtr; @@ -370,11 +461,9 @@ void Lexer::lexCommentText(Token &T) { setupAndLexHTMLStartTag(T); else if (C == '/') setupAndLexHTMLEndTag(T); - else { - StringRef Text(BufferPtr, TokenPtr - BufferPtr); - formTokenWithChars(T, TokenPtr, tok::text); - T.setText(Text); - } + else + formTextToken(T, TokenPtr); + return; } @@ -394,12 +483,10 @@ void Lexer::lexCommentText(Token &T) { break; const char C = *TokenPtr; if(C == '\n' || C == '\r' || - C == '\\' || C == '@' || C == '<') + C == '\\' || C == '@' || C == '&' || C == '<') break; } - StringRef Text(BufferPtr, TokenPtr - BufferPtr); - formTokenWithChars(T, TokenPtr, tok::text); - T.setText(Text); + formTextToken(T, TokenPtr); return; } } @@ -506,6 +593,69 @@ void Lexer::lexVerbatimLineText(Token &T) { State = LS_Normal; } +void Lexer::lexHTMLCharacterReference(Token &T) { + const char *TokenPtr = BufferPtr; + assert(*TokenPtr == '&'); + TokenPtr++; + if (TokenPtr == CommentEnd) { + formTextToken(T, TokenPtr); + return; + } + const char *NamePtr; + bool isNamed = false; + bool isDecimal = false; + char C = *TokenPtr; + if (isHTMLNamedCharacterReferenceCharacter(C)) { + NamePtr = TokenPtr; + TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd); + isNamed = true; + } else if (C == '#') { + TokenPtr++; + if (TokenPtr == CommentEnd) { + formTextToken(T, TokenPtr); + return; + } + C = *TokenPtr; + if (isHTMLDecimalCharacterReferenceCharacter(C)) { + NamePtr = TokenPtr; + TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd); + isDecimal = true; + } else if (C == 'x' || C == 'X') { + TokenPtr++; + NamePtr = TokenPtr; + TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd); + } else { + formTextToken(T, TokenPtr); + return; + } + } else { + formTextToken(T, TokenPtr); + return; + } + if (NamePtr == TokenPtr || TokenPtr == CommentEnd || + *TokenPtr != ';') { + formTextToken(T, TokenPtr); + return; + } + StringRef Name(NamePtr, TokenPtr - NamePtr); + TokenPtr++; // Skip semicolon. + StringRef Resolved; + if (isNamed) + Resolved = resolveHTMLNamedCharacterReference(Name); + else if (isDecimal) + Resolved = resolveHTMLDecimalCharacterReference(Name); + else + Resolved = resolveHTMLHexCharacterReference(Name); + + if (Resolved.empty()) { + formTextToken(T, TokenPtr); + return; + } + formTokenWithChars(T, TokenPtr, tok::text); + T.setText(Resolved); + return; +} + void Lexer::setupAndLexHTMLStartTag(Token &T) { assert(BufferPtr[0] == '<' && isHTMLIdentifierStartingCharacter(BufferPtr[1])); @@ -561,11 +711,9 @@ void Lexer::lexHTMLStartTag(Token &T) { if (TokenPtr != CommentEnd && *TokenPtr == '>') { TokenPtr++; formTokenWithChars(T, TokenPtr, tok::html_slash_greater); - } else { - StringRef Text(BufferPtr, TokenPtr - BufferPtr); - formTokenWithChars(T, TokenPtr, tok::text); - T.setText(Text); - } + } else + formTextToken(T, TokenPtr); + State = LS_Normal; return; } @@ -609,8 +757,10 @@ void Lexer::lexHTMLEndTag(Token &T) { State = LS_Normal; } -Lexer::Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts, +Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, + SourceLocation FileLoc, const CommentOptions &CommOpts, const char *BufferStart, const char *BufferEnd): + Allocator(Allocator), BufferStart(BufferStart), BufferEnd(BufferEnd), FileLoc(FileLoc), CommOpts(CommOpts), BufferPtr(BufferStart), CommentState(LCS_BeforeComment), State(LS_Normal) { diff --git a/clang/lib/AST/RawCommentList.cpp b/clang/lib/AST/RawCommentList.cpp index 7e183e2f2d3..41866cf03f1 100644 --- a/clang/lib/AST/RawCommentList.cpp +++ b/clang/lib/AST/RawCommentList.cpp @@ -134,7 +134,13 @@ const char *RawComment::extractBriefText(const ASTContext &Context) const { // Make sure that RawText is valid. getRawText(Context.getSourceManager()); - comments::Lexer L(Range.getBegin(), comments::CommentOptions(), + // Since we will be copying the resulting text, all allocations made during + // parsing are garbage after resulting string is formed. Thus we can use + // a separate allocator for all temporary stuff. + llvm::BumpPtrAllocator Allocator; + + comments::Lexer L(Allocator, + Range.getBegin(), comments::CommentOptions(), RawText.begin(), RawText.end()); comments::BriefParser P(L); diff --git a/clang/test/Index/annotate-comments.cpp b/clang/test/Index/annotate-comments.cpp index 926e3d8baee..5aebb6dee0f 100644 --- a/clang/test/Index/annotate-comments.cpp +++ b/clang/test/Index/annotate-comments.cpp @@ -323,6 +323,9 @@ void comment_to_html_conversion_23(); /// & < > " void comment_to_html_conversion_24(); +/// 0<i +void comment_to_html_conversion_25(); + #endif // RUN: rm -rf %t @@ -642,9 +645,26 @@ void comment_to_html_conversion_24(); // CHECK-NEXT: (CXComment_Text Text=[.]) // CHECK-NEXT: (CXComment_Text Text=[ ] IsWhitespace) // CHECK-NEXT: (CXComment_Text Text=[::])))] -// CHECK: annotate-comments.cpp:324:6: FunctionDecl=comment_to_html_conversion_24:{{.*}} FullCommentAsHTML=[

&amp; &lt; &gt; &quot;

] -// CHECK: CommentAST=[ -// CHECK: (CXComment_FullComment -// CHECK: (CXComment_Paragraph -// CHECK: (CXComment_Text Text=[ & < > "])))] +// CHECK: annotate-comments.cpp:324:6: FunctionDecl=comment_to_html_conversion_24:{{.*}} FullCommentAsHTML=[

& < > "

] +// CHECK-NEXT: CommentAST=[ +// CHECK-NEXT: (CXComment_FullComment +// CHECK-NEXT: (CXComment_Paragraph +// CHECK-NEXT: (CXComment_Text Text=[ ] IsWhitespace) +// CHECK-NEXT: (CXComment_Text Text=[&]) +// CHECK-NEXT: (CXComment_Text Text=[ ] IsWhitespace) +// CHECK-NEXT: (CXComment_Text Text=[<]) +// CHECK-NEXT: (CXComment_Text Text=[ ] IsWhitespace) +// CHECK-NEXT: (CXComment_Text Text=[>]) +// CHECK-NEXT: (CXComment_Text Text=[ ] IsWhitespace) +// CHECK-NEXT: (CXComment_Text Text=["])))] +// CHECK: annotate-comments.cpp:327:6: FunctionDecl=comment_to_html_conversion_25:{{.*}} FullCommentAsHTML=[

0<i

] +// CHECK-NEXT: CommentAST=[ +// CHECK-NEXT: (CXComment_FullComment +// CHECK-NEXT: (CXComment_Paragraph +// CHECK-NEXT: (CXComment_Text Text=[ ] IsWhitespace) +// CHECK-NEXT: (CXComment_HTMLStartTag Name=[em]) +// CHECK-NEXT: (CXComment_Text Text=[0]) +// CHECK-NEXT: (CXComment_Text Text=[<]) +// CHECK-NEXT: (CXComment_Text Text=[i]) +// CHECK-NEXT: (CXComment_HTMLEndTag Name=[em])))] diff --git a/clang/unittests/AST/CommentLexer.cpp b/clang/unittests/AST/CommentLexer.cpp index dd92df421f2..8b5d0c8cf01 100644 --- a/clang/unittests/AST/CommentLexer.cpp +++ b/clang/unittests/AST/CommentLexer.cpp @@ -37,6 +37,7 @@ protected: IntrusiveRefCntPtr DiagID; DiagnosticsEngine Diags; SourceManager SourceMgr; + llvm::BumpPtrAllocator Allocator; void lexString(const char *Source, std::vector &Toks); }; @@ -47,7 +48,7 @@ void CommentLexerTest::lexString(const char *Source, FileID File = SourceMgr.createFileIDForMemBuffer(Buf); SourceLocation Begin = SourceMgr.getLocForStartOfFile(File); - comments::Lexer L(Begin, CommentOptions(), + comments::Lexer L(Allocator, Begin, CommentOptions(), Source, Source + strlen(Source)); while (1) { @@ -1272,6 +1273,324 @@ TEST_F(CommentLexerTest, HTML20) { } } +TEST_F(CommentLexerTest, HTMLCharacterReferences1) { + const char *Source = "// &"; + + std::vector Toks; + + lexString(Source, Toks); + + ASSERT_EQ(3U, Toks.size()); + + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); + + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef("&"), Toks[1].getText()); + + ASSERT_EQ(tok::newline, Toks[2].getKind()); +} + +TEST_F(CommentLexerTest, HTMLCharacterReferences2) { + const char *Source = "// &!"; + + std::vector Toks; + + lexString(Source, Toks); + + ASSERT_EQ(4U, Toks.size()); + + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); + + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef("&"), Toks[1].getText()); + + ASSERT_EQ(tok::text, Toks[2].getKind()); + ASSERT_EQ(StringRef("!"), Toks[2].getText()); + + ASSERT_EQ(tok::newline, Toks[3].getKind()); +} + +TEST_F(CommentLexerTest, HTMLCharacterReferences3) { + const char *Source = "// &"; + + std::vector Toks; + + lexString(Source, Toks); + + ASSERT_EQ(3U, Toks.size()); + + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); + + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef("&"), Toks[1].getText()); + + ASSERT_EQ(tok::newline, Toks[2].getKind()); +} + +TEST_F(CommentLexerTest, HTMLCharacterReferences4) { + const char *Source = "// &!"; + + std::vector Toks; + + lexString(Source, Toks); + + ASSERT_EQ(4U, Toks.size()); + + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); + + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef("&"), Toks[1].getText()); + + ASSERT_EQ(tok::text, Toks[2].getKind()); + ASSERT_EQ(StringRef("!"), Toks[2].getText()); + + ASSERT_EQ(tok::newline, Toks[3].getKind()); +} + +TEST_F(CommentLexerTest, HTMLCharacterReferences5) { + const char *Source = "// &#"; + + std::vector Toks; + + lexString(Source, Toks); + + ASSERT_EQ(3U, Toks.size()); + + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); + + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef("&#"), Toks[1].getText()); + + ASSERT_EQ(tok::newline, Toks[2].getKind()); +} + +TEST_F(CommentLexerTest, HTMLCharacterReferences6) { + const char *Source = "// &#a"; + + std::vector Toks; + + lexString(Source, Toks); + + ASSERT_EQ(4U, Toks.size()); + + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); + + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef("&#"), Toks[1].getText()); + + ASSERT_EQ(tok::text, Toks[2].getKind()); + ASSERT_EQ(StringRef("a"), Toks[2].getText()); + + ASSERT_EQ(tok::newline, Toks[3].getKind()); +} + +TEST_F(CommentLexerTest, HTMLCharacterReferences7) { + const char *Source = "// *"; + + std::vector Toks; + + lexString(Source, Toks); + + ASSERT_EQ(3U, Toks.size()); + + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); + + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef("*"), Toks[1].getText()); + + ASSERT_EQ(tok::newline, Toks[2].getKind()); +} + +TEST_F(CommentLexerTest, HTMLCharacterReferences8) { + const char *Source = "// *a"; + + std::vector Toks; + + lexString(Source, Toks); + + ASSERT_EQ(4U, Toks.size()); + + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); + + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef("*"), Toks[1].getText()); + + ASSERT_EQ(tok::text, Toks[2].getKind()); + ASSERT_EQ(StringRef("a"), Toks[2].getText()); + + ASSERT_EQ(tok::newline, Toks[3].getKind()); +} + +TEST_F(CommentLexerTest, HTMLCharacterReferences9) { + const char *Source = "// &#x"; + + std::vector Toks; + + lexString(Source, Toks); + + ASSERT_EQ(3U, Toks.size()); + + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); + + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef("&#x"), Toks[1].getText()); + + ASSERT_EQ(tok::newline, Toks[2].getKind()); +} + +TEST_F(CommentLexerTest, HTMLCharacterReferences10) { + const char *Source = "// &#xz"; + + std::vector Toks; + + lexString(Source, Toks); + + ASSERT_EQ(4U, Toks.size()); + + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); + + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef("&#x"), Toks[1].getText()); + + ASSERT_EQ(tok::text, Toks[2].getKind()); + ASSERT_EQ(StringRef("z"), Toks[2].getText()); + + ASSERT_EQ(tok::newline, Toks[3].getKind()); +} + +TEST_F(CommentLexerTest, HTMLCharacterReferences11) { + const char *Source = "// «"; + + std::vector Toks; + + lexString(Source, Toks); + + ASSERT_EQ(3U, Toks.size()); + + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); + + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef("«"), Toks[1].getText()); + + ASSERT_EQ(tok::newline, Toks[2].getKind()); +} + +TEST_F(CommentLexerTest, HTMLCharacterReferences12) { + const char *Source = "// «z"; + + std::vector Toks; + + lexString(Source, Toks); + + ASSERT_EQ(4U, Toks.size()); + + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); + + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef("«"), Toks[1].getText()); + + ASSERT_EQ(tok::text, Toks[2].getKind()); + ASSERT_EQ(StringRef("z"), Toks[2].getText()); + + ASSERT_EQ(tok::newline, Toks[3].getKind()); +} + +TEST_F(CommentLexerTest, HTMLCharacterReferences13) { + const char *Source = "// &"; + + std::vector Toks; + + lexString(Source, Toks); + + ASSERT_EQ(3U, Toks.size()); + + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); + + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef("&"), Toks[1].getText()); + + ASSERT_EQ(tok::newline, Toks[2].getKind()); +} + +TEST_F(CommentLexerTest, HTMLCharacterReferences14) { + const char *Source = "// &<"; + + std::vector Toks; + + lexString(Source, Toks); + + ASSERT_EQ(4U, Toks.size()); + + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); + + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef("&"), Toks[1].getText()); + + ASSERT_EQ(tok::text, Toks[2].getKind()); + ASSERT_EQ(StringRef("<"), Toks[2].getText()); + + ASSERT_EQ(tok::newline, Toks[3].getKind()); +} + +TEST_F(CommentLexerTest, HTMLCharacterReferences15) { + const char *Source = "// & meow"; + + std::vector Toks; + + lexString(Source, Toks); + + ASSERT_EQ(4U, Toks.size()); + + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); + + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef("&"), Toks[1].getText()); + + ASSERT_EQ(tok::text, Toks[2].getKind()); + ASSERT_EQ(StringRef(" meow"), Toks[2].getText()); + + ASSERT_EQ(tok::newline, Toks[3].getKind()); +} + +TEST_F(CommentLexerTest, HTMLCharacterReferences16) { + const char *Sources[] = { + "// =", + "// =", + "// =" + }; + + for (size_t i = 0, e = array_lengthof(Sources); i != e; i++) { + std::vector Toks; + + lexString(Sources[i], Toks); + + ASSERT_EQ(3U, Toks.size()); + + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); + + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef("="), Toks[1].getText()); + + ASSERT_EQ(tok::newline, Toks[2].getKind()); + } +} + TEST_F(CommentLexerTest, MultipleComments) { const char *Source = "// Aaa\n" diff --git a/clang/unittests/AST/CommentParser.cpp b/clang/unittests/AST/CommentParser.cpp index 47433aee2be..ff931955595 100644 --- a/clang/unittests/AST/CommentParser.cpp +++ b/clang/unittests/AST/CommentParser.cpp @@ -54,7 +54,7 @@ FullComment *CommentParserTest::parseString(const char *Source) { FileID File = SourceMgr.createFileIDForMemBuffer(Buf); SourceLocation Begin = SourceMgr.getLocForStartOfFile(File); - comments::Lexer L(Begin, CommentOptions(), + comments::Lexer L(Allocator, Begin, CommentOptions(), Source, Source + strlen(Source)); comments::Sema S(Allocator, SourceMgr, Diags); -- cgit v1.2.1