diff options
| author | Richard Smith <richard-llvm@metafoo.co.uk> | 2014-02-17 21:52:30 +0000 |
|---|---|---|
| committer | Richard Smith <richard-llvm@metafoo.co.uk> | 2014-02-17 21:52:30 +0000 |
| commit | 8b7258bdb3d371a59e1e84e3c417b76a2c0dc003 (patch) | |
| tree | f5ff757b174428b944298f4ab3a8ca68f7ba7c1a /clang/lib/Lex | |
| parent | 6287371ce629196749925d73cd143cb2dd35b8e7 (diff) | |
| download | bcm5719-llvm-8b7258bdb3d371a59e1e84e3c417b76a2c0dc003.tar.gz bcm5719-llvm-8b7258bdb3d371a59e1e84e3c417b76a2c0dc003.zip | |
PR18855: Add support for UCNs and UTF-8 encoding within ud-suffixes.
llvm-svn: 201532
Diffstat (limited to 'clang/lib/Lex')
| -rw-r--r-- | clang/lib/Lex/Lexer.cpp | 150 | ||||
| -rw-r--r-- | clang/lib/Lex/LiteralSupport.cpp | 79 | ||||
| -rw-r--r-- | clang/lib/Lex/Preprocessor.cpp | 42 |
3 files changed, 157 insertions, 114 deletions
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp index c7eab490ad6..cfa835d173d 100644 --- a/clang/lib/Lex/Lexer.cpp +++ b/clang/lib/Lex/Lexer.cpp @@ -1445,7 +1445,50 @@ static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, << Range; } } - } +} + +bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, + Token &Result) { + const char *UCNPtr = CurPtr + Size; + uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/0); + if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts)) + return false; + + if (!isLexingRawMode()) + maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, + makeCharRange(*this, CurPtr, UCNPtr), + /*IsFirst=*/false); + + Result.setFlag(Token::HasUCN); + if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') || + (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U')) + CurPtr = UCNPtr; + else + while (CurPtr != UCNPtr) + (void)getAndAdvanceChar(CurPtr, Result); + return true; +} + +bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) { + const char *UnicodePtr = CurPtr; + UTF32 CodePoint; + ConversionResult Result = + llvm::convertUTF8Sequence((const UTF8 **)&UnicodePtr, + (const UTF8 *)BufferEnd, + &CodePoint, + strictConversion); + if (Result != conversionOK || + !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)) + return false; + + if (!isLexingRawMode()) + maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, + makeCharRange(*this, CurPtr, UnicodePtr), + /*IsFirst=*/false); + + CurPtr = UnicodePtr; + return true; +} bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) { // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] @@ -1500,47 +1543,10 @@ FinishIdentifier: C = getCharAndSize(CurPtr, Size); continue; - } else if (C == '\\') { - const char *UCNPtr = CurPtr + Size; - uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/0); - if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts)) - goto FinishIdentifier; - - if (!isLexingRawMode()) { - maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, - makeCharRange(*this, CurPtr, UCNPtr), - /*IsFirst=*/false); - } - - Result.setFlag(Token::HasUCN); - if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') || - (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U')) - CurPtr = UCNPtr; - else - while (CurPtr != UCNPtr) - (void)getAndAdvanceChar(CurPtr, Result); - + } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { C = getCharAndSize(CurPtr, Size); continue; - } else if (!isASCII(C)) { - const char *UnicodePtr = CurPtr; - UTF32 CodePoint; - ConversionResult Result = - llvm::convertUTF8Sequence((const UTF8 **)&UnicodePtr, - (const UTF8 *)BufferEnd, - &CodePoint, - strictConversion); - if (Result != conversionOK || - !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)) - goto FinishIdentifier; - - if (!isLexingRawMode()) { - maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, - makeCharRange(*this, CurPtr, UnicodePtr), - /*IsFirst=*/false); - } - - CurPtr = UnicodePtr; + } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) { C = getCharAndSize(CurPtr, Size); continue; } else if (!isIdentifierBody(C)) { @@ -1576,7 +1582,7 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { unsigned Size; char C = getCharAndSize(CurPtr, Size); char PrevCh = 0; - while (isPreprocessingNumberBody(C)) { // FIXME: UCNs in ud-suffix. + while (isPreprocessingNumberBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); PrevCh = C; C = getCharAndSize(CurPtr, Size); @@ -1618,6 +1624,12 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { } } + // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue. + if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) + return LexNumericConstant(Result, CurPtr); + if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) + return LexNumericConstant(Result, CurPtr); + // Update the location of token as well as BufferPtr. const char *TokStart = BufferPtr; FormTokenWithChars(Result, CurPtr, tok::numeric_constant); @@ -1631,23 +1643,35 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, bool IsStringLiteral) { assert(getLangOpts().CPlusPlus); - // Maximally munch an identifier. FIXME: UCNs. + // Maximally munch an identifier. unsigned Size; char C = getCharAndSize(CurPtr, Size); - if (isIdentifierHead(C)) { - if (!getLangOpts().CPlusPlus11) { - if (!isLexingRawMode()) - Diag(CurPtr, - C == '_' ? diag::warn_cxx11_compat_user_defined_literal - : diag::warn_cxx11_compat_reserved_user_defined_literal) - << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); + bool Consumed = false; + + if (!isIdentifierHead(C)) { + if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) + Consumed = true; + else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) + Consumed = true; + else return CurPtr; - } + } - // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix - // that does not start with an underscore is ill-formed. As a conforming - // extension, we treat all such suffixes as if they had whitespace before - // them. + if (!getLangOpts().CPlusPlus11) { + if (!isLexingRawMode()) + Diag(CurPtr, + C == '_' ? diag::warn_cxx11_compat_user_defined_literal + : diag::warn_cxx11_compat_reserved_user_defined_literal) + << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); + return CurPtr; + } + + // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix + // that does not start with an underscore is ill-formed. As a conforming + // extension, we treat all such suffixes as if they had whitespace before + // them. We assume a suffix beginning with a UCN or UTF-8 character is more + // likely to be a ud-suffix than a macro, however, and accept that. + if (!Consumed) { bool IsUDSuffix = false; if (C == '_') IsUDSuffix = true; @@ -1685,16 +1709,22 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, Diag(CurPtr, getLangOpts().MSVCCompat ? diag::ext_ms_reserved_user_defined_literal : diag::ext_reserved_user_defined_literal) - << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); + << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); return CurPtr; } - Result.setFlag(Token::HasUDSuffix); - do { - CurPtr = ConsumeChar(CurPtr, Size, Result); - C = getCharAndSize(CurPtr, Size); - } while (isIdentifierBody(C)); + CurPtr = ConsumeChar(CurPtr, Size, Result); } + + Result.setFlag(Token::HasUDSuffix); + while (true) { + C = getCharAndSize(CurPtr, Size); + if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); } + else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {} + else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {} + else break; + } + return CurPtr; } diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp index 17c6bb3049b..a71518184ca 100644 --- a/clang/lib/Lex/LiteralSupport.cpp +++ b/clang/lib/Lex/LiteralSupport.cpp @@ -212,6 +212,48 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin, return ResultChar; } +static void appendCodePoint(unsigned Codepoint, + llvm::SmallVectorImpl<char> &Str) { + char ResultBuf[4]; + char *ResultPtr = ResultBuf; + bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr); + (void)Res; + assert(Res && "Unexpected conversion failure"); + Str.append(ResultBuf, ResultPtr); +} + +void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) { + for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) { + if (*I != '\\') { + Buf.push_back(*I); + continue; + } + + ++I; + assert(*I == 'u' || *I == 'U'); + + unsigned NumHexDigits; + if (*I == 'u') + NumHexDigits = 4; + else + NumHexDigits = 8; + + assert(I + NumHexDigits <= E); + + uint32_t CodePoint = 0; + for (++I; NumHexDigits != 0; ++I, --NumHexDigits) { + unsigned Value = llvm::hexDigitValue(*I); + assert(Value != -1U); + + CodePoint <<= 4; + CodePoint += Value; + } + + appendCodePoint(CodePoint, Buf); + --I; + } +} + /// ProcessUCNEscape - Read the Universal Character Name, check constraints and /// return the UTF32. static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, @@ -625,8 +667,9 @@ NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling, } if (s != ThisTokEnd) { - if (isValidUDSuffix(PP.getLangOpts(), - StringRef(SuffixBegin, ThisTokEnd - SuffixBegin))) { + // FIXME: Don't bother expanding UCNs if !tok.hasUCN(). + expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin)); + if (isValidUDSuffix(PP.getLangOpts(), UDSuffixBuf)) { // Any suffix pieces we might have parsed are actually part of the // ud-suffix. isLong = false; @@ -992,7 +1035,8 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, do { --end; } while (end[-1] != '\''); - UDSuffixBuf.assign(end, UDSuffixEnd); + // FIXME: Don't bother with this if !tok.hasUCN(). + expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end)); UDSuffixOffset = end - TokBegin; } @@ -1311,23 +1355,34 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd); if (UDSuffixBuf.empty()) { - UDSuffixBuf.assign(UDSuffix); + if (StringToks[i].hasUCN()) + expandUCNs(UDSuffixBuf, UDSuffix); + else + UDSuffixBuf.assign(UDSuffix); UDSuffixToken = i; UDSuffixOffset = ThisTokEnd - ThisTokBuf; UDSuffixTokLoc = StringToks[i].getLocation(); - } else if (!UDSuffixBuf.equals(UDSuffix)) { + } else { + SmallString<32> ExpandedUDSuffix; + if (StringToks[i].hasUCN()) { + expandUCNs(ExpandedUDSuffix, UDSuffix); + UDSuffix = ExpandedUDSuffix; + } + // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the // result of a concatenation involving at least one user-defined-string- // literal, all the participating user-defined-string-literals shall // have the same ud-suffix. - if (Diags) { - SourceLocation TokLoc = StringToks[i].getLocation(); - Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix) - << UDSuffixBuf << UDSuffix - << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc) - << SourceRange(TokLoc, TokLoc); + if (!UDSuffixBuf.equals(UDSuffix)) { + if (Diags) { + SourceLocation TokLoc = StringToks[i].getLocation(); + Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix) + << UDSuffixBuf << UDSuffix + << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc) + << SourceRange(TokLoc, TokLoc); + } + hadError = true; } - hadError = true; } } diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp index 9ffc83ceffb..1e54ab37251 100644 --- a/clang/lib/Lex/Preprocessor.cpp +++ b/clang/lib/Lex/Preprocessor.cpp @@ -503,48 +503,6 @@ void Preprocessor::EndSourceFile() { // Lexer Event Handling. //===----------------------------------------------------------------------===// -static void appendCodePoint(unsigned Codepoint, - llvm::SmallVectorImpl<char> &Str) { - char ResultBuf[4]; - char *ResultPtr = ResultBuf; - bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr); - (void)Res; - assert(Res && "Unexpected conversion failure"); - Str.append(ResultBuf, ResultPtr); -} - -static void expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) { - for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) { - if (*I != '\\') { - Buf.push_back(*I); - continue; - } - - ++I; - assert(*I == 'u' || *I == 'U'); - - unsigned NumHexDigits; - if (*I == 'u') - NumHexDigits = 4; - else - NumHexDigits = 8; - - assert(I + NumHexDigits <= E); - - uint32_t CodePoint = 0; - for (++I; NumHexDigits != 0; ++I, --NumHexDigits) { - unsigned Value = llvm::hexDigitValue(*I); - assert(Value != -1U); - - CodePoint <<= 4; - CodePoint += Value; - } - - appendCodePoint(CodePoint, Buf); - --I; - } -} - /// LookUpIdentifierInfo - Given a tok::raw_identifier token, look up the /// identifier information for the token and install it into the token, /// updating the token kind accordingly. |

