diff options
Diffstat (limited to 'clang/lib/Lex')
-rw-r--r-- | clang/lib/Lex/Lexer.cpp | 77 | ||||
-rw-r--r-- | clang/lib/Lex/LiteralSupport.cpp | 157 | ||||
-rw-r--r-- | clang/lib/Lex/MacroArgs.cpp | 8 | ||||
-rw-r--r-- | clang/lib/Lex/PPDirectives.cpp | 4 | ||||
-rw-r--r-- | clang/lib/Lex/PPExpressions.cpp | 16 | ||||
-rw-r--r-- | clang/lib/Lex/Pragma.cpp | 6 | ||||
-rw-r--r-- | clang/lib/Lex/TokenConcatenation.cpp | 64 |
7 files changed, 209 insertions, 123 deletions
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp index 6c7169f89bd..44674a93d74 100644 --- a/clang/lib/Lex/Lexer.cpp +++ b/clang/lib/Lex/Lexer.cpp @@ -1267,8 +1267,9 @@ void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { } /// LexStringLiteral - Lex the remainder of a string literal, after having lexed -/// either " or L". -void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) { +/// either " or L" or u8" or u" or U". +void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, + tok::TokenKind Kind) { const char *NulCharacter = 0; // Does this string contain the \0 character? char C = getAndAdvanceChar(CurPtr, Result); @@ -1299,8 +1300,7 @@ void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) { // Update the location of the token as well as the BufferPtr instance var. const char *TokStart = BufferPtr; - FormTokenWithChars(Result, CurPtr, - Wide ? tok::wide_string_literal : tok::string_literal); + FormTokenWithChars(Result, CurPtr, Kind); Result.setLiteralData(TokStart); } @@ -1339,8 +1339,9 @@ void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { /// LexCharConstant - Lex the remainder of a character constant, after having -/// lexed either ' or L'. -void Lexer::LexCharConstant(Token &Result, const char *CurPtr) { +/// lexed either ' or L' or u' or U'. +void Lexer::LexCharConstant(Token &Result, const char *CurPtr, + tok::TokenKind Kind) { const char *NulCharacter = 0; // Does this character contain the \0 character? char C = getAndAdvanceChar(CurPtr, Result); @@ -1377,7 +1378,7 @@ void Lexer::LexCharConstant(Token &Result, const char *CurPtr) { // Update the location of token as well as BufferPtr. const char *TokStart = BufferPtr; - FormTokenWithChars(Result, CurPtr, tok::char_constant); + FormTokenWithChars(Result, CurPtr, Kind); Result.setLiteralData(TokStart); } @@ -2185,6 +2186,55 @@ LexNextToken: MIOpt.ReadToken(); return LexNumericConstant(Result, CurPtr); + case 'u': // Identifier (uber) or C++0x UTF-8 or UTF-16 string literal + // Notify MIOpt that we read a non-whitespace/non-comment token. + MIOpt.ReadToken(); + + if (Features.CPlusPlus0x) { + Char = getCharAndSize(CurPtr, SizeTmp); + + // UTF-16 string literal + if (Char == '"') + return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), + tok::utf16_string_literal); + + // UTF-16 character constant + if (Char == '\'') + return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), + tok::utf16_char_constant); + + // UTF-8 string literal + if (Char == '8' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') + return LexStringLiteral(Result, + ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result), + tok::utf8_string_literal); + } + + // treat u like the start of an identifier. + return LexIdentifier(Result, CurPtr); + + case 'U': // Identifier (Uber) or C++0x UTF-32 string literal + // Notify MIOpt that we read a non-whitespace/non-comment token. + MIOpt.ReadToken(); + + if (Features.CPlusPlus0x) { + Char = getCharAndSize(CurPtr, SizeTmp); + + // UTF-32 string literal + if (Char == '"') + return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), + tok::utf32_string_literal); + + // UTF-32 character constant + if (Char == '\'') + return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), + tok::utf32_char_constant); + } + + // treat U like the start of an identifier. + return LexIdentifier(Result, CurPtr); + case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). // Notify MIOpt that we read a non-whitespace/non-comment token. MIOpt.ReadToken(); @@ -2193,21 +2243,22 @@ LexNextToken: // Wide string literal. if (Char == '"') return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), - true); + tok::wide_string_literal); // Wide character constant. if (Char == '\'') - return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); + return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), + tok::wide_char_constant); // FALL THROUGH, treating L like the start of an identifier. // C99 6.4.2: Identifiers. case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': - case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': + case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': /*'U'*/ case 'V': case 'W': case 'X': case 'Y': case 'Z': case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': - case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': + case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/ case 'v': case 'w': case 'x': case 'y': case 'z': case '_': // Notify MIOpt that we read a non-whitespace/non-comment token. @@ -2230,13 +2281,13 @@ LexNextToken: case '\'': // Notify MIOpt that we read a non-whitespace/non-comment token. MIOpt.ReadToken(); - return LexCharConstant(Result, CurPtr); + return LexCharConstant(Result, CurPtr, tok::char_constant); // C99 6.4.5: String Literals. case '"': // Notify MIOpt that we read a non-whitespace/non-comment token. MIOpt.ReadToken(); - return LexStringLiteral(Result, CurPtr, false); + return LexStringLiteral(Result, CurPtr, tok::string_literal); // C99 6.4.6: Punctuators. case '?': diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp index f8a2a55117c..82493408e61 100644 --- a/clang/lib/Lex/LiteralSupport.cpp +++ b/clang/lib/Lex/LiteralSupport.cpp @@ -28,12 +28,31 @@ static int HexDigitValue(char C) { return -1; } +static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) { + switch (kind) { + default: assert(0 && "Unknown token type!"); + case tok::char_constant: + case tok::string_literal: + case tok::utf8_string_literal: + return Target.getCharWidth(); + case tok::wide_char_constant: + case tok::wide_string_literal: + return Target.getWCharWidth(); + case tok::utf16_char_constant: + case tok::utf16_string_literal: + return Target.getChar16Width(); + case tok::utf32_char_constant: + case tok::utf32_string_literal: + return Target.getChar32Width(); + } +} + /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in /// either a character or a string literal. static unsigned ProcessCharEscape(const char *&ThisTokBuf, const char *ThisTokEnd, bool &HadError, - FullSourceLoc Loc, bool IsWide, - Diagnostic *Diags, const TargetInfo &Target) { + FullSourceLoc Loc, unsigned CharWidth, + Diagnostic *Diags) { // Skip the '\' char. ++ThisTokBuf; @@ -98,9 +117,6 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf, } // See if any bits will be truncated when evaluated as a character. - unsigned CharWidth = - IsWide ? Target.getWCharWidth() : Target.getCharWidth(); - if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { Overflow = true; ResultChar &= ~0U >> (32-CharWidth); @@ -128,9 +144,6 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf, ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7'); // Check for overflow. Reject '\777', but not L'\777'. - unsigned CharWidth = - IsWide ? Target.getWCharWidth() : Target.getCharWidth(); - if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { if (Diags) Diags->Report(Loc, diag::warn_octal_escape_too_large); @@ -219,8 +232,8 @@ static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, /// we will likely rework our support for UCN's. static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, char *&ResultBuf, bool &HadError, - FullSourceLoc Loc, bool wide, Diagnostic *Diags, - const LangOptions &Features) { + FullSourceLoc Loc, unsigned CharByteWidth, + Diagnostic *Diags, const LangOptions &Features) { typedef uint32_t UTF32; UTF32 UcnVal = 0; unsigned short UcnLen = 0; @@ -230,19 +243,22 @@ static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, return; } - if (wide) { - (void)UcnLen; - assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported"); + assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth) && + "only character widths of 1, 2, or 4 bytes supported"); - if (!Features.ShortWChar) { - // Note: our internal rep of wide char tokens is always little-endian. - *ResultBuf++ = (UcnVal & 0x000000FF); - *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8; - *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16; - *ResultBuf++ = (UcnVal & 0xFF000000) >> 24; - return; - } + (void)UcnLen; + assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported"); + + if (CharByteWidth == 4) { + // Note: our internal rep of wide char tokens is always little-endian. + *ResultBuf++ = (UcnVal & 0x000000FF); + *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8; + *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16; + *ResultBuf++ = (UcnVal & 0xFF000000) >> 24; + return; + } + if (CharByteWidth == 2) { // Convert to UTF16. if (UcnVal < (UTF32)0xFFFF) { *ResultBuf++ = (UcnVal & 0x000000FF); @@ -261,6 +277,9 @@ static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, *ResultBuf++ = (surrogate2 & 0x0000FF00) >> 8; return; } + + assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters"); + // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8. // The conversion below was inspired by: // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c @@ -695,13 +714,18 @@ NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) { CharLiteralParser::CharLiteralParser(const char *begin, const char *end, - SourceLocation Loc, Preprocessor &PP) { + SourceLocation Loc, Preprocessor &PP, + tok::TokenKind kind) { // At this point we know that the character matches the regex "L?'.*'". HadError = false; - // Determine if this is a wide character. - IsWide = begin[0] == 'L'; - if (IsWide) ++begin; + Kind = kind; + + // Determine if this is a wide or UTF character. + if (Kind == tok::wide_char_constant || Kind == tok::utf16_char_constant || + Kind == tok::utf32_char_constant) { + ++begin; + } // Skip over the entry quote. assert(begin[0] == '\'' && "Invalid token lexed"); @@ -742,17 +766,17 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, ResultChar = utf32; } else { // Otherwise, this is a non-UCN escape character. Process it. + unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo()); ResultChar = ProcessCharEscape(begin, end, HadError, FullSourceLoc(Loc,PP.getSourceManager()), - IsWide, - &PP.getDiagnostics(), PP.getTargetInfo()); + CharWidth, &PP.getDiagnostics()); } } // If this is a multi-character constant (e.g. 'abc'), handle it. These are // implementation defined (C99 6.4.4.4p10). if (NumCharsSoFar) { - if (IsWide) { + if (!isAscii()) { // Emulate GCC's (unintentional?) behavior: L'ab' -> L'b'. LitVal = 0; } else { @@ -774,8 +798,8 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, if (NumCharsSoFar > 1) { // Warn about discarding the top bits for multi-char wide-character // constants (L'abcd'). - if (IsWide) - PP.Diag(Loc, diag::warn_extraneous_wide_char_constant); + if (!isAscii()) + PP.Diag(Loc, diag::warn_extraneous_char_constant); else if (NumCharsSoFar != 4) PP.Diag(Loc, diag::ext_multichar_character_literal); else @@ -787,14 +811,15 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, // Transfer the value from APInt to uint64_t Value = LitVal.getZExtValue(); - if (IsWide && PP.getLangOptions().ShortWChar && Value > 0xFFFF) + if (((isWide() && PP.getLangOptions().ShortWChar) || isUTF16()) && + Value > 0xFFFF) PP.Diag(Loc, diag::warn_ucn_escape_too_large); // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1") // if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple // character constants are not sign extended in the this implementation: // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC. - if (!IsWide && NumCharsSoFar == 1 && (Value & 128) && + if (isAscii() && NumCharsSoFar == 1 && (Value & 128) && PP.getLangOptions().CharIsSigned) Value = (signed char)Value; } @@ -839,8 +864,8 @@ StringLiteralParser(const Token *StringToks, unsigned NumStringToks, Preprocessor &PP, bool Complain) : SM(PP.getSourceManager()), Features(PP.getLangOptions()), Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() : 0), - MaxTokenLength(0), SizeBound(0), wchar_tByteWidth(0), - ResultPtr(ResultBuf.data()), hadError(false), AnyWide(false), Pascal(false) { + MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown), + ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) { init(StringToks, NumStringToks); } @@ -860,7 +885,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ MaxTokenLength = StringToks[0].getLength(); assert(StringToks[0].getLength() >= 2 && "literal token is invalid!"); SizeBound = StringToks[0].getLength()-2; // -2 for "". - AnyWide = StringToks[0].is(tok::wide_string_literal); + Kind = StringToks[0].getKind(); hadError = false; @@ -881,8 +906,18 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ if (StringToks[i].getLength() > MaxTokenLength) MaxTokenLength = StringToks[i].getLength(); - // Remember if we see any wide strings. - AnyWide |= StringToks[i].is(tok::wide_string_literal); + // Remember if we see any wide or utf-8/16/32 strings. + // Also check for illegal concatenations. + if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) { + if (isAscii()) { + Kind = StringToks[i].getKind(); + } else { + if (Diags) + Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM), + diag::err_unsupported_string_concat); + hadError = true; + } + } } // Include space for the null terminator. @@ -890,19 +925,14 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ // TODO: K&R warning: "traditional C rejects string constant concatenation" - // Get the width in bytes of wchar_t. If no wchar_t strings are used, do not - // query the target. As such, wchar_tByteWidth is only valid if AnyWide=true. - wchar_tByteWidth = ~0U; - if (AnyWide) { - wchar_tByteWidth = Target.getWCharWidth(); - assert((wchar_tByteWidth & 7) == 0 && "Assumes wchar_t is byte multiple!"); - wchar_tByteWidth /= 8; - } + // Get the width in bytes of char/wchar_t/char16_t/char32_t + CharByteWidth = getCharWidth(Kind, Target); + assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple"); + CharByteWidth /= 8; // The output buffer size needs to be large enough to hold wide characters. // This is a worst-case assumption which basically corresponds to L"" "long". - if (AnyWide) - SizeBound *= wchar_tByteWidth; + SizeBound *= CharByteWidth; // Size the temporary buffer to hold the result string data. ResultBuf.resize(SizeBound); @@ -927,18 +957,19 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features, &StringInvalid); if (StringInvalid) { - hadError = 1; + hadError = true; continue; } const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1; // Skip end quote. - bool wide = false; // TODO: Input character set mapping support. // Skip L marker for wide strings. - if (ThisTokBuf[0] == 'L') { - wide = true; + if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') { ++ThisTokBuf; + // Skip 8 of u8 marker for utf8 strings. + if (ThisTokBuf[0] == '8') + ++ThisTokBuf; } assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?"); @@ -967,7 +998,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ // Copy the character span over. unsigned Len = ThisTokBuf-InStart; - if (!AnyWide) { + if (CharByteWidth == 1) { memcpy(ResultPtr, InStart, Len); ResultPtr += Len; } else { @@ -975,7 +1006,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ for (; Len; --Len, ++InStart) { *ResultPtr++ = InStart[0]; // Add zeros at the end. - for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i) + for (unsigned i = 1, e = CharByteWidth; i != e; ++i) *ResultPtr++ = 0; } } @@ -985,29 +1016,26 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') { EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr, hadError, FullSourceLoc(StringToks[i].getLocation(),SM), - wide, Diags, Features); + CharByteWidth, Diags, Features); continue; } // Otherwise, this is a non-UCN escape character. Process it. unsigned ResultChar = ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError, FullSourceLoc(StringToks[i].getLocation(), SM), - AnyWide, Diags, Target); + CharByteWidth*8, Diags); // Note: our internal rep of wide char tokens is always little-endian. *ResultPtr++ = ResultChar & 0xFF; - if (AnyWide) { - for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i) - *ResultPtr++ = ResultChar >> i*8; - } + for (unsigned i = 1, e = CharByteWidth; i != e; ++i) + *ResultPtr++ = ResultChar >> i*8; } } if (Pascal) { ResultBuf[0] = ResultPtr-&ResultBuf[0]-1; - if (AnyWide) - ResultBuf[0] /= wchar_tByteWidth; + ResultBuf[0] /= CharByteWidth; // Verify that pascal strings aren't too large. if (GetStringLength() > 256) { @@ -1016,7 +1044,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ diag::err_pascal_string_too_long) << SourceRange(StringToks[0].getLocation(), StringToks[NumStringToks-1].getLocation()); - hadError = 1; + hadError = true; return; } } else if (Diags) { @@ -1050,7 +1078,8 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok, if (StringInvalid) return 0; - assert(SpellingPtr[0] != 'L' && "Doesn't handle wide strings yet"); + assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' && + SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet"); const char *SpellingStart = SpellingPtr; @@ -1075,7 +1104,7 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok, bool HadError = false; ProcessCharEscape(SpellingPtr, SpellingEnd, HadError, FullSourceLoc(Tok.getLocation(), SM), - false, Diags, Target); + CharByteWidth*8, Diags); assert(!HadError && "This method isn't valid on erroneous strings"); --ByteNo; } diff --git a/clang/lib/Lex/MacroArgs.cpp b/clang/lib/Lex/MacroArgs.cpp index 968c15e3c27..ccd0b705c8b 100644 --- a/clang/lib/Lex/MacroArgs.cpp +++ b/clang/lib/Lex/MacroArgs.cpp @@ -208,7 +208,13 @@ Token MacroArgs::StringifyArgument(const Token *ArgToks, // by 6.10.3.2p2. if (Tok.is(tok::string_literal) || // "foo" Tok.is(tok::wide_string_literal) || // L"foo" - Tok.is(tok::char_constant)) { // 'x' and L'x'. + Tok.is(tok::utf8_string_literal) || // u8"foo" + Tok.is(tok::utf16_string_literal) || // u"foo" + Tok.is(tok::utf32_string_literal) || // U"foo" + Tok.is(tok::char_constant) || // 'x' + Tok.is(tok::wide_char_constant) || // L'x'. + Tok.is(tok::utf16_char_constant) || // u'x'. + Tok.is(tok::utf32_char_constant)) { // U'x'. bool Invalid = false; std::string TokStr = PP.getSpelling(Tok, &Invalid); if (!Invalid) { diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp index 212ffeef1b1..383c6f5aa16 100644 --- a/clang/lib/Lex/PPDirectives.cpp +++ b/clang/lib/Lex/PPDirectives.cpp @@ -777,7 +777,7 @@ void Preprocessor::HandleLineDirective(Token &Tok) { } else { // Parse and validate the string, converting it into a unique ID. StringLiteralParser Literal(&StrTok, 1, *this); - assert(!Literal.AnyWide && "Didn't allow wide strings in"); + assert(Literal.isAscii() && "Didn't allow wide strings in"); if (Literal.hadError) return DiscardUntilEndOfDirective(); if (Literal.Pascal) { @@ -910,7 +910,7 @@ void Preprocessor::HandleDigitDirective(Token &DigitTok) { } else { // Parse and validate the string, converting it into a unique ID. StringLiteralParser Literal(&StrTok, 1, *this); - assert(!Literal.AnyWide && "Didn't allow wide strings in"); + assert(Literal.isAscii() && "Didn't allow wide strings in"); if (Literal.hadError) return DiscardUntilEndOfDirective(); if (Literal.Pascal) { diff --git a/clang/lib/Lex/PPExpressions.cpp b/clang/lib/Lex/PPExpressions.cpp index 08e2705ef1b..25816923c80 100644 --- a/clang/lib/Lex/PPExpressions.cpp +++ b/clang/lib/Lex/PPExpressions.cpp @@ -236,7 +236,10 @@ static bool EvaluateValue(PPValue &Result, Token &PeekTok, DefinedTracker &DT, PP.LexNonComment(PeekTok); return false; } - case tok::char_constant: { // 'x' + case tok::char_constant: // 'x' + case tok::wide_char_constant: { // L'x' + case tok::utf16_char_constant: // u'x' + case tok::utf32_char_constant: // U'x' llvm::SmallString<32> CharBuffer; bool CharInvalid = false; StringRef ThisTok = PP.getSpelling(PeekTok, CharBuffer, &CharInvalid); @@ -244,7 +247,7 @@ static bool EvaluateValue(PPValue &Result, Token &PeekTok, DefinedTracker &DT, return true; CharLiteralParser Literal(ThisTok.begin(), ThisTok.end(), - PeekTok.getLocation(), PP); + PeekTok.getLocation(), PP, PeekTok.getKind()); if (Literal.hadError()) return true; // A diagnostic was already emitted. @@ -255,6 +258,10 @@ static bool EvaluateValue(PPValue &Result, Token &PeekTok, DefinedTracker &DT, NumBits = TI.getIntWidth(); else if (Literal.isWide()) NumBits = TI.getWCharWidth(); + else if (Literal.isUTF16()) + NumBits = TI.getChar16Width(); + else if (Literal.isUTF32()) + NumBits = TI.getChar32Width(); else NumBits = TI.getCharWidth(); @@ -262,8 +269,9 @@ static bool EvaluateValue(PPValue &Result, Token &PeekTok, DefinedTracker &DT, llvm::APSInt Val(NumBits); // Set the value. Val = Literal.getValue(); - // Set the signedness. - Val.setIsUnsigned(!PP.getLangOptions().CharIsSigned); + // Set the signedness. UTF-16 and UTF-32 are always unsigned + if (!Literal.isUTF16() && !Literal.isUTF32()) + Val.setIsUnsigned(!PP.getLangOptions().CharIsSigned); if (Result.Val.getBitWidth() > Val.getBitWidth()) { Result.Val = Val.extend(Result.Val.getBitWidth()); diff --git a/clang/lib/Lex/Pragma.cpp b/clang/lib/Lex/Pragma.cpp index d94e2e8305f..1d0b5e4f2d0 100644 --- a/clang/lib/Lex/Pragma.cpp +++ b/clang/lib/Lex/Pragma.cpp @@ -444,7 +444,7 @@ void Preprocessor::HandlePragmaComment(Token &Tok) { // Concatenate and parse the strings. StringLiteralParser Literal(&StrToks[0], StrToks.size(), *this); - assert(!Literal.AnyWide && "Didn't allow wide strings in"); + assert(Literal.isAscii() && "Didn't allow wide strings in"); if (Literal.hadError) return; if (Literal.Pascal) { @@ -520,7 +520,7 @@ void Preprocessor::HandlePragmaMessage(Token &Tok) { // Concatenate and parse the strings. StringLiteralParser Literal(&StrToks[0], StrToks.size(), *this); - assert(!Literal.AnyWide && "Didn't allow wide strings in"); + assert(Literal.isAscii() && "Didn't allow wide strings in"); if (Literal.hadError) return; if (Literal.Pascal) { @@ -902,7 +902,7 @@ public: // Concatenate and parse the strings. StringLiteralParser Literal(&StrToks[0], StrToks.size(), PP); - assert(!Literal.AnyWide && "Didn't allow wide strings in"); + assert(Literal.isAscii() && "Didn't allow wide strings in"); if (Literal.hadError) return; if (Literal.Pascal) { diff --git a/clang/lib/Lex/TokenConcatenation.cpp b/clang/lib/Lex/TokenConcatenation.cpp index 3e9e8550313..19baf80aad3 100644 --- a/clang/lib/Lex/TokenConcatenation.cpp +++ b/clang/lib/Lex/TokenConcatenation.cpp @@ -17,42 +17,39 @@ using namespace clang; -/// StartsWithL - Return true if the spelling of this token starts with 'L'. -bool TokenConcatenation::StartsWithL(const Token &Tok) const { - if (!Tok.needsCleaning()) { - SourceManager &SM = PP.getSourceManager(); - return *SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation())) == 'L'; - } - - if (Tok.getLength() < 256) { - char Buffer[256]; - const char *TokPtr = Buffer; - PP.getSpelling(Tok, TokPtr); - return TokPtr[0] == 'L'; - } - - return PP.getSpelling(Tok)[0] == 'L'; -} +/// IsIdentifierStringPrefix - Return true if the spelling of the token +/// is literally 'L', 'u', 'U', or 'u8'. +bool TokenConcatenation::IsIdentifierStringPrefix(const Token &Tok) const { + const LangOptions &LangOpts = PP.getLangOptions(); -/// IsIdentifierL - Return true if the spelling of this token is literally -/// 'L'. -bool TokenConcatenation::IsIdentifierL(const Token &Tok) const { if (!Tok.needsCleaning()) { - if (Tok.getLength() != 1) + if (Tok.getLength() != 1 && Tok.getLength() != 2) return false; SourceManager &SM = PP.getSourceManager(); - return *SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation())) == 'L'; + const char *Ptr = SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation())); + if (Tok.getLength() == 1) + return Ptr[0] == 'L' || + (LangOpts.CPlusPlus0x && (Ptr[0] == 'u' || Ptr[0] == 'U')); + if (Tok.getLength() == 2) + return LangOpts.CPlusPlus0x && Ptr[0] == 'u' && Ptr[1] == '8'; } if (Tok.getLength() < 256) { char Buffer[256]; const char *TokPtr = Buffer; - if (PP.getSpelling(Tok, TokPtr) != 1) - return false; - return TokPtr[0] == 'L'; + unsigned length = PP.getSpelling(Tok, TokPtr); + if (length == 1) + return TokPtr[0] == 'L' || + (LangOpts.CPlusPlus0x && (TokPtr[0] == 'u' || TokPtr[0] == 'U')); + if (length == 2) + return LangOpts.CPlusPlus0x && TokPtr[0] == 'u' && TokPtr[1] == '8'; + return false; } - return PP.getSpelling(Tok) == "L"; + std::string TokStr = PP.getSpelling(Tok); + return TokStr == "L" || (LangOpts.CPlusPlus0x && (TokStr == "u8" || + TokStr == "u" || + TokStr == "U")); } TokenConcatenation::TokenConcatenation(Preprocessor &pp) : PP(pp) { @@ -179,24 +176,19 @@ bool TokenConcatenation::AvoidConcat(const Token &PrevPrevTok, if (Tok.is(tok::numeric_constant)) return GetFirstChar(PP, Tok) != '.'; - if (Tok.getIdentifierInfo() || Tok.is(tok::wide_string_literal) /* || - Tok.is(tok::wide_char_literal)*/) + if (Tok.getIdentifierInfo() || Tok.is(tok::wide_string_literal) || + Tok.is(tok::utf8_string_literal) || Tok.is(tok::utf16_string_literal) || + Tok.is(tok::utf32_string_literal) || Tok.is(tok::wide_char_constant) || + Tok.is(tok::utf16_char_constant) || Tok.is(tok::utf32_char_constant)) return true; // If this isn't identifier + string, we're done. if (Tok.isNot(tok::char_constant) && Tok.isNot(tok::string_literal)) return false; - // FIXME: need a wide_char_constant! - - // If the string was a wide string L"foo" or wide char L'f', it would - // concat with the previous identifier into fooL"bar". Avoid this. - if (StartsWithL(Tok)) - return true; - // Otherwise, this is a narrow character or string. If the *identifier* - // is a literal 'L', avoid pasting L "foo" -> L"foo". - return IsIdentifierL(PrevTok); + // is a literal 'L', 'u8', 'u' or 'U', avoid pasting L "foo" -> L"foo". + return IsIdentifierStringPrefix(PrevTok); case tok::numeric_constant: return isalnum(FirstChar) || Tok.is(tok::numeric_constant) || FirstChar == '+' || FirstChar == '-' || FirstChar == '.'; |