PR18855: Add support for UCNs and UTF-8 encoding within ud-suffixes.

llvm-svn: 201532
author: Richard Smith <richard-llvm@metafoo.co.uk> 2014-02-17 21:52:30 +0000
committer: Richard Smith <richard-llvm@metafoo.co.uk> 2014-02-17 21:52:30 +0000
commit: 8b7258bdb3d371a59e1e84e3c417b76a2c0dc003 (patch)
tree: f5ff757b174428b944298f4ab3a8ca68f7ba7c1a /clang/lib/Lex
parent: 6287371ce629196749925d73cd143cb2dd35b8e7 (diff)
download: bcm5719-llvm-8b7258bdb3d371a59e1e84e3c417b76a2c0dc003.tar.gz
bcm5719-llvm-8b7258bdb3d371a59e1e84e3c417b76a2c0dc003.zip
3 files changed, 157 insertions, 114 deletions
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index c7eab490ad6..cfa835d173d 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -1445,7 +1445,50 @@ static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
         << Range;
     }
   }
- }
+}
+
+bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
+                                    Token &Result) {
+  const char *UCNPtr = CurPtr + Size;
+  uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/0);
+  if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts))
+    return false;
+
+  if (!isLexingRawMode())
+    maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
+                              makeCharRange(*this, CurPtr, UCNPtr),
+                              /*IsFirst=*/false);
+
+  Result.setFlag(Token::HasUCN);
+  if ((UCNPtr - CurPtr ==  6 && CurPtr[1] == 'u') ||
+      (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
+    CurPtr = UCNPtr;
+  else
+    while (CurPtr != UCNPtr)
+      (void)getAndAdvanceChar(CurPtr, Result);
+  return true;
+}
+
+bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
+  const char *UnicodePtr = CurPtr;
+  UTF32 CodePoint;
+  ConversionResult Result =
+      llvm::convertUTF8Sequence((const UTF8 **)&UnicodePtr,
+                                (const UTF8 *)BufferEnd,
+                                &CodePoint,
+                                strictConversion);
+  if (Result != conversionOK ||
+      !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
+    return false;
+
+  if (!isLexingRawMode())
+    maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
+                              makeCharRange(*this, CurPtr, UnicodePtr),
+                              /*IsFirst=*/false);
+
+  CurPtr = UnicodePtr;
+  return true;
+}
 
 bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
   // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
@@ -1500,47 +1543,10 @@ FinishIdentifier:
       C = getCharAndSize(CurPtr, Size);
       continue;
 
-    } else if (C == '\\') {
-      const char *UCNPtr = CurPtr + Size;
-      uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/0);
-      if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts))
-        goto FinishIdentifier;
-
-      if (!isLexingRawMode()) {
-        maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
-                                  makeCharRange(*this, CurPtr, UCNPtr),
-                                  /*IsFirst=*/false);
-      }
-
-      Result.setFlag(Token::HasUCN);
-      if ((UCNPtr - CurPtr ==  6 && CurPtr[1] == 'u') ||
-          (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
-        CurPtr = UCNPtr;
-      else
-        while (CurPtr != UCNPtr)
-          (void)getAndAdvanceChar(CurPtr, Result);
-
+    } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
       C = getCharAndSize(CurPtr, Size);
       continue;
-    } else if (!isASCII(C)) {
-      const char *UnicodePtr = CurPtr;
-      UTF32 CodePoint;
-      ConversionResult Result =
-          llvm::convertUTF8Sequence((const UTF8 **)&UnicodePtr,
-                                    (const UTF8 *)BufferEnd,
-                                    &CodePoint,
-                                    strictConversion);
-      if (Result != conversionOK ||
-          !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
-        goto FinishIdentifier;
-
-      if (!isLexingRawMode()) {
-        maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
-                                  makeCharRange(*this, CurPtr, UnicodePtr),
-                                  /*IsFirst=*/false);
-      }
-
-      CurPtr = UnicodePtr;
+    } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
       C = getCharAndSize(CurPtr, Size);
       continue;
     } else if (!isIdentifierBody(C)) {
@@ -1576,7 +1582,7 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
   unsigned Size;
   char C = getCharAndSize(CurPtr, Size);
   char PrevCh = 0;
-  while (isPreprocessingNumberBody(C)) { // FIXME: UCNs in ud-suffix.
+  while (isPreprocessingNumberBody(C)) {
     CurPtr = ConsumeChar(CurPtr, Size, Result);
     PrevCh = C;
     C = getCharAndSize(CurPtr, Size);
@@ -1618,6 +1624,12 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
     }
   }
 
+  // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
+  if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
+    return LexNumericConstant(Result, CurPtr);
+  if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
+    return LexNumericConstant(Result, CurPtr);
+
   // Update the location of token as well as BufferPtr.
   const char *TokStart = BufferPtr;
   FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
@@ -1631,23 +1643,35 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
                                bool IsStringLiteral) {
   assert(getLangOpts().CPlusPlus);
 
-  // Maximally munch an identifier. FIXME: UCNs.
+  // Maximally munch an identifier.
   unsigned Size;
   char C = getCharAndSize(CurPtr, Size);
-  if (isIdentifierHead(C)) {
-    if (!getLangOpts().CPlusPlus11) {
-      if (!isLexingRawMode())
-        Diag(CurPtr,
-             C == '_' ? diag::warn_cxx11_compat_user_defined_literal
-                      : diag::warn_cxx11_compat_reserved_user_defined_literal)
-          << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
+  bool Consumed = false;
+
+  if (!isIdentifierHead(C)) {
+    if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
+      Consumed = true;
+    else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
+      Consumed = true;
+    else
       return CurPtr;
-    }
+  }
 
-    // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
-    // that does not start with an underscore is ill-formed. As a conforming
-    // extension, we treat all such suffixes as if they had whitespace before
-    // them.
+  if (!getLangOpts().CPlusPlus11) {
+    if (!isLexingRawMode())
+      Diag(CurPtr,
+           C == '_' ? diag::warn_cxx11_compat_user_defined_literal
+                    : diag::warn_cxx11_compat_reserved_user_defined_literal)
+        << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
+    return CurPtr;
+  }
+
+  // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
+  // that does not start with an underscore is ill-formed. As a conforming
+  // extension, we treat all such suffixes as if they had whitespace before
+  // them. We assume a suffix beginning with a UCN or UTF-8 character is more
+  // likely to be a ud-suffix than a macro, however, and accept that.
+  if (!Consumed) {
     bool IsUDSuffix = false;
     if (C == '_')
       IsUDSuffix = true;
@@ -1685,16 +1709,22 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
         Diag(CurPtr, getLangOpts().MSVCCompat
                          ? diag::ext_ms_reserved_user_defined_literal
                          : diag::ext_reserved_user_defined_literal)
-            << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
+          << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
       return CurPtr;
     }
 
-    Result.setFlag(Token::HasUDSuffix);
-    do {
-      CurPtr = ConsumeChar(CurPtr, Size, Result);
-      C = getCharAndSize(CurPtr, Size);
-    } while (isIdentifierBody(C));
+    CurPtr = ConsumeChar(CurPtr, Size, Result);
   }
+
+  Result.setFlag(Token::HasUDSuffix);
+  while (true) {
+    C = getCharAndSize(CurPtr, Size);
+    if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); }
+    else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {}
+    else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {}
+    else break;
+  }
+
   return CurPtr;
 }
 
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 17c6bb3049b..a71518184ca 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -212,6 +212,48 @@ static unsigned ProcessCharEscape(const char *ThisTokBegin,
   return ResultChar;
 }
 
+static void appendCodePoint(unsigned Codepoint,
+                            llvm::SmallVectorImpl<char> &Str) {
+  char ResultBuf[4];
+  char *ResultPtr = ResultBuf;
+  bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr);
+  (void)Res;
+  assert(Res && "Unexpected conversion failure");
+  Str.append(ResultBuf, ResultPtr);
+}
+
+void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
+  for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
+    if (*I != '\\') {
+      Buf.push_back(*I);
+      continue;
+    }
+
+    ++I;
+    assert(*I == 'u' || *I == 'U');
+
+    unsigned NumHexDigits;
+    if (*I == 'u')
+      NumHexDigits = 4;
+    else
+      NumHexDigits = 8;
+
+    assert(I + NumHexDigits <= E);
+
+    uint32_t CodePoint = 0;
+    for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
+      unsigned Value = llvm::hexDigitValue(*I);
+      assert(Value != -1U);
+
+      CodePoint <<= 4;
+      CodePoint += Value;
+    }
+
+    appendCodePoint(CodePoint, Buf);
+    --I;
+  }
+}
+
 /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
 /// return the UTF32.
 static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
@@ -625,8 +667,9 @@ NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
   }
 
   if (s != ThisTokEnd) {
-    if (isValidUDSuffix(PP.getLangOpts(),
-                        StringRef(SuffixBegin, ThisTokEnd - SuffixBegin))) {
+    // FIXME: Don't bother expanding UCNs if !tok.hasUCN().
+    expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
+    if (isValidUDSuffix(PP.getLangOpts(), UDSuffixBuf)) {
       // Any suffix pieces we might have parsed are actually part of the
       // ud-suffix.
       isLong = false;
@@ -992,7 +1035,8 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
     do {
       --end;
     } while (end[-1] != '\'');
-    UDSuffixBuf.assign(end, UDSuffixEnd);
+    // FIXME: Don't bother with this if !tok.hasUCN().
+    expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
     UDSuffixOffset = end - TokBegin;
   }
 
@@ -1311,23 +1355,34 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
       StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
 
       if (UDSuffixBuf.empty()) {
-        UDSuffixBuf.assign(UDSuffix);
+        if (StringToks[i].hasUCN())
+          expandUCNs(UDSuffixBuf, UDSuffix);
+        else
+          UDSuffixBuf.assign(UDSuffix);
         UDSuffixToken = i;
         UDSuffixOffset = ThisTokEnd - ThisTokBuf;
         UDSuffixTokLoc = StringToks[i].getLocation();
-      } else if (!UDSuffixBuf.equals(UDSuffix)) {
+      } else {
+        SmallString<32> ExpandedUDSuffix;
+        if (StringToks[i].hasUCN()) {
+          expandUCNs(ExpandedUDSuffix, UDSuffix);
+          UDSuffix = ExpandedUDSuffix;
+        }
+
         // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
         // result of a concatenation involving at least one user-defined-string-
         // literal, all the participating user-defined-string-literals shall
         // have the same ud-suffix.
-        if (Diags) {
-          SourceLocation TokLoc = StringToks[i].getLocation();
-          Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
-            << UDSuffixBuf << UDSuffix
-            << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
-            << SourceRange(TokLoc, TokLoc);
+        if (!UDSuffixBuf.equals(UDSuffix)) {
+          if (Diags) {
+            SourceLocation TokLoc = StringToks[i].getLocation();
+            Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
+              << UDSuffixBuf << UDSuffix
+              << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
+              << SourceRange(TokLoc, TokLoc);
+          }
+          hadError = true;
         }
-        hadError = true;
       }
     }
 
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index 9ffc83ceffb..1e54ab37251 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -503,48 +503,6 @@ void Preprocessor::EndSourceFile() {
 // Lexer Event Handling.
 //===----------------------------------------------------------------------===//
 
-static void appendCodePoint(unsigned Codepoint,
-                            llvm::SmallVectorImpl<char> &Str) {
-  char ResultBuf[4];
-  char *ResultPtr = ResultBuf;
-  bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr);
-  (void)Res;
-  assert(Res && "Unexpected conversion failure");
-  Str.append(ResultBuf, ResultPtr);
-}
-
-static void expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
-  for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
-    if (*I != '\\') {
-      Buf.push_back(*I);
-      continue;
-    }
-
-    ++I;
-    assert(*I == 'u' || *I == 'U');
-
-    unsigned NumHexDigits;
-    if (*I == 'u')
-      NumHexDigits = 4;
-    else
-      NumHexDigits = 8;
-
-    assert(I + NumHexDigits <= E);
-
-    uint32_t CodePoint = 0;
-    for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
-      unsigned Value = llvm::hexDigitValue(*I);
-      assert(Value != -1U);
-
-      CodePoint <<= 4;
-      CodePoint += Value;
-    }
-
-    appendCodePoint(CodePoint, Buf);
-    --I;
-  }
-}
-
 /// LookUpIdentifierInfo - Given a tok::raw_identifier token, look up the
 /// identifier information for the token and install it into the token,
 /// updating the token kind accordingly.
author	Richard Smith <richard-llvm@metafoo.co.uk>	2014-02-17 21:52:30 +0000
committer	Richard Smith <richard-llvm@metafoo.co.uk>	2014-02-17 21:52:30 +0000
commit	8b7258bdb3d371a59e1e84e3c417b76a2c0dc003 (patch)
tree	f5ff757b174428b944298f4ab3a8ca68f7ba7c1a /clang/lib/Lex
parent	6287371ce629196749925d73cd143cb2dd35b8e7 (diff)
download	bcm5719-llvm-8b7258bdb3d371a59e1e84e3c417b76a2c0dc003.tar.gz bcm5719-llvm-8b7258bdb3d371a59e1e84e3c417b76a2c0dc003.zip