Handle universal character names and Unicode characters outside of literals.

This is a missing piece for C99 conformance. This patch handles UCNs by adding a '\\' case to LexTokenInternal and LexIdentifier -- if we see a backslash, we tentatively try to read in a UCN. If the UCN is not syntactically well-formed, we fall back to the old treatment: a backslash followed by an identifier beginning with 'u' (or 'U'). Because the spelling of an identifier with UCNs still has the UCN in it, we need to convert that to UTF-8 in Preprocessor::LookUpIdentifierInfo. Of course, valid code that does *not* use UCNs will see only a very minimal performance hit (checks after each identifier for non-ASCII characters, checks when converting raw_identifiers to identifiers that they do not contain UCNs, and checks when getting the spelling of an identifier that it does not contain a UCN). This patch also adds basic support for actual UTF-8 in the source. This is treated almost exactly the same as UCNs except that we consider stray Unicode characters to be mistakes and offer a fixit to remove them. llvm-svn: 173369
author: Jordan Rose <jordan_rose@apple.com> 2013-01-24 20:50:46 +0000
committer: Jordan Rose <jordan_rose@apple.com> 2013-01-24 20:50:46 +0000
commit: 7f43dddae0669f0700cf04bf520c12d3301cd809 (patch)
tree: c431698192bbab6064242887616f7e36916aa973 /clang/lib/Lex/Preprocessor.cpp
parent: aa89cf1a6611ce1a35b85418b0434620b4064c83 (diff)
download: bcm5719-llvm-7f43dddae0669f0700cf04bf520c12d3301cd809.tar.gz
bcm5719-llvm-7f43dddae0669f0700cf04bf520c12d3301cd809.zip
1 files changed, 56 insertions, 4 deletions
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index c01019cf430..b933a5fd750 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -27,6 +27,7 @@
 
 #include "clang/Lex/Preprocessor.h"
 #include "MacroArgs.h"
+#include "clang/Basic/ConvertUTF.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/TargetInfo.h"
@@ -43,6 +44,8 @@
 #include "clang/Lex/ScratchBuffer.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Capacity.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
@@ -396,7 +399,7 @@ StringRef Preprocessor::getSpelling(const Token &Tok,
                                           SmallVectorImpl<char> &Buffer,
                                           bool *Invalid) const {
   // NOTE: this has to be checked *before* testing for an IdentifierInfo.
-  if (Tok.isNot(tok::raw_identifier)) {
+  if (Tok.isNot(tok::raw_identifier) && !Tok.hasUCN()) {
     // Try the fast path.
     if (const IdentifierInfo *II = Tok.getIdentifierInfo())
       return II->getName();
@@ -494,6 +497,48 @@ void Preprocessor::EndSourceFile() {
 // Lexer Event Handling.
 //===----------------------------------------------------------------------===//
 
+static void appendCodePoint(unsigned Codepoint,
+                            llvm::SmallVectorImpl<char> &Str) {
+  char ResultBuf[4];
+  char *ResultPtr = ResultBuf;
+  bool Res = ConvertCodePointToUTF8(Codepoint, ResultPtr);
+  (void)Res;
+  assert(Res && "Unexpected conversion failure");
+  Str.append(ResultBuf, ResultPtr);
+}
+
+static void expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
+  for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
+    if (*I != '\\') {
+      Buf.push_back(*I);
+      continue;
+    }
+
+    ++I;
+    assert(*I == 'u' || *I == 'U');
+
+    unsigned NumHexDigits;
+    if (*I == 'u')
+      NumHexDigits = 4;
+    else
+      NumHexDigits = 8;
+
+    assert(I + NumHexDigits <= E);
+
+    uint32_t CodePoint = 0;
+    for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
+      unsigned Value = llvm::hexDigitValue(*I);
+      assert(Value != -1U);
+
+      CodePoint <<= 4;
+      CodePoint += Value;
+    }
+
+    appendCodePoint(CodePoint, Buf);
+    --I;
+  }
+}
+
 /// LookUpIdentifierInfo - Given a tok::raw_identifier token, look up the
 /// identifier information for the token and install it into the token,
 /// updating the token kind accordingly.
@@ -502,15 +547,22 @@ IdentifierInfo *Preprocessor::LookUpIdentifierInfo(Token &Identifier) const {
 
   // Look up this token, see if it is a macro, or if it is a language keyword.
   IdentifierInfo *II;
-  if (!Identifier.needsCleaning()) {
+  if (!Identifier.needsCleaning() && !Identifier.hasUCN()) {
     // No cleaning needed, just use the characters from the lexed buffer.
     II = getIdentifierInfo(StringRef(Identifier.getRawIdentifierData(),
-                                           Identifier.getLength()));
+                                     Identifier.getLength()));
   } else {
     // Cleaning needed, alloca a buffer, clean into it, then use the buffer.
     SmallString<64> IdentifierBuffer;
     StringRef CleanedStr = getSpelling(Identifier, IdentifierBuffer);
-    II = getIdentifierInfo(CleanedStr);
+
+    if (Identifier.hasUCN()) {
+      SmallString<64> UCNIdentifierBuffer;
+      expandUCNs(UCNIdentifierBuffer, CleanedStr);
+      II = getIdentifierInfo(UCNIdentifierBuffer);
+    } else {
+      II = getIdentifierInfo(CleanedStr);
+    }
   }
 
   // Update the token info (identifier info and appropriate token kind).
author	Jordan Rose <jordan_rose@apple.com>	2013-01-24 20:50:46 +0000
committer	Jordan Rose <jordan_rose@apple.com>	2013-01-24 20:50:46 +0000
commit	7f43dddae0669f0700cf04bf520c12d3301cd809 (patch)
tree	c431698192bbab6064242887616f7e36916aa973 /clang/lib/Lex/Preprocessor.cpp
parent	aa89cf1a6611ce1a35b85418b0434620b4064c83 (diff)
download	bcm5719-llvm-7f43dddae0669f0700cf04bf520c12d3301cd809.tar.gz bcm5719-llvm-7f43dddae0669f0700cf04bf520c12d3301cd809.zip