10 files changed, 95 insertions, 40 deletions
diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
index f623fd1d525..9e4c0f09153 100644
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -1112,29 +1112,39 @@ public:
 };
 
 class CharacterLiteral : public Expr {
+public:
+  enum CharacterKind {
+    Ascii,
+    Wide,
+    UTF16,
+    UTF32
+  };
+
+private:
   unsigned Value;
   SourceLocation Loc;
-  bool IsWide;
+  unsigned Kind : 2;
 public:
   // type should be IntTy
-  CharacterLiteral(unsigned value, bool iswide, QualType type, SourceLocation l)
+  CharacterLiteral(unsigned value, CharacterKind kind, QualType type,
+                   SourceLocation l)
     : Expr(CharacterLiteralClass, type, VK_RValue, OK_Ordinary, false, false,
            false, false),
-      Value(value), Loc(l), IsWide(iswide) {
+      Value(value), Loc(l), Kind(kind) {
   }
 
   /// \brief Construct an empty character literal.
   CharacterLiteral(EmptyShell Empty) : Expr(CharacterLiteralClass, Empty) { }
 
   SourceLocation getLocation() const { return Loc; }
-  bool isWide() const { return IsWide; }
+  CharacterKind getKind() const { return static_cast<CharacterKind>(Kind); }
 
   SourceRange getSourceRange() const { return SourceRange(Loc); }
 
   unsigned getValue() const { return Value; }
 
   void setLocation(SourceLocation Location) { Loc = Location; }
-  void setWide(bool W) { IsWide = W; }
+  void setKind(CharacterKind kind) { Kind = kind; }
   void setValue(unsigned Val) { Value = Val; }
 
   static bool classof(const Stmt *T) {
@@ -1243,13 +1253,23 @@ public:
 /// In this case, getByteLength() will return 6, but the string literal will
 /// have type "char[2]".
 class StringLiteral : public Expr {
+public:
+  enum StringKind {
+    Ascii,
+    Wide,
+    UTF8,
+    UTF16,
+    UTF32
+  };
+
+private:
   friend class ASTStmtReader;
 
   const char *StrData;
   unsigned ByteLength;
-  bool IsWide;
-  bool IsPascal;
   unsigned NumConcatenated;
+  unsigned Kind : 3;
+  bool IsPascal : 1;
   SourceLocation TokLocs[1];
 
   StringLiteral(QualType Ty) :
@@ -1259,14 +1279,15 @@ class StringLiteral : public Expr {
 public:
   /// This is the "fully general" constructor that allows representation of
   /// strings formed from multiple concatenated tokens.
-  static StringLiteral *Create(ASTContext &C, StringRef Str, bool Wide,
+  static StringLiteral *Create(ASTContext &C, StringRef Str, StringKind Kind,
                                bool Pascal, QualType Ty,
                                const SourceLocation *Loc, unsigned NumStrs);
 
   /// Simple constructor for string literals made from one token.
-  static StringLiteral *Create(ASTContext &C, StringRef Str, bool Wide, 
-                               bool Pascal, QualType Ty, SourceLocation Loc) {
-    return Create(C, Str, Wide, Pascal, Ty, &Loc, 1);
+  static StringLiteral *Create(ASTContext &C, StringRef Str, StringKind Kind,
+                               bool Pascal, QualType Ty,
+                               SourceLocation Loc) {
+    return Create(C, Str, Kind, Pascal, Ty, &Loc, 1);
   }
 
   /// \brief Construct an empty string literal.
@@ -1281,9 +1302,14 @@ public:
   /// \brief Sets the string data to the given string data.
   void setString(ASTContext &C, StringRef Str);
 
-  bool isWide() const { return IsWide; }
+  StringKind getKind() const { return static_cast<StringKind>(Kind); }
+  bool isAscii() const { return Kind == Ascii; }
+  bool isWide() const { return Kind == Wide; }
+  bool isUTF8() const { return Kind == UTF8; }
+  bool isUTF16() const { return Kind == UTF16; }
+  bool isUTF32() const { return Kind == UTF32; }
   bool isPascal() const { return IsPascal; }
-  
+
   bool containsNonAsciiOrNull() const {
     StringRef Str = getString();
     for (unsigned i = 0, e = Str.size(); i != e; ++i)
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 8a842da4404..2b726102267 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -1368,6 +1368,8 @@ public:
   bool isBooleanType() const;
   bool isCharType() const;
   bool isWideCharType() const;
+  bool isChar16Type() const;
+  bool isChar32Type() const;
   bool isAnyCharacterType() const;
   bool isIntegralType(ASTContext &Ctx) const;
   
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index 9e431a2d21c..e23921be0bf 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -77,8 +77,8 @@ def err_invalid_suffix_integer_constant : Error<
   "invalid suffix '%0' on integer constant">;
 def err_invalid_suffix_float_constant : Error<
   "invalid suffix '%0' on floating constant">;
-def warn_extraneous_wide_char_constant : Warning<
-  "extraneous characters in wide character constant ignored">;
+def warn_extraneous_char_constant : Warning<
+  "extraneous characters in character constant ignored">;
 def warn_char_constant_too_large : Warning<
   "character constant too long for its type">;
 def err_exponent_has_no_digits : Error<"exponent has no digits">;
@@ -102,6 +102,8 @@ def warn_ucn_escape_too_large : ExtWarn<
   "character unicode escape sequence too long for its type">;
 def warn_ucn_not_valid_in_c89 : ExtWarn<
   "unicode escape sequences are only valid in C99 or C++">;
+def err_unsupported_string_concat : Error<
+  "unsupported non-standard concatenation of string literals">;
   
 //===----------------------------------------------------------------------===//
 // PTH Diagnostics
diff --git a/clang/include/clang/Basic/IdentifierTable.h b/clang/include/clang/Basic/IdentifierTable.h
index be1fa196c0d..3390f7809d0 100644
--- a/clang/include/clang/Basic/IdentifierTable.h
+++ b/clang/include/clang/Basic/IdentifierTable.h
@@ -50,8 +50,8 @@ namespace clang {
 /// set, and all tok::identifier tokens have a pointer to one of these.
 class IdentifierInfo {
   // Note: DON'T make TokenID a 'tok::TokenKind'; MSVC will treat it as a
-  //       signed char and TokenKinds > 127 won't be handled correctly.
-  unsigned TokenID            : 8; // Front-end token ID or tok::identifier.
+  //       signed char and TokenKinds > 255 won't be handled correctly.
+  unsigned TokenID            : 9; // Front-end token ID or tok::identifier.
   // Objective-C keyword ('protocol' in '@protocol') or builtin (__builtin_inf).
   // First NUM_OBJC_KEYWORDS values are for Objective-C, the remaining values
   // are for builtins.
@@ -65,7 +65,7 @@ class IdentifierInfo {
                                    // file and wasn't modified since.
   bool RevertedTokenID        : 1; // True if RevertTokenIDToIdentifier was
                                    // called.
-  // 6 bits left in 32-bit word.
+  // 5 bits left in 32-bit word.
   void *FETokenInfo;               // Managed by the language front-end.
   llvm::StringMapEntry<IdentifierInfo*> *Entry;
 
@@ -409,6 +409,7 @@ public:
   IdentifierInfo &get(StringRef Name, tok::TokenKind TokenCode) {
     IdentifierInfo &II = get(Name);
     II.TokenID = TokenCode;
+    assert(II.TokenID == TokenCode && "TokenCode too large");
     return II;
   }
 
diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index 86172b83ff4..d057559889a 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -114,13 +114,23 @@ TOK(raw_identifier)      // Used only in raw lexing mode.
 TOK(numeric_constant)    // 0x123
 
 // C99 6.4.4: Character Constants
-TOK(char_constant)       // 'a'   L'b'
+TOK(char_constant)       // 'a'
+TOK(wide_char_constant)  // L'b'
+
+// C++0x Character Constants
+TOK(utf16_char_constant) // u'a'
+TOK(utf32_char_constant) // U'a'
 
 // C99 6.4.5: String Literals.
 TOK(string_literal)      // "foo"
 TOK(wide_string_literal) // L"foo"
 TOK(angle_string_literal)// <foo>
 
+// C++0x String Literals.
+TOK(utf8_string_literal) // u8"foo"
+TOK(utf16_string_literal)// u"foo"
+TOK(utf32_string_literal)// U"foo"
+
 // C99 6.4.6: Punctuators.
 PUNCTUATOR(l_square,            "[")
 PUNCTUATOR(r_square,            "]")
diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h
index 990c1eedbb2..2c25597433e 100644
--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@@ -471,9 +471,11 @@ private:
   // Helper functions to lex the remainder of a token of the specific type.
   void LexIdentifier         (Token &Result, const char *CurPtr);
   void LexNumericConstant    (Token &Result, const char *CurPtr);
-  void LexStringLiteral      (Token &Result, const char *CurPtr,bool Wide);
+  void LexStringLiteral      (Token &Result, const char *CurPtr,
+                              tok::TokenKind Kind);
   void LexAngledStringLiteral(Token &Result, const char *CurPtr);
-  void LexCharConstant       (Token &Result, const char *CurPtr);
+  void LexCharConstant       (Token &Result, const char *CurPtr,
+                              tok::TokenKind Kind);
   bool LexEndOfFile          (Token &Result, const char *CurPtr);
 
   bool SkipWhitespace        (Token &Result, const char *CurPtr);
diff --git a/clang/include/clang/Lex/LiteralSupport.h b/clang/include/clang/Lex/LiteralSupport.h
index 6486c38a406..15057299b2a 100644
--- a/clang/include/clang/Lex/LiteralSupport.h
+++ b/clang/include/clang/Lex/LiteralSupport.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/DataTypes.h"
+#include "clang/Basic/TokenKinds.h"
 #include <cctype>
 
 namespace clang {
@@ -124,15 +125,19 @@ private:
 /// character literal.
 class CharLiteralParser {
   uint64_t Value;
-  bool IsWide;
+  tok::TokenKind Kind;
   bool IsMultiChar;
   bool HadError;
 public:
   CharLiteralParser(const char *begin, const char *end,
-                    SourceLocation Loc, Preprocessor &PP);
+                    SourceLocation Loc, Preprocessor &PP,
+                    tok::TokenKind kind);
 
   bool hadError() const { return HadError; }
-  bool isWide() const { return IsWide; }
+  bool isAscii() const { return Kind == tok::char_constant; }
+  bool isWide() const { return Kind == tok::wide_char_constant; }
+  bool isUTF16() const { return Kind == tok::utf16_char_constant; }
+  bool isUTF32() const { return Kind == tok::utf32_char_constant; }
   bool isMultiChar() const { return IsMultiChar; }
   uint64_t getValue() const { return Value; }
 };
@@ -148,7 +153,8 @@ class StringLiteralParser {
   
   unsigned MaxTokenLength;
   unsigned SizeBound;
-  unsigned wchar_tByteWidth;
+  unsigned CharByteWidth;
+  tok::TokenKind Kind;
   llvm::SmallString<512> ResultBuf;
   char *ResultPtr; // cursor
 public:
@@ -158,14 +164,13 @@ public:
                       const SourceManager &sm, const LangOptions &features,
                       const TargetInfo &target, Diagnostic *diags = 0)
     : SM(sm), Features(features), Target(target), Diags(diags),
-      MaxTokenLength(0), SizeBound(0), wchar_tByteWidth(0),
-      ResultPtr(ResultBuf.data()), hadError(false), AnyWide(false), Pascal(false) {
+      MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
+      ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
     init(StringToks, NumStringToks);
   }
     
 
   bool hadError;
-  bool AnyWide;
   bool Pascal;
 
   StringRef GetString() const {
@@ -174,9 +179,7 @@ public:
   unsigned GetStringLength() const { return ResultPtr-ResultBuf.data(); }
 
   unsigned GetNumStringChars() const {
-    if (AnyWide)
-      return GetStringLength() / wchar_tByteWidth;
-    return GetStringLength();
+    return GetStringLength() / CharByteWidth;
   }
   /// getOffsetOfStringByte - This function returns the offset of the
   /// specified byte of the string data represented by Token.  This handles
@@ -185,7 +188,13 @@ public:
   /// If the Diagnostics pointer is non-null, then this will do semantic
   /// checking of the string literal and emit errors and warnings.
   unsigned getOffsetOfStringByte(const Token &TheTok, unsigned ByteNo) const;
-  
+
+  bool isAscii() { return Kind == tok::string_literal; }
+  bool isWide() { return Kind == tok::wide_string_literal; }
+  bool isUTF8() { return Kind == tok::utf8_string_literal; }
+  bool isUTF16() { return Kind == tok::utf16_string_literal; }
+  bool isUTF32() { return Kind == tok::utf32_string_literal; }
+
 private:
   void init(const Token *StringToks, unsigned NumStringToks);
 };
diff --git a/clang/include/clang/Lex/Token.h b/clang/include/clang/Lex/Token.h
index 9cf11d9a64c..e6dd1607e88 100644
--- a/clang/include/clang/Lex/Token.h
+++ b/clang/include/clang/Lex/Token.h
@@ -96,7 +96,10 @@ public:
   /// constant, string, etc.
   bool isLiteral() const {
     return is(tok::numeric_constant) || is(tok::char_constant) ||
-           is(tok::string_literal) || is(tok::wide_string_literal) ||
+           is(tok::wide_char_constant) || is(tok::utf16_char_constant) ||
+           is(tok::utf32_char_constant) || is(tok::string_literal) ||
+           is(tok::wide_string_literal) || is(tok::utf8_string_literal) ||
+           is(tok::utf16_string_literal) || is(tok::utf32_string_literal) ||
            is(tok::angle_string_literal);
   }
 
diff --git a/clang/include/clang/Lex/TokenConcatenation.h b/clang/include/clang/Lex/TokenConcatenation.h
index 094990a6e31..551300f402c 100644
--- a/clang/include/clang/Lex/TokenConcatenation.h
+++ b/clang/include/clang/Lex/TokenConcatenation.h
@@ -63,12 +63,9 @@ namespace clang {
                      const Token &Tok) const;
 
   private:
-    /// StartsWithL - Return true if the spelling of this token starts with 'L'.
-    bool StartsWithL(const Token &Tok) const;
-
-    /// IsIdentifierL - Return true if the spelling of this token is literally
-    /// 'L'.
-    bool IsIdentifierL(const Token &Tok) const;
+    /// IsIdentifierStringPrefix - Return true if the spelling of the token
+    /// is literally 'L', 'u', 'U', or 'u8'.
+    bool IsIdentifierStringPrefix(const Token &Tok) const;
   };
   } // end clang namespace
 
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 5d9376c1f7e..83b0cd455e9 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -265,7 +265,10 @@ private:
   ///
   bool isTokenStringLiteral() const {
     return Tok.getKind() == tok::string_literal ||
-           Tok.getKind() == tok::wide_string_literal;
+           Tok.getKind() == tok::wide_string_literal ||
+           Tok.getKind() == tok::utf8_string_literal ||
+           Tok.getKind() == tok::utf16_string_literal ||
+           Tok.getKind() == tok::utf32_string_literal;
   }
 
   /// \brief Returns true if the current token is a '=' or '==' and