UTF-8 support for clang-format.

Summary: Detect if the file is valid UTF-8, and if this is the case, count code points instead of just using number of bytes in all (hopefully) places, where number of columns is needed. In particular, use the new FormatToken.CodePointCount instead of TokenLength where appropriate. Changed BreakableToken implementations to respect utf-8 character boundaries when in utf-8 mode. Reviewers: klimek, djasper Reviewed By: djasper CC: cfe-commits, rsmith, gribozavr Differential Revision: http://llvm-reviews.chandlerc.com/D918 llvm-svn: 183312
author: Alexander Kornienko <alexfh@google.com> 2013-06-05 14:09:10 +0000
committer: Alexander Kornienko <alexfh@google.com> 2013-06-05 14:09:10 +0000
commit: ffcc010767573c657ee0e6c0c9ea82ca124003ab (patch)
tree: 945819aeda9957c1232c7e2f7329e0c7a147b3e2 /clang/lib/Format/TokenAnnotator.cpp
parent: 218f6d8f59f55c848d335d89cbdd84706f7e096c (diff)
download: bcm5719-llvm-ffcc010767573c657ee0e6c0c9ea82ca124003ab.tar.gz
bcm5719-llvm-ffcc010767573c657ee0e6c0c9ea82ca124003ab.zip
1 files changed, 16 insertions, 23 deletions
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 83dea841b5e..62177b3efd7 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -15,7 +15,6 @@
 
 #include "TokenAnnotator.h"
 #include "clang/Basic/SourceManager.h"
-#include "clang/Lex/Lexer.h"
 #include "llvm/Support/Debug.h"
 
 namespace clang {
@@ -28,10 +27,9 @@ namespace format {
 /// into template parameter lists.
 class AnnotatingParser {
 public:
-  AnnotatingParser(SourceManager &SourceMgr, Lexer &Lex, AnnotatedLine &Line,
-                   IdentifierInfo &Ident_in)
-      : SourceMgr(SourceMgr), Lex(Lex), Line(Line), CurrentToken(Line.First),
-        KeywordVirtualFound(false), NameFound(false), Ident_in(Ident_in) {
+  AnnotatingParser(AnnotatedLine &Line, IdentifierInfo &Ident_in)
+      : Line(Line), CurrentToken(Line.First), KeywordVirtualFound(false),
+        NameFound(false), Ident_in(Ident_in) {
     Contexts.push_back(Context(tok::unknown, 1, /*IsExpression=*/ false));
   }
 
@@ -295,9 +293,11 @@ private:
                  Line.First->Type == TT_ObjCMethodSpecifier) {
         Tok->Type = TT_ObjCMethodExpr;
         Tok->Previous->Type = TT_ObjCSelectorName;
-        if (Tok->Previous->TokenLength >
-            Contexts.back().LongestObjCSelectorName)
-          Contexts.back().LongestObjCSelectorName = Tok->Previous->TokenLength;
+        if (Tok->Previous->CodePointCount >
+            Contexts.back().LongestObjCSelectorName) {
+          Contexts.back().LongestObjCSelectorName =
+              Tok->Previous->CodePointCount;
+        }
         if (Contexts.back().FirstObjCSelectorName == NULL)
           Contexts.back().FirstObjCSelectorName = Tok->Previous;
       } else if (Contexts.back().ColonIsForRangeExpr) {
@@ -602,9 +602,7 @@ private:
       } else if (Current.isBinaryOperator()) {
         Current.Type = TT_BinaryOperator;
       } else if (Current.is(tok::comment)) {
-        std::string Data(
-            Lexer::getSpelling(Current.Tok, SourceMgr, Lex.getLangOpts()));
-        if (StringRef(Data).startswith("//"))
+        if (Current.TokenText.startswith("//"))
           Current.Type = TT_LineComment;
         else
           Current.Type = TT_BlockComment;
@@ -748,23 +746,19 @@ private:
     case tok::kw_wchar_t:
     case tok::kw_bool:
     case tok::kw___underlying_type:
-      return true;
     case tok::annot_typename:
     case tok::kw_char16_t:
     case tok::kw_char32_t:
     case tok::kw_typeof:
     case tok::kw_decltype:
-      return Lex.getLangOpts().CPlusPlus;
+      return true;
     default:
-      break;
+      return false;
     }
-    return false;
   }
 
   SmallVector<Context, 8> Contexts;
 
-  SourceManager &SourceMgr;
-  Lexer &Lex;
   AnnotatedLine &Line;
   FormatToken *CurrentToken;
   bool KeywordVirtualFound;
@@ -866,7 +860,7 @@ private:
 };
 
 void TokenAnnotator::annotate(AnnotatedLine &Line) {
-  AnnotatingParser Parser(SourceMgr, Lex, Line, Ident_in);
+  AnnotatingParser Parser(Line, Ident_in);
   Line.Type = Parser.parseLine();
   if (Line.Type == LT_Invalid)
     return;
@@ -886,7 +880,7 @@ void TokenAnnotator::annotate(AnnotatedLine &Line) {
 }
 
 void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) {
-  Line.First->TotalLength = Line.First->TokenLength;
+  Line.First->TotalLength = Line.First->CodePointCount;
   if (!Line.First->Next)
     return;
   FormatToken *Current = Line.First->Next;
@@ -920,7 +914,7 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) {
       Current->TotalLength = Current->Previous->TotalLength + Style.ColumnLimit;
     else
       Current->TotalLength =
-          Current->Previous->TotalLength + Current->TokenLength +
+          Current->Previous->TotalLength + Current->CodePointCount +
           Current->SpacesRequiredBefore;
     // FIXME: Only calculate this if CanBreakBefore is true once static
     // initializers etc. are sorted out.
@@ -947,7 +941,7 @@ void TokenAnnotator::calculateUnbreakableTailLengths(AnnotatedLine &Line) {
       UnbreakableTailLength = 0;
     } else {
       UnbreakableTailLength +=
-          Current->TokenLength + Current->SpacesRequiredBefore;
+          Current->CodePointCount + Current->SpacesRequiredBefore;
     }
     Current = Current->Previous;
   }
@@ -1015,8 +1009,7 @@ unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line,
 
   if (Right.is(tok::lessless)) {
     if (Left.is(tok::string_literal)) {
-      StringRef Content =
-          StringRef(Left.Tok.getLiteralData(), Left.TokenLength);
+      StringRef Content = Left.TokenText;
       Content = Content.drop_back(1).drop_front(1).trim();
       if (Content.size() > 1 &&
           (Content.back() == ':' || Content.back() == '='))
author	Alexander Kornienko <alexfh@google.com>	2013-06-05 14:09:10 +0000
committer	Alexander Kornienko <alexfh@google.com>	2013-06-05 14:09:10 +0000
commit	ffcc010767573c657ee0e6c0c9ea82ca124003ab (patch)
tree	945819aeda9957c1232c7e2f7329e0c7a147b3e2 /clang/lib/Format/TokenAnnotator.cpp
parent	218f6d8f59f55c848d335d89cbdd84706f7e096c (diff)
download	bcm5719-llvm-ffcc010767573c657ee0e6c0c9ea82ca124003ab.tar.gz bcm5719-llvm-ffcc010767573c657ee0e6c0c9ea82ca124003ab.zip