UTF-8 support for clang-format.

Summary: Detect if the file is valid UTF-8, and if this is the case, count code points instead of just using number of bytes in all (hopefully) places, where number of columns is needed. In particular, use the new FormatToken.CodePointCount instead of TokenLength where appropriate. Changed BreakableToken implementations to respect utf-8 character boundaries when in utf-8 mode. Reviewers: klimek, djasper Reviewed By: djasper CC: cfe-commits, rsmith, gribozavr Differential Revision: http://llvm-reviews.chandlerc.com/D918 llvm-svn: 183312
author: Alexander Kornienko <alexfh@google.com> 2013-06-05 14:09:10 +0000
committer: Alexander Kornienko <alexfh@google.com> 2013-06-05 14:09:10 +0000
commit: ffcc010767573c657ee0e6c0c9ea82ca124003ab (patch)
tree: 945819aeda9957c1232c7e2f7329e0c7a147b3e2 /clang/lib/Format/Format.cpp
parent: 218f6d8f59f55c848d335d89cbdd84706f7e096c (diff)
download: bcm5719-llvm-ffcc010767573c657ee0e6c0c9ea82ca124003ab.tar.gz
bcm5719-llvm-ffcc010767573c657ee0e6c0c9ea82ca124003ab.zip
1 files changed, 56 insertions, 37 deletions
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 63bf09317e3..9dd5e4a0f21 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -243,10 +243,11 @@ public:
   UnwrappedLineFormatter(const FormatStyle &Style, SourceManager &SourceMgr,
                          const AnnotatedLine &Line, unsigned FirstIndent,
                          const FormatToken *RootToken,
-                         WhitespaceManager &Whitespaces)
+                         WhitespaceManager &Whitespaces,
+                         encoding::Encoding Encoding)
       : Style(Style), SourceMgr(SourceMgr), Line(Line),
         FirstIndent(FirstIndent), RootToken(RootToken),
-        Whitespaces(Whitespaces), Count(0) {}
+        Whitespaces(Whitespaces), Count(0), Encoding(Encoding) {}
 
   /// \brief Formats an \c UnwrappedLine.
   void format(const AnnotatedLine *NextLine) {
@@ -484,7 +485,7 @@ private:
                                  State.NextToken->WhitespaceRange.getEnd()) -
                              SourceMgr.getSpellingColumnNumber(
                                  State.NextToken->WhitespaceRange.getBegin());
-      State.Column += WhitespaceLength + State.NextToken->TokenLength;
+      State.Column += WhitespaceLength + State.NextToken->CodePointCount;
       State.NextToken = State.NextToken->Next;
       return 0;
     }
@@ -520,11 +521,11 @@ private:
                   Line.StartsDefinition)) {
         State.Column = State.Stack.back().Indent;
       } else if (Current.Type == TT_ObjCSelectorName) {
-        if (State.Stack.back().ColonPos > Current.TokenLength) {
-          State.Column = State.Stack.back().ColonPos - Current.TokenLength;
+        if (State.Stack.back().ColonPos > Current.CodePointCount) {
+          State.Column = State.Stack.back().ColonPos - Current.CodePointCount;
         } else {
           State.Column = State.Stack.back().Indent;
-          State.Stack.back().ColonPos = State.Column + Current.TokenLength;
+          State.Stack.back().ColonPos = State.Column + Current.CodePointCount;
         }
       } else if (Current.Type == TT_StartOfName ||
                  Previous.isOneOf(tok::coloncolon, tok::equal) ||
@@ -560,7 +561,7 @@ private:
       State.Stack.back().LastSpace = State.Column;
       if (Current.isOneOf(tok::arrow, tok::period) &&
           Current.Type != TT_DesignatedInitializerPeriod)
-        State.Stack.back().LastSpace += Current.TokenLength;
+        State.Stack.back().LastSpace += Current.CodePointCount;
       State.StartOfLineLevel = State.ParenLevel;
       State.LowestCallLevel = State.ParenLevel;
 
@@ -595,8 +596,8 @@ private:
         State.Stack.back().VariablePos = State.Column;
         // Move over * and & if they are bound to the variable name.
         const FormatToken *Tok = &Previous;
-        while (Tok && State.Stack.back().VariablePos >= Tok->TokenLength) {
-          State.Stack.back().VariablePos -= Tok->TokenLength;
+        while (Tok && State.Stack.back().VariablePos >= Tok->CodePointCount) {
+          State.Stack.back().VariablePos -= Tok->CodePointCount;
           if (Tok->SpacesRequiredBefore != 0)
             break;
           Tok = Tok->Previous;
@@ -614,12 +615,12 @@ private:
       if (Current.Type == TT_ObjCSelectorName &&
           State.Stack.back().ColonPos == 0) {
         if (State.Stack.back().Indent + Current.LongestObjCSelectorName >
-            State.Column + Spaces + Current.TokenLength)
+            State.Column + Spaces + Current.CodePointCount)
           State.Stack.back().ColonPos =
               State.Stack.back().Indent + Current.LongestObjCSelectorName;
         else
           State.Stack.back().ColonPos =
-              State.Column + Spaces + Current.TokenLength;
+              State.Column + Spaces + Current.CodePointCount;
       }
 
       if (Previous.opensScope() && Previous.Type != TT_ObjCMethodExpr &&
@@ -671,7 +672,8 @@ private:
       State.LowestCallLevel = std::min(State.LowestCallLevel, State.ParenLevel);
       if (Line.Type == LT_BuilderTypeCall && State.ParenLevel == 0)
         State.Stack.back().StartOfFunctionCall =
-            Current.LastInChainOfCalls ? 0 : State.Column + Current.TokenLength;
+            Current.LastInChainOfCalls ? 0
+                                       : State.Column + Current.CodePointCount;
     }
     if (Current.Type == TT_CtorInitializerColon) {
       // Indent 2 from the column, so:
@@ -779,7 +781,7 @@ private:
       State.StartOfStringLiteral = 0;
     }
 
-    State.Column += Current.TokenLength;
+    State.Column += Current.CodePointCount;
 
     State.NextToken = State.NextToken->Next;
 
@@ -798,7 +800,7 @@ private:
                                 bool DryRun) {
     unsigned UnbreakableTailLength = Current.UnbreakableTailLength;
     llvm::OwningPtr<BreakableToken> Token;
-    unsigned StartColumn = State.Column - Current.TokenLength;
+    unsigned StartColumn = State.Column - Current.CodePointCount;
     unsigned OriginalStartColumn =
         SourceMgr.getSpellingColumnNumber(Current.getStartOfNonWhitespace()) -
         1;
@@ -811,15 +813,16 @@ private:
       if (!LiteralData || *LiteralData != '"')
         return 0;
 
-      Token.reset(new BreakableStringLiteral(Current, StartColumn));
+      Token.reset(new BreakableStringLiteral(Current, StartColumn, Encoding));
     } else if (Current.Type == TT_BlockComment) {
       BreakableBlockComment *BBC = new BreakableBlockComment(
-          Style, Current, StartColumn, OriginalStartColumn, !Current.Previous);
+          Style, Current, StartColumn, OriginalStartColumn, !Current.Previous,
+          Encoding);
       Token.reset(BBC);
     } else if (Current.Type == TT_LineComment &&
                (Current.Previous == NULL ||
                 Current.Previous->Type != TT_ImplicitStringLiteral)) {
-      Token.reset(new BreakableLineComment(Current, StartColumn));
+      Token.reset(new BreakableLineComment(Current, StartColumn, Encoding));
     } else {
       return 0;
     }
@@ -837,27 +840,27 @@ private:
                                        Whitespaces);
       }
       unsigned TailOffset = 0;
-      unsigned RemainingTokenLength =
+      unsigned RemainingTokenColumns =
           Token->getLineLengthAfterSplit(LineIndex, TailOffset);
-      while (RemainingTokenLength > RemainingSpace) {
+      while (RemainingTokenColumns > RemainingSpace) {
         BreakableToken::Split Split =
             Token->getSplit(LineIndex, TailOffset, getColumnLimit());
         if (Split.first == StringRef::npos)
           break;
         assert(Split.first != 0);
-        unsigned NewRemainingTokenLength = Token->getLineLengthAfterSplit(
+        unsigned NewRemainingTokenColumns = Token->getLineLengthAfterSplit(
             LineIndex, TailOffset + Split.first + Split.second);
-        assert(NewRemainingTokenLength < RemainingTokenLength);
+        assert(NewRemainingTokenColumns < RemainingTokenColumns);
         if (!DryRun) {
           Token->insertBreak(LineIndex, TailOffset, Split, Line.InPPDirective,
                              Whitespaces);
         }
         TailOffset += Split.first + Split.second;
-        RemainingTokenLength = NewRemainingTokenLength;
+        RemainingTokenColumns = NewRemainingTokenColumns;
         Penalty += Style.PenaltyExcessCharacter;
         BreakInserted = true;
       }
-      PositionAfterLastLineInToken = RemainingTokenLength;
+      PositionAfterLastLineInToken = RemainingTokenColumns;
     }
 
     if (BreakInserted) {
@@ -1080,13 +1083,16 @@ private:
   // Increasing count of \c StateNode items we have created. This is used
   // to create a deterministic order independent of the container.
   unsigned Count;
+  encoding::Encoding Encoding;
 };
 
 class FormatTokenLexer {
 public:
-  FormatTokenLexer(Lexer &Lex, SourceManager &SourceMgr)
+  FormatTokenLexer(Lexer &Lex, SourceManager &SourceMgr,
+                   encoding::Encoding Encoding)
       : FormatTok(NULL), GreaterStashed(false), TrailingWhitespace(0), Lex(Lex),
-        SourceMgr(SourceMgr), IdentTable(Lex.getLangOpts()) {
+        SourceMgr(SourceMgr), IdentTable(Lex.getLangOpts()),
+        Encoding(Encoding) {
     Lex.SetKeepWhitespaceMode(true);
   }
 
@@ -1111,7 +1117,8 @@ private:
           FormatTok->Tok.getLocation().getLocWithOffset(1);
       FormatTok->WhitespaceRange =
           SourceRange(GreaterLocation, GreaterLocation);
-      FormatTok->TokenLength = 1;
+      FormatTok->ByteCount = 1;
+      FormatTok->CodePointCount = 1;
       GreaterStashed = false;
       return FormatTok;
     }
@@ -1146,12 +1153,12 @@ private:
     }
 
     // Now FormatTok is the next non-whitespace token.
-    FormatTok->TokenLength = Text.size();
+    FormatTok->ByteCount = Text.size();
 
     TrailingWhitespace = 0;
     if (FormatTok->Tok.is(tok::comment)) {
       TrailingWhitespace = Text.size() - Text.rtrim().size();
-      FormatTok->TokenLength -= TrailingWhitespace;
+      FormatTok->ByteCount -= TrailingWhitespace;
     }
 
     // In case the token starts with escaped newlines, we want to
@@ -1164,7 +1171,7 @@ private:
     while (i + 1 < Text.size() && Text[i] == '\\' && Text[i + 1] == '\n') {
       // FIXME: ++FormatTok->NewlinesBefore is missing...
       WhitespaceLength += 2;
-      FormatTok->TokenLength -= 2;
+      FormatTok->ByteCount -= 2;
       i += 2;
     }
 
@@ -1176,15 +1183,19 @@ private:
 
     if (FormatTok->Tok.is(tok::greatergreater)) {
       FormatTok->Tok.setKind(tok::greater);
-      FormatTok->TokenLength = 1;
+      FormatTok->ByteCount = 1;
       GreaterStashed = true;
     }
 
+    unsigned EncodingExtraBytes =
+        Text.size() - encoding::getCodePointCount(Text, Encoding);
+    FormatTok->CodePointCount = FormatTok->ByteCount - EncodingExtraBytes;
+
     FormatTok->WhitespaceRange = SourceRange(
         WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
     FormatTok->TokenText = StringRef(
         SourceMgr.getCharacterData(FormatTok->getStartOfNonWhitespace()),
-        FormatTok->TokenLength);
+        FormatTok->ByteCount);
     return FormatTok;
   }
 
@@ -1194,6 +1205,7 @@ private:
   Lexer &Lex;
   SourceManager &SourceMgr;
   IdentifierTable IdentTable;
+  encoding::Encoding Encoding;
   llvm::SpecificBumpPtrAllocator<FormatToken> Allocator;
   SmallVector<FormatToken *, 16> Tokens;
 
@@ -1209,17 +1221,22 @@ public:
   Formatter(const FormatStyle &Style, Lexer &Lex, SourceManager &SourceMgr,
             const std::vector<CharSourceRange> &Ranges)
       : Style(Style), Lex(Lex), SourceMgr(SourceMgr),
-        Whitespaces(SourceMgr, Style), Ranges(Ranges) {}
+        Whitespaces(SourceMgr, Style), Ranges(Ranges),
+        Encoding(encoding::detectEncoding(Lex.getBuffer())) {
+    DEBUG(llvm::dbgs()
+          << "File encoding: "
+          << (Encoding == encoding::Encoding_UTF8 ? "UTF8" : "unknown")
+          << "\n");
+  }
 
   virtual ~Formatter() {}
 
   tooling::Replacements format() {
-    FormatTokenLexer Tokens(Lex, SourceMgr);
+    FormatTokenLexer Tokens(Lex, SourceMgr, Encoding);
 
     UnwrappedLineParser Parser(Style, Tokens.lex(), *this);
     bool StructuralError = Parser.parse();
-    TokenAnnotator Annotator(Style, SourceMgr, Lex,
-                             Tokens.getIdentTable().get("in"));
+    TokenAnnotator Annotator(Style, Tokens.getIdentTable().get("in"));
     for (unsigned i = 0, e = AnnotatedLines.size(); i != e; ++i) {
       Annotator.annotate(AnnotatedLines[i]);
     }
@@ -1290,7 +1307,7 @@ public:
               1;
         }
         UnwrappedLineFormatter Formatter(Style, SourceMgr, TheLine, Indent,
-                                         TheLine.First, Whitespaces);
+                                         TheLine.First, Whitespaces, Encoding);
         Formatter.format(I + 1 != E ? &*(I + 1) : NULL);
         IndentForLevel[TheLine.Level] = LevelIndent;
         PreviousLineWasTouched = true;
@@ -1556,7 +1573,7 @@ private:
     CharSourceRange LineRange = CharSourceRange::getCharRange(
         First->WhitespaceRange.getBegin().getLocWithOffset(
             First->LastNewlineOffset),
-        Last->Tok.getLocation().getLocWithOffset(Last->TokenLength - 1));
+        Last->Tok.getLocation().getLocWithOffset(Last->ByteCount - 1));
     return touchesRanges(LineRange);
   }
 
@@ -1616,6 +1633,8 @@ private:
   WhitespaceManager Whitespaces;
   std::vector<CharSourceRange> Ranges;
   std::vector<AnnotatedLine> AnnotatedLines;
+
+  encoding::Encoding Encoding;
 };
 
 tooling::Replacements reformat(const FormatStyle &Style, Lexer &Lex,
author	Alexander Kornienko <alexfh@google.com>	2013-06-05 14:09:10 +0000
committer	Alexander Kornienko <alexfh@google.com>	2013-06-05 14:09:10 +0000
commit	ffcc010767573c657ee0e6c0c9ea82ca124003ab (patch)
tree	945819aeda9957c1232c7e2f7329e0c7a147b3e2 /clang/lib/Format/Format.cpp
parent	218f6d8f59f55c848d335d89cbdd84706f7e096c (diff)
download	bcm5719-llvm-ffcc010767573c657ee0e6c0c9ea82ca124003ab.tar.gz bcm5719-llvm-ffcc010767573c657ee0e6c0c9ea82ca124003ab.zip