diff options
author | Alexander Kornienko <alexfh@google.com> | 2013-06-05 14:09:10 +0000 |
---|---|---|
committer | Alexander Kornienko <alexfh@google.com> | 2013-06-05 14:09:10 +0000 |
commit | ffcc010767573c657ee0e6c0c9ea82ca124003ab (patch) | |
tree | 945819aeda9957c1232c7e2f7329e0c7a147b3e2 /clang/lib/Format/Format.cpp | |
parent | 218f6d8f59f55c848d335d89cbdd84706f7e096c (diff) | |
download | bcm5719-llvm-ffcc010767573c657ee0e6c0c9ea82ca124003ab.tar.gz bcm5719-llvm-ffcc010767573c657ee0e6c0c9ea82ca124003ab.zip |
UTF-8 support for clang-format.
Summary:
Detect if the file is valid UTF-8, and if this is the case, count code
points instead of just using number of bytes in all (hopefully) places, where
number of columns is needed. In particular, use the new
FormatToken.CodePointCount instead of TokenLength where appropriate.
Changed BreakableToken implementations to respect utf-8 character boundaries
when in utf-8 mode.
Reviewers: klimek, djasper
Reviewed By: djasper
CC: cfe-commits, rsmith, gribozavr
Differential Revision: http://llvm-reviews.chandlerc.com/D918
llvm-svn: 183312
Diffstat (limited to 'clang/lib/Format/Format.cpp')
-rw-r--r-- | clang/lib/Format/Format.cpp | 93 |
1 files changed, 56 insertions, 37 deletions
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index 63bf09317e3..9dd5e4a0f21 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -243,10 +243,11 @@ public: UnwrappedLineFormatter(const FormatStyle &Style, SourceManager &SourceMgr, const AnnotatedLine &Line, unsigned FirstIndent, const FormatToken *RootToken, - WhitespaceManager &Whitespaces) + WhitespaceManager &Whitespaces, + encoding::Encoding Encoding) : Style(Style), SourceMgr(SourceMgr), Line(Line), FirstIndent(FirstIndent), RootToken(RootToken), - Whitespaces(Whitespaces), Count(0) {} + Whitespaces(Whitespaces), Count(0), Encoding(Encoding) {} /// \brief Formats an \c UnwrappedLine. void format(const AnnotatedLine *NextLine) { @@ -484,7 +485,7 @@ private: State.NextToken->WhitespaceRange.getEnd()) - SourceMgr.getSpellingColumnNumber( State.NextToken->WhitespaceRange.getBegin()); - State.Column += WhitespaceLength + State.NextToken->TokenLength; + State.Column += WhitespaceLength + State.NextToken->CodePointCount; State.NextToken = State.NextToken->Next; return 0; } @@ -520,11 +521,11 @@ private: Line.StartsDefinition)) { State.Column = State.Stack.back().Indent; } else if (Current.Type == TT_ObjCSelectorName) { - if (State.Stack.back().ColonPos > Current.TokenLength) { - State.Column = State.Stack.back().ColonPos - Current.TokenLength; + if (State.Stack.back().ColonPos > Current.CodePointCount) { + State.Column = State.Stack.back().ColonPos - Current.CodePointCount; } else { State.Column = State.Stack.back().Indent; - State.Stack.back().ColonPos = State.Column + Current.TokenLength; + State.Stack.back().ColonPos = State.Column + Current.CodePointCount; } } else if (Current.Type == TT_StartOfName || Previous.isOneOf(tok::coloncolon, tok::equal) || @@ -560,7 +561,7 @@ private: State.Stack.back().LastSpace = State.Column; if (Current.isOneOf(tok::arrow, tok::period) && Current.Type != TT_DesignatedInitializerPeriod) - State.Stack.back().LastSpace += Current.TokenLength; + State.Stack.back().LastSpace += Current.CodePointCount; State.StartOfLineLevel = State.ParenLevel; State.LowestCallLevel = State.ParenLevel; @@ -595,8 +596,8 @@ private: State.Stack.back().VariablePos = State.Column; // Move over * and & if they are bound to the variable name. const FormatToken *Tok = &Previous; - while (Tok && State.Stack.back().VariablePos >= Tok->TokenLength) { - State.Stack.back().VariablePos -= Tok->TokenLength; + while (Tok && State.Stack.back().VariablePos >= Tok->CodePointCount) { + State.Stack.back().VariablePos -= Tok->CodePointCount; if (Tok->SpacesRequiredBefore != 0) break; Tok = Tok->Previous; @@ -614,12 +615,12 @@ private: if (Current.Type == TT_ObjCSelectorName && State.Stack.back().ColonPos == 0) { if (State.Stack.back().Indent + Current.LongestObjCSelectorName > - State.Column + Spaces + Current.TokenLength) + State.Column + Spaces + Current.CodePointCount) State.Stack.back().ColonPos = State.Stack.back().Indent + Current.LongestObjCSelectorName; else State.Stack.back().ColonPos = - State.Column + Spaces + Current.TokenLength; + State.Column + Spaces + Current.CodePointCount; } if (Previous.opensScope() && Previous.Type != TT_ObjCMethodExpr && @@ -671,7 +672,8 @@ private: State.LowestCallLevel = std::min(State.LowestCallLevel, State.ParenLevel); if (Line.Type == LT_BuilderTypeCall && State.ParenLevel == 0) State.Stack.back().StartOfFunctionCall = - Current.LastInChainOfCalls ? 0 : State.Column + Current.TokenLength; + Current.LastInChainOfCalls ? 0 + : State.Column + Current.CodePointCount; } if (Current.Type == TT_CtorInitializerColon) { // Indent 2 from the column, so: @@ -779,7 +781,7 @@ private: State.StartOfStringLiteral = 0; } - State.Column += Current.TokenLength; + State.Column += Current.CodePointCount; State.NextToken = State.NextToken->Next; @@ -798,7 +800,7 @@ private: bool DryRun) { unsigned UnbreakableTailLength = Current.UnbreakableTailLength; llvm::OwningPtr<BreakableToken> Token; - unsigned StartColumn = State.Column - Current.TokenLength; + unsigned StartColumn = State.Column - Current.CodePointCount; unsigned OriginalStartColumn = SourceMgr.getSpellingColumnNumber(Current.getStartOfNonWhitespace()) - 1; @@ -811,15 +813,16 @@ private: if (!LiteralData || *LiteralData != '"') return 0; - Token.reset(new BreakableStringLiteral(Current, StartColumn)); + Token.reset(new BreakableStringLiteral(Current, StartColumn, Encoding)); } else if (Current.Type == TT_BlockComment) { BreakableBlockComment *BBC = new BreakableBlockComment( - Style, Current, StartColumn, OriginalStartColumn, !Current.Previous); + Style, Current, StartColumn, OriginalStartColumn, !Current.Previous, + Encoding); Token.reset(BBC); } else if (Current.Type == TT_LineComment && (Current.Previous == NULL || Current.Previous->Type != TT_ImplicitStringLiteral)) { - Token.reset(new BreakableLineComment(Current, StartColumn)); + Token.reset(new BreakableLineComment(Current, StartColumn, Encoding)); } else { return 0; } @@ -837,27 +840,27 @@ private: Whitespaces); } unsigned TailOffset = 0; - unsigned RemainingTokenLength = + unsigned RemainingTokenColumns = Token->getLineLengthAfterSplit(LineIndex, TailOffset); - while (RemainingTokenLength > RemainingSpace) { + while (RemainingTokenColumns > RemainingSpace) { BreakableToken::Split Split = Token->getSplit(LineIndex, TailOffset, getColumnLimit()); if (Split.first == StringRef::npos) break; assert(Split.first != 0); - unsigned NewRemainingTokenLength = Token->getLineLengthAfterSplit( + unsigned NewRemainingTokenColumns = Token->getLineLengthAfterSplit( LineIndex, TailOffset + Split.first + Split.second); - assert(NewRemainingTokenLength < RemainingTokenLength); + assert(NewRemainingTokenColumns < RemainingTokenColumns); if (!DryRun) { Token->insertBreak(LineIndex, TailOffset, Split, Line.InPPDirective, Whitespaces); } TailOffset += Split.first + Split.second; - RemainingTokenLength = NewRemainingTokenLength; + RemainingTokenColumns = NewRemainingTokenColumns; Penalty += Style.PenaltyExcessCharacter; BreakInserted = true; } - PositionAfterLastLineInToken = RemainingTokenLength; + PositionAfterLastLineInToken = RemainingTokenColumns; } if (BreakInserted) { @@ -1080,13 +1083,16 @@ private: // Increasing count of \c StateNode items we have created. This is used // to create a deterministic order independent of the container. unsigned Count; + encoding::Encoding Encoding; }; class FormatTokenLexer { public: - FormatTokenLexer(Lexer &Lex, SourceManager &SourceMgr) + FormatTokenLexer(Lexer &Lex, SourceManager &SourceMgr, + encoding::Encoding Encoding) : FormatTok(NULL), GreaterStashed(false), TrailingWhitespace(0), Lex(Lex), - SourceMgr(SourceMgr), IdentTable(Lex.getLangOpts()) { + SourceMgr(SourceMgr), IdentTable(Lex.getLangOpts()), + Encoding(Encoding) { Lex.SetKeepWhitespaceMode(true); } @@ -1111,7 +1117,8 @@ private: FormatTok->Tok.getLocation().getLocWithOffset(1); FormatTok->WhitespaceRange = SourceRange(GreaterLocation, GreaterLocation); - FormatTok->TokenLength = 1; + FormatTok->ByteCount = 1; + FormatTok->CodePointCount = 1; GreaterStashed = false; return FormatTok; } @@ -1146,12 +1153,12 @@ private: } // Now FormatTok is the next non-whitespace token. - FormatTok->TokenLength = Text.size(); + FormatTok->ByteCount = Text.size(); TrailingWhitespace = 0; if (FormatTok->Tok.is(tok::comment)) { TrailingWhitespace = Text.size() - Text.rtrim().size(); - FormatTok->TokenLength -= TrailingWhitespace; + FormatTok->ByteCount -= TrailingWhitespace; } // In case the token starts with escaped newlines, we want to @@ -1164,7 +1171,7 @@ private: while (i + 1 < Text.size() && Text[i] == '\\' && Text[i + 1] == '\n') { // FIXME: ++FormatTok->NewlinesBefore is missing... WhitespaceLength += 2; - FormatTok->TokenLength -= 2; + FormatTok->ByteCount -= 2; i += 2; } @@ -1176,15 +1183,19 @@ private: if (FormatTok->Tok.is(tok::greatergreater)) { FormatTok->Tok.setKind(tok::greater); - FormatTok->TokenLength = 1; + FormatTok->ByteCount = 1; GreaterStashed = true; } + unsigned EncodingExtraBytes = + Text.size() - encoding::getCodePointCount(Text, Encoding); + FormatTok->CodePointCount = FormatTok->ByteCount - EncodingExtraBytes; + FormatTok->WhitespaceRange = SourceRange( WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength)); FormatTok->TokenText = StringRef( SourceMgr.getCharacterData(FormatTok->getStartOfNonWhitespace()), - FormatTok->TokenLength); + FormatTok->ByteCount); return FormatTok; } @@ -1194,6 +1205,7 @@ private: Lexer &Lex; SourceManager &SourceMgr; IdentifierTable IdentTable; + encoding::Encoding Encoding; llvm::SpecificBumpPtrAllocator<FormatToken> Allocator; SmallVector<FormatToken *, 16> Tokens; @@ -1209,17 +1221,22 @@ public: Formatter(const FormatStyle &Style, Lexer &Lex, SourceManager &SourceMgr, const std::vector<CharSourceRange> &Ranges) : Style(Style), Lex(Lex), SourceMgr(SourceMgr), - Whitespaces(SourceMgr, Style), Ranges(Ranges) {} + Whitespaces(SourceMgr, Style), Ranges(Ranges), + Encoding(encoding::detectEncoding(Lex.getBuffer())) { + DEBUG(llvm::dbgs() + << "File encoding: " + << (Encoding == encoding::Encoding_UTF8 ? "UTF8" : "unknown") + << "\n"); + } virtual ~Formatter() {} tooling::Replacements format() { - FormatTokenLexer Tokens(Lex, SourceMgr); + FormatTokenLexer Tokens(Lex, SourceMgr, Encoding); UnwrappedLineParser Parser(Style, Tokens.lex(), *this); bool StructuralError = Parser.parse(); - TokenAnnotator Annotator(Style, SourceMgr, Lex, - Tokens.getIdentTable().get("in")); + TokenAnnotator Annotator(Style, Tokens.getIdentTable().get("in")); for (unsigned i = 0, e = AnnotatedLines.size(); i != e; ++i) { Annotator.annotate(AnnotatedLines[i]); } @@ -1290,7 +1307,7 @@ public: 1; } UnwrappedLineFormatter Formatter(Style, SourceMgr, TheLine, Indent, - TheLine.First, Whitespaces); + TheLine.First, Whitespaces, Encoding); Formatter.format(I + 1 != E ? &*(I + 1) : NULL); IndentForLevel[TheLine.Level] = LevelIndent; PreviousLineWasTouched = true; @@ -1556,7 +1573,7 @@ private: CharSourceRange LineRange = CharSourceRange::getCharRange( First->WhitespaceRange.getBegin().getLocWithOffset( First->LastNewlineOffset), - Last->Tok.getLocation().getLocWithOffset(Last->TokenLength - 1)); + Last->Tok.getLocation().getLocWithOffset(Last->ByteCount - 1)); return touchesRanges(LineRange); } @@ -1616,6 +1633,8 @@ private: WhitespaceManager Whitespaces; std::vector<CharSourceRange> Ranges; std::vector<AnnotatedLine> AnnotatedLines; + + encoding::Encoding Encoding; }; tooling::Replacements reformat(const FormatStyle &Style, Lexer &Lex, |