diff options
Diffstat (limited to 'clang/lib/Format')
-rw-r--r-- | clang/lib/Format/BreakableToken.cpp | 157 | ||||
-rw-r--r-- | clang/lib/Format/BreakableToken.h | 16 | ||||
-rw-r--r-- | clang/lib/Format/Encoding.h | 114 | ||||
-rw-r--r-- | clang/lib/Format/Format.cpp | 93 | ||||
-rw-r--r-- | clang/lib/Format/FormatToken.h | 21 | ||||
-rw-r--r-- | clang/lib/Format/TokenAnnotator.cpp | 39 | ||||
-rw-r--r-- | clang/lib/Format/TokenAnnotator.h | 9 |
7 files changed, 282 insertions, 167 deletions
diff --git a/clang/lib/Format/BreakableToken.cpp b/clang/lib/Format/BreakableToken.cpp index 7d7fe3f032e..5e5604c597f 100644 --- a/clang/lib/Format/BreakableToken.cpp +++ b/clang/lib/Format/BreakableToken.cpp @@ -25,66 +25,22 @@ namespace clang { namespace format { namespace { -// FIXME: Move helper string functions to where it makes sense. - -unsigned getOctalLength(StringRef Text) { - unsigned I = 1; - while (I < Text.size() && I < 4 && (Text[I] >= '0' && Text[I] <= '7')) { - ++I; - } - return I; -} - -unsigned getHexLength(StringRef Text) { - unsigned I = 2; // Point after '\x'. - while (I < Text.size() && ((Text[I] >= '0' && Text[I] <= '9') || - (Text[I] >= 'a' && Text[I] <= 'f') || - (Text[I] >= 'A' && Text[I] <= 'F'))) { - ++I; - } - return I; -} - -unsigned getEscapeSequenceLength(StringRef Text) { - assert(Text[0] == '\\'); - if (Text.size() < 2) - return 1; - - switch (Text[1]) { - case 'u': - return 6; - case 'U': - return 10; - case 'x': - return getHexLength(Text); - default: - if (Text[1] >= '0' && Text[1] <= '7') - return getOctalLength(Text); - return 2; - } -} - -StringRef::size_type getStartOfCharacter(StringRef Text, - StringRef::size_type Offset) { - StringRef::size_type NextEscape = Text.find('\\'); - while (NextEscape != StringRef::npos && NextEscape < Offset) { - StringRef::size_type SequenceLength = - getEscapeSequenceLength(Text.substr(NextEscape)); - if (Offset < NextEscape + SequenceLength) - return NextEscape; - NextEscape = Text.find('\\', NextEscape + SequenceLength); - } - return Offset; -} - BreakableToken::Split getCommentSplit(StringRef Text, unsigned ContentStartColumn, - unsigned ColumnLimit) { + unsigned ColumnLimit, + encoding::Encoding Encoding) { if (ColumnLimit <= ContentStartColumn + 1) return BreakableToken::Split(StringRef::npos, 0); unsigned MaxSplit = ColumnLimit - ContentStartColumn + 1; - StringRef::size_type SpaceOffset = Text.rfind(' ', MaxSplit); + unsigned MaxSplitBytes = 0; + + for (unsigned NumChars = 0; + NumChars < MaxSplit && MaxSplitBytes < Text.size(); ++NumChars) + MaxSplitBytes += + encoding::getCodePointNumBytes(Text[MaxSplitBytes], Encoding); + + StringRef::size_type SpaceOffset = Text.rfind(' ', MaxSplitBytes); if (SpaceOffset == StringRef::npos || // Don't break at leading whitespace. Text.find_last_not_of(' ', SpaceOffset) == StringRef::npos) { @@ -95,7 +51,7 @@ BreakableToken::Split getCommentSplit(StringRef Text, // If the comment is only whitespace, we cannot split. return BreakableToken::Split(StringRef::npos, 0); SpaceOffset = - Text.find(' ', std::max<unsigned>(MaxSplit, FirstNonWhitespace)); + Text.find(' ', std::max<unsigned>(MaxSplitBytes, FirstNonWhitespace)); } if (SpaceOffset != StringRef::npos && SpaceOffset != 0) { StringRef BeforeCut = Text.substr(0, SpaceOffset).rtrim(); @@ -108,25 +64,48 @@ BreakableToken::Split getCommentSplit(StringRef Text, BreakableToken::Split getStringSplit(StringRef Text, unsigned ContentStartColumn, - unsigned ColumnLimit) { - - if (ColumnLimit <= ContentStartColumn) - return BreakableToken::Split(StringRef::npos, 0); - unsigned MaxSplit = ColumnLimit - ContentStartColumn; + unsigned ColumnLimit, + encoding::Encoding Encoding) { // FIXME: Reduce unit test case. if (Text.empty()) return BreakableToken::Split(StringRef::npos, 0); - MaxSplit = std::min<unsigned>(MaxSplit, Text.size() - 1); - StringRef::size_type SpaceOffset = Text.rfind(' ', MaxSplit); - if (SpaceOffset != StringRef::npos && SpaceOffset != 0) + if (ColumnLimit <= ContentStartColumn) + return BreakableToken::Split(StringRef::npos, 0); + unsigned MaxSplit = + std::min<unsigned>(ColumnLimit - ContentStartColumn, + encoding::getCodePointCount(Text, Encoding) - 1); + StringRef::size_type SpaceOffset = 0; + StringRef::size_type SlashOffset = 0; + StringRef::size_type SplitPoint = 0; + for (unsigned Chars = 0;;) { + unsigned Advance; + if (Text[0] == '\\') { + Advance = encoding::getEscapeSequenceLength(Text); + Chars += Advance; + } else { + Advance = encoding::getCodePointNumBytes(Text[0], Encoding); + Chars += 1; + } + + if (Chars > MaxSplit) + break; + + if (Text[0] == ' ') + SpaceOffset = SplitPoint; + if (Text[0] == '/') + SlashOffset = SplitPoint; + + SplitPoint += Advance; + Text = Text.substr(Advance); + } + + if (SpaceOffset != 0) return BreakableToken::Split(SpaceOffset + 1, 0); - StringRef::size_type SlashOffset = Text.rfind('/', MaxSplit); - if (SlashOffset != StringRef::npos && SlashOffset != 0) + if (SlashOffset != 0) return BreakableToken::Split(SlashOffset + 1, 0); - StringRef::size_type SplitPoint = getStartOfCharacter(Text, MaxSplit); - if (SplitPoint == StringRef::npos || SplitPoint == 0) - return BreakableToken::Split(StringRef::npos, 0); - return BreakableToken::Split(SplitPoint, 0); + if (SplitPoint != 0) + return BreakableToken::Split(SplitPoint, 0); + return BreakableToken::Split(StringRef::npos, 0); } } // namespace @@ -136,8 +115,8 @@ unsigned BreakableSingleLineToken::getLineCount() const { return 1; } unsigned BreakableSingleLineToken::getLineLengthAfterSplit(unsigned LineIndex, unsigned TailOffset) const { - return StartColumn + Prefix.size() + Postfix.size() + Line.size() - - TailOffset; + return StartColumn + Prefix.size() + Postfix.size() + + encoding::getCodePointCount(Line.substr(TailOffset), Encoding); } void BreakableSingleLineToken::insertBreak(unsigned LineIndex, @@ -152,8 +131,9 @@ void BreakableSingleLineToken::insertBreak(unsigned LineIndex, BreakableSingleLineToken::BreakableSingleLineToken(const FormatToken &Tok, unsigned StartColumn, StringRef Prefix, - StringRef Postfix) - : BreakableToken(Tok), StartColumn(StartColumn), Prefix(Prefix), + StringRef Postfix, + encoding::Encoding Encoding) + : BreakableToken(Tok, Encoding), StartColumn(StartColumn), Prefix(Prefix), Postfix(Postfix) { assert(Tok.TokenText.startswith(Prefix) && Tok.TokenText.endswith(Postfix)); Line = Tok.TokenText.substr( @@ -161,13 +141,15 @@ BreakableSingleLineToken::BreakableSingleLineToken(const FormatToken &Tok, } BreakableStringLiteral::BreakableStringLiteral(const FormatToken &Tok, - unsigned StartColumn) - : BreakableSingleLineToken(Tok, StartColumn, "\"", "\"") {} + unsigned StartColumn, + encoding::Encoding Encoding) + : BreakableSingleLineToken(Tok, StartColumn, "\"", "\"", Encoding) {} BreakableToken::Split BreakableStringLiteral::getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit) const { - return getStringSplit(Line.substr(TailOffset), StartColumn + 2, ColumnLimit); + return getStringSplit(Line.substr(TailOffset), StartColumn + 2, ColumnLimit, + Encoding); } static StringRef getLineCommentPrefix(StringRef Comment) { @@ -179,23 +161,23 @@ static StringRef getLineCommentPrefix(StringRef Comment) { } BreakableLineComment::BreakableLineComment(const FormatToken &Token, - unsigned StartColumn) + unsigned StartColumn, + encoding::Encoding Encoding) : BreakableSingleLineToken(Token, StartColumn, - getLineCommentPrefix(Token.TokenText), "") {} + getLineCommentPrefix(Token.TokenText), "", + Encoding) {} BreakableToken::Split BreakableLineComment::getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit) const { return getCommentSplit(Line.substr(TailOffset), StartColumn + Prefix.size(), - ColumnLimit); + ColumnLimit, Encoding); } -BreakableBlockComment::BreakableBlockComment(const FormatStyle &Style, - const FormatToken &Token, - unsigned StartColumn, - unsigned OriginalStartColumn, - bool FirstInLine) - : BreakableToken(Token) { +BreakableBlockComment::BreakableBlockComment( + const FormatStyle &Style, const FormatToken &Token, unsigned StartColumn, + unsigned OriginalStartColumn, bool FirstInLine, encoding::Encoding Encoding) + : BreakableToken(Token, Encoding) { StringRef TokenText(Token.TokenText); assert(TokenText.startswith("/*") && TokenText.endswith("*/")); TokenText.substr(2, TokenText.size() - 4).split(Lines, "\n"); @@ -290,7 +272,8 @@ unsigned BreakableBlockComment::getLineLengthAfterSplit(unsigned LineIndex, unsigned TailOffset) const { return getContentStartColumn(LineIndex, TailOffset) + - (Lines[LineIndex].size() - TailOffset) + + encoding::getCodePointCount(Lines[LineIndex].substr(TailOffset), + Encoding) + // The last line gets a "*/" postfix. (LineIndex + 1 == Lines.size() ? 2 : 0); } @@ -300,7 +283,7 @@ BreakableBlockComment::getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit) const { return getCommentSplit(Lines[LineIndex].substr(TailOffset), getContentStartColumn(LineIndex, TailOffset), - ColumnLimit); + ColumnLimit, Encoding); } void BreakableBlockComment::insertBreak(unsigned LineIndex, unsigned TailOffset, diff --git a/clang/lib/Format/BreakableToken.h b/clang/lib/Format/BreakableToken.h index 03904c2a468..157bff4c42f 100644 --- a/clang/lib/Format/BreakableToken.h +++ b/clang/lib/Format/BreakableToken.h @@ -17,6 +17,7 @@ #ifndef LLVM_CLANG_FORMAT_BREAKABLETOKEN_H #define LLVM_CLANG_FORMAT_BREAKABLETOKEN_H +#include "Encoding.h" #include "TokenAnnotator.h" #include "WhitespaceManager.h" #include <utility> @@ -65,9 +66,11 @@ public: WhitespaceManager &Whitespaces) {} protected: - BreakableToken(const FormatToken &Tok) : Tok(Tok) {} + BreakableToken(const FormatToken &Tok, encoding::Encoding Encoding) + : Tok(Tok), Encoding(Encoding) {} const FormatToken &Tok; + encoding::Encoding Encoding; }; /// \brief Base class for single line tokens that can be broken. @@ -83,7 +86,8 @@ public: protected: BreakableSingleLineToken(const FormatToken &Tok, unsigned StartColumn, - StringRef Prefix, StringRef Postfix); + StringRef Prefix, StringRef Postfix, + encoding::Encoding Encoding); // The column in which the token starts. unsigned StartColumn; @@ -101,7 +105,8 @@ public: /// /// \p StartColumn specifies the column in which the token will start /// after formatting. - BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn); + BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn, + encoding::Encoding Encoding); virtual Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit) const; @@ -113,7 +118,8 @@ public: /// /// \p StartColumn specifies the column in which the comment will start /// after formatting. - BreakableLineComment(const FormatToken &Token, unsigned StartColumn); + BreakableLineComment(const FormatToken &Token, unsigned StartColumn, + encoding::Encoding Encoding); virtual Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit) const; @@ -129,7 +135,7 @@ public: /// If the comment starts a line after formatting, set \p FirstInLine to true. BreakableBlockComment(const FormatStyle &Style, const FormatToken &Token, unsigned StartColumn, unsigned OriginaStartColumn, - bool FirstInLine); + bool FirstInLine, encoding::Encoding Encoding); virtual unsigned getLineCount() const; virtual unsigned getLineLengthAfterSplit(unsigned LineIndex, diff --git a/clang/lib/Format/Encoding.h b/clang/lib/Format/Encoding.h new file mode 100644 index 00000000000..a44f4590a24 --- /dev/null +++ b/clang/lib/Format/Encoding.h @@ -0,0 +1,114 @@ +//===--- Encoding.h - Format C++ code -------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief Contains functions for text encoding manipulation. Supports UTF-8, +/// 8-bit encodings and escape sequences in C++ string literals. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_FORMAT_ENCODING_H +#define LLVM_CLANG_FORMAT_ENCODING_H + +#include "clang/Basic/LLVM.h" +#include "llvm/Support/ConvertUTF.h" + +namespace clang { +namespace format { +namespace encoding { + +enum Encoding { + Encoding_UTF8, + Encoding_Unknown // We treat all other encodings as 8-bit encodings. +}; + +/// \brief Detects encoding of the Text. If the Text can be decoded using UTF-8, +/// it is considered UTF8, otherwise we treat it as some 8-bit encoding. +inline Encoding detectEncoding(StringRef Text) { + const UTF8 *Ptr = reinterpret_cast<const UTF8 *>(Text.begin()); + const UTF8 *BufEnd = reinterpret_cast<const UTF8 *>(Text.end()); + if (::isLegalUTF8String(&Ptr, BufEnd)) + return Encoding_UTF8; + return Encoding_Unknown; +} + +inline unsigned getCodePointCountUTF8(StringRef Text) { + unsigned CodePoints = 0; + for (size_t i = 0, e = Text.size(); i < e; i += getNumBytesForUTF8(Text[i])) { + ++CodePoints; + } + return CodePoints; +} + +/// \brief Gets the number of code points in the Text using the specified +/// Encoding. +inline unsigned getCodePointCount(StringRef Text, Encoding Encoding) { + switch (Encoding) { + case Encoding_UTF8: + return getCodePointCountUTF8(Text); + default: + return Text.size(); + } +} + +/// \brief Gets the number of bytes in a sequence representing a single +/// codepoint and starting with FirstChar in the specified Encoding. +inline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) { + switch (Encoding) { + case Encoding_UTF8: + return getNumBytesForUTF8(FirstChar); + default: + return 1; + } +} + +inline bool isOctDigit(char c) { + return '0' <= c && c <= '7'; +} + +inline bool isHexDigit(char c) { + return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || + ('A' <= c && c <= 'F'); +} + +/// \brief Gets the length of an escape sequence inside a C++ string literal. +/// Text should span from the beginning of the escape sequence (starting with a +/// backslash) to the end of the string literal. +inline unsigned getEscapeSequenceLength(StringRef Text) { + assert(Text[0] == '\\'); + if (Text.size() < 2) + return 1; + + switch (Text[1]) { + case 'u': + return 6; + case 'U': + return 10; + case 'x': { + unsigned I = 2; // Point after '\x'. + while (I < Text.size() && isHexDigit(Text[I])) + ++I; + return I; + } + default: + if (isOctDigit(Text[1])) { + unsigned I = 1; + while (I < Text.size() && I < 4 && isOctDigit(Text[I])) + ++I; + return I; + } + return 2; + } +} + +} // namespace encoding +} // namespace format +} // namespace clang + +#endif // LLVM_CLANG_FORMAT_ENCODING_H diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index 63bf09317e3..9dd5e4a0f21 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -243,10 +243,11 @@ public: UnwrappedLineFormatter(const FormatStyle &Style, SourceManager &SourceMgr, const AnnotatedLine &Line, unsigned FirstIndent, const FormatToken *RootToken, - WhitespaceManager &Whitespaces) + WhitespaceManager &Whitespaces, + encoding::Encoding Encoding) : Style(Style), SourceMgr(SourceMgr), Line(Line), FirstIndent(FirstIndent), RootToken(RootToken), - Whitespaces(Whitespaces), Count(0) {} + Whitespaces(Whitespaces), Count(0), Encoding(Encoding) {} /// \brief Formats an \c UnwrappedLine. void format(const AnnotatedLine *NextLine) { @@ -484,7 +485,7 @@ private: State.NextToken->WhitespaceRange.getEnd()) - SourceMgr.getSpellingColumnNumber( State.NextToken->WhitespaceRange.getBegin()); - State.Column += WhitespaceLength + State.NextToken->TokenLength; + State.Column += WhitespaceLength + State.NextToken->CodePointCount; State.NextToken = State.NextToken->Next; return 0; } @@ -520,11 +521,11 @@ private: Line.StartsDefinition)) { State.Column = State.Stack.back().Indent; } else if (Current.Type == TT_ObjCSelectorName) { - if (State.Stack.back().ColonPos > Current.TokenLength) { - State.Column = State.Stack.back().ColonPos - Current.TokenLength; + if (State.Stack.back().ColonPos > Current.CodePointCount) { + State.Column = State.Stack.back().ColonPos - Current.CodePointCount; } else { State.Column = State.Stack.back().Indent; - State.Stack.back().ColonPos = State.Column + Current.TokenLength; + State.Stack.back().ColonPos = State.Column + Current.CodePointCount; } } else if (Current.Type == TT_StartOfName || Previous.isOneOf(tok::coloncolon, tok::equal) || @@ -560,7 +561,7 @@ private: State.Stack.back().LastSpace = State.Column; if (Current.isOneOf(tok::arrow, tok::period) && Current.Type != TT_DesignatedInitializerPeriod) - State.Stack.back().LastSpace += Current.TokenLength; + State.Stack.back().LastSpace += Current.CodePointCount; State.StartOfLineLevel = State.ParenLevel; State.LowestCallLevel = State.ParenLevel; @@ -595,8 +596,8 @@ private: State.Stack.back().VariablePos = State.Column; // Move over * and & if they are bound to the variable name. const FormatToken *Tok = &Previous; - while (Tok && State.Stack.back().VariablePos >= Tok->TokenLength) { - State.Stack.back().VariablePos -= Tok->TokenLength; + while (Tok && State.Stack.back().VariablePos >= Tok->CodePointCount) { + State.Stack.back().VariablePos -= Tok->CodePointCount; if (Tok->SpacesRequiredBefore != 0) break; Tok = Tok->Previous; @@ -614,12 +615,12 @@ private: if (Current.Type == TT_ObjCSelectorName && State.Stack.back().ColonPos == 0) { if (State.Stack.back().Indent + Current.LongestObjCSelectorName > - State.Column + Spaces + Current.TokenLength) + State.Column + Spaces + Current.CodePointCount) State.Stack.back().ColonPos = State.Stack.back().Indent + Current.LongestObjCSelectorName; else State.Stack.back().ColonPos = - State.Column + Spaces + Current.TokenLength; + State.Column + Spaces + Current.CodePointCount; } if (Previous.opensScope() && Previous.Type != TT_ObjCMethodExpr && @@ -671,7 +672,8 @@ private: State.LowestCallLevel = std::min(State.LowestCallLevel, State.ParenLevel); if (Line.Type == LT_BuilderTypeCall && State.ParenLevel == 0) State.Stack.back().StartOfFunctionCall = - Current.LastInChainOfCalls ? 0 : State.Column + Current.TokenLength; + Current.LastInChainOfCalls ? 0 + : State.Column + Current.CodePointCount; } if (Current.Type == TT_CtorInitializerColon) { // Indent 2 from the column, so: @@ -779,7 +781,7 @@ private: State.StartOfStringLiteral = 0; } - State.Column += Current.TokenLength; + State.Column += Current.CodePointCount; State.NextToken = State.NextToken->Next; @@ -798,7 +800,7 @@ private: bool DryRun) { unsigned UnbreakableTailLength = Current.UnbreakableTailLength; llvm::OwningPtr<BreakableToken> Token; - unsigned StartColumn = State.Column - Current.TokenLength; + unsigned StartColumn = State.Column - Current.CodePointCount; unsigned OriginalStartColumn = SourceMgr.getSpellingColumnNumber(Current.getStartOfNonWhitespace()) - 1; @@ -811,15 +813,16 @@ private: if (!LiteralData || *LiteralData != '"') return 0; - Token.reset(new BreakableStringLiteral(Current, StartColumn)); + Token.reset(new BreakableStringLiteral(Current, StartColumn, Encoding)); } else if (Current.Type == TT_BlockComment) { BreakableBlockComment *BBC = new BreakableBlockComment( - Style, Current, StartColumn, OriginalStartColumn, !Current.Previous); + Style, Current, StartColumn, OriginalStartColumn, !Current.Previous, + Encoding); Token.reset(BBC); } else if (Current.Type == TT_LineComment && (Current.Previous == NULL || Current.Previous->Type != TT_ImplicitStringLiteral)) { - Token.reset(new BreakableLineComment(Current, StartColumn)); + Token.reset(new BreakableLineComment(Current, StartColumn, Encoding)); } else { return 0; } @@ -837,27 +840,27 @@ private: Whitespaces); } unsigned TailOffset = 0; - unsigned RemainingTokenLength = + unsigned RemainingTokenColumns = Token->getLineLengthAfterSplit(LineIndex, TailOffset); - while (RemainingTokenLength > RemainingSpace) { + while (RemainingTokenColumns > RemainingSpace) { BreakableToken::Split Split = Token->getSplit(LineIndex, TailOffset, getColumnLimit()); if (Split.first == StringRef::npos) break; assert(Split.first != 0); - unsigned NewRemainingTokenLength = Token->getLineLengthAfterSplit( + unsigned NewRemainingTokenColumns = Token->getLineLengthAfterSplit( LineIndex, TailOffset + Split.first + Split.second); - assert(NewRemainingTokenLength < RemainingTokenLength); + assert(NewRemainingTokenColumns < RemainingTokenColumns); if (!DryRun) { Token->insertBreak(LineIndex, TailOffset, Split, Line.InPPDirective, Whitespaces); } TailOffset += Split.first + Split.second; - RemainingTokenLength = NewRemainingTokenLength; + RemainingTokenColumns = NewRemainingTokenColumns; Penalty += Style.PenaltyExcessCharacter; BreakInserted = true; } - PositionAfterLastLineInToken = RemainingTokenLength; + PositionAfterLastLineInToken = RemainingTokenColumns; } if (BreakInserted) { @@ -1080,13 +1083,16 @@ private: // Increasing count of \c StateNode items we have created. This is used // to create a deterministic order independent of the container. unsigned Count; + encoding::Encoding Encoding; }; class FormatTokenLexer { public: - FormatTokenLexer(Lexer &Lex, SourceManager &SourceMgr) + FormatTokenLexer(Lexer &Lex, SourceManager &SourceMgr, + encoding::Encoding Encoding) : FormatTok(NULL), GreaterStashed(false), TrailingWhitespace(0), Lex(Lex), - SourceMgr(SourceMgr), IdentTable(Lex.getLangOpts()) { + SourceMgr(SourceMgr), IdentTable(Lex.getLangOpts()), + Encoding(Encoding) { Lex.SetKeepWhitespaceMode(true); } @@ -1111,7 +1117,8 @@ private: FormatTok->Tok.getLocation().getLocWithOffset(1); FormatTok->WhitespaceRange = SourceRange(GreaterLocation, GreaterLocation); - FormatTok->TokenLength = 1; + FormatTok->ByteCount = 1; + FormatTok->CodePointCount = 1; GreaterStashed = false; return FormatTok; } @@ -1146,12 +1153,12 @@ private: } // Now FormatTok is the next non-whitespace token. - FormatTok->TokenLength = Text.size(); + FormatTok->ByteCount = Text.size(); TrailingWhitespace = 0; if (FormatTok->Tok.is(tok::comment)) { TrailingWhitespace = Text.size() - Text.rtrim().size(); - FormatTok->TokenLength -= TrailingWhitespace; + FormatTok->ByteCount -= TrailingWhitespace; } // In case the token starts with escaped newlines, we want to @@ -1164,7 +1171,7 @@ private: while (i + 1 < Text.size() && Text[i] == '\\' && Text[i + 1] == '\n') { // FIXME: ++FormatTok->NewlinesBefore is missing... WhitespaceLength += 2; - FormatTok->TokenLength -= 2; + FormatTok->ByteCount -= 2; i += 2; } @@ -1176,15 +1183,19 @@ private: if (FormatTok->Tok.is(tok::greatergreater)) { FormatTok->Tok.setKind(tok::greater); - FormatTok->TokenLength = 1; + FormatTok->ByteCount = 1; GreaterStashed = true; } + unsigned EncodingExtraBytes = + Text.size() - encoding::getCodePointCount(Text, Encoding); + FormatTok->CodePointCount = FormatTok->ByteCount - EncodingExtraBytes; + FormatTok->WhitespaceRange = SourceRange( WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength)); FormatTok->TokenText = StringRef( SourceMgr.getCharacterData(FormatTok->getStartOfNonWhitespace()), - FormatTok->TokenLength); + FormatTok->ByteCount); return FormatTok; } @@ -1194,6 +1205,7 @@ private: Lexer &Lex; SourceManager &SourceMgr; IdentifierTable IdentTable; + encoding::Encoding Encoding; llvm::SpecificBumpPtrAllocator<FormatToken> Allocator; SmallVector<FormatToken *, 16> Tokens; @@ -1209,17 +1221,22 @@ public: Formatter(const FormatStyle &Style, Lexer &Lex, SourceManager &SourceMgr, const std::vector<CharSourceRange> &Ranges) : Style(Style), Lex(Lex), SourceMgr(SourceMgr), - Whitespaces(SourceMgr, Style), Ranges(Ranges) {} + Whitespaces(SourceMgr, Style), Ranges(Ranges), + Encoding(encoding::detectEncoding(Lex.getBuffer())) { + DEBUG(llvm::dbgs() + << "File encoding: " + << (Encoding == encoding::Encoding_UTF8 ? "UTF8" : "unknown") + << "\n"); + } virtual ~Formatter() {} tooling::Replacements format() { - FormatTokenLexer Tokens(Lex, SourceMgr); + FormatTokenLexer Tokens(Lex, SourceMgr, Encoding); UnwrappedLineParser Parser(Style, Tokens.lex(), *this); bool StructuralError = Parser.parse(); - TokenAnnotator Annotator(Style, SourceMgr, Lex, - Tokens.getIdentTable().get("in")); + TokenAnnotator Annotator(Style, Tokens.getIdentTable().get("in")); for (unsigned i = 0, e = AnnotatedLines.size(); i != e; ++i) { Annotator.annotate(AnnotatedLines[i]); } @@ -1290,7 +1307,7 @@ public: 1; } UnwrappedLineFormatter Formatter(Style, SourceMgr, TheLine, Indent, - TheLine.First, Whitespaces); + TheLine.First, Whitespaces, Encoding); Formatter.format(I + 1 != E ? &*(I + 1) : NULL); IndentForLevel[TheLine.Level] = LevelIndent; PreviousLineWasTouched = true; @@ -1556,7 +1573,7 @@ private: CharSourceRange LineRange = CharSourceRange::getCharRange( First->WhitespaceRange.getBegin().getLocWithOffset( First->LastNewlineOffset), - Last->Tok.getLocation().getLocWithOffset(Last->TokenLength - 1)); + Last->Tok.getLocation().getLocWithOffset(Last->ByteCount - 1)); return touchesRanges(LineRange); } @@ -1616,6 +1633,8 @@ private: WhitespaceManager Whitespaces; std::vector<CharSourceRange> Ranges; std::vector<AnnotatedLine> AnnotatedLines; + + encoding::Encoding Encoding; }; tooling::Replacements reformat(const FormatStyle &Style, Lexer &Lex, diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h index 4a5e20dd4c6..fd1bd7e1cf8 100644 --- a/clang/lib/Format/FormatToken.h +++ b/clang/lib/Format/FormatToken.h @@ -61,11 +61,12 @@ enum TokenType { struct FormatToken { FormatToken() : NewlinesBefore(0), HasUnescapedNewline(false), LastNewlineOffset(0), - TokenLength(0), IsFirst(false), MustBreakBefore(false), - Type(TT_Unknown), SpacesRequiredBefore(0), CanBreakBefore(false), - ClosesTemplateDeclaration(false), ParameterCount(0), TotalLength(0), - UnbreakableTailLength(0), BindingStrength(0), SplitPenalty(0), - LongestObjCSelectorName(0), FakeRParens(0), LastInChainOfCalls(false), + ByteCount(0), CodePointCount(0), IsFirst(false), + MustBreakBefore(false), Type(TT_Unknown), SpacesRequiredBefore(0), + CanBreakBefore(false), ClosesTemplateDeclaration(false), + ParameterCount(0), TotalLength(0), UnbreakableTailLength(0), + BindingStrength(0), SplitPenalty(0), LongestObjCSelectorName(0), + FakeRParens(0), LastInChainOfCalls(false), PartOfMultiVariableDeclStmt(false), MatchingParen(NULL), Previous(NULL), Next(NULL) {} @@ -89,10 +90,14 @@ struct FormatToken { /// whitespace (relative to \c WhiteSpaceStart). 0 if there is no '\n'. unsigned LastNewlineOffset; - /// \brief The length of the non-whitespace parts of the token. This is - /// necessary because we need to handle escaped newlines that are stored + /// \brief The number of bytes of the non-whitespace parts of the token. This + /// is necessary because we need to handle escaped newlines that are stored /// with the token. - unsigned TokenLength; + unsigned ByteCount; + + /// \brief The length of the non-whitespace parts of the token in CodePoints. + /// We need this to correctly measure number of columns a token spans. + unsigned CodePointCount; /// \brief Indicates that this is the first token. bool IsFirst; diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index 83dea841b5e..62177b3efd7 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -15,7 +15,6 @@ #include "TokenAnnotator.h" #include "clang/Basic/SourceManager.h" -#include "clang/Lex/Lexer.h" #include "llvm/Support/Debug.h" namespace clang { @@ -28,10 +27,9 @@ namespace format { /// into template parameter lists. class AnnotatingParser { public: - AnnotatingParser(SourceManager &SourceMgr, Lexer &Lex, AnnotatedLine &Line, - IdentifierInfo &Ident_in) - : SourceMgr(SourceMgr), Lex(Lex), Line(Line), CurrentToken(Line.First), - KeywordVirtualFound(false), NameFound(false), Ident_in(Ident_in) { + AnnotatingParser(AnnotatedLine &Line, IdentifierInfo &Ident_in) + : Line(Line), CurrentToken(Line.First), KeywordVirtualFound(false), + NameFound(false), Ident_in(Ident_in) { Contexts.push_back(Context(tok::unknown, 1, /*IsExpression=*/ false)); } @@ -295,9 +293,11 @@ private: Line.First->Type == TT_ObjCMethodSpecifier) { Tok->Type = TT_ObjCMethodExpr; Tok->Previous->Type = TT_ObjCSelectorName; - if (Tok->Previous->TokenLength > - Contexts.back().LongestObjCSelectorName) - Contexts.back().LongestObjCSelectorName = Tok->Previous->TokenLength; + if (Tok->Previous->CodePointCount > + Contexts.back().LongestObjCSelectorName) { + Contexts.back().LongestObjCSelectorName = + Tok->Previous->CodePointCount; + } if (Contexts.back().FirstObjCSelectorName == NULL) Contexts.back().FirstObjCSelectorName = Tok->Previous; } else if (Contexts.back().ColonIsForRangeExpr) { @@ -602,9 +602,7 @@ private: } else if (Current.isBinaryOperator()) { Current.Type = TT_BinaryOperator; } else if (Current.is(tok::comment)) { - std::string Data( - Lexer::getSpelling(Current.Tok, SourceMgr, Lex.getLangOpts())); - if (StringRef(Data).startswith("//")) + if (Current.TokenText.startswith("//")) Current.Type = TT_LineComment; else Current.Type = TT_BlockComment; @@ -748,23 +746,19 @@ private: case tok::kw_wchar_t: case tok::kw_bool: case tok::kw___underlying_type: - return true; case tok::annot_typename: case tok::kw_char16_t: case tok::kw_char32_t: case tok::kw_typeof: case tok::kw_decltype: - return Lex.getLangOpts().CPlusPlus; + return true; default: - break; + return false; } - return false; } SmallVector<Context, 8> Contexts; - SourceManager &SourceMgr; - Lexer &Lex; AnnotatedLine &Line; FormatToken *CurrentToken; bool KeywordVirtualFound; @@ -866,7 +860,7 @@ private: }; void TokenAnnotator::annotate(AnnotatedLine &Line) { - AnnotatingParser Parser(SourceMgr, Lex, Line, Ident_in); + AnnotatingParser Parser(Line, Ident_in); Line.Type = Parser.parseLine(); if (Line.Type == LT_Invalid) return; @@ -886,7 +880,7 @@ void TokenAnnotator::annotate(AnnotatedLine &Line) { } void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) { - Line.First->TotalLength = Line.First->TokenLength; + Line.First->TotalLength = Line.First->CodePointCount; if (!Line.First->Next) return; FormatToken *Current = Line.First->Next; @@ -920,7 +914,7 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) { Current->TotalLength = Current->Previous->TotalLength + Style.ColumnLimit; else Current->TotalLength = - Current->Previous->TotalLength + Current->TokenLength + + Current->Previous->TotalLength + Current->CodePointCount + Current->SpacesRequiredBefore; // FIXME: Only calculate this if CanBreakBefore is true once static // initializers etc. are sorted out. @@ -947,7 +941,7 @@ void TokenAnnotator::calculateUnbreakableTailLengths(AnnotatedLine &Line) { UnbreakableTailLength = 0; } else { UnbreakableTailLength += - Current->TokenLength + Current->SpacesRequiredBefore; + Current->CodePointCount + Current->SpacesRequiredBefore; } Current = Current->Previous; } @@ -1015,8 +1009,7 @@ unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line, if (Right.is(tok::lessless)) { if (Left.is(tok::string_literal)) { - StringRef Content = - StringRef(Left.Tok.getLiteralData(), Left.TokenLength); + StringRef Content = Left.TokenText; Content = Content.drop_back(1).drop_front(1).trim(); if (Content.size() > 1 && (Content.back() == ':' || Content.back() == '=')) diff --git a/clang/lib/Format/TokenAnnotator.h b/clang/lib/Format/TokenAnnotator.h index a0d680c91df..28d55a007c2 100644 --- a/clang/lib/Format/TokenAnnotator.h +++ b/clang/lib/Format/TokenAnnotator.h @@ -21,7 +21,6 @@ #include <string> namespace clang { -class Lexer; class SourceManager; namespace format { @@ -71,10 +70,8 @@ public: /// \c UnwrappedLine. class TokenAnnotator { public: - TokenAnnotator(const FormatStyle &Style, SourceManager &SourceMgr, Lexer &Lex, - IdentifierInfo &Ident_in) - : Style(Style), SourceMgr(SourceMgr), Lex(Lex), Ident_in(Ident_in) { - } + TokenAnnotator(const FormatStyle &Style, IdentifierInfo &Ident_in) + : Style(Style), Ident_in(Ident_in) {} void annotate(AnnotatedLine &Line); void calculateFormattingInformation(AnnotatedLine &Line); @@ -95,8 +92,6 @@ private: void calculateUnbreakableTailLengths(AnnotatedLine &Line); const FormatStyle &Style; - SourceManager &SourceMgr; - Lexer &Lex; // Contextual keywords: IdentifierInfo &Ident_in; |