summaryrefslogtreecommitdiffstats
path: root/clang/lib/Format/Format.cpp
diff options
context:
space:
mode:
authorAlexander Kornienko <alexfh@google.com>2013-06-05 14:09:10 +0000
committerAlexander Kornienko <alexfh@google.com>2013-06-05 14:09:10 +0000
commitffcc010767573c657ee0e6c0c9ea82ca124003ab (patch)
tree945819aeda9957c1232c7e2f7329e0c7a147b3e2 /clang/lib/Format/Format.cpp
parent218f6d8f59f55c848d335d89cbdd84706f7e096c (diff)
downloadbcm5719-llvm-ffcc010767573c657ee0e6c0c9ea82ca124003ab.tar.gz
bcm5719-llvm-ffcc010767573c657ee0e6c0c9ea82ca124003ab.zip
UTF-8 support for clang-format.
Summary: Detect if the file is valid UTF-8, and if this is the case, count code points instead of just using number of bytes in all (hopefully) places, where number of columns is needed. In particular, use the new FormatToken.CodePointCount instead of TokenLength where appropriate. Changed BreakableToken implementations to respect utf-8 character boundaries when in utf-8 mode. Reviewers: klimek, djasper Reviewed By: djasper CC: cfe-commits, rsmith, gribozavr Differential Revision: http://llvm-reviews.chandlerc.com/D918 llvm-svn: 183312
Diffstat (limited to 'clang/lib/Format/Format.cpp')
-rw-r--r--clang/lib/Format/Format.cpp93
1 files changed, 56 insertions, 37 deletions
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 63bf09317e3..9dd5e4a0f21 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -243,10 +243,11 @@ public:
UnwrappedLineFormatter(const FormatStyle &Style, SourceManager &SourceMgr,
const AnnotatedLine &Line, unsigned FirstIndent,
const FormatToken *RootToken,
- WhitespaceManager &Whitespaces)
+ WhitespaceManager &Whitespaces,
+ encoding::Encoding Encoding)
: Style(Style), SourceMgr(SourceMgr), Line(Line),
FirstIndent(FirstIndent), RootToken(RootToken),
- Whitespaces(Whitespaces), Count(0) {}
+ Whitespaces(Whitespaces), Count(0), Encoding(Encoding) {}
/// \brief Formats an \c UnwrappedLine.
void format(const AnnotatedLine *NextLine) {
@@ -484,7 +485,7 @@ private:
State.NextToken->WhitespaceRange.getEnd()) -
SourceMgr.getSpellingColumnNumber(
State.NextToken->WhitespaceRange.getBegin());
- State.Column += WhitespaceLength + State.NextToken->TokenLength;
+ State.Column += WhitespaceLength + State.NextToken->CodePointCount;
State.NextToken = State.NextToken->Next;
return 0;
}
@@ -520,11 +521,11 @@ private:
Line.StartsDefinition)) {
State.Column = State.Stack.back().Indent;
} else if (Current.Type == TT_ObjCSelectorName) {
- if (State.Stack.back().ColonPos > Current.TokenLength) {
- State.Column = State.Stack.back().ColonPos - Current.TokenLength;
+ if (State.Stack.back().ColonPos > Current.CodePointCount) {
+ State.Column = State.Stack.back().ColonPos - Current.CodePointCount;
} else {
State.Column = State.Stack.back().Indent;
- State.Stack.back().ColonPos = State.Column + Current.TokenLength;
+ State.Stack.back().ColonPos = State.Column + Current.CodePointCount;
}
} else if (Current.Type == TT_StartOfName ||
Previous.isOneOf(tok::coloncolon, tok::equal) ||
@@ -560,7 +561,7 @@ private:
State.Stack.back().LastSpace = State.Column;
if (Current.isOneOf(tok::arrow, tok::period) &&
Current.Type != TT_DesignatedInitializerPeriod)
- State.Stack.back().LastSpace += Current.TokenLength;
+ State.Stack.back().LastSpace += Current.CodePointCount;
State.StartOfLineLevel = State.ParenLevel;
State.LowestCallLevel = State.ParenLevel;
@@ -595,8 +596,8 @@ private:
State.Stack.back().VariablePos = State.Column;
// Move over * and & if they are bound to the variable name.
const FormatToken *Tok = &Previous;
- while (Tok && State.Stack.back().VariablePos >= Tok->TokenLength) {
- State.Stack.back().VariablePos -= Tok->TokenLength;
+ while (Tok && State.Stack.back().VariablePos >= Tok->CodePointCount) {
+ State.Stack.back().VariablePos -= Tok->CodePointCount;
if (Tok->SpacesRequiredBefore != 0)
break;
Tok = Tok->Previous;
@@ -614,12 +615,12 @@ private:
if (Current.Type == TT_ObjCSelectorName &&
State.Stack.back().ColonPos == 0) {
if (State.Stack.back().Indent + Current.LongestObjCSelectorName >
- State.Column + Spaces + Current.TokenLength)
+ State.Column + Spaces + Current.CodePointCount)
State.Stack.back().ColonPos =
State.Stack.back().Indent + Current.LongestObjCSelectorName;
else
State.Stack.back().ColonPos =
- State.Column + Spaces + Current.TokenLength;
+ State.Column + Spaces + Current.CodePointCount;
}
if (Previous.opensScope() && Previous.Type != TT_ObjCMethodExpr &&
@@ -671,7 +672,8 @@ private:
State.LowestCallLevel = std::min(State.LowestCallLevel, State.ParenLevel);
if (Line.Type == LT_BuilderTypeCall && State.ParenLevel == 0)
State.Stack.back().StartOfFunctionCall =
- Current.LastInChainOfCalls ? 0 : State.Column + Current.TokenLength;
+ Current.LastInChainOfCalls ? 0
+ : State.Column + Current.CodePointCount;
}
if (Current.Type == TT_CtorInitializerColon) {
// Indent 2 from the column, so:
@@ -779,7 +781,7 @@ private:
State.StartOfStringLiteral = 0;
}
- State.Column += Current.TokenLength;
+ State.Column += Current.CodePointCount;
State.NextToken = State.NextToken->Next;
@@ -798,7 +800,7 @@ private:
bool DryRun) {
unsigned UnbreakableTailLength = Current.UnbreakableTailLength;
llvm::OwningPtr<BreakableToken> Token;
- unsigned StartColumn = State.Column - Current.TokenLength;
+ unsigned StartColumn = State.Column - Current.CodePointCount;
unsigned OriginalStartColumn =
SourceMgr.getSpellingColumnNumber(Current.getStartOfNonWhitespace()) -
1;
@@ -811,15 +813,16 @@ private:
if (!LiteralData || *LiteralData != '"')
return 0;
- Token.reset(new BreakableStringLiteral(Current, StartColumn));
+ Token.reset(new BreakableStringLiteral(Current, StartColumn, Encoding));
} else if (Current.Type == TT_BlockComment) {
BreakableBlockComment *BBC = new BreakableBlockComment(
- Style, Current, StartColumn, OriginalStartColumn, !Current.Previous);
+ Style, Current, StartColumn, OriginalStartColumn, !Current.Previous,
+ Encoding);
Token.reset(BBC);
} else if (Current.Type == TT_LineComment &&
(Current.Previous == NULL ||
Current.Previous->Type != TT_ImplicitStringLiteral)) {
- Token.reset(new BreakableLineComment(Current, StartColumn));
+ Token.reset(new BreakableLineComment(Current, StartColumn, Encoding));
} else {
return 0;
}
@@ -837,27 +840,27 @@ private:
Whitespaces);
}
unsigned TailOffset = 0;
- unsigned RemainingTokenLength =
+ unsigned RemainingTokenColumns =
Token->getLineLengthAfterSplit(LineIndex, TailOffset);
- while (RemainingTokenLength > RemainingSpace) {
+ while (RemainingTokenColumns > RemainingSpace) {
BreakableToken::Split Split =
Token->getSplit(LineIndex, TailOffset, getColumnLimit());
if (Split.first == StringRef::npos)
break;
assert(Split.first != 0);
- unsigned NewRemainingTokenLength = Token->getLineLengthAfterSplit(
+ unsigned NewRemainingTokenColumns = Token->getLineLengthAfterSplit(
LineIndex, TailOffset + Split.first + Split.second);
- assert(NewRemainingTokenLength < RemainingTokenLength);
+ assert(NewRemainingTokenColumns < RemainingTokenColumns);
if (!DryRun) {
Token->insertBreak(LineIndex, TailOffset, Split, Line.InPPDirective,
Whitespaces);
}
TailOffset += Split.first + Split.second;
- RemainingTokenLength = NewRemainingTokenLength;
+ RemainingTokenColumns = NewRemainingTokenColumns;
Penalty += Style.PenaltyExcessCharacter;
BreakInserted = true;
}
- PositionAfterLastLineInToken = RemainingTokenLength;
+ PositionAfterLastLineInToken = RemainingTokenColumns;
}
if (BreakInserted) {
@@ -1080,13 +1083,16 @@ private:
// Increasing count of \c StateNode items we have created. This is used
// to create a deterministic order independent of the container.
unsigned Count;
+ encoding::Encoding Encoding;
};
class FormatTokenLexer {
public:
- FormatTokenLexer(Lexer &Lex, SourceManager &SourceMgr)
+ FormatTokenLexer(Lexer &Lex, SourceManager &SourceMgr,
+ encoding::Encoding Encoding)
: FormatTok(NULL), GreaterStashed(false), TrailingWhitespace(0), Lex(Lex),
- SourceMgr(SourceMgr), IdentTable(Lex.getLangOpts()) {
+ SourceMgr(SourceMgr), IdentTable(Lex.getLangOpts()),
+ Encoding(Encoding) {
Lex.SetKeepWhitespaceMode(true);
}
@@ -1111,7 +1117,8 @@ private:
FormatTok->Tok.getLocation().getLocWithOffset(1);
FormatTok->WhitespaceRange =
SourceRange(GreaterLocation, GreaterLocation);
- FormatTok->TokenLength = 1;
+ FormatTok->ByteCount = 1;
+ FormatTok->CodePointCount = 1;
GreaterStashed = false;
return FormatTok;
}
@@ -1146,12 +1153,12 @@ private:
}
// Now FormatTok is the next non-whitespace token.
- FormatTok->TokenLength = Text.size();
+ FormatTok->ByteCount = Text.size();
TrailingWhitespace = 0;
if (FormatTok->Tok.is(tok::comment)) {
TrailingWhitespace = Text.size() - Text.rtrim().size();
- FormatTok->TokenLength -= TrailingWhitespace;
+ FormatTok->ByteCount -= TrailingWhitespace;
}
// In case the token starts with escaped newlines, we want to
@@ -1164,7 +1171,7 @@ private:
while (i + 1 < Text.size() && Text[i] == '\\' && Text[i + 1] == '\n') {
// FIXME: ++FormatTok->NewlinesBefore is missing...
WhitespaceLength += 2;
- FormatTok->TokenLength -= 2;
+ FormatTok->ByteCount -= 2;
i += 2;
}
@@ -1176,15 +1183,19 @@ private:
if (FormatTok->Tok.is(tok::greatergreater)) {
FormatTok->Tok.setKind(tok::greater);
- FormatTok->TokenLength = 1;
+ FormatTok->ByteCount = 1;
GreaterStashed = true;
}
+ unsigned EncodingExtraBytes =
+ Text.size() - encoding::getCodePointCount(Text, Encoding);
+ FormatTok->CodePointCount = FormatTok->ByteCount - EncodingExtraBytes;
+
FormatTok->WhitespaceRange = SourceRange(
WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
FormatTok->TokenText = StringRef(
SourceMgr.getCharacterData(FormatTok->getStartOfNonWhitespace()),
- FormatTok->TokenLength);
+ FormatTok->ByteCount);
return FormatTok;
}
@@ -1194,6 +1205,7 @@ private:
Lexer &Lex;
SourceManager &SourceMgr;
IdentifierTable IdentTable;
+ encoding::Encoding Encoding;
llvm::SpecificBumpPtrAllocator<FormatToken> Allocator;
SmallVector<FormatToken *, 16> Tokens;
@@ -1209,17 +1221,22 @@ public:
Formatter(const FormatStyle &Style, Lexer &Lex, SourceManager &SourceMgr,
const std::vector<CharSourceRange> &Ranges)
: Style(Style), Lex(Lex), SourceMgr(SourceMgr),
- Whitespaces(SourceMgr, Style), Ranges(Ranges) {}
+ Whitespaces(SourceMgr, Style), Ranges(Ranges),
+ Encoding(encoding::detectEncoding(Lex.getBuffer())) {
+ DEBUG(llvm::dbgs()
+ << "File encoding: "
+ << (Encoding == encoding::Encoding_UTF8 ? "UTF8" : "unknown")
+ << "\n");
+ }
virtual ~Formatter() {}
tooling::Replacements format() {
- FormatTokenLexer Tokens(Lex, SourceMgr);
+ FormatTokenLexer Tokens(Lex, SourceMgr, Encoding);
UnwrappedLineParser Parser(Style, Tokens.lex(), *this);
bool StructuralError = Parser.parse();
- TokenAnnotator Annotator(Style, SourceMgr, Lex,
- Tokens.getIdentTable().get("in"));
+ TokenAnnotator Annotator(Style, Tokens.getIdentTable().get("in"));
for (unsigned i = 0, e = AnnotatedLines.size(); i != e; ++i) {
Annotator.annotate(AnnotatedLines[i]);
}
@@ -1290,7 +1307,7 @@ public:
1;
}
UnwrappedLineFormatter Formatter(Style, SourceMgr, TheLine, Indent,
- TheLine.First, Whitespaces);
+ TheLine.First, Whitespaces, Encoding);
Formatter.format(I + 1 != E ? &*(I + 1) : NULL);
IndentForLevel[TheLine.Level] = LevelIndent;
PreviousLineWasTouched = true;
@@ -1556,7 +1573,7 @@ private:
CharSourceRange LineRange = CharSourceRange::getCharRange(
First->WhitespaceRange.getBegin().getLocWithOffset(
First->LastNewlineOffset),
- Last->Tok.getLocation().getLocWithOffset(Last->TokenLength - 1));
+ Last->Tok.getLocation().getLocWithOffset(Last->ByteCount - 1));
return touchesRanges(LineRange);
}
@@ -1616,6 +1633,8 @@ private:
WhitespaceManager Whitespaces;
std::vector<CharSourceRange> Ranges;
std::vector<AnnotatedLine> AnnotatedLines;
+
+ encoding::Encoding Encoding;
};
tooling::Replacements reformat(const FormatStyle &Style, Lexer &Lex,
OpenPOWER on IntegriCloud