improve highlighting of invalid string encodings

limit highlight to exactly the bad encoding, and highlight every bad encoding in a string. llvm-svn: 166900
author: Seth Cantrell <seth.cantrell@gmail.com> 2012-10-28 18:24:46 +0000
committer: Seth Cantrell <seth.cantrell@gmail.com> 2012-10-28 18:24:46 +0000
commit: 4cfc817a9ad9f30f3380f7f5cd816f65cd1d4c49 (patch)
tree: 891af0cbde1af7e4f0470cbbdc73c0ba9cb709af /clang/lib/Lex
parent: 59a8e49f01ee7ef465deced6f86ef6d0ee9b9319 (diff)
download: bcm5719-llvm-4cfc817a9ad9f30f3380f7f5cd816f65cd1d4c49.tar.gz
bcm5719-llvm-4cfc817a9ad9f30f3380f7f5cd816f65cd1d4c49.zip
1 files changed, 49 insertions, 11 deletions
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 9171449968c..2896dc3bf7d 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -49,6 +49,20 @@ static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
   }
 }
 
+static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
+                                           FullSourceLoc TokLoc,
+                                           const char *TokBegin,
+                                           const char *TokRangeBegin,
+                                           const char *TokRangeEnd) {
+  SourceLocation Begin =
+    Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
+                                   TokLoc.getManager(), Features);
+  SourceLocation End =
+    Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
+                                   TokLoc.getManager(), Features);
+  return CharSourceRange::getCharRange(Begin, End);
+}
+
 /// \brief Produce a diagnostic highlighting some portion of a literal.
 ///
 /// Emits the diagnostic \p DiagID, highlighting the range of characters from
@@ -61,11 +75,8 @@ static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
   SourceLocation Begin =
     Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
                                    TokLoc.getManager(), Features);
-  SourceLocation End =
-    Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
-                                   TokLoc.getManager(), Features);
-  return Diags->Report(Begin, DiagID)
-      << CharSourceRange::getCharRange(Begin, End);
+  return Diags->Report(Begin, DiagID) <<
+    MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
 }
 
 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
@@ -1372,6 +1383,15 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
   }
 }
 
+static const char *resync_utf8(const char *err, const char *end) {
+    if (err==end)
+        return end;
+    end = err + std::min<unsigned>(getNumBytesForUTF8(*err), end-err);
+    while (++err!=end && (*err&0xC0)==0x80)
+      ;
+    return err;
+}
+
 /// \brief This function copies from Fragment, which is a sequence of bytes
 /// within Tok's contents (which begin at TokBegin) into ResultPtr.
 /// Performs widening for multi-byte characters.
@@ -1381,7 +1401,6 @@ bool StringLiteralParser::CopyStringFragment(const Token &Tok,
   const UTF8 *ErrorPtrTmp;
   if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
     return false;
-  const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
 
   // If we see bad encoding for unprefixed string literals, warn and
   // simply copy the byte values, for compatibility with gcc and older
@@ -1391,12 +1410,31 @@ bool StringLiteralParser::CopyStringFragment(const Token &Tok,
     memcpy(ResultPtr, Fragment.data(), Fragment.size());
     ResultPtr += Fragment.size();
   }
+
   if (Diags) {
-    Diag(Diags, Features, FullSourceLoc(Tok.getLocation(), SM), TokBegin,
-         ErrorPtr, ErrorPtr + std::min<unsigned>(getNumBytesForUTF8(*ErrorPtr),
-                                                 Fragment.end() - ErrorPtr),
-         NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
-                              : diag::err_bad_string_encoding);
+    const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
+
+    FullSourceLoc SourceLoc(Tok.getLocation(), SM);
+    const DiagnosticBuilder &Builder =
+      Diag(Diags, Features, SourceLoc, TokBegin,
+           ErrorPtr, resync_utf8(ErrorPtr, Fragment.end()),
+           NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
+                                : diag::err_bad_string_encoding);
+
+    char *SavedResultPtr = ResultPtr;
+    const char *NextStart = resync_utf8(ErrorPtr, Fragment.end());
+    StringRef NextFragment(NextStart, Fragment.end()-NextStart);
+
+    while (!ConvertUTF8toWide(CharByteWidth, NextFragment, ResultPtr,
+                              ErrorPtrTmp)) {
+      const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
+      NextStart = resync_utf8(ErrorPtr, Fragment.end());
+      Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
+                                     ErrorPtr, NextStart);
+      NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
+    }
+
+    ResultPtr = SavedResultPtr;
   }
   return !NoErrorOnBadEncoding;
 }
author	Seth Cantrell <seth.cantrell@gmail.com>	2012-10-28 18:24:46 +0000
committer	Seth Cantrell <seth.cantrell@gmail.com>	2012-10-28 18:24:46 +0000
commit	4cfc817a9ad9f30f3380f7f5cd816f65cd1d4c49 (patch)
tree	891af0cbde1af7e4f0470cbbdc73c0ba9cb709af /clang/lib/Lex
parent	59a8e49f01ee7ef465deced6f86ef6d0ee9b9319 (diff)
download	bcm5719-llvm-4cfc817a9ad9f30f3380f7f5cd816f65cd1d4c49.tar.gz bcm5719-llvm-4cfc817a9ad9f30f3380f7f5cd816f65cd1d4c49.zip