summaryrefslogtreecommitdiffstats
path: root/clang
diff options
context:
space:
mode:
authorRichard Smith <richard-llvm@metafoo.co.uk>2017-12-14 13:15:08 +0000
committerRichard Smith <richard-llvm@metafoo.co.uk>2017-12-14 13:15:08 +0000
commit77091b167fd959e1ee0c4dad4ec44de43b6c95db (patch)
tree9e068262e88c9ae489457581de5bd31c914d2a78 /clang
parentbf484aa0f137c88729136f5a477e0a0594a58769 (diff)
downloadbcm5719-llvm-77091b167fd959e1ee0c4dad4ec44de43b6c95db.tar.gz
bcm5719-llvm-77091b167fd959e1ee0c4dad4ec44de43b6c95db.zip
Warn if we find a Unicode homoglyph for a symbol in an identifier.
Specifically, warn if: * we find a character that the language standard says we must treat as an identifier, and * that character is not reasonably an identifier character (it's a punctuation character or similar), and * it renders identically to a valid non-identifier character in common fixed-width fonts. Some tools "helpfully" substitute the surprising characters for the expected characters, and replacing semicolons with Greek question marks is a common "prank". llvm-svn: 320697
Diffstat (limited to 'clang')
-rw-r--r--clang/include/clang/Basic/DiagnosticLexKinds.td3
-rw-r--r--clang/lib/Lex/Lexer.cpp79
-rw-r--r--clang/test/Lexer/unicode.c5
3 files changed, 86 insertions, 1 deletions
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index 89874b50f68..c664281ffcd 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -119,6 +119,9 @@ def err_non_ascii : Error<
def ext_unicode_whitespace : ExtWarn<
"treating Unicode character as whitespace">,
InGroup<DiagGroup<"unicode-whitespace">>;
+def warn_utf8_symbol_homoglyph : Warning<
+ "treating Unicode character <U+%0> as identifier character rather than "
+ "as '%1' symbol">, InGroup<DiagGroup<"unicode-homoglyph">>;
def err_hex_escape_no_digits : Error<
"\\%0 used with no following hex digits">;
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index aeb123a3603..830354ab23f 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -37,6 +37,7 @@
#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/NativeFormatting.h"
#include "llvm/Support/UnicodeCharRanges.h"
#include <algorithm>
#include <cassert>
@@ -1500,6 +1501,75 @@ static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
}
}
+/// After encountering UTF-8 character C and interpreting it as an identifier
+/// character, check whether it's a homoglyph for a common non-identifier
+/// source character that is unlikely to be an intentional identifier
+/// character and warn if so.
+static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
+ CharSourceRange Range) {
+ // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
+ struct HomoglyphPair {
+ uint32_t Character;
+ char LooksLike;
+ bool operator<(HomoglyphPair R) const { return Character < R.Character; }
+ };
+ static constexpr HomoglyphPair SortedHomoglyphs[] = {
+ {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
+ {U'\u037e', ';'}, // GREEK QUESTION MARK
+ {U'\u2212', '-'}, // MINUS SIGN
+ {U'\u2215', '/'}, // DIVISION SLASH
+ {U'\u2216', '\\'}, // SET MINUS
+ {U'\u2217', '*'}, // ASTERISK OPERATOR
+ {U'\u2223', '|'}, // DIVIDES
+ {U'\u2227', '^'}, // LOGICAL AND
+ {U'\u2236', ':'}, // RATIO
+ {U'\u223c', '~'}, // TILDE OPERATOR
+ {U'\ua789', ':'}, // MODIFIER LETTER COLON
+ {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
+ {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
+ {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
+ {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
+ {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
+ {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
+ {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
+ {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
+ {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
+ {U'\uff0c', ','}, // FULLWIDTH COMMA
+ {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
+ {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
+ {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
+ {U'\uff1a', ':'}, // FULLWIDTH COLON
+ {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
+ {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
+ {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
+ {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
+ {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
+ {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
+ {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
+ {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
+ {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
+ {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
+ {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
+ {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
+ {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
+ {U'\uff5e', '~'}, // FULLWIDTH TILDE
+ {0, 0}
+ };
+ auto Homoglyph =
+ std::lower_bound(std::begin(SortedHomoglyphs),
+ std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
+ if (Homoglyph->Character == C) {
+ llvm::SmallString<5> CharBuf;
+ {
+ llvm::raw_svector_ostream CharOS(CharBuf);
+ llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
+ }
+ const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
+ Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
+ << Range << CharBuf << LooksLikeStr;
+ }
+}
+
bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
Token &Result) {
const char *UCNPtr = CurPtr + Size;
@@ -1534,10 +1604,13 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
return false;
- if (!isLexingRawMode())
+ if (!isLexingRawMode()) {
maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
makeCharRange(*this, CurPtr, UnicodePtr),
/*IsFirst=*/false);
+ maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
+ makeCharRange(*this, CurPtr, UnicodePtr));
+ }
CurPtr = UnicodePtr;
return true;
@@ -3737,6 +3810,7 @@ LexNextToken:
// We can't just reset CurPtr to BufferPtr because BufferPtr may point to
// an escaped newline.
--CurPtr;
+ const char *UTF8StartPtr = CurPtr;
llvm::ConversionResult Status =
llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
(const llvm::UTF8 *)BufferEnd,
@@ -3751,6 +3825,9 @@ LexNextToken:
// (We manually eliminate the tail call to avoid recursion.)
goto LexNextToken;
}
+ if (!isLexingRawMode())
+ maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
+ makeCharRange(*this, UTF8StartPtr, CurPtr));
return LexUnicode(Result, CodePoint, CurPtr);
}
diff --git a/clang/test/Lexer/unicode.c b/clang/test/Lexer/unicode.c
index 30805d1acb2..30e353fa797 100644
--- a/clang/test/Lexer/unicode.c
+++ b/clang/test/Lexer/unicode.c
@@ -33,3 +33,8 @@ int main () {
int 🌷 = 🌵(🌹);
return 🌷;
}
+
+int n; = 3; // expected-warning {{treating Unicode character <U+037E> as identifier character rather than as ';' symbol}}
+int *n꞉꞉v = &n;; // expected-warning 2{{treating Unicode character <U+A789> as identifier character rather than as ':' symbol}}
+ // expected-warning@-1 {{treating Unicode character <U+037E> as identifier character rather than as ';' symbol}}
+int v=[=](auto){return~x;}(); // expected-warning 12{{treating Unicode character}}
OpenPOWER on IntegriCloud