diff options
author | Richard Smith <richard-llvm@metafoo.co.uk> | 2017-12-14 13:15:08 +0000 |
---|---|---|
committer | Richard Smith <richard-llvm@metafoo.co.uk> | 2017-12-14 13:15:08 +0000 |
commit | 77091b167fd959e1ee0c4dad4ec44de43b6c95db (patch) | |
tree | 9e068262e88c9ae489457581de5bd31c914d2a78 | |
parent | bf484aa0f137c88729136f5a477e0a0594a58769 (diff) | |
download | bcm5719-llvm-77091b167fd959e1ee0c4dad4ec44de43b6c95db.tar.gz bcm5719-llvm-77091b167fd959e1ee0c4dad4ec44de43b6c95db.zip |
Warn if we find a Unicode homoglyph for a symbol in an identifier.
Specifically, warn if:
* we find a character that the language standard says we must treat as an
identifier, and
* that character is not reasonably an identifier character (it's a punctuation
character or similar), and
* it renders identically to a valid non-identifier character in common
fixed-width fonts.
Some tools "helpfully" substitute the surprising characters for the expected
characters, and replacing semicolons with Greek question marks is a common
"prank".
llvm-svn: 320697
-rw-r--r-- | clang/include/clang/Basic/DiagnosticLexKinds.td | 3 | ||||
-rw-r--r-- | clang/lib/Lex/Lexer.cpp | 79 | ||||
-rw-r--r-- | clang/test/Lexer/unicode.c | 5 |
3 files changed, 86 insertions, 1 deletions
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td index 89874b50f68..c664281ffcd 100644 --- a/clang/include/clang/Basic/DiagnosticLexKinds.td +++ b/clang/include/clang/Basic/DiagnosticLexKinds.td @@ -119,6 +119,9 @@ def err_non_ascii : Error< def ext_unicode_whitespace : ExtWarn< "treating Unicode character as whitespace">, InGroup<DiagGroup<"unicode-whitespace">>; +def warn_utf8_symbol_homoglyph : Warning< + "treating Unicode character <U+%0> as identifier character rather than " + "as '%1' symbol">, InGroup<DiagGroup<"unicode-homoglyph">>; def err_hex_escape_no_digits : Error< "\\%0 used with no following hex digits">; diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp index aeb123a3603..830354ab23f 100644 --- a/clang/lib/Lex/Lexer.cpp +++ b/clang/lib/Lex/Lexer.cpp @@ -37,6 +37,7 @@ #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/NativeFormatting.h" #include "llvm/Support/UnicodeCharRanges.h" #include <algorithm> #include <cassert> @@ -1500,6 +1501,75 @@ static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, } } +/// After encountering UTF-8 character C and interpreting it as an identifier +/// character, check whether it's a homoglyph for a common non-identifier +/// source character that is unlikely to be an intentional identifier +/// character and warn if so. +static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, + CharSourceRange Range) { + // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes). + struct HomoglyphPair { + uint32_t Character; + char LooksLike; + bool operator<(HomoglyphPair R) const { return Character < R.Character; } + }; + static constexpr HomoglyphPair SortedHomoglyphs[] = { + {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK + {U'\u037e', ';'}, // GREEK QUESTION MARK + {U'\u2212', '-'}, // MINUS SIGN + {U'\u2215', '/'}, // DIVISION SLASH + {U'\u2216', '\\'}, // SET MINUS + {U'\u2217', '*'}, // ASTERISK OPERATOR + {U'\u2223', '|'}, // DIVIDES + {U'\u2227', '^'}, // LOGICAL AND + {U'\u2236', ':'}, // RATIO + {U'\u223c', '~'}, // TILDE OPERATOR + {U'\ua789', ':'}, // MODIFIER LETTER COLON + {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK + {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN + {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN + {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN + {U'\uff06', '&'}, // FULLWIDTH AMPERSAND + {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS + {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS + {U'\uff0a', '*'}, // FULLWIDTH ASTERISK + {U'\uff0b', '+'}, // FULLWIDTH ASTERISK + {U'\uff0c', ','}, // FULLWIDTH COMMA + {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS + {U'\uff0e', '.'}, // FULLWIDTH FULL STOP + {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS + {U'\uff1a', ':'}, // FULLWIDTH COLON + {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON + {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN + {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN + {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN + {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK + {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT + {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET + {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS + {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET + {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT + {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET + {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE + {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET + {U'\uff5e', '~'}, // FULLWIDTH TILDE + {0, 0} + }; + auto Homoglyph = + std::lower_bound(std::begin(SortedHomoglyphs), + std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'}); + if (Homoglyph->Character == C) { + llvm::SmallString<5> CharBuf; + { + llvm::raw_svector_ostream CharOS(CharBuf); + llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4); + } + const char LooksLikeStr[] = {Homoglyph->LooksLike, 0}; + Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph) + << Range << CharBuf << LooksLikeStr; + } +} + bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, Token &Result) { const char *UCNPtr = CurPtr + Size; @@ -1534,10 +1604,13 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) { !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)) return false; - if (!isLexingRawMode()) + if (!isLexingRawMode()) { maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, makeCharRange(*this, CurPtr, UnicodePtr), /*IsFirst=*/false); + maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint, + makeCharRange(*this, CurPtr, UnicodePtr)); + } CurPtr = UnicodePtr; return true; @@ -3737,6 +3810,7 @@ LexNextToken: // We can't just reset CurPtr to BufferPtr because BufferPtr may point to // an escaped newline. --CurPtr; + const char *UTF8StartPtr = CurPtr; llvm::ConversionResult Status = llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr, (const llvm::UTF8 *)BufferEnd, @@ -3751,6 +3825,9 @@ LexNextToken: // (We manually eliminate the tail call to avoid recursion.) goto LexNextToken; } + if (!isLexingRawMode()) + maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint, + makeCharRange(*this, UTF8StartPtr, CurPtr)); return LexUnicode(Result, CodePoint, CurPtr); } diff --git a/clang/test/Lexer/unicode.c b/clang/test/Lexer/unicode.c index 30805d1acb2..30e353fa797 100644 --- a/clang/test/Lexer/unicode.c +++ b/clang/test/Lexer/unicode.c @@ -33,3 +33,8 @@ int main () { int 🌷 = 🌵(🌹); return 🌷; } + +int n; = 3; // expected-warning {{treating Unicode character <U+037E> as identifier character rather than as ';' symbol}} +int *n꞉꞉v = &n;; // expected-warning 2{{treating Unicode character <U+A789> as identifier character rather than as ':' symbol}} + // expected-warning@-1 {{treating Unicode character <U+037E> as identifier character rather than as ';' symbol}} +int v=[=](auto){return~x;}(); // expected-warning 12{{treating Unicode character}} |