diff options
author | Richard Smith <richard-llvm@metafoo.co.uk> | 2012-03-08 21:59:28 +0000 |
---|---|---|
committer | Richard Smith <richard-llvm@metafoo.co.uk> | 2012-03-08 21:59:28 +0000 |
commit | 812924502bb7fbe0525757576aa2d16072ab5a87 (patch) | |
tree | cf817c7931b543509af5f99d86be261a19e92b85 /clang | |
parent | 0ef86b0ea3392c672dd3ce69e32aa6d3d33603dd (diff) | |
download | bcm5719-llvm-812924502bb7fbe0525757576aa2d16072ab5a87.tar.gz bcm5719-llvm-812924502bb7fbe0525757576aa2d16072ab5a87.zip |
When checking the encoding of an 8-bit string literal, don't just check the
first codepoint! Also, don't reject empty raw string literals for spurious
"encoding" issues. Also, don't rely on undefined behavior in ConvertUTF.c.
llvm-svn: 152344
Diffstat (limited to 'clang')
-rw-r--r-- | clang/include/clang/Basic/ConvertUTF.h | 4 | ||||
-rw-r--r-- | clang/lib/Basic/ConvertUTF.c | 22 | ||||
-rw-r--r-- | clang/lib/Lex/LiteralSupport.cpp | 21 | ||||
-rw-r--r-- | clang/test/Lexer/cxx0x_raw_string_delim_length.cpp | 8 | ||||
-rw-r--r-- | clang/test/Lexer/string-literal-encoding.c | 15 |
5 files changed, 51 insertions, 19 deletions
diff --git a/clang/include/clang/Basic/ConvertUTF.h b/clang/include/clang/Basic/ConvertUTF.h index e376b7d7973..ec6b973e6a7 100644 --- a/clang/include/clang/Basic/ConvertUTF.h +++ b/clang/include/clang/Basic/ConvertUTF.h @@ -151,9 +151,11 @@ ConversionResult ConvertUTF16toUTF32 ( ConversionResult ConvertUTF32toUTF16 ( const UTF32** sourceStart, const UTF32* sourceEnd, UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); -#endif Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd); +#endif + +Boolean isLegalUTF8String(const UTF8 *source, const UTF8 *sourceEnd); #ifdef __cplusplus } diff --git a/clang/lib/Basic/ConvertUTF.c b/clang/lib/Basic/ConvertUTF.c index b3fa9169344..e1970039e16 100644 --- a/clang/lib/Basic/ConvertUTF.c +++ b/clang/lib/Basic/ConvertUTF.c @@ -387,7 +387,7 @@ static Boolean isLegalUTF8(const UTF8 *source, int length) { */ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { int length = trailingBytesForUTF8[*source]+1; - if (source+length > sourceEnd) { + if (length > sourceEnd - source) { return false; } return isLegalUTF8(source, length); @@ -395,6 +395,22 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { /* --------------------------------------------------------------------- */ +/* + * Exported function to return whether a UTF-8 string is legal or not. + * This is not used here; it's just exported. + */ +Boolean isLegalUTF8String(const UTF8 *source, const UTF8 *sourceEnd) { + while (source != sourceEnd) { + int length = trailingBytesForUTF8[*source] + 1; + if (length > sourceEnd - source || !isLegalUTF8(source, length)) + return false; + source += length; + } + return true; +} + +/* --------------------------------------------------------------------- */ + ConversionResult ConvertUTF8toUTF16 ( const UTF8** sourceStart, const UTF8* sourceEnd, UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { @@ -404,7 +420,7 @@ ConversionResult ConvertUTF8toUTF16 ( while (source < sourceEnd) { UTF32 ch = 0; unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; - if (source + extraBytesToRead >= sourceEnd) { + if (extraBytesToRead >= sourceEnd - source) { result = sourceExhausted; break; } /* Do this check whether lenient or strict */ @@ -477,7 +493,7 @@ ConversionResult ConvertUTF8toUTF32 ( while (source < sourceEnd) { UTF32 ch = 0; unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; - if (source + extraBytesToRead >= sourceEnd) { + if (extraBytesToRead >= sourceEnd - source) { result = sourceExhausted; break; } /* Do this check whether lenient or strict */ diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp index 901d96d21ae..e0a5ba39d0f 100644 --- a/clang/lib/Lex/LiteralSupport.cpp +++ b/clang/lib/Lex/LiteralSupport.cpp @@ -333,7 +333,7 @@ static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, /// decimal-constant integer-suffix /// octal-constant integer-suffix /// hexadecimal-constant integer-suffix -/// user-defiend-integer-literal: [C++11 lex.ext] +/// user-defined-integer-literal: [C++11 lex.ext] /// decimal-literal ud-suffix /// octal-literal ud-suffix /// hexadecimal-literal ud-suffix @@ -1167,17 +1167,14 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ ++ThisTokBuf; ++ThisTokBuf; // skip '(' - // remove same number of characters from the end - if (ThisTokEnd >= ThisTokBuf + (ThisTokBuf - Prefix)) - ThisTokEnd -= (ThisTokBuf - Prefix); + // Remove same number of characters from the end + ThisTokEnd -= ThisTokBuf - Prefix; + assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal"); // Copy the string over - if (CopyStringFragment(StringRef(ThisTokBuf,ThisTokEnd-ThisTokBuf))) - { + if (CopyStringFragment(StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf))) if (DiagnoseBadString(StringToks[i])) hadError = true; - } - } else { assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?"); ++ThisTokBuf; // skip " @@ -1204,11 +1201,9 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\'); // Copy the character span over. - if (CopyStringFragment(StringRef(InStart,ThisTokBuf-InStart))) - { + if (CopyStringFragment(StringRef(InStart, ThisTokBuf - InStart))) if (DiagnoseBadString(StringToks[i])) hadError = true; - } continue; } // Is this a Universal Character Name escape? @@ -1292,8 +1287,8 @@ bool StringLiteralParser::CopyStringFragment(StringRef Fragment) { ConversionResult result = conversionOK; // Copy the character span over. if (CharByteWidth == 1) { - if (!isLegalUTF8Sequence(reinterpret_cast<const UTF8*>(Fragment.begin()), - reinterpret_cast<const UTF8*>(Fragment.end()))) + if (!isLegalUTF8String(reinterpret_cast<const UTF8*>(Fragment.begin()), + reinterpret_cast<const UTF8*>(Fragment.end()))) result = sourceIllegal; memcpy(ResultPtr, Fragment.data(), Fragment.size()); ResultPtr += Fragment.size(); diff --git a/clang/test/Lexer/cxx0x_raw_string_delim_length.cpp b/clang/test/Lexer/cxx0x_raw_string_delim_length.cpp index e7d5c6f8cd2..b9f6d13ab74 100644 --- a/clang/test/Lexer/cxx0x_raw_string_delim_length.cpp +++ b/clang/test/Lexer/cxx0x_raw_string_delim_length.cpp @@ -1,3 +1,7 @@ -// RUN: %clang_cc1 -std=c++11 -E %s 2>&1 | grep 'error: raw string delimiter longer than 16 characters' +// RUN: %clang_cc1 -std=c++11 -verify %s -const char *str = R"abcdefghijkmnopqrstuvwxyz(abcdef)abcdefghijkmnopqrstuvwxyz"; +const char *str1 = R"(abcdef)"; // ok +const char *str2 = R"foo()foo"; // ok +const char *str3 = R"()"; // ok +// FIXME: recover better than this. +const char *str4 = R"abcdefghijkmnopqrstuvwxyz(abcdef)abcdefghijkmnopqrstuvwxyz"; // expected-error {{raw string delimiter longer than 16 characters}} expected-error {{expected expression}} diff --git a/clang/test/Lexer/string-literal-encoding.c b/clang/test/Lexer/string-literal-encoding.c index aa7cb73f626..57097dca88a 100644 --- a/clang/test/Lexer/string-literal-encoding.c +++ b/clang/test/Lexer/string-literal-encoding.c @@ -15,4 +15,19 @@ void f() { char const *g = "Àéîõü"; // expected-warning {{illegal character encoding in string literal}} char const *h = u8"Àéîõü"; // expected-error {{illegal character encoding in string literal}} + char const *i = R"(Àéîõü)"; // expected-warning {{illegal character encoding in string literal}} +} + +void g() { + wchar_t const *a = L"foo Àéîõü"; // expected-error {{illegal character encoding in string literal}} + + char16_t const *b = u"foo Àéîõü"; // expected-error {{illegal character encoding in string literal}} + char32_t const *c = U"foo Àéîõü"; // expected-error {{illegal character encoding in string literal}} + wchar_t const *d = LR"(foo Àéîõü)"; // expected-error {{illegal character encoding in string literal}} + char16_t const *e = uR"(foo Àéîõü)"; // expected-error {{illegal character encoding in string literal}} + char32_t const *f = UR"(foo Àéîõü)"; // expected-error {{illegal character encoding in string literal}} + + char const *g = "foo Àéîõü"; // expected-warning {{illegal character encoding in string literal}} + char const *h = u8"foo Àéîõü"; // expected-error {{illegal character encoding in string literal}} + char const *i = R"(foo Àéîõü)"; // expected-warning {{illegal character encoding in string literal}} } |