diff options
Diffstat (limited to 'llvm/tools/llvm-rc/ResourceFileWriter.cpp')
-rw-r--r-- | llvm/tools/llvm-rc/ResourceFileWriter.cpp | 159 |
1 files changed, 155 insertions, 4 deletions
diff --git a/llvm/tools/llvm-rc/ResourceFileWriter.cpp b/llvm/tools/llvm-rc/ResourceFileWriter.cpp index 1d23fb966f6..8b234ae96d6 100644 --- a/llvm/tools/llvm-rc/ResourceFileWriter.cpp +++ b/llvm/tools/llvm-rc/ResourceFileWriter.cpp @@ -122,24 +122,175 @@ enum class NullHandlingMethod { CutAtDoubleNull // Terminate string on '\0\0'; strip final '\0'. }; -// Parses an identifier or string and returns a processed version of it. -// For now, it only strips the string boundaries, but TODO: +// Parses an identifier or string and returns a processed version of it: +// * String the string boundary quotes. // * Squash "" to a single ". // * Replace the escape sequences with their processed version. // For identifiers, this is no-op. static Error processString(StringRef Str, NullHandlingMethod NullHandler, bool &IsLongString, SmallVectorImpl<UTF16> &Result) { bool IsString = stripQuotes(Str, IsLongString); - convertUTF8ToUTF16String(Str, Result); + SmallVector<UTF16, 128> Chars; + convertUTF8ToUTF16String(Str, Chars); if (!IsString) { // It's an identifier if it's not a string. Make all characters uppercase. - for (UTF16 &Ch : Result) { + for (UTF16 &Ch : Chars) { assert(Ch <= 0x7F && "We didn't allow identifiers to be non-ASCII"); Ch = toupper(Ch); } + Result.swap(Chars); return Error::success(); } + Result.reserve(Chars.size()); + size_t Pos = 0; + + auto AddRes = [&Result, NullHandler, IsLongString](UTF16 Char) -> Error { + if (!IsLongString) { + if (NullHandler == NullHandlingMethod::UserResource) { + // Narrow strings in user-defined resources are *not* output in + // UTF-16 format. + if (Char > 0xFF) + return createError("Non-8-bit codepoint (" + Twine(Char) + + ") can't occur in a user-defined narrow string"); + + } else { + // In case of narrow non-user strings, Windows RC converts + // [0x80, 0xFF] chars according to the current codepage. + // There is no 'codepage' concept settled in every supported platform, + // so we should reject such inputs. + if (Char > 0x7F && Char <= 0xFF) + return createError("Non-ASCII 8-bit codepoint (" + Twine(Char) + + ") can't " + "occur in a non-Unicode string"); + } + } + + Result.push_back(Char); + return Error::success(); + }; + + while (Pos < Chars.size()) { + UTF16 CurChar = Chars[Pos]; + ++Pos; + + // Strip double "". + if (CurChar == '"') { + if (Pos == Chars.size() || Chars[Pos] != '"') + return createError("Expected \"\""); + ++Pos; + RETURN_IF_ERROR(AddRes('"')); + continue; + } + + if (CurChar == '\\') { + UTF16 TypeChar = Chars[Pos]; + ++Pos; + + if (TypeChar == 'x' || TypeChar == 'X') { + // Read a hex number. Max number of characters to read differs between + // narrow and wide strings. + UTF16 ReadInt = 0; + size_t RemainingChars = IsLongString ? 4 : 2; + // We don't want to read non-ASCII hex digits. std:: functions past + // 0xFF invoke UB. + // + // FIXME: actually, Microsoft version probably doesn't check this + // condition and uses their Unicode version of 'isxdigit'. However, + // there are some hex-digit Unicode character outside of ASCII, and + // some of these are actually accepted by rc.exe, the notable example + // being fullwidth forms (U+FF10..U+FF19 etc.) These can be written + // instead of ASCII digits in \x... escape sequence and get accepted. + // However, the resulting hexcodes seem totally unpredictable. + // We think it's infeasible to try to reproduce this behavior, nor to + // put effort in order to detect it. + while (RemainingChars && Pos < Chars.size() && Chars[Pos] < 0x80) { + if (!isxdigit(Chars[Pos])) + break; + char Digit = tolower(Chars[Pos]); + ++Pos; + + ReadInt <<= 4; + if (isdigit(Digit)) + ReadInt |= Digit - '0'; + else + ReadInt |= Digit - 'a' + 10; + + --RemainingChars; + } + + RETURN_IF_ERROR(AddRes(ReadInt)); + continue; + } + + if (TypeChar >= '0' && TypeChar < '8') { + // Read an octal number. Note that we've already read the first digit. + UTF16 ReadInt = TypeChar - '0'; + size_t RemainingChars = IsLongString ? 6 : 2; + + while (RemainingChars && Pos < Chars.size() && Chars[Pos] >= '0' && + Chars[Pos] < '8') { + ReadInt <<= 3; + ReadInt |= Chars[Pos] - '0'; + --RemainingChars; + ++Pos; + } + + RETURN_IF_ERROR(AddRes(ReadInt)); + + continue; + } + + switch (TypeChar) { + case 'A': + case 'a': + // Windows '\a' translates into '\b' (Backspace). + RETURN_IF_ERROR(AddRes('\b')); + break; + + case 'n': // Somehow, RC doesn't recognize '\N' and '\R'. + RETURN_IF_ERROR(AddRes('\n')); + break; + + case 'r': + RETURN_IF_ERROR(AddRes('\r')); + break; + + case 'T': + case 't': + RETURN_IF_ERROR(AddRes('\t')); + break; + + case '\\': + RETURN_IF_ERROR(AddRes('\\')); + break; + + case '"': + // RC accepts \" only if another " comes afterwards; then, \"" means + // a single ". + if (Pos == Chars.size() || Chars[Pos] != '"') + return createError("Expected \\\"\""); + ++Pos; + RETURN_IF_ERROR(AddRes('"')); + break; + + default: + // If TypeChar means nothing, \ is should be output to stdout with + // following char. However, rc.exe consumes these characters when + // dealing with wide strings. + if (!IsLongString) { + RETURN_IF_ERROR(AddRes('\\')); + RETURN_IF_ERROR(AddRes(TypeChar)); + } + break; + } + + continue; + } + + // If nothing interesting happens, just output the character. + RETURN_IF_ERROR(AddRes(CurChar)); + } switch (NullHandler) { case NullHandlingMethod::CutAtNull: |