[llvm-rc] Implement escape sequences in .rc files.

This allows the escape sequences (\a, \n, \r, \t, \\, \x[0-9a-f]*, \[0-7]*, "") to appear in .rc scripts. These are parsed and output in the same way as it's done in original MS implementation. The way these sequences are processed depends on the type of the resource it resides in, and on whether the user declared the string to be "wide" or "narrow". I tried to maintain the maximum compatibility with the original tool (and fail in some erroneous situations that are accepted by .rc). However, there are some (extremely rare) cases where Microsoft tool outputs nonsense. I found it infeasible to detect such casses. Patch by Marek Sokolowski Differential Revision: https://reviews.llvm.org/D38426 llvm-svn: 315118
author: Zachary Turner <zturner@google.com> 2017-10-06 22:05:15 +0000
committer: Zachary Turner <zturner@google.com> 2017-10-06 22:05:15 +0000
commit: a92eb33ad59e3d01bd832f9c19a03d011f393a92 (patch)
tree: f22d4e1130854f98ad16609ddff84856c380c599 /llvm/tools/llvm-rc/ResourceFileWriter.cpp
parent: 9d8b358a49bec7cbd225062962d7959854d04338 (diff)
download: bcm5719-llvm-a92eb33ad59e3d01bd832f9c19a03d011f393a92.tar.gz
bcm5719-llvm-a92eb33ad59e3d01bd832f9c19a03d011f393a92.zip
1 files changed, 155 insertions, 4 deletions
diff --git a/llvm/tools/llvm-rc/ResourceFileWriter.cpp b/llvm/tools/llvm-rc/ResourceFileWriter.cpp
index 1d23fb966f6..8b234ae96d6 100644
--- a/llvm/tools/llvm-rc/ResourceFileWriter.cpp
+++ b/llvm/tools/llvm-rc/ResourceFileWriter.cpp
@@ -122,24 +122,175 @@ enum class NullHandlingMethod {
   CutAtDoubleNull // Terminate string on '\0\0'; strip final '\0'.
 };
 
-// Parses an identifier or string and returns a processed version of it.
-// For now, it only strips the string boundaries, but TODO:
+// Parses an identifier or string and returns a processed version of it:
+//   * String the string boundary quotes.
 //   * Squash "" to a single ".
 //   * Replace the escape sequences with their processed version.
 // For identifiers, this is no-op.
 static Error processString(StringRef Str, NullHandlingMethod NullHandler,
                            bool &IsLongString, SmallVectorImpl<UTF16> &Result) {
   bool IsString = stripQuotes(Str, IsLongString);
-  convertUTF8ToUTF16String(Str, Result);
+  SmallVector<UTF16, 128> Chars;
+  convertUTF8ToUTF16String(Str, Chars);
 
   if (!IsString) {
     // It's an identifier if it's not a string. Make all characters uppercase.
-    for (UTF16 &Ch : Result) {
+    for (UTF16 &Ch : Chars) {
       assert(Ch <= 0x7F && "We didn't allow identifiers to be non-ASCII");
       Ch = toupper(Ch);
     }
+    Result.swap(Chars);
     return Error::success();
   }
+  Result.reserve(Chars.size());
+  size_t Pos = 0;
+
+  auto AddRes = [&Result, NullHandler, IsLongString](UTF16 Char) -> Error {
+    if (!IsLongString) {
+      if (NullHandler == NullHandlingMethod::UserResource) {
+        // Narrow strings in user-defined resources are *not* output in
+        // UTF-16 format.
+        if (Char > 0xFF)
+          return createError("Non-8-bit codepoint (" + Twine(Char) +
+                             ") can't occur in a user-defined narrow string");
+
+      } else {
+        // In case of narrow non-user strings, Windows RC converts
+        // [0x80, 0xFF] chars according to the current codepage.
+        // There is no 'codepage' concept settled in every supported platform,
+        // so we should reject such inputs.
+        if (Char > 0x7F && Char <= 0xFF)
+          return createError("Non-ASCII 8-bit codepoint (" + Twine(Char) +
+                             ") can't "
+                             "occur in a non-Unicode string");
+      }
+    }
+
+    Result.push_back(Char);
+    return Error::success();
+  };
+
+  while (Pos < Chars.size()) {
+    UTF16 CurChar = Chars[Pos];
+    ++Pos;
+
+    // Strip double "".
+    if (CurChar == '"') {
+      if (Pos == Chars.size() || Chars[Pos] != '"')
+        return createError("Expected \"\"");
+      ++Pos;
+      RETURN_IF_ERROR(AddRes('"'));
+      continue;
+    }
+
+    if (CurChar == '\\') {
+      UTF16 TypeChar = Chars[Pos];
+      ++Pos;
+
+      if (TypeChar == 'x' || TypeChar == 'X') {
+        // Read a hex number. Max number of characters to read differs between
+        // narrow and wide strings.
+        UTF16 ReadInt = 0;
+        size_t RemainingChars = IsLongString ? 4 : 2;
+        // We don't want to read non-ASCII hex digits. std:: functions past
+        // 0xFF invoke UB.
+        //
+        // FIXME: actually, Microsoft version probably doesn't check this
+        // condition and uses their Unicode version of 'isxdigit'. However,
+        // there are some hex-digit Unicode character outside of ASCII, and
+        // some of these are actually accepted by rc.exe, the notable example
+        // being fullwidth forms (U+FF10..U+FF19 etc.) These can be written
+        // instead of ASCII digits in \x... escape sequence and get accepted.
+        // However, the resulting hexcodes seem totally unpredictable.
+        // We think it's infeasible to try to reproduce this behavior, nor to
+        // put effort in order to detect it.
+        while (RemainingChars && Pos < Chars.size() && Chars[Pos] < 0x80) {
+          if (!isxdigit(Chars[Pos]))
+            break;
+          char Digit = tolower(Chars[Pos]);
+          ++Pos;
+
+          ReadInt <<= 4;
+          if (isdigit(Digit))
+            ReadInt |= Digit - '0';
+          else
+            ReadInt |= Digit - 'a' + 10;
+
+          --RemainingChars;
+        }
+
+        RETURN_IF_ERROR(AddRes(ReadInt));
+        continue;
+      }
+
+      if (TypeChar >= '0' && TypeChar < '8') {
+        // Read an octal number. Note that we've already read the first digit.
+        UTF16 ReadInt = TypeChar - '0';
+        size_t RemainingChars = IsLongString ? 6 : 2;
+
+        while (RemainingChars && Pos < Chars.size() && Chars[Pos] >= '0' &&
+               Chars[Pos] < '8') {
+          ReadInt <<= 3;
+          ReadInt |= Chars[Pos] - '0';
+          --RemainingChars;
+          ++Pos;
+        }
+
+        RETURN_IF_ERROR(AddRes(ReadInt));
+
+        continue;
+      }
+
+      switch (TypeChar) {
+      case 'A':
+      case 'a':
+        // Windows '\a' translates into '\b' (Backspace).
+        RETURN_IF_ERROR(AddRes('\b'));
+        break;
+
+      case 'n': // Somehow, RC doesn't recognize '\N' and '\R'.
+        RETURN_IF_ERROR(AddRes('\n'));
+        break;
+
+      case 'r':
+        RETURN_IF_ERROR(AddRes('\r'));
+        break;
+
+      case 'T':
+      case 't':
+        RETURN_IF_ERROR(AddRes('\t'));
+        break;
+
+      case '\\':
+        RETURN_IF_ERROR(AddRes('\\'));
+        break;
+
+      case '"':
+        // RC accepts \" only if another " comes afterwards; then, \"" means
+        // a single ".
+        if (Pos == Chars.size() || Chars[Pos] != '"')
+          return createError("Expected \\\"\"");
+        ++Pos;
+        RETURN_IF_ERROR(AddRes('"'));
+        break;
+
+      default:
+        // If TypeChar means nothing, \ is should be output to stdout with
+        // following char. However, rc.exe consumes these characters when
+        // dealing with wide strings.
+        if (!IsLongString) {
+          RETURN_IF_ERROR(AddRes('\\'));
+          RETURN_IF_ERROR(AddRes(TypeChar));
+        }
+        break;
+      }
+
+      continue;
+    }
+
+    // If nothing interesting happens, just output the character.
+    RETURN_IF_ERROR(AddRes(CurChar));
+  }
 
   switch (NullHandler) {
   case NullHandlingMethod::CutAtNull:
author	Zachary Turner <zturner@google.com>	2017-10-06 22:05:15 +0000
committer	Zachary Turner <zturner@google.com>	2017-10-06 22:05:15 +0000
commit	a92eb33ad59e3d01bd832f9c19a03d011f393a92 (patch)
tree	f22d4e1130854f98ad16609ddff84856c380c599 /llvm/tools/llvm-rc/ResourceFileWriter.cpp
parent	9d8b358a49bec7cbd225062962d7959854d04338 (diff)
download	bcm5719-llvm-a92eb33ad59e3d01bd832f9c19a03d011f393a92.tar.gz bcm5719-llvm-a92eb33ad59e3d01bd832f9c19a03d011f393a92.zip