9 files changed, 153 insertions, 21 deletions
diff --git a/llvm/test/tools/llvm-rc/Inputs/cp1252.rc b/llvm/test/tools/llvm-rc/Inputs/cp1252.rc
new file mode 100644
index 00000000000..f1dd948aead
--- /dev/null
+++ b/llvm/test/tools/llvm-rc/Inputs/cp1252.rc
@@ -0,0 +1,4 @@
+STRINGTABLE {
+  1 "åäö © ƒ \xe5\xe4\366 \251 \x83"
+  2 L"åäö © ƒ \xe5\xe4\366 \251 \x0192"
+}
diff --git a/llvm/test/tools/llvm-rc/Inputs/utf8-escape-narrow.rc b/llvm/test/tools/llvm-rc/Inputs/utf8-escape-narrow.rc
new file mode 100644
index 00000000000..311968c4d71
--- /dev/null
+++ b/llvm/test/tools/llvm-rc/Inputs/utf8-escape-narrow.rc
@@ -0,0 +1,5 @@
+STRINGTABLE {
+  // One can't pass UTF-8 sequences via multiple escaped chars - in narrow
+  // strings in UTF-8 mode, only ASCII chars can be entered via escapes.
+  1 "Ã¥Ã¤Ã¶ \xc3\xa5"
+}
diff --git a/llvm/test/tools/llvm-rc/Inputs/utf8.rc b/llvm/test/tools/llvm-rc/Inputs/utf8.rc
new file mode 100644
index 00000000000..20ef99116c9
--- /dev/null
+++ b/llvm/test/tools/llvm-rc/Inputs/utf8.rc
@@ -0,0 +1,6 @@
+STRINGTABLE {
+  // One can't pass UTF-8 sequences via multiple escaped chars - in narrow
+  // strings in UTF-8 mode, only ASCII chars can be entered via escapes.
+  1 "Ã¥Ã¤Ã¶ Â© \x61"
+  2 L"Ã¥Ã¤Ã¶ Â© \xe5\xe4\366 \251"
+}
diff --git a/llvm/test/tools/llvm-rc/codepage.test b/llvm/test/tools/llvm-rc/codepage.test
new file mode 100644
index 00000000000..ce17e0a6b2b
--- /dev/null
+++ b/llvm/test/tools/llvm-rc/codepage.test
@@ -0,0 +1,44 @@
+; RUN: llvm-rc /C 65001 /FO %t.utf8.res %p/Inputs/utf8.rc
+; RUN: llvm-readobj %t.utf8.res | FileCheck %s --check-prefix=UTF8
+
+; UTF8:      Resource type (int): 6
+; UTF8-NEXT: Resource name (int): 1
+; UTF8-NEXT: Data version: 0
+; UTF8-NEXT: Memory flags: 0x1030
+; UTF8-NEXT: Language ID: 1033
+; UTF8-NEXT: Version (major): 0
+; UTF8-NEXT: Version (minor): 0
+; UTF8-NEXT: Characteristics: 0
+; UTF8-NEXT: Data size: 68
+; UTF8-NEXT: Data: (
+; UTF8-NEXT:   0000: 00000700 E500E400 F6002000 A9002000  |.......... ... .|
+; UTF8-NEXT:   0010: 61000B00 E500E400 F6002000 A9002000  |a......... ... .|
+; UTF8-NEXT:   0020: E500E400 F6002000 A9000000 00000000  |...... .........|
+; UTF8-NEXT:   0030: 00000000 00000000 00000000 00000000  |................|
+; UTF8-NEXT:   0040: 00000000                             |....|
+; UTF8-NEXT: )
+
+; RUN: not llvm-rc /C 65001 /FO %t.utf8-escape-narrow.res %p/Inputs/utf8-escape-narrow.rc 2>&1 | FileCheck %s --check-prefix UTF8_ESCAPE
+; UTF8_ESCAPE: llvm-rc: Error in STRINGTABLE statement (ID 1):
+; UTF8_ESCAPE-NEXT: Unable to interpret single byte (195) as UTF-8
+
+; RUN: llvm-rc /C 1252 /FO %t.cp1252.res %p/Inputs/cp1252.rc
+; RUN: llvm-readobj %t.cp1252.res | FileCheck %s --check-prefix=CP1252
+
+; CP1252:      Resource type (int): 6
+; CP1252-NEXT: Resource name (int): 1
+; CP1252-NEXT: Data version: 0
+; CP1252-NEXT: Memory flags: 0x1030
+; CP1252-NEXT: Language ID: 1033
+; CP1252-NEXT: Version (major): 0
+; CP1252-NEXT: Version (minor): 0
+; CP1252-NEXT: Characteristics: 0
+; CP1252-NEXT: Data size: 92
+; CP1252-NEXT: Data: (
+; CP1252-NEXT:   0000: 00000F00 E500E400 F6002000 A9002000  |.......... ... .|
+; CP1252-NEXT:   0010: 92012000 E500E400 F6002000 A9002000  |.. ....... ... .|
+; CP1252-NEXT:   0020: 92010F00 E500E400 F6002000 A9002000  |.......... ... .|
+; CP1252-NEXT:   0030: 92012000 E500E400 F6002000 A9002000  |.. ....... ... .|
+; CP1252-NEXT:   0040: 92010000 00000000 00000000 00000000  |................|
+; CP1252-NEXT:   0050: 00000000 00000000 00000000           |............|
+; CP1252-NEXT: )
diff --git a/llvm/test/tools/llvm-rc/helpmsg.test b/llvm/test/tools/llvm-rc/helpmsg.test
index 2c2814abc66..e91d6369c62 100644
--- a/llvm/test/tools/llvm-rc/helpmsg.test
+++ b/llvm/test/tools/llvm-rc/helpmsg.test
@@ -7,6 +7,7 @@
 ; CHECK-DAG:  USAGE: rc [options] <inputs>
 ; CHECK-DAG:  OPTIONS:
 ; CHECK-NEXT:    /?          Display this help and exit.
+; CHECK-NEXT:    /C <value>  Set the codepage used for input strings.
 ; CHECK-NEXT:    /dry-run    Don't compile the input; only try to parse it.
 ; CHECK-NEXT:    /D <value>  Define a symbol for the C preprocessor.
 ; CHECK-NEXT:    /FO <value> Change the output file location.
diff --git a/llvm/tools/llvm-rc/Opts.td b/llvm/tools/llvm-rc/Opts.td
index 9792aa582cb..11f40f57103 100644
--- a/llvm/tools/llvm-rc/Opts.td
+++ b/llvm/tools/llvm-rc/Opts.td
@@ -35,6 +35,9 @@ def H : Flag<[ "/", "-" ], "H">,
 def DRY_RUN : Flag<[ "/", "-" ], "dry-run">,
               HelpText<"Don't compile the input; only try to parse it.">;
 
+def CODEPAGE : JoinedOrSeparate<[ "/", "-" ], "C">,
+               HelpText<"Set the codepage used for input strings.">;
+
 // Unused switches (at least for now). These will stay unimplemented
 // in an early stage of development and can be ignored. However, we need to
 // parse them in order to preserve the compatibility with the original tool.
@@ -44,7 +47,6 @@ def R : Flag<[ "/", "-" ], "R">;
 def SL : Flag<[ "/", "-" ], "SL">;
 
 // (Codepages support.)
-def C : Flag<[ "/", "-" ], "C">;
 def W : Flag<[ "/", "-" ], "W">;
 
 // (Support of MUI and similar.)
diff --git a/llvm/tools/llvm-rc/ResourceFileWriter.cpp b/llvm/tools/llvm-rc/ResourceFileWriter.cpp
index f141dc7e356..dadb7d691f7 100644
--- a/llvm/tools/llvm-rc/ResourceFileWriter.cpp
+++ b/llvm/tools/llvm-rc/ResourceFileWriter.cpp
@@ -110,6 +110,18 @@ static bool stripQuotes(StringRef &Str, bool &IsLongString) {
   return true;
 }
 
+static UTF16 cp1252ToUnicode(unsigned char C) {
+  static const UTF16 Map80[] = {
+      0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
+      0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
+      0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
+      0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178,
+  };
+  if (C >= 0x80 && C <= 0x9F)
+    return Map80[C - 0x80];
+  return C;
+}
+
 // Describes a way to handle '\0' characters when processing the string.
 // rc.exe tool sometimes behaves in a weird way in postprocessing.
 // If the string to be output is equivalent to a C-string (e.g. in MENU
@@ -132,10 +144,26 @@ enum class NullHandlingMethod {
 //   * Replace the escape sequences with their processed version.
 // For identifiers, this is no-op.
 static Error processString(StringRef Str, NullHandlingMethod NullHandler,
-                           bool &IsLongString, SmallVectorImpl<UTF16> &Result) {
+                           bool &IsLongString, SmallVectorImpl<UTF16> &Result,
+                           int CodePage) {
   bool IsString = stripQuotes(Str, IsLongString);
   SmallVector<UTF16, 128> Chars;
-  convertUTF8ToUTF16String(Str, Chars);
+
+  // Convert the input bytes according to the chosen codepage.
+  if (CodePage == CpUtf8) {
+    convertUTF8ToUTF16String(Str, Chars);
+  } else if (CodePage == CpWin1252) {
+    for (char C : Str)
+      Chars.push_back(cp1252ToUnicode((unsigned char)C));
+  } else {
+    // For other, unknown codepages, only allow plain ASCII input.
+    for (char C : Str) {
+      if ((unsigned char)C > 0x7F)
+        return createError("Non-ASCII 8-bit codepoint (" + Twine(C) +
+                           ") can't be interpreted in the current codepage");
+      Chars.push_back((unsigned char)C);
+    }
+  }
 
   if (!IsString) {
     // It's an identifier if it's not a string. Make all characters uppercase.
@@ -157,21 +185,35 @@ static Error processString(StringRef Str, NullHandlingMethod NullHandler,
         if (Char > 0xFF)
           return createError("Non-8-bit codepoint (" + Twine(Char) +
                              ") can't occur in a user-defined narrow string");
+      }
+    }
 
+    Result.push_back(Char);
+    return Error::success();
+  };
+  auto AddEscapedChar = [AddRes, IsLongString, CodePage](UTF16 Char) -> Error {
+    if (!IsLongString) {
+      // Escaped chars in narrow strings have to be interpreted according to
+      // the chosen code page.
+      if (Char > 0xFF)
+        return createError("Non-8-bit escaped char (" + Twine(Char) +
+                           ") can't occur in narrow string");
+      if (CodePage == CpUtf8) {
+        if (Char >= 0x80)
+          return createError("Unable to interpret single byte (" + Twine(Char) +
+                             ") as UTF-8");
+      } else if (CodePage == CpWin1252) {
+        Char = cp1252ToUnicode(Char);
       } else {
-        // In case of narrow non-user strings, Windows RC converts
-        // [0x80, 0xFF] chars according to the current codepage.
-        // There is no 'codepage' concept settled in every supported platform,
-        // so we should reject such inputs.
-        if (Char > 0x7F && Char <= 0xFF)
+        // Unknown/unsupported codepage, only allow ASCII input.
+        if (Char > 0x7F)
           return createError("Non-ASCII 8-bit codepoint (" + Twine(Char) +
                              ") can't "
                              "occur in a non-Unicode string");
       }
     }
 
-    Result.push_back(Char);
-    return Error::success();
+    return AddRes(Char);
   };
 
   while (Pos < Chars.size()) {
@@ -223,7 +265,7 @@ static Error processString(StringRef Str, NullHandlingMethod NullHandler,
           --RemainingChars;
         }
 
-        RETURN_IF_ERROR(AddRes(ReadInt));
+        RETURN_IF_ERROR(AddEscapedChar(ReadInt));
         continue;
       }
 
@@ -240,7 +282,7 @@ static Error processString(StringRef Str, NullHandlingMethod NullHandler,
           ++Pos;
         }
 
-        RETURN_IF_ERROR(AddRes(ReadInt));
+        RETURN_IF_ERROR(AddEscapedChar(ReadInt));
 
         continue;
       }
@@ -328,7 +370,8 @@ Error ResourceFileWriter::writeCString(StringRef Str, bool WriteTerminator) {
   SmallVector<UTF16, 128> ProcessedString;
   bool IsLongString;
   RETURN_IF_ERROR(processString(Str, NullHandlingMethod::CutAtNull,
-                                IsLongString, ProcessedString));
+                                IsLongString, ProcessedString,
+                                Params.CodePage));
   for (auto Ch : ProcessedString)
     writeInt<uint16_t>(Ch);
   if (WriteTerminator)
@@ -1142,6 +1185,7 @@ public:
   static bool classof(const RCResource *Res) {
     return Res->getKind() == RkStringTableBundle;
   }
+  Twine getResourceTypeName() const override { return "STRINGTABLE"; }
 };
 
 Error ResourceFileWriter::visitStringTableBundle(const RCResource *Res) {
@@ -1168,7 +1212,7 @@ Error ResourceFileWriter::writeStringTableBundleBody(const RCResource *Base) {
     SmallVector<UTF16, 128> Data;
     RETURN_IF_ERROR(processString(Res->Bundle.Data[ID].getValueOr(StringRef()),
                                   NullHandlingMethod::CutAtDoubleNull,
-                                  IsLongString, Data));
+                                  IsLongString, Data, Params.CodePage));
     if (AppendNull && Res->Bundle.Data[ID])
       Data.push_back('\0');
     RETURN_IF_ERROR(
@@ -1215,9 +1259,9 @@ Error ResourceFileWriter::writeUserDefinedBody(const RCResource *Base) {
 
     SmallVector<UTF16, 128> ProcessedString;
     bool IsLongString;
-    RETURN_IF_ERROR(processString(Elem.getString(),
-                                  NullHandlingMethod::UserResource,
-                                  IsLongString, ProcessedString));
+    RETURN_IF_ERROR(
+        processString(Elem.getString(), NullHandlingMethod::UserResource,
+                      IsLongString, ProcessedString, Params.CodePage));
 
     for (auto Ch : ProcessedString) {
       if (IsLongString) {
diff --git a/llvm/tools/llvm-rc/ResourceFileWriter.h b/llvm/tools/llvm-rc/ResourceFileWriter.h
index dcdebbf47fd..aef3bfa3c71 100644
--- a/llvm/tools/llvm-rc/ResourceFileWriter.h
+++ b/llvm/tools/llvm-rc/ResourceFileWriter.h
@@ -25,15 +25,25 @@ class MemoryBuffer;
 
 namespace rc {
 
-struct SearchParams {
+enum CodePage {
+  CpAcp = 0,        // The current used codepage. Since there's no such
+                    // notion in LLVM what codepage it actually means,
+                    // this only allows ASCII.
+  CpWin1252 = 1252, // A codepage where most 8 bit values correspond to
+                    // unicode code points with the same value.
+  CpUtf8 = 65001,   // UTF-8.
+};
+
+struct WriterParams {
   std::vector<std::string> Include;   // Additional folders to search for files.
   std::vector<std::string> NoInclude; // Folders to exclude from file search.
   StringRef InputFilePath;            // The full path of the input file.
+  int CodePage = CpAcp;               // The codepage for interpreting characters.
 };
 
 class ResourceFileWriter : public Visitor {
 public:
-  ResourceFileWriter(const SearchParams &Params,
+  ResourceFileWriter(const WriterParams &Params,
                      std::unique_ptr<raw_fd_ostream> Stream)
       : Params(Params), FS(std::move(Stream)), IconCursorID(1) {
     assert(FS && "Output stream needs to be provided to the serializator");
@@ -146,7 +156,7 @@ private:
   Error writeVersionInfoBlock(const VersionInfoBlock &);
   Error writeVersionInfoValue(const VersionInfoValue &);
 
-  const SearchParams &Params;
+  const WriterParams &Params;
 
   // Output stream handling.
   std::unique_ptr<raw_fd_ostream> FS;
diff --git a/llvm/tools/llvm-rc/llvm-rc.cpp b/llvm/tools/llvm-rc/llvm-rc.cpp
index 6491473de32..1f0f16f1685 100644
--- a/llvm/tools/llvm-rc/llvm-rc.cpp
+++ b/llvm/tools/llvm-rc/llvm-rc.cpp
@@ -129,13 +129,29 @@ int main(int Argc, const char **Argv) {
     }
   }
 
-  SearchParams Params;
+  WriterParams Params;
   SmallString<128> InputFile(InArgsInfo[0]);
   llvm::sys::fs::make_absolute(InputFile);
   Params.InputFilePath = InputFile;
   Params.Include = InputArgs.getAllArgValues(OPT_INCLUDE);
   Params.NoInclude = InputArgs.getAllArgValues(OPT_NOINCLUDE);
 
+  if (InputArgs.hasArg(OPT_CODEPAGE)) {
+    if (InputArgs.getLastArgValue(OPT_CODEPAGE)
+            .getAsInteger(10, Params.CodePage))
+      fatalError("Invalid code page: " +
+                 InputArgs.getLastArgValue(OPT_CODEPAGE));
+    switch (Params.CodePage) {
+    case CpAcp:
+    case CpWin1252:
+    case CpUtf8:
+      break;
+    default:
+      fatalError(
+          "Unsupported code page, only 0, 1252 and 65001 are supported!");
+    }
+  }
+
   std::unique_ptr<ResourceFileWriter> Visitor;
   bool IsDryRun = InputArgs.hasArg(OPT_DRY_RUN);