From a8cf495a1592f9d2deddd1c39ce57e81dcc426e5 Mon Sep 17 00:00:00 2001
From: Yunzhong Gao <Yunzhong_Gao@playstation.sony.com>
Date: Sat, 24 Jan 2015 04:23:08 +0000
Subject: If we see UTF-8 BOM sequence at the beginning of a response file, we
 shall remove these bytes before parsing.

Phabricator Revision: http://reviews.llvm.org/D7156

llvm-svn: 226988
---
 llvm/lib/Support/CommandLine.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'llvm/lib/Support/CommandLine.cpp')
diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index a774421b26c..b4e32257a01 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -655,6 +655,13 @@ void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver,
     NewArgv.push_back(nullptr);
 }
 
+// It is called byte order marker but the UTF-8 BOM is actually not affected
+// by the host system's endianness.
+static bool hasUTF8ByteOrderMark(ArrayRef<char> S) {
+  return (S.size() >= 3 &&
+          S[0] == '\xef' && S[1] == '\xbb' && S[2] == '\xbf');
+}
+
 static bool ExpandResponseFile(const char *FName, StringSaver &Saver,
                                TokenizerCallback Tokenizer,
                                SmallVectorImpl<const char *> &NewArgv,
@@ -674,6 +681,11 @@ static bool ExpandResponseFile(const char *FName, StringSaver &Saver,
       return false;
     Str = StringRef(UTF8Buf);
   }
+  // If we see UTF-8 BOM sequence at the beginning of a file, we shall remove
+  // these bytes before parsing.
+  // Reference: http://en.wikipedia.org/wiki/UTF-8#Byte_order_mark
+  else if (hasUTF8ByteOrderMark(BufRef))
+    Str = StringRef(BufRef.data() + 3, BufRef.size() - 3);
 
   // Tokenize the contents into NewArgv.
   Tokenizer(Str, Saver, NewArgv, MarkEOLs);
-- 
cgit v1.2.3