summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Support/JSON.cpp
diff options
context:
space:
mode:
authorSam McCall <sam.mccall@gmail.com>2018-07-10 11:51:26 +0000
committerSam McCall <sam.mccall@gmail.com>2018-07-10 11:51:26 +0000
commite6057bc689f380c245512b6809c3767d964407ed (patch)
treee0f7d83dc3c4a456ffe3a0376f85b741779465ae /llvm/lib/Support/JSON.cpp
parentce5c19b623bc677c6e2eb0f7ab21c128e000a982 (diff)
downloadbcm5719-llvm-e6057bc689f380c245512b6809c3767d964407ed.tar.gz
bcm5719-llvm-e6057bc689f380c245512b6809c3767d964407ed.zip
[Support] Harded JSON against invalid UTF-8.
Parsing invalid UTF-8 input is now a parse error. Creating JSON values from invalid UTF-8 now triggers an assertion, and (in no-assert builds) substitutes the unicode replacement character. Strings retrieved from json::Value are always valid UTF-8. llvm-svn: 336657
Diffstat (limited to 'llvm/lib/Support/JSON.cpp')
-rw-r--r--llvm/lib/Support/JSON.cpp49
1 files changed, 45 insertions, 4 deletions
diff --git a/llvm/lib/Support/JSON.cpp b/llvm/lib/Support/JSON.cpp
index c2025bb2299..a5dae7a7c2e 100644
--- a/llvm/lib/Support/JSON.cpp
+++ b/llvm/lib/Support/JSON.cpp
@@ -8,6 +8,7 @@
//===---------------------------------------------------------------------===//
#include "llvm/Support/JSON.h"
+#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/Format.h"
#include <cctype>
@@ -199,6 +200,14 @@ public:
Parser(StringRef JSON)
: Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {}
+ bool checkUTF8() {
+ size_t ErrOffset;
+ if (isUTF8(StringRef(Start, End - Start), &ErrOffset))
+ return true;
+ P = Start + ErrOffset; // For line/column calculation.
+ return parseError("Invalid UTF-8 sequence");
+ }
+
bool parseValue(Value &Out);
bool assertEnd() {
@@ -458,7 +467,7 @@ bool Parser::parseUnicode(std::string &Out) {
// Case 3: it's a leading surrogate. We expect a trailing one next.
// Case 3a: there's no trailing \u escape. Don't advance in the stream.
- if (!LLVM_LIKELY(P + 2 <= End && *P == '\\' && *(P + 1) == 'u')) {
+ if (LLVM_UNLIKELY(P + 2 > End || *P != '\\' || *(P + 1) != 'u')) {
Invalid(); // Leading surrogate was unpaired.
return true;
}
@@ -496,9 +505,10 @@ bool Parser::parseError(const char *Msg) {
Expected<Value> parse(StringRef JSON) {
Parser P(JSON);
Value E = nullptr;
- if (P.parseValue(E))
- if (P.assertEnd())
- return std::move(E);
+ if (P.checkUTF8())
+ if (P.parseValue(E))
+ if (P.assertEnd())
+ return std::move(E);
return P.takeError();
}
char ParseError::ID = 0;
@@ -514,6 +524,37 @@ static std::vector<const Object::value_type *> sortedElements(const Object &O) {
return Elements;
}
+bool isUTF8(llvm::StringRef S, size_t *ErrOffset) {
+ // Fast-path for ASCII, which is valid UTF-8.
+ if (LLVM_LIKELY(isASCII(S)))
+ return true;
+
+ const UTF8 *Data = reinterpret_cast<const UTF8 *>(S.data()), *Rest = Data;
+ if (LLVM_LIKELY(isLegalUTF8String(&Rest, Data + S.size())))
+ return true;
+
+ if (ErrOffset)
+ *ErrOffset = Rest - Data;
+ return false;
+}
+
+std::string fixUTF8(llvm::StringRef S) {
+ // This isn't particularly efficient, but is only for error-recovery.
+ std::vector<UTF32> Codepoints(S.size()); // 1 codepoint per byte suffices.
+ const UTF8 *In8 = reinterpret_cast<const UTF8 *>(S.data());
+ UTF32 *Out32 = Codepoints.data();
+ ConvertUTF8toUTF32(&In8, In8 + S.size(), &Out32, Out32 + Codepoints.size(),
+ lenientConversion);
+ Codepoints.resize(Out32 - Codepoints.data());
+ std::string Res(4 * Codepoints.size(), 0); // 4 bytes per codepoint suffice
+ const UTF32 *In32 = Codepoints.data();
+ UTF8 *Out8 = reinterpret_cast<UTF8 *>(&Res[0]);
+ ConvertUTF32toUTF8(&In32, In32 + Codepoints.size(), &Out8, Out8 + Res.size(),
+ strictConversion);
+ Res.resize(reinterpret_cast<char *>(Out8) - Res.data());
+ return Res;
+}
+
} // namespace json
} // namespace llvm
OpenPOWER on IntegriCloud