diff options
author | Sam McCall <sam.mccall@gmail.com> | 2018-07-10 11:51:26 +0000 |
---|---|---|
committer | Sam McCall <sam.mccall@gmail.com> | 2018-07-10 11:51:26 +0000 |
commit | e6057bc689f380c245512b6809c3767d964407ed (patch) | |
tree | e0f7d83dc3c4a456ffe3a0376f85b741779465ae /llvm/lib/Support/JSON.cpp | |
parent | ce5c19b623bc677c6e2eb0f7ab21c128e000a982 (diff) | |
download | bcm5719-llvm-e6057bc689f380c245512b6809c3767d964407ed.tar.gz bcm5719-llvm-e6057bc689f380c245512b6809c3767d964407ed.zip |
[Support] Harded JSON against invalid UTF-8.
Parsing invalid UTF-8 input is now a parse error.
Creating JSON values from invalid UTF-8 now triggers an assertion, and
(in no-assert builds) substitutes the unicode replacement character.
Strings retrieved from json::Value are always valid UTF-8.
llvm-svn: 336657
Diffstat (limited to 'llvm/lib/Support/JSON.cpp')
-rw-r--r-- | llvm/lib/Support/JSON.cpp | 49 |
1 files changed, 45 insertions, 4 deletions
diff --git a/llvm/lib/Support/JSON.cpp b/llvm/lib/Support/JSON.cpp index c2025bb2299..a5dae7a7c2e 100644 --- a/llvm/lib/Support/JSON.cpp +++ b/llvm/lib/Support/JSON.cpp @@ -8,6 +8,7 @@ //===---------------------------------------------------------------------===// #include "llvm/Support/JSON.h" +#include "llvm/Support/ConvertUTF.h" #include "llvm/Support/Format.h" #include <cctype> @@ -199,6 +200,14 @@ public: Parser(StringRef JSON) : Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {} + bool checkUTF8() { + size_t ErrOffset; + if (isUTF8(StringRef(Start, End - Start), &ErrOffset)) + return true; + P = Start + ErrOffset; // For line/column calculation. + return parseError("Invalid UTF-8 sequence"); + } + bool parseValue(Value &Out); bool assertEnd() { @@ -458,7 +467,7 @@ bool Parser::parseUnicode(std::string &Out) { // Case 3: it's a leading surrogate. We expect a trailing one next. // Case 3a: there's no trailing \u escape. Don't advance in the stream. - if (!LLVM_LIKELY(P + 2 <= End && *P == '\\' && *(P + 1) == 'u')) { + if (LLVM_UNLIKELY(P + 2 > End || *P != '\\' || *(P + 1) != 'u')) { Invalid(); // Leading surrogate was unpaired. return true; } @@ -496,9 +505,10 @@ bool Parser::parseError(const char *Msg) { Expected<Value> parse(StringRef JSON) { Parser P(JSON); Value E = nullptr; - if (P.parseValue(E)) - if (P.assertEnd()) - return std::move(E); + if (P.checkUTF8()) + if (P.parseValue(E)) + if (P.assertEnd()) + return std::move(E); return P.takeError(); } char ParseError::ID = 0; @@ -514,6 +524,37 @@ static std::vector<const Object::value_type *> sortedElements(const Object &O) { return Elements; } +bool isUTF8(llvm::StringRef S, size_t *ErrOffset) { + // Fast-path for ASCII, which is valid UTF-8. + if (LLVM_LIKELY(isASCII(S))) + return true; + + const UTF8 *Data = reinterpret_cast<const UTF8 *>(S.data()), *Rest = Data; + if (LLVM_LIKELY(isLegalUTF8String(&Rest, Data + S.size()))) + return true; + + if (ErrOffset) + *ErrOffset = Rest - Data; + return false; +} + +std::string fixUTF8(llvm::StringRef S) { + // This isn't particularly efficient, but is only for error-recovery. + std::vector<UTF32> Codepoints(S.size()); // 1 codepoint per byte suffices. + const UTF8 *In8 = reinterpret_cast<const UTF8 *>(S.data()); + UTF32 *Out32 = Codepoints.data(); + ConvertUTF8toUTF32(&In8, In8 + S.size(), &Out32, Out32 + Codepoints.size(), + lenientConversion); + Codepoints.resize(Out32 - Codepoints.data()); + std::string Res(4 * Codepoints.size(), 0); // 4 bytes per codepoint suffice + const UTF32 *In32 = Codepoints.data(); + UTF8 *Out8 = reinterpret_cast<UTF8 *>(&Res[0]); + ConvertUTF32toUTF8(&In32, In32 + Codepoints.size(), &Out8, Out8 + Res.size(), + strictConversion); + Res.resize(reinterpret_cast<char *>(Out8) - Res.data()); + return Res; +} + } // namespace json } // namespace llvm |