From e6057bc689f380c245512b6809c3767d964407ed Mon Sep 17 00:00:00 2001 From: Sam McCall Date: Tue, 10 Jul 2018 11:51:26 +0000 Subject: [Support] Harded JSON against invalid UTF-8. Parsing invalid UTF-8 input is now a parse error. Creating JSON values from invalid UTF-8 now triggers an assertion, and (in no-assert builds) substitutes the unicode replacement character. Strings retrieved from json::Value are always valid UTF-8. llvm-svn: 336657 --- llvm/lib/Support/JSON.cpp | 49 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 4 deletions(-) (limited to 'llvm/lib/Support/JSON.cpp') diff --git a/llvm/lib/Support/JSON.cpp b/llvm/lib/Support/JSON.cpp index c2025bb2299..a5dae7a7c2e 100644 --- a/llvm/lib/Support/JSON.cpp +++ b/llvm/lib/Support/JSON.cpp @@ -8,6 +8,7 @@ //===---------------------------------------------------------------------===// #include "llvm/Support/JSON.h" +#include "llvm/Support/ConvertUTF.h" #include "llvm/Support/Format.h" #include @@ -199,6 +200,14 @@ public: Parser(StringRef JSON) : Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {} + bool checkUTF8() { + size_t ErrOffset; + if (isUTF8(StringRef(Start, End - Start), &ErrOffset)) + return true; + P = Start + ErrOffset; // For line/column calculation. + return parseError("Invalid UTF-8 sequence"); + } + bool parseValue(Value &Out); bool assertEnd() { @@ -458,7 +467,7 @@ bool Parser::parseUnicode(std::string &Out) { // Case 3: it's a leading surrogate. We expect a trailing one next. // Case 3a: there's no trailing \u escape. Don't advance in the stream. - if (!LLVM_LIKELY(P + 2 <= End && *P == '\\' && *(P + 1) == 'u')) { + if (LLVM_UNLIKELY(P + 2 > End || *P != '\\' || *(P + 1) != 'u')) { Invalid(); // Leading surrogate was unpaired. return true; } @@ -496,9 +505,10 @@ bool Parser::parseError(const char *Msg) { Expected parse(StringRef JSON) { Parser P(JSON); Value E = nullptr; - if (P.parseValue(E)) - if (P.assertEnd()) - return std::move(E); + if (P.checkUTF8()) + if (P.parseValue(E)) + if (P.assertEnd()) + return std::move(E); return P.takeError(); } char ParseError::ID = 0; @@ -514,6 +524,37 @@ static std::vector sortedElements(const Object &O) { return Elements; } +bool isUTF8(llvm::StringRef S, size_t *ErrOffset) { + // Fast-path for ASCII, which is valid UTF-8. + if (LLVM_LIKELY(isASCII(S))) + return true; + + const UTF8 *Data = reinterpret_cast(S.data()), *Rest = Data; + if (LLVM_LIKELY(isLegalUTF8String(&Rest, Data + S.size()))) + return true; + + if (ErrOffset) + *ErrOffset = Rest - Data; + return false; +} + +std::string fixUTF8(llvm::StringRef S) { + // This isn't particularly efficient, but is only for error-recovery. + std::vector Codepoints(S.size()); // 1 codepoint per byte suffices. + const UTF8 *In8 = reinterpret_cast(S.data()); + UTF32 *Out32 = Codepoints.data(); + ConvertUTF8toUTF32(&In8, In8 + S.size(), &Out32, Out32 + Codepoints.size(), + lenientConversion); + Codepoints.resize(Out32 - Codepoints.data()); + std::string Res(4 * Codepoints.size(), 0); // 4 bytes per codepoint suffice + const UTF32 *In32 = Codepoints.data(); + UTF8 *Out8 = reinterpret_cast(&Res[0]); + ConvertUTF32toUTF8(&In32, In32 + Codepoints.size(), &Out8, Out8 + Res.size(), + strictConversion); + Res.resize(reinterpret_cast(Out8) - Res.data()); + return Res; +} + } // namespace json } // namespace llvm -- cgit v1.2.3