diff options
Diffstat (limited to 'clang-tools-extra/clangd/SourceCode.cpp')
| -rw-r--r-- | clang-tools-extra/clangd/SourceCode.cpp | 113 |
1 files changed, 66 insertions, 47 deletions
diff --git a/clang-tools-extra/clangd/SourceCode.cpp b/clang-tools-extra/clangd/SourceCode.cpp index cdafaf9636c..656ab1dc3c1 100644 --- a/clang-tools-extra/clangd/SourceCode.cpp +++ b/clang-tools-extra/clangd/SourceCode.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Support/Errc.h" #include "llvm/Support/Error.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Path.h" namespace clang { @@ -30,6 +31,8 @@ namespace clangd { // Returns true if CB returned true, false if we hit the end of string. template <typename Callback> static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) { + // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). + // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx. for (size_t I = 0; I < U8.size();) { unsigned char C = static_cast<unsigned char>(U8[I]); if (LLVM_LIKELY(!(C & 0x80))) { // ASCII character. @@ -53,46 +56,75 @@ static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) { return false; } -// Returns the offset into the string that matches \p Units UTF-16 code units. -// Conceptually, this converts to UTF-16, truncates to CodeUnits, converts back -// to UTF-8, and returns the length in bytes. -static size_t measureUTF16(llvm::StringRef U8, int U16Units, bool &Valid) { +// Returns the byte offset into the string that is an offset of \p Units in +// the specified encoding. +// Conceptually, this converts to the encoding, truncates to CodeUnits, +// converts back to UTF-8, and returns the length in bytes. +static size_t measureUnits(llvm::StringRef U8, int Units, OffsetEncoding Enc, + bool &Valid) { + Valid = Units >= 0; + if (Units <= 0) + return 0; size_t Result = 0; - Valid = U16Units == 0 || iterateCodepoints(U8, [&](int U8Len, int U16Len) { - Result += U8Len; - U16Units -= U16Len; - return U16Units <= 0; - }); - if (U16Units < 0) // Offset was into the middle of a surrogate pair. - Valid = false; + switch (Enc) { + case OffsetEncoding::UTF8: + Result = Units; + break; + case OffsetEncoding::UTF16: + Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) { + Result += U8Len; + Units -= U16Len; + return Units <= 0; + }); + if (Units < 0) // Offset in the middle of a surrogate pair. + Valid = false; + break; + case OffsetEncoding::UTF32: + Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) { + Result += U8Len; + Units--; + return Units <= 0; + }); + break; + case OffsetEncoding::UnsupportedEncoding: + llvm_unreachable("unsupported encoding"); + } // Don't return an out-of-range index if we overran. - return std::min(Result, U8.size()); + if (Result > U8.size()) { + Valid = false; + return U8.size(); + } + return Result; } Key<OffsetEncoding> kCurrentOffsetEncoding; -static bool useUTF16ForLSP() { +static OffsetEncoding lspEncoding() { auto *Enc = Context::current().get(kCurrentOffsetEncoding); - switch (Enc ? *Enc : OffsetEncoding::UTF16) { - case OffsetEncoding::UTF16: - return true; - case OffsetEncoding::UTF8: - return false; - case OffsetEncoding::UnsupportedEncoding: - llvm_unreachable("cannot use an unsupported encoding"); - } + return Enc ? *Enc : OffsetEncoding::UTF16; } // Like most strings in clangd, the input is UTF-8 encoded. size_t lspLength(llvm::StringRef Code) { - if (!useUTF16ForLSP()) - return Code.size(); - // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). - // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx. size_t Count = 0; - iterateCodepoints(Code, [&](int U8Len, int U16Len) { - Count += U16Len; - return false; - }); + switch (lspEncoding()) { + case OffsetEncoding::UTF8: + Count = Code.size(); + break; + case OffsetEncoding::UTF16: + iterateCodepoints(Code, [&](int U8Len, int U16Len) { + Count += U16Len; + return false; + }); + break; + case OffsetEncoding::UTF32: + iterateCodepoints(Code, [&](int U8Len, int U16Len) { + ++Count; + return false; + }); + break; + case OffsetEncoding::UnsupportedEncoding: + llvm_unreachable("unsupported encoding"); + } return Count; } @@ -118,28 +150,15 @@ llvm::Expected<size_t> positionToOffset(llvm::StringRef Code, Position P, StringRef Line = Code.substr(StartOfLine).take_until([](char C) { return C == '\n'; }); - if (!useUTF16ForLSP()) { - // Bounds-checking only. - if (P.character > int(Line.size())) { - if (AllowColumnsBeyondLineLength) - return StartOfLine + Line.size(); - else - return llvm::make_error<llvm::StringError>( - llvm::formatv("UTF-8 offset {0} overruns line {1}", P.character, - P.line), - llvm::errc::invalid_argument); - } - return StartOfLine + P.character; - } - // P.character is in UTF-16 code units, so we have to transcode. + // P.character may be in UTF-16, transcode if necessary. bool Valid; - size_t ByteOffsetInLine = measureUTF16(Line, P.character, Valid); + size_t ByteInLine = measureUnits(Line, P.character, lspEncoding(), Valid); if (!Valid && !AllowColumnsBeyondLineLength) return llvm::make_error<llvm::StringError>( - llvm::formatv("UTF-16 offset {0} is invalid for line {1}", P.character, - P.line), + llvm::formatv("{0} offset {1} is invalid for line {2}", lspEncoding(), + P.character, P.line), llvm::errc::invalid_argument); - return StartOfLine + ByteOffsetInLine; + return StartOfLine + ByteInLine; } Position offsetToPosition(llvm::StringRef Code, size_t Offset) { |

