diff options
author | Sam McCall <sam.mccall@gmail.com> | 2019-03-27 17:47:49 +0000 |
---|---|---|
committer | Sam McCall <sam.mccall@gmail.com> | 2019-03-27 17:47:49 +0000 |
commit | a69698f45fadefd2cabba8d5aa1a8e93463a6d49 (patch) | |
tree | 4052f9f6efd3a86a95ca6f019328d6b8ac0f50f5 /clang-tools-extra/clangd/SourceCode.cpp | |
parent | 381188f1f39ea8980c23aecb3a0695425aaa99cc (diff) | |
download | bcm5719-llvm-a69698f45fadefd2cabba8d5aa1a8e93463a6d49.tar.gz bcm5719-llvm-a69698f45fadefd2cabba8d5aa1a8e93463a6d49.zip |
[clangd] Support utf-8 offsets (rather than utf-16) as a protocol extension
Summary:
Still some pieces to go here: unit tests for new SourceCode functionality and
a command-line flag to force utf-8 mode. But wanted to get early feedback.
Reviewers: hokein
Subscribers: ilya-biryukov, ioeric, MaskRay, jkorous, arphaman, kadircet, jdoerfert, cfe-commits
Tags: #clang
Differential Revision: https://reviews.llvm.org/D58275
llvm-svn: 357102
Diffstat (limited to 'clang-tools-extra/clangd/SourceCode.cpp')
-rw-r--r-- | clang-tools-extra/clangd/SourceCode.cpp | 42 |
1 files changed, 35 insertions, 7 deletions
diff --git a/clang-tools-extra/clangd/SourceCode.cpp b/clang-tools-extra/clangd/SourceCode.cpp index 4366f36072e..cdafaf9636c 100644 --- a/clang-tools-extra/clangd/SourceCode.cpp +++ b/clang-tools-extra/clangd/SourceCode.cpp @@ -7,7 +7,9 @@ //===----------------------------------------------------------------------===// #include "SourceCode.h" +#include "Context.h" #include "Logger.h" +#include "Protocol.h" #include "clang/AST/ASTContext.h" #include "clang/Basic/SourceManager.h" #include "clang/Lex/Lexer.h" @@ -67,8 +69,23 @@ static size_t measureUTF16(llvm::StringRef U8, int U16Units, bool &Valid) { return std::min(Result, U8.size()); } +Key<OffsetEncoding> kCurrentOffsetEncoding; +static bool useUTF16ForLSP() { + auto *Enc = Context::current().get(kCurrentOffsetEncoding); + switch (Enc ? *Enc : OffsetEncoding::UTF16) { + case OffsetEncoding::UTF16: + return true; + case OffsetEncoding::UTF8: + return false; + case OffsetEncoding::UnsupportedEncoding: + llvm_unreachable("cannot use an unsupported encoding"); + } +} + // Like most strings in clangd, the input is UTF-8 encoded. size_t lspLength(llvm::StringRef Code) { + if (!useUTF16ForLSP()) + return Code.size(); // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx. size_t Count = 0; @@ -98,14 +115,25 @@ llvm::Expected<size_t> positionToOffset(llvm::StringRef Code, Position P, llvm::errc::invalid_argument); StartOfLine = NextNL + 1; } - - size_t NextNL = Code.find('\n', StartOfLine); - if (NextNL == llvm::StringRef::npos) - NextNL = Code.size(); - + StringRef Line = + Code.substr(StartOfLine).take_until([](char C) { return C == '\n'; }); + + if (!useUTF16ForLSP()) { + // Bounds-checking only. + if (P.character > int(Line.size())) { + if (AllowColumnsBeyondLineLength) + return StartOfLine + Line.size(); + else + return llvm::make_error<llvm::StringError>( + llvm::formatv("UTF-8 offset {0} overruns line {1}", P.character, + P.line), + llvm::errc::invalid_argument); + } + return StartOfLine + P.character; + } + // P.character is in UTF-16 code units, so we have to transcode. bool Valid; - size_t ByteOffsetInLine = measureUTF16( - Code.substr(StartOfLine, NextNL - StartOfLine), P.character, Valid); + size_t ByteOffsetInLine = measureUTF16(Line, P.character, Valid); if (!Valid && !AllowColumnsBeyondLineLength) return llvm::make_error<llvm::StringError>( llvm::formatv("UTF-16 offset {0} is invalid for line {1}", P.character, |