diff options
| author | Sam McCall <sam.mccall@gmail.com> | 2019-03-28 14:37:51 +0000 | 
|---|---|---|
| committer | Sam McCall <sam.mccall@gmail.com> | 2019-03-28 14:37:51 +0000 | 
| commit | 8b25d228804f4494eb22bfad0e5ccbaf95d49a91 (patch) | |
| tree | 8e38792297e92a8d07d1fb66b640415f6b278f96 /clang-tools-extra/clangd | |
| parent | 5bbf6f0bd8966c32a62527d32501d984df1b800e (diff) | |
| download | bcm5719-llvm-8b25d228804f4494eb22bfad0e5ccbaf95d49a91.tar.gz bcm5719-llvm-8b25d228804f4494eb22bfad0e5ccbaf95d49a91.zip  | |
[clangd] Support UTF-32 (i.e. codepoint) offsets.
Summary:
(Changes to UTF-8/UTF-16 here are NFC, moving things around to make the
cases more symmetrical)
Reviewers: ilya-biryukov
Subscribers: ioeric, MaskRay, jkorous, arphaman, kadircet, cfe-commits
Tags: #clang
Differential Revision: https://reviews.llvm.org/D59927
llvm-svn: 357173
Diffstat (limited to 'clang-tools-extra/clangd')
| -rw-r--r-- | clang-tools-extra/clangd/Protocol.cpp | 21 | ||||
| -rw-r--r-- | clang-tools-extra/clangd/Protocol.h | 4 | ||||
| -rw-r--r-- | clang-tools-extra/clangd/SourceCode.cpp | 113 | 
3 files changed, 84 insertions, 54 deletions
diff --git a/clang-tools-extra/clangd/Protocol.cpp b/clang-tools-extra/clangd/Protocol.cpp index e11c621aaa3..a7b36307b82 100644 --- a/clang-tools-extra/clangd/Protocol.cpp +++ b/clang-tools-extra/clangd/Protocol.cpp @@ -938,16 +938,19 @@ bool fromJSON(const llvm::json::Value &Params, ReferenceParams &R) {    return fromJSON(Params, Base);  } -llvm::json::Value toJSON(const OffsetEncoding &OE) { +static const char *toString(OffsetEncoding OE) {    switch (OE) { -    case OffsetEncoding::UTF8: -      return "utf-8"; -    case OffsetEncoding::UTF16: -      return "utf-16"; -    case OffsetEncoding::UnsupportedEncoding: -      return "unknown"; +  case OffsetEncoding::UTF8: +    return "utf-8"; +  case OffsetEncoding::UTF16: +    return "utf-16"; +  case OffsetEncoding::UTF32: +    return "utf-32"; +  case OffsetEncoding::UnsupportedEncoding: +    return "unknown";    }  } +llvm::json::Value toJSON(const OffsetEncoding &OE) { return toString(OE); }  bool fromJSON(const llvm::json::Value &V, OffsetEncoding &OE) {    auto Str = V.getAsString();    if (!Str) @@ -955,9 +958,13 @@ bool fromJSON(const llvm::json::Value &V, OffsetEncoding &OE) {    OE = llvm::StringSwitch<OffsetEncoding>(*Str)             .Case("utf-8", OffsetEncoding::UTF8)             .Case("utf-16", OffsetEncoding::UTF16) +           .Case("utf-32", OffsetEncoding::UTF32)             .Default(OffsetEncoding::UnsupportedEncoding);    return true;  } +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, OffsetEncoding Enc) { +  return OS << toString(Enc); +}  } // namespace clangd  } // namespace clang diff --git a/clang-tools-extra/clangd/Protocol.h b/clang-tools-extra/clangd/Protocol.h index 7cf414123f1..0659c93df51 100644 --- a/clang-tools-extra/clangd/Protocol.h +++ b/clang-tools-extra/clangd/Protocol.h @@ -28,6 +28,7 @@  #include "clang/Index/IndexSymbol.h"  #include "llvm/ADT/Optional.h"  #include "llvm/Support/JSON.h" +#include "llvm/Support/raw_ostream.h"  #include <bitset>  #include <string>  #include <vector> @@ -346,9 +347,12 @@ enum class OffsetEncoding {    UTF16,    // Length counts bytes of UTF-8 encoded text. (Clangd extension).    UTF8, +  // Length counts codepoints in unicode text. (Clangd extension). +  UTF32,  };  llvm::json::Value toJSON(const OffsetEncoding &);  bool fromJSON(const llvm::json::Value &, OffsetEncoding &); +llvm::raw_ostream &operator<<(llvm::raw_ostream &, OffsetEncoding OS);  // This struct doesn't mirror LSP!  // The protocol defines deeply nested structures for client capabilities. diff --git a/clang-tools-extra/clangd/SourceCode.cpp b/clang-tools-extra/clangd/SourceCode.cpp index cdafaf9636c..656ab1dc3c1 100644 --- a/clang-tools-extra/clangd/SourceCode.cpp +++ b/clang-tools-extra/clangd/SourceCode.cpp @@ -17,6 +17,7 @@  #include "llvm/ADT/StringRef.h"  #include "llvm/Support/Errc.h"  #include "llvm/Support/Error.h" +#include "llvm/Support/ErrorHandling.h"  #include "llvm/Support/Path.h"  namespace clang { @@ -30,6 +31,8 @@ namespace clangd {  // Returns true if CB returned true, false if we hit the end of string.  template <typename Callback>  static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) { +  // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). +  // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx.    for (size_t I = 0; I < U8.size();) {      unsigned char C = static_cast<unsigned char>(U8[I]);      if (LLVM_LIKELY(!(C & 0x80))) { // ASCII character. @@ -53,46 +56,75 @@ static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) {    return false;  } -// Returns the offset into the string that matches \p Units UTF-16 code units. -// Conceptually, this converts to UTF-16, truncates to CodeUnits, converts back -// to UTF-8, and returns the length in bytes. -static size_t measureUTF16(llvm::StringRef U8, int U16Units, bool &Valid) { +// Returns the byte offset into the string that is an offset of \p Units in +// the specified encoding. +// Conceptually, this converts to the encoding, truncates to CodeUnits, +// converts back to UTF-8, and returns the length in bytes. +static size_t measureUnits(llvm::StringRef U8, int Units, OffsetEncoding Enc, +                           bool &Valid) { +  Valid = Units >= 0; +  if (Units <= 0) +    return 0;    size_t Result = 0; -  Valid = U16Units == 0 || iterateCodepoints(U8, [&](int U8Len, int U16Len) { -            Result += U8Len; -            U16Units -= U16Len; -            return U16Units <= 0; -          }); -  if (U16Units < 0) // Offset was into the middle of a surrogate pair. -    Valid = false; +  switch (Enc) { +  case OffsetEncoding::UTF8: +    Result = Units; +    break; +  case OffsetEncoding::UTF16: +    Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) { +      Result += U8Len; +      Units -= U16Len; +      return Units <= 0; +    }); +    if (Units < 0) // Offset in the middle of a surrogate pair. +      Valid = false; +    break; +  case OffsetEncoding::UTF32: +    Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) { +      Result += U8Len; +      Units--; +      return Units <= 0; +    }); +    break; +  case OffsetEncoding::UnsupportedEncoding: +    llvm_unreachable("unsupported encoding"); +  }    // Don't return an out-of-range index if we overran. -  return std::min(Result, U8.size()); +  if (Result > U8.size()) { +    Valid = false; +    return U8.size(); +  } +  return Result;  }  Key<OffsetEncoding> kCurrentOffsetEncoding; -static bool useUTF16ForLSP() { +static OffsetEncoding lspEncoding() {    auto *Enc = Context::current().get(kCurrentOffsetEncoding); -  switch (Enc ? *Enc : OffsetEncoding::UTF16) { -    case OffsetEncoding::UTF16: -      return true; -    case OffsetEncoding::UTF8: -      return false; -    case OffsetEncoding::UnsupportedEncoding: -      llvm_unreachable("cannot use an unsupported encoding"); -  } +  return Enc ? *Enc : OffsetEncoding::UTF16;  }  // Like most strings in clangd, the input is UTF-8 encoded.  size_t lspLength(llvm::StringRef Code) { -  if (!useUTF16ForLSP()) -    return Code.size(); -  // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). -  // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx.    size_t Count = 0; -  iterateCodepoints(Code, [&](int U8Len, int U16Len) { -    Count += U16Len; -    return false; -  }); +  switch (lspEncoding()) { +  case OffsetEncoding::UTF8: +    Count = Code.size(); +    break; +  case OffsetEncoding::UTF16: +    iterateCodepoints(Code, [&](int U8Len, int U16Len) { +      Count += U16Len; +      return false; +    }); +    break; +  case OffsetEncoding::UTF32: +    iterateCodepoints(Code, [&](int U8Len, int U16Len) { +      ++Count; +      return false; +    }); +    break; +  case OffsetEncoding::UnsupportedEncoding: +    llvm_unreachable("unsupported encoding"); +  }    return Count;  } @@ -118,28 +150,15 @@ llvm::Expected<size_t> positionToOffset(llvm::StringRef Code, Position P,    StringRef Line =        Code.substr(StartOfLine).take_until([](char C) { return C == '\n'; }); -  if (!useUTF16ForLSP()) { -    // Bounds-checking only. -    if (P.character > int(Line.size())) { -      if (AllowColumnsBeyondLineLength) -        return StartOfLine + Line.size(); -      else -        return llvm::make_error<llvm::StringError>( -            llvm::formatv("UTF-8 offset {0} overruns line {1}", P.character, -                          P.line), -            llvm::errc::invalid_argument); -    } -    return StartOfLine + P.character; -  } -  // P.character is in UTF-16 code units, so we have to transcode. +  // P.character may be in UTF-16, transcode if necessary.    bool Valid; -  size_t ByteOffsetInLine = measureUTF16(Line, P.character, Valid); +  size_t ByteInLine = measureUnits(Line, P.character, lspEncoding(), Valid);    if (!Valid && !AllowColumnsBeyondLineLength)      return llvm::make_error<llvm::StringError>( -        llvm::formatv("UTF-16 offset {0} is invalid for line {1}", P.character, -                      P.line), +        llvm::formatv("{0} offset {1} is invalid for line {2}", lspEncoding(), +                      P.character, P.line),          llvm::errc::invalid_argument); -  return StartOfLine + ByteOffsetInLine; +  return StartOfLine + ByteInLine;  }  Position offsetToPosition(llvm::StringRef Code, size_t Offset) {  | 

