[clangd] Support UTF-32 (i.e. codepoint) offsets.

Summary: (Changes to UTF-8/UTF-16 here are NFC, moving things around to make the cases more symmetrical) Reviewers: ilya-biryukov Subscribers: ioeric, MaskRay, jkorous, arphaman, kadircet, cfe-commits Tags: #clang Differential Revision: https://reviews.llvm.org/D59927 llvm-svn: 357173
author: Sam McCall <sam.mccall@gmail.com> 2019-03-28 14:37:51 +0000
committer: Sam McCall <sam.mccall@gmail.com> 2019-03-28 14:37:51 +0000
commit: 8b25d228804f4494eb22bfad0e5ccbaf95d49a91 (patch)
tree: 8e38792297e92a8d07d1fb66b640415f6b278f96 /clang-tools-extra/clangd
parent: 5bbf6f0bd8966c32a62527d32501d984df1b800e (diff)
download: bcm5719-llvm-8b25d228804f4494eb22bfad0e5ccbaf95d49a91.tar.gz
bcm5719-llvm-8b25d228804f4494eb22bfad0e5ccbaf95d49a91.zip
3 files changed, 84 insertions, 54 deletions
diff --git a/clang-tools-extra/clangd/Protocol.cpp b/clang-tools-extra/clangd/Protocol.cpp
index e11c621aaa3..a7b36307b82 100644
--- a/clang-tools-extra/clangd/Protocol.cpp
+++ b/clang-tools-extra/clangd/Protocol.cpp
@@ -938,16 +938,19 @@ bool fromJSON(const llvm::json::Value &Params, ReferenceParams &R) {
   return fromJSON(Params, Base);
 }
 
-llvm::json::Value toJSON(const OffsetEncoding &OE) {
+static const char *toString(OffsetEncoding OE) {
   switch (OE) {
-    case OffsetEncoding::UTF8:
-      return "utf-8";
-    case OffsetEncoding::UTF16:
-      return "utf-16";
-    case OffsetEncoding::UnsupportedEncoding:
-      return "unknown";
+  case OffsetEncoding::UTF8:
+    return "utf-8";
+  case OffsetEncoding::UTF16:
+    return "utf-16";
+  case OffsetEncoding::UTF32:
+    return "utf-32";
+  case OffsetEncoding::UnsupportedEncoding:
+    return "unknown";
   }
 }
+llvm::json::Value toJSON(const OffsetEncoding &OE) { return toString(OE); }
 bool fromJSON(const llvm::json::Value &V, OffsetEncoding &OE) {
   auto Str = V.getAsString();
   if (!Str)
@@ -955,9 +958,13 @@ bool fromJSON(const llvm::json::Value &V, OffsetEncoding &OE) {
   OE = llvm::StringSwitch<OffsetEncoding>(*Str)
            .Case("utf-8", OffsetEncoding::UTF8)
            .Case("utf-16", OffsetEncoding::UTF16)
+           .Case("utf-32", OffsetEncoding::UTF32)
            .Default(OffsetEncoding::UnsupportedEncoding);
   return true;
 }
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, OffsetEncoding Enc) {
+  return OS << toString(Enc);
+}
 
 } // namespace clangd
 } // namespace clang
diff --git a/clang-tools-extra/clangd/Protocol.h b/clang-tools-extra/clangd/Protocol.h
index 7cf414123f1..0659c93df51 100644
--- a/clang-tools-extra/clangd/Protocol.h
+++ b/clang-tools-extra/clangd/Protocol.h
@@ -28,6 +28,7 @@
 #include "clang/Index/IndexSymbol.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/Support/JSON.h"
+#include "llvm/Support/raw_ostream.h"
 #include <bitset>
 #include <string>
 #include <vector>
@@ -346,9 +347,12 @@ enum class OffsetEncoding {
   UTF16,
   // Length counts bytes of UTF-8 encoded text. (Clangd extension).
   UTF8,
+  // Length counts codepoints in unicode text. (Clangd extension).
+  UTF32,
 };
 llvm::json::Value toJSON(const OffsetEncoding &);
 bool fromJSON(const llvm::json::Value &, OffsetEncoding &);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, OffsetEncoding OS);
 
 // This struct doesn't mirror LSP!
 // The protocol defines deeply nested structures for client capabilities.
diff --git a/clang-tools-extra/clangd/SourceCode.cpp b/clang-tools-extra/clangd/SourceCode.cpp
index cdafaf9636c..656ab1dc3c1 100644
--- a/clang-tools-extra/clangd/SourceCode.cpp
+++ b/clang-tools-extra/clangd/SourceCode.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Path.h"
 
 namespace clang {
@@ -30,6 +31,8 @@ namespace clangd {
 // Returns true if CB returned true, false if we hit the end of string.
 template <typename Callback>
 static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) {
+  // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
+  // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx.
   for (size_t I = 0; I < U8.size();) {
     unsigned char C = static_cast<unsigned char>(U8[I]);
     if (LLVM_LIKELY(!(C & 0x80))) { // ASCII character.
@@ -53,46 +56,75 @@ static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) {
   return false;
 }
 
-// Returns the offset into the string that matches \p Units UTF-16 code units.
-// Conceptually, this converts to UTF-16, truncates to CodeUnits, converts back
-// to UTF-8, and returns the length in bytes.
-static size_t measureUTF16(llvm::StringRef U8, int U16Units, bool &Valid) {
+// Returns the byte offset into the string that is an offset of \p Units in
+// the specified encoding.
+// Conceptually, this converts to the encoding, truncates to CodeUnits,
+// converts back to UTF-8, and returns the length in bytes.
+static size_t measureUnits(llvm::StringRef U8, int Units, OffsetEncoding Enc,
+                           bool &Valid) {
+  Valid = Units >= 0;
+  if (Units <= 0)
+    return 0;
   size_t Result = 0;
-  Valid = U16Units == 0 || iterateCodepoints(U8, [&](int U8Len, int U16Len) {
-            Result += U8Len;
-            U16Units -= U16Len;
-            return U16Units <= 0;
-          });
-  if (U16Units < 0) // Offset was into the middle of a surrogate pair.
-    Valid = false;
+  switch (Enc) {
+  case OffsetEncoding::UTF8:
+    Result = Units;
+    break;
+  case OffsetEncoding::UTF16:
+    Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) {
+      Result += U8Len;
+      Units -= U16Len;
+      return Units <= 0;
+    });
+    if (Units < 0) // Offset in the middle of a surrogate pair.
+      Valid = false;
+    break;
+  case OffsetEncoding::UTF32:
+    Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) {
+      Result += U8Len;
+      Units--;
+      return Units <= 0;
+    });
+    break;
+  case OffsetEncoding::UnsupportedEncoding:
+    llvm_unreachable("unsupported encoding");
+  }
   // Don't return an out-of-range index if we overran.
-  return std::min(Result, U8.size());
+  if (Result > U8.size()) {
+    Valid = false;
+    return U8.size();
+  }
+  return Result;
 }
 
 Key<OffsetEncoding> kCurrentOffsetEncoding;
-static bool useUTF16ForLSP() {
+static OffsetEncoding lspEncoding() {
   auto *Enc = Context::current().get(kCurrentOffsetEncoding);
-  switch (Enc ? *Enc : OffsetEncoding::UTF16) {
-    case OffsetEncoding::UTF16:
-      return true;
-    case OffsetEncoding::UTF8:
-      return false;
-    case OffsetEncoding::UnsupportedEncoding:
-      llvm_unreachable("cannot use an unsupported encoding");
-  }
+  return Enc ? *Enc : OffsetEncoding::UTF16;
 }
 
 // Like most strings in clangd, the input is UTF-8 encoded.
 size_t lspLength(llvm::StringRef Code) {
-  if (!useUTF16ForLSP())
-    return Code.size();
-  // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
-  // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx.
   size_t Count = 0;
-  iterateCodepoints(Code, [&](int U8Len, int U16Len) {
-    Count += U16Len;
-    return false;
-  });
+  switch (lspEncoding()) {
+  case OffsetEncoding::UTF8:
+    Count = Code.size();
+    break;
+  case OffsetEncoding::UTF16:
+    iterateCodepoints(Code, [&](int U8Len, int U16Len) {
+      Count += U16Len;
+      return false;
+    });
+    break;
+  case OffsetEncoding::UTF32:
+    iterateCodepoints(Code, [&](int U8Len, int U16Len) {
+      ++Count;
+      return false;
+    });
+    break;
+  case OffsetEncoding::UnsupportedEncoding:
+    llvm_unreachable("unsupported encoding");
+  }
   return Count;
 }
 
@@ -118,28 +150,15 @@ llvm::Expected<size_t> positionToOffset(llvm::StringRef Code, Position P,
   StringRef Line =
       Code.substr(StartOfLine).take_until([](char C) { return C == '\n'; });
 
-  if (!useUTF16ForLSP()) {
-    // Bounds-checking only.
-    if (P.character > int(Line.size())) {
-      if (AllowColumnsBeyondLineLength)
-        return StartOfLine + Line.size();
-      else
-        return llvm::make_error<llvm::StringError>(
-            llvm::formatv("UTF-8 offset {0} overruns line {1}", P.character,
-                          P.line),
-            llvm::errc::invalid_argument);
-    }
-    return StartOfLine + P.character;
-  }
-  // P.character is in UTF-16 code units, so we have to transcode.
+  // P.character may be in UTF-16, transcode if necessary.
   bool Valid;
-  size_t ByteOffsetInLine = measureUTF16(Line, P.character, Valid);
+  size_t ByteInLine = measureUnits(Line, P.character, lspEncoding(), Valid);
   if (!Valid && !AllowColumnsBeyondLineLength)
     return llvm::make_error<llvm::StringError>(
-        llvm::formatv("UTF-16 offset {0} is invalid for line {1}", P.character,
-                      P.line),
+        llvm::formatv("{0} offset {1} is invalid for line {2}", lspEncoding(),
+                      P.character, P.line),
         llvm::errc::invalid_argument);
-  return StartOfLine + ByteOffsetInLine;
+  return StartOfLine + ByteInLine;
 }
 
 Position offsetToPosition(llvm::StringRef Code, size_t Offset) {
author	Sam McCall <sam.mccall@gmail.com>	2019-03-28 14:37:51 +0000
committer	Sam McCall <sam.mccall@gmail.com>	2019-03-28 14:37:51 +0000
commit	8b25d228804f4494eb22bfad0e5ccbaf95d49a91 (patch)
tree	8e38792297e92a8d07d1fb66b640415f6b278f96 /clang-tools-extra/clangd
parent	5bbf6f0bd8966c32a62527d32501d984df1b800e (diff)
download	bcm5719-llvm-8b25d228804f4494eb22bfad0e5ccbaf95d49a91.tar.gz bcm5719-llvm-8b25d228804f4494eb22bfad0e5ccbaf95d49a91.zip