summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Support/DJB.cpp
diff options
context:
space:
mode:
authorPavel Labath <labath@google.com>2018-02-14 10:05:09 +0000
committerPavel Labath <labath@google.com>2018-02-14 10:05:09 +0000
commitf1440978a114e6da6d44d9583efa914cb57a6019 (patch)
tree4a6715c75166459a8a749ea1d5d623d79d926edd /llvm/lib/Support/DJB.cpp
parenteb18d772a1d4c317f1f6f84795e8f75b1d19e510 (diff)
downloadbcm5719-llvm-f1440978a114e6da6d44d9583efa914cb57a6019.tar.gz
bcm5719-llvm-f1440978a114e6da6d44d9583efa914cb57a6019.zip
Implement a case-folding version of DJB hash
Summary: This patch implements a variant of the DJB hash function which folds the input according to the algorithm in the Dwarf 5 specification (Section 6.1.1.4.5), which in turn references the Unicode Standard (Section 5.18, "Case Mappings"). To achieve this, I have added a llvm::sys::unicode::foldCharSimple function, which performs this mapping. The implementation of this function was generated from the CaseMatching.txt file from the Unicode spec using a python script (which is also included in this patch). The script tries to optimize the function by coalescing adjecant mappings with the same shift and stride (terms I made up). Theoretically, it could be made a bit smarter and merge adjecant blocks that were interrupted by only one or two characters with exceptional mapping, but this would save only a couple of branches, while it would greatly complicate the implementation, so I deemed it was not worth it. Since we assume that the vast majority of the input characters will be US-ASCII, the folding hash function has a fast-path for handling these, and only whips out the full decode+fold+encode logic if we encounter a character outside of this range. It might be possible to implement the folding directly on utf8 sequences, but this would also bring a lot of complexity for the few cases where we will actually need to process non-ascii characters. Reviewers: JDevlieghere, aprantl, probinson, dblaikie Subscribers: mgorny, hintonda, echristo, clayborg, vleschuk, llvm-commits Differential Revision: https://reviews.llvm.org/D42740 llvm-svn: 325107
Diffstat (limited to 'llvm/lib/Support/DJB.cpp')
-rw-r--r--llvm/lib/Support/DJB.cpp78
1 files changed, 77 insertions, 1 deletions
diff --git a/llvm/lib/Support/DJB.cpp b/llvm/lib/Support/DJB.cpp
index c696f1e6a0b..3190c7ffe5a 100644
--- a/llvm/lib/Support/DJB.cpp
+++ b/llvm/lib/Support/DJB.cpp
@@ -12,9 +12,85 @@
//===----------------------------------------------------------------------===//
#include "llvm/Support/DJB.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/Unicode.h"
+
+using namespace llvm;
+
+static inline uint32_t djbHashChar(char C, uint32_t H) {
+ return (H << 5) + H + C;
+}
uint32_t llvm::djbHash(StringRef Buffer, uint32_t H) {
for (char C : Buffer.bytes())
- H = ((H << 5) + H) + C;
+ H = djbHashChar(C, H);
+ return H;
+}
+
+static UTF32 chopOneUTF32(StringRef &Buffer) {
+ UTF32 C;
+ const UTF8 *const Begin8Const =
+ reinterpret_cast<const UTF8 *>(Buffer.begin());
+ const UTF8 *Begin8 = Begin8Const;
+ UTF32 *Begin32 = &C;
+
+ // In lenient mode we will always end up with a "reasonable" value in C for
+ // non-empty input.
+ assert(!Buffer.empty());
+ ConvertUTF8toUTF32(&Begin8, reinterpret_cast<const UTF8 *>(Buffer.end()),
+ &Begin32, &C + 1, lenientConversion);
+ Buffer = Buffer.drop_front(Begin8 - Begin8Const);
+ return C;
+}
+
+static StringRef toUTF8(UTF32 C, MutableArrayRef<UTF8> Storage) {
+ const UTF32 *Begin32 = &C;
+ UTF8 *Begin8 = Storage.begin();
+
+ // The case-folded output should always be a valid unicode character, so use
+ // strict mode here.
+ ConversionResult CR = ConvertUTF32toUTF8(&Begin32, &C + 1, &Begin8,
+ Storage.end(), strictConversion);
+ assert(CR == conversionOK && "Case folding produced invalid char?");
+ (void)CR;
+ return StringRef(reinterpret_cast<char *>(Storage.begin()),
+ Begin8 - Storage.begin());
+}
+
+static UTF32 foldCharDwarf(UTF32 C) {
+ // DWARF v5 addition to the unicode folding rules.
+ // Fold "Latin Small Letter Dotless I" and "Latin Capital Letter I With Dot
+ // Above" into "i".
+ if (C == 0x130 || C == 0x131)
+ return 'i';
+ return sys::unicode::foldCharSimple(C);
+}
+
+static uint32_t caseFoldingDjbHashCharSlow(StringRef &Buffer, uint32_t H) {
+ UTF32 C = chopOneUTF32(Buffer);
+
+ C = foldCharDwarf(C);
+
+ std::array<UTF8, UNI_MAX_UTF8_BYTES_PER_CODE_POINT> Storage;
+ StringRef Folded = toUTF8(C, Storage);
+ return djbHash(Folded, H);
+}
+
+uint32_t llvm::caseFoldingDjbHash(StringRef Buffer, uint32_t H) {
+ while (!Buffer.empty()) {
+ unsigned char C = Buffer.front();
+ if (LLVM_LIKELY(C <= 0x7f)) {
+ // US-ASCII, encoded as one character in utf-8.
+ // This is by far the most common case, so handle this specially.
+ if (C >= 'A' && C <= 'Z')
+ C = 'a' + (C - 'A'); // fold uppercase into lowercase
+ H = djbHashChar(C, H);
+ Buffer = Buffer.drop_front();
+ continue;
+ }
+ H = caseFoldingDjbHashCharSlow(Buffer, H);
+ }
return H;
}
OpenPOWER on IntegriCloud