diff options
author | Pavel Labath <labath@google.com> | 2018-02-14 10:05:09 +0000 |
---|---|---|
committer | Pavel Labath <labath@google.com> | 2018-02-14 10:05:09 +0000 |
commit | f1440978a114e6da6d44d9583efa914cb57a6019 (patch) | |
tree | 4a6715c75166459a8a749ea1d5d623d79d926edd /llvm/lib/Support/DJB.cpp | |
parent | eb18d772a1d4c317f1f6f84795e8f75b1d19e510 (diff) | |
download | bcm5719-llvm-f1440978a114e6da6d44d9583efa914cb57a6019.tar.gz bcm5719-llvm-f1440978a114e6da6d44d9583efa914cb57a6019.zip |
Implement a case-folding version of DJB hash
Summary:
This patch implements a variant of the DJB hash function which folds the
input according to the algorithm in the Dwarf 5 specification (Section
6.1.1.4.5), which in turn references the Unicode Standard (Section 5.18,
"Case Mappings").
To achieve this, I have added a llvm::sys::unicode::foldCharSimple
function, which performs this mapping. The implementation of this
function was generated from the CaseMatching.txt file from the Unicode
spec using a python script (which is also included in this patch). The
script tries to optimize the function by coalescing adjecant mappings
with the same shift and stride (terms I made up). Theoretically, it
could be made a bit smarter and merge adjecant blocks that were
interrupted by only one or two characters with exceptional mapping, but
this would save only a couple of branches, while it would greatly
complicate the implementation, so I deemed it was not worth it.
Since we assume that the vast majority of the input characters will be
US-ASCII, the folding hash function has a fast-path for handling these,
and only whips out the full decode+fold+encode logic if we encounter a
character outside of this range. It might be possible to implement the
folding directly on utf8 sequences, but this would also bring a lot of
complexity for the few cases where we will actually need to process
non-ascii characters.
Reviewers: JDevlieghere, aprantl, probinson, dblaikie
Subscribers: mgorny, hintonda, echristo, clayborg, vleschuk, llvm-commits
Differential Revision: https://reviews.llvm.org/D42740
llvm-svn: 325107
Diffstat (limited to 'llvm/lib/Support/DJB.cpp')
-rw-r--r-- | llvm/lib/Support/DJB.cpp | 78 |
1 files changed, 77 insertions, 1 deletions
diff --git a/llvm/lib/Support/DJB.cpp b/llvm/lib/Support/DJB.cpp index c696f1e6a0b..3190c7ffe5a 100644 --- a/llvm/lib/Support/DJB.cpp +++ b/llvm/lib/Support/DJB.cpp @@ -12,9 +12,85 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/DJB.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/ConvertUTF.h" +#include "llvm/Support/Unicode.h" + +using namespace llvm; + +static inline uint32_t djbHashChar(char C, uint32_t H) { + return (H << 5) + H + C; +} uint32_t llvm::djbHash(StringRef Buffer, uint32_t H) { for (char C : Buffer.bytes()) - H = ((H << 5) + H) + C; + H = djbHashChar(C, H); + return H; +} + +static UTF32 chopOneUTF32(StringRef &Buffer) { + UTF32 C; + const UTF8 *const Begin8Const = + reinterpret_cast<const UTF8 *>(Buffer.begin()); + const UTF8 *Begin8 = Begin8Const; + UTF32 *Begin32 = &C; + + // In lenient mode we will always end up with a "reasonable" value in C for + // non-empty input. + assert(!Buffer.empty()); + ConvertUTF8toUTF32(&Begin8, reinterpret_cast<const UTF8 *>(Buffer.end()), + &Begin32, &C + 1, lenientConversion); + Buffer = Buffer.drop_front(Begin8 - Begin8Const); + return C; +} + +static StringRef toUTF8(UTF32 C, MutableArrayRef<UTF8> Storage) { + const UTF32 *Begin32 = &C; + UTF8 *Begin8 = Storage.begin(); + + // The case-folded output should always be a valid unicode character, so use + // strict mode here. + ConversionResult CR = ConvertUTF32toUTF8(&Begin32, &C + 1, &Begin8, + Storage.end(), strictConversion); + assert(CR == conversionOK && "Case folding produced invalid char?"); + (void)CR; + return StringRef(reinterpret_cast<char *>(Storage.begin()), + Begin8 - Storage.begin()); +} + +static UTF32 foldCharDwarf(UTF32 C) { + // DWARF v5 addition to the unicode folding rules. + // Fold "Latin Small Letter Dotless I" and "Latin Capital Letter I With Dot + // Above" into "i". + if (C == 0x130 || C == 0x131) + return 'i'; + return sys::unicode::foldCharSimple(C); +} + +static uint32_t caseFoldingDjbHashCharSlow(StringRef &Buffer, uint32_t H) { + UTF32 C = chopOneUTF32(Buffer); + + C = foldCharDwarf(C); + + std::array<UTF8, UNI_MAX_UTF8_BYTES_PER_CODE_POINT> Storage; + StringRef Folded = toUTF8(C, Storage); + return djbHash(Folded, H); +} + +uint32_t llvm::caseFoldingDjbHash(StringRef Buffer, uint32_t H) { + while (!Buffer.empty()) { + unsigned char C = Buffer.front(); + if (LLVM_LIKELY(C <= 0x7f)) { + // US-ASCII, encoded as one character in utf-8. + // This is by far the most common case, so handle this specially. + if (C >= 'A' && C <= 'Z') + C = 'a' + (C - 'A'); // fold uppercase into lowercase + H = djbHashChar(C, H); + Buffer = Buffer.drop_front(); + continue; + } + H = caseFoldingDjbHashCharSlow(Buffer, H); + } return H; } |