summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Support/DJB.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Support/DJB.cpp')
-rw-r--r--llvm/lib/Support/DJB.cpp80
1 files changed, 78 insertions, 2 deletions
diff --git a/llvm/lib/Support/DJB.cpp b/llvm/lib/Support/DJB.cpp
index c696f1e6a0b..b7bfbbf7bd8 100644
--- a/llvm/lib/Support/DJB.cpp
+++ b/llvm/lib/Support/DJB.cpp
@@ -12,9 +12,85 @@
//===----------------------------------------------------------------------===//
#include "llvm/Support/DJB.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/Unicode.h"
+
+using namespace llvm;
+
+static inline uint32_t djbHashChar(unsigned char C, uint32_t H) {
+ return (H << 5) + H + C;
+}
uint32_t llvm::djbHash(StringRef Buffer, uint32_t H) {
- for (char C : Buffer.bytes())
- H = ((H << 5) + H) + C;
+ for (unsigned char C : Buffer.bytes())
+ H = djbHashChar(C, H);
+ return H;
+}
+
+static UTF32 chopOneUTF32(StringRef &Buffer) {
+ UTF32 C;
+ const UTF8 *const Begin8Const =
+ reinterpret_cast<const UTF8 *>(Buffer.begin());
+ const UTF8 *Begin8 = Begin8Const;
+ UTF32 *Begin32 = &C;
+
+ // In lenient mode we will always end up with a "reasonable" value in C for
+ // non-empty input.
+ assert(!Buffer.empty());
+ ConvertUTF8toUTF32(&Begin8, reinterpret_cast<const UTF8 *>(Buffer.end()),
+ &Begin32, &C + 1, lenientConversion);
+ Buffer = Buffer.drop_front(Begin8 - Begin8Const);
+ return C;
+}
+
+static StringRef toUTF8(UTF32 C, MutableArrayRef<UTF8> Storage) {
+ const UTF32 *Begin32 = &C;
+ UTF8 *Begin8 = Storage.begin();
+
+ // The case-folded output should always be a valid unicode character, so use
+ // strict mode here.
+ ConversionResult CR = ConvertUTF32toUTF8(&Begin32, &C + 1, &Begin8,
+ Storage.end(), strictConversion);
+ assert(CR == conversionOK && "Case folding produced invalid char?");
+ (void)CR;
+ return StringRef(reinterpret_cast<char *>(Storage.begin()),
+ Begin8 - Storage.begin());
+}
+
+static UTF32 foldCharDwarf(UTF32 C) {
+ // DWARF v5 addition to the unicode folding rules.
+ // Fold "Latin Small Letter Dotless I" and "Latin Capital Letter I With Dot
+ // Above" into "i".
+ if (C == 0x130 || C == 0x131)
+ return 'i';
+ return sys::unicode::foldCharSimple(C);
+}
+
+static uint32_t caseFoldingDjbHashCharSlow(StringRef &Buffer, uint32_t H) {
+ UTF32 C = chopOneUTF32(Buffer);
+
+ C = foldCharDwarf(C);
+
+ std::array<UTF8, UNI_MAX_UTF8_BYTES_PER_CODE_POINT> Storage;
+ StringRef Folded = toUTF8(C, Storage);
+ return djbHash(Folded, H);
+}
+
+uint32_t llvm::caseFoldingDjbHash(StringRef Buffer, uint32_t H) {
+ while (!Buffer.empty()) {
+ unsigned char C = Buffer.front();
+ if (LLVM_LIKELY(C <= 0x7f)) {
+ // US-ASCII, encoded as one character in utf-8.
+ // This is by far the most common case, so handle this specially.
+ if (C >= 'A' && C <= 'Z')
+ C = 'a' + (C - 'A'); // fold uppercase into lowercase
+ H = djbHashChar(C, H);
+ Buffer = Buffer.drop_front();
+ continue;
+ }
+ H = caseFoldingDjbHashCharSlow(Buffer, H);
+ }
return H;
}
OpenPOWER on IntegriCloud