summaryrefslogtreecommitdiffstats
path: root/llvm/unittests/Support/DJBTest.cpp
diff options
context:
space:
mode:
authorPavel Labath <labath@google.com>2018-02-14 10:05:09 +0000
committerPavel Labath <labath@google.com>2018-02-14 10:05:09 +0000
commitf1440978a114e6da6d44d9583efa914cb57a6019 (patch)
tree4a6715c75166459a8a749ea1d5d623d79d926edd /llvm/unittests/Support/DJBTest.cpp
parenteb18d772a1d4c317f1f6f84795e8f75b1d19e510 (diff)
downloadbcm5719-llvm-f1440978a114e6da6d44d9583efa914cb57a6019.tar.gz
bcm5719-llvm-f1440978a114e6da6d44d9583efa914cb57a6019.zip
Implement a case-folding version of DJB hash
Summary: This patch implements a variant of the DJB hash function which folds the input according to the algorithm in the Dwarf 5 specification (Section 6.1.1.4.5), which in turn references the Unicode Standard (Section 5.18, "Case Mappings"). To achieve this, I have added a llvm::sys::unicode::foldCharSimple function, which performs this mapping. The implementation of this function was generated from the CaseMatching.txt file from the Unicode spec using a python script (which is also included in this patch). The script tries to optimize the function by coalescing adjecant mappings with the same shift and stride (terms I made up). Theoretically, it could be made a bit smarter and merge adjecant blocks that were interrupted by only one or two characters with exceptional mapping, but this would save only a couple of branches, while it would greatly complicate the implementation, so I deemed it was not worth it. Since we assume that the vast majority of the input characters will be US-ASCII, the folding hash function has a fast-path for handling these, and only whips out the full decode+fold+encode logic if we encounter a character outside of this range. It might be possible to implement the folding directly on utf8 sequences, but this would also bring a lot of complexity for the few cases where we will actually need to process non-ascii characters. Reviewers: JDevlieghere, aprantl, probinson, dblaikie Subscribers: mgorny, hintonda, echristo, clayborg, vleschuk, llvm-commits Differential Revision: https://reviews.llvm.org/D42740 llvm-svn: 325107
Diffstat (limited to 'llvm/unittests/Support/DJBTest.cpp')
-rw-r--r--llvm/unittests/Support/DJBTest.cpp89
1 files changed, 89 insertions, 0 deletions
diff --git a/llvm/unittests/Support/DJBTest.cpp b/llvm/unittests/Support/DJBTest.cpp
new file mode 100644
index 00000000000..c9e970c8480
--- /dev/null
+++ b/llvm/unittests/Support/DJBTest.cpp
@@ -0,0 +1,89 @@
+//===---------- llvm/unittest/Support/DJBTest.cpp -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/DJB.h"
+#include "llvm/ADT/Twine.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+TEST(DJBTest, caseFolding) {
+ struct TestCase {
+ StringLiteral One;
+ StringLiteral Two;
+ };
+
+ static constexpr TestCase Tests[] = {
+ {"ASDF", "asdf"},
+ {"qWeR", "QwEr"},
+ {"qqqqqqqqqqqqqqqqqqqq", "QQQQQQQQQQQQQQQQQQQQ"},
+
+ {"I", "i"},
+ // Latin Small Letter Dotless I
+ {u8"\u0130", "i"},
+ // Latin Capital Letter I With Dot Above
+ {u8"\u0131", "i"},
+
+ // Latin Capital Letter A With Grave
+ {u8"\u00c0", u8"\u00e0"},
+ // Latin Capital Letter A With Macron
+ {u8"\u0100", u8"\u0101"},
+ // Latin Capital Letter L With Acute
+ {u8"\u0139", u8"\u013a"},
+ // Cyrillic Capital Letter Ie
+ {u8"\u0415", u8"\u0435"},
+ // Latin Capital Letter A With Circumflex And Grave
+ {u8"\u1ea6", u8"\u1ea7"},
+ // Kelvin Sign
+ {u8"\u212a", u8"\u006b"},
+ // Glagolitic Capital Letter Chrivi
+ {u8"\u2c1d", u8"\u2c4d"},
+ // Fullwidth Latin Capital Letter M
+ {u8"\uff2d", u8"\uff4d"},
+ // Old Hungarian Capital Letter Ej
+ {u8"\U00010c92", u8"\U00010cd2"},
+ };
+
+ for (const TestCase &T : Tests) {
+ SCOPED_TRACE("Comparing '" + T.One + "' and '" + T.Two + "'");
+ EXPECT_EQ(caseFoldingDjbHash(T.One), caseFoldingDjbHash(T.Two));
+ }
+}
+
+TEST(DJBTest, knownValuesLowerCase) {
+ struct TestCase {
+ StringLiteral Text;
+ uint32_t Hash;
+ };
+ static constexpr TestCase Tests[] = {
+ {"", 5381u},
+ {"f", 177675u},
+ {"fo", 5863386u},
+ {"foo", 193491849u},
+ {"foob", 2090263819u},
+ {"fooba", 259229388u},
+ {"foobar", 4259602622u},
+ {"pneumonoultramicroscopicsilicovolcanoconiosis", 3999417781u},
+ };
+
+ for (const TestCase &T : Tests) {
+ SCOPED_TRACE("Text: '" + T.Text + "'");
+ EXPECT_EQ(T.Hash, djbHash(T.Text));
+ EXPECT_EQ(T.Hash, caseFoldingDjbHash(T.Text));
+ EXPECT_EQ(T.Hash, caseFoldingDjbHash(T.Text.upper()));
+ }
+}
+
+TEST(DJBTest, knownValuesUnicode) {
+ EXPECT_EQ(
+ 2326183139u,
+ caseFoldingDjbHash(
+ u8"\u0130\u0131\u00c0\u00e0\u0100\u0101\u0139\u013a\u0415\u0435\u1ea6"
+ u8"\u1ea7\u212a\u006b\u2c1d\u2c4d\uff2d\uff4d\U00010c92\U00010cd2"));
+}
OpenPOWER on IntegriCloud