summaryrefslogtreecommitdiffstats
path: root/llvm/include
diff options
context:
space:
mode:
authorKirill Bobyrev <kbobyrev.opensource@gmail.com>2018-08-20 07:00:36 +0000
committerKirill Bobyrev <kbobyrev.opensource@gmail.com>2018-08-20 07:00:36 +0000
commit5f26a642e61e6245862e62861c7b4e75a00a372b (patch)
tree93bf8f3ebcdadd60c7bd199e08ffc9f983675716 /llvm/include
parent6f1740d52f1b85ea30e05941e8e35e162eed621a (diff)
downloadbcm5719-llvm-5f26a642e61e6245862e62861c7b4e75a00a372b.tar.gz
bcm5719-llvm-5f26a642e61e6245862e62861c7b4e75a00a372b.zip
[llvm] Make YAML serialization up to 2.5 times faster
This patch significantly improves performance of the YAML serializer by optimizing `YAML::isNumeric` function. This function is called on the most strings and is highly inefficient for two reasons: * It uses `Regex`, which is parsed and compiled each time this function is called * It uses multiple passes which are not necessary This patch introduces stateful ad hoc YAML number parser which does not rely on `Regex`. It also fixes YAML number format inconsistency: current implementation supports C-stile octal number format (`01234567`) which was present in YAML 1.0 specialization (http://yaml.org/spec/1.0/), [Section 2.4. Tags, Example 2.19] but was deprecated and is no longer present in latest YAML 1.2 specification (http://yaml.org/spec/1.2/spec.html), see [Section 10.3.2. Tag Resolution]. Since the rest of the rest of the implementation does not support other deprecated YAML 1.0 numeric features such as sexagecimal numbers, commas as delimiters it is treated as inconsistency and not longer supported. This patch also adds unit tests to ensure the validity of proposed implementation. This performance bottleneck was identified while profiling Clangd's global-symbol-builder tool with my colleague @ilya-biryukov. The substantial part of the runtime was spent during a single-thread Reduce phase, which concludes with YAML serialization of collected symbol collection. Regex matching was accountable for approximately 45% of the whole runtime (which involves sharded Map phase), now it is reduced to 18% (which is spent in `clang::clangd::CanonicalIncludes` and can be also optimized because all used regexes are in fact either suffix matches or exact matches). `llvm-yaml-numeric-parser-fuzzer` was used to ensure the validity of the proposed regex replacement. Fuzzing for ~60 hours using 10 threads did not expose any bugs. Benchmarking `global-symbol-builder` (using `hyperfine --warmup 2 --min-runs 5 'command 1' 'command 2'`) tool by processing a reasonable amount of code (26 source files matched by `clang-tools-extra/clangd/*.cpp` with all transitive includes) confirmed our understanding of the performance bottleneck nature as it speeds up the command by the factor of 1.6x: | Command | Mean [s] | Min…Max [s] | | this patch (D50839) | 84.7 ± 0.6 | 83.3…84.7 | | master (rL339849) | 133.1 ± 0.8 | 132.4…134.6 | Using smaller samples (e.g. by collecting symbols from `clang-tools-extra/clangd/AST.cpp` only) yields even better performance improvement, which is expected because Map phase takes less time compared to Reduce and is 2.05x faster and therefore would significantly improve the performance of standalone YAML serializations. | Command | Mean [ms] | Min…Max [ms] | | this patch (D50839) | 3702.2 ± 48.7 | 3635.1…3752.3 | | master (rL339849) | 7607.6 ± 109.5 | 7533.3…7796.4 | Reviewed by: zturner, ilya-biryukov Differential revision: https://reviews.llvm.org/D50839 llvm-svn: 340154
Diffstat (limited to 'llvm/include')
-rw-r--r--llvm/include/llvm/Support/YAMLTraits.h109
1 files changed, 82 insertions, 27 deletions
diff --git a/llvm/include/llvm/Support/YAMLTraits.h b/llvm/include/llvm/Support/YAMLTraits.h
index 4b8c4e95828..fa8caed1a9c 100644
--- a/llvm/include/llvm/Support/YAMLTraits.h
+++ b/llvm/include/llvm/Support/YAMLTraits.h
@@ -27,6 +27,7 @@
#include <cctype>
#include <cstddef>
#include <cstdint>
+#include <iterator>
#include <map>
#include <memory>
#include <new>
@@ -449,46 +450,100 @@ public:
static bool const value = (sizeof(test<DocumentListTraits<T>>(nullptr))==1);
};
-inline bool isNumber(StringRef S) {
- static const char OctalChars[] = "01234567";
- if (S.startswith("0") &&
- S.drop_front().find_first_not_of(OctalChars) == StringRef::npos)
- return true;
+inline bool isNumeric(StringRef S) {
+ const static auto skipDigits = [](StringRef Input) {
+ return Input.drop_front(
+ std::min(Input.find_first_not_of("0123456789"), Input.size()));
+ };
- if (S.startswith("0o") &&
- S.drop_front(2).find_first_not_of(OctalChars) == StringRef::npos)
- return true;
+ // Make S.front() and S.drop_front().front() (if S.front() is [+-]) calls
+ // safe.
+ if (S.empty() || S.equals("+") || S.equals("-"))
+ return false;
- static const char HexChars[] = "0123456789abcdefABCDEF";
- if (S.startswith("0x") &&
- S.drop_front(2).find_first_not_of(HexChars) == StringRef::npos)
+ if (S.equals(".nan") || S.equals(".NaN") || S.equals(".NAN"))
return true;
- static const char DecChars[] = "0123456789";
- if (S.find_first_not_of(DecChars) == StringRef::npos)
- return true;
+ // Infinity and decimal numbers can be prefixed with sign.
+ StringRef Tail = (S.front() == '-' || S.front() == '+') ? S.drop_front() : S;
- if (S.equals(".inf") || S.equals(".Inf") || S.equals(".INF"))
+ // Check for infinity first, because checking for hex and oct numbers is more
+ // expensive.
+ if (Tail.equals(".inf") || Tail.equals(".Inf") || Tail.equals(".INF"))
return true;
- Regex FloatMatcher("^(\\.[0-9]+|[0-9]+(\\.[0-9]*)?)([eE][-+]?[0-9]+)?$");
- if (FloatMatcher.match(S))
- return true;
+ // Section 10.3.2 Tag Resolution
+ // YAML 1.2 Specification prohibits Base 8 and Base 16 numbers prefixed with
+ // [-+], so S should be used instead of Tail.
+ if (S.startswith("0o"))
+ return S.size() > 2 &&
+ S.drop_front(2).find_first_not_of("01234567") == StringRef::npos;
+
+ if (S.startswith("0x"))
+ return S.size() > 2 && S.drop_front(2).find_first_not_of(
+ "0123456789abcdefABCDEF") == StringRef::npos;
+
+ // Parse float: [-+]? (\. [0-9]+ | [0-9]+ (\. [0-9]* )?) ([eE] [-+]? [0-9]+)?
+ S = Tail;
+
+ // Handle cases when the number starts with '.' and hence needs at least one
+ // digit after dot (as opposed by number which has digits before the dot), but
+ // doesn't have one.
+ if (S.startswith(".") &&
+ (S.equals(".") ||
+ (S.size() > 1 && std::strchr("0123456789", S[1]) == nullptr)))
+ return false;
+
+ if (S.startswith("E") || S.startswith("e"))
+ return false;
+
+ enum ParseState {
+ Default,
+ FoundDot,
+ FoundExponent,
+ };
+ ParseState State = Default;
- return false;
-}
+ S = skipDigits(S);
-inline bool isNumeric(StringRef S) {
- if ((S.front() == '-' || S.front() == '+') && isNumber(S.drop_front()))
+ // Accept decimal integer.
+ if (S.empty())
return true;
- if (isNumber(S))
- return true;
+ if (S.front() == '.') {
+ State = FoundDot;
+ S = S.drop_front();
+ } else if (S.front() == 'e' || S.front() == 'E') {
+ State = FoundExponent;
+ S = S.drop_front();
+ } else {
+ return false;
+ }
- if (S.equals(".nan") || S.equals(".NaN") || S.equals(".NAN"))
- return true;
+ if (State == FoundDot) {
+ S = skipDigits(S);
+ if (S.empty())
+ return true;
+
+ if (S.front() == 'e' || S.front() == 'E') {
+ State = FoundExponent;
+ S = S.drop_front();
+ } else {
+ return false;
+ }
+ }
+
+ assert(FoundExponent && "Should have found exponent at this point.");
+ if (S.empty())
+ return false;
+
+ if (S.front() == '+' || S.front() == '-') {
+ S = S.drop_front();
+ if (S.empty())
+ return false;
+ }
- return false;
+ return skipDigits(S).empty();
}
inline bool isNull(StringRef S) {
OpenPOWER on IntegriCloud