diff options
author | Nick Terrell <terrelln@fb.com> | 2019-11-11 22:07:12 -0800 |
---|---|---|
committer | Fangrui Song <maskray@google.com> | 2019-11-11 22:14:28 -0800 |
commit | 43ff63477256d584cf506dba0c222c28231b0ccc (patch) | |
tree | d7a0c2ac8e758ca3809edf3db02aa39911cf0309 /llvm/lib/Support | |
parent | 3c4f8bb108b2a362db7d723fe9646a3d236fe60e (diff) | |
download | bcm5719-llvm-43ff63477256d584cf506dba0c222c28231b0ccc.tar.gz bcm5719-llvm-43ff63477256d584cf506dba0c222c28231b0ccc.zip |
[Support] Optimize SHA1 implementation
* Add inline to the helper functions because gcc-9 won't inline all of
them without the hint. I've avoided `__attribute__((always_inline))`
because gcc and clang will inline without it, and improves
compatibility.
* Replace the byte-by-byte copy in update() with endian::readbe32()
since perf reports that 1/2 of the time is spent copying into the
buffer before this patch.
When lld uses --build-id=sha1 it spends 30-45% of CPU in SHA1 depending on the binary (not wall-time since it is parallel). This patch speeds up SHA1 by a factor of 2 on clang-8 and 3 on gcc-6. This leads to a >10% improvement in overall linking time.
lld-speed-test benchmarks run on an Intel i9-9900k with Turbo disabled on CPU 0 compiled with clang-9. Stats recorded with `perf stat -r 5`. All inputs are using `--build-id=sha1`.
| Input | Before (seconds) | After (seconds) |
| --- | --- | --- |
| chrome | 2.14 | 1.82 (-15%) |
| chrome-icf | 2.56 | 2.29 (-10%) |
| clang | 0.65 | 0.53 (-18%) |
| clang-fsds | 0.69 | 0.58 (-16%) |
| clang-gdb-index | 21.71 | 19.3 (-11%) |
| gold | 0.42 | 0.34 (-19%) |
| gold-fsds | 0.431 | 0.355 (-17%) |
| linux-kernel | 0.625 | 0.575 (-8%) |
| llvm-as | 0.045 | 0.039 (-14%) |
| llvm-as-fsds | 0.035 | 0.039 (-11%) |
| mozilla | 11.3 | 9.8 (-13%) |
| mozilla-gc | 11.84 | 10.36 (-12%) |
| mozilla-O0 | 8.2 | 5.84 (-28%) |
| scylla | 5.59 | 4.52 (-19%) |
Reviewed By: ruiu, MaskRay
Differential Revision: https://reviews.llvm.org/D69295
Diffstat (limited to 'llvm/lib/Support')
-rw-r--r-- | llvm/lib/Support/SHA1.cpp | 54 |
1 files changed, 39 insertions, 15 deletions
diff --git a/llvm/lib/Support/SHA1.cpp b/llvm/lib/Support/SHA1.cpp index 47a5f07fbe7..a98ca41a335 100644 --- a/llvm/lib/Support/SHA1.cpp +++ b/llvm/lib/Support/SHA1.cpp @@ -16,6 +16,7 @@ #include "llvm/Support/SHA1.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/Support/Endian.h" #include "llvm/Support/Host.h" using namespace llvm; @@ -26,45 +27,45 @@ using namespace llvm; #define SHA_BIG_ENDIAN #endif -static uint32_t rol(uint32_t Number, int Bits) { +static inline uint32_t rol(uint32_t Number, int Bits) { return (Number << Bits) | (Number >> (32 - Bits)); } -static uint32_t blk0(uint32_t *Buf, int I) { return Buf[I]; } +static inline uint32_t blk0(uint32_t *Buf, int I) { return Buf[I]; } -static uint32_t blk(uint32_t *Buf, int I) { +static inline uint32_t blk(uint32_t *Buf, int I) { Buf[I & 15] = rol(Buf[(I + 13) & 15] ^ Buf[(I + 8) & 15] ^ Buf[(I + 2) & 15] ^ Buf[I & 15], 1); return Buf[I & 15]; } -static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, - int I, uint32_t *Buf) { +static inline void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, + uint32_t &E, int I, uint32_t *Buf) { E += ((B & (C ^ D)) ^ D) + blk0(Buf, I) + 0x5A827999 + rol(A, 5); B = rol(B, 30); } -static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, - int I, uint32_t *Buf) { +static inline void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, + uint32_t &E, int I, uint32_t *Buf) { E += ((B & (C ^ D)) ^ D) + blk(Buf, I) + 0x5A827999 + rol(A, 5); B = rol(B, 30); } -static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, - int I, uint32_t *Buf) { +static inline void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, + uint32_t &E, int I, uint32_t *Buf) { E += (B ^ C ^ D) + blk(Buf, I) + 0x6ED9EBA1 + rol(A, 5); B = rol(B, 30); } -static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, - int I, uint32_t *Buf) { +static inline void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, + uint32_t &E, int I, uint32_t *Buf) { E += (((B | C) & D) | (B & C)) + blk(Buf, I) + 0x8F1BBCDC + rol(A, 5); B = rol(B, 30); } -static void r4(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, - int I, uint32_t *Buf) { +static inline void r4(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, + uint32_t &E, int I, uint32_t *Buf) { E += (B ^ C ^ D) + blk(Buf, I) + 0xCA62C1D6 + rol(A, 5); B = rol(B, 30); } @@ -210,8 +211,31 @@ void SHA1::writebyte(uint8_t Data) { } void SHA1::update(ArrayRef<uint8_t> Data) { - for (auto &C : Data) - writebyte(C); + InternalState.ByteCount += Data.size(); + + // Finish the current block. + if (InternalState.BufferOffset > 0) { + const size_t Remainder = std::min<size_t>( + Data.size(), BLOCK_LENGTH - InternalState.BufferOffset); + for (size_t I = 0; I < Remainder; ++I) + addUncounted(Data[I]); + Data = Data.drop_front(Remainder); + } + + // Fast buffer filling for large inputs. + while (Data.size() >= BLOCK_LENGTH) { + assert(InternalState.BufferOffset == 0); + assert(BLOCK_LENGTH % 4 == 0); + constexpr size_t BLOCK_LENGTH_32 = BLOCK_LENGTH / 4; + for (size_t I = 0; I < BLOCK_LENGTH_32; ++I) + InternalState.Buffer.L[I] = support::endian::read32be(&Data[I * 4]); + hashBlock(); + Data = Data.drop_front(BLOCK_LENGTH); + } + + // Finish the remainder. + for (uint8_t C : Data) + addUncounted(C); } void SHA1::pad() { |