COFF, ELF: ICF: Perform 2 rounds of relocation hash propagation.

LLD's performance on PGO instrumented Windows binaries was still not great even with the fix in D56955; out of the 2m41s linker runtime, around 2 minutes were still being spent in ICF. I looked into this more closely and discovered that the vast majority of the runtime was being spent segregating .pdata sections with the following relocation chain: .pdata -> identical .text -> unique PGO counter (not eligible for ICF) This patch causes us to perform 2 rounds of relocation hash propagation, which allows the hash for the .pdata sections to incorporate the identifier from the PGO counter. With that, the amount of time spent in ICF was reduced to about 2 seconds. I also found that the same change led to a significant ICF performance improvement in a regular release build of Chromium's chrome_child.dll, where ICF time was reduced from around 1s to around 700ms. With the same change applied to the ELF linker, median of 100 runs for lld-speed-test/chrome reduced from 4.53s to 4.45s on my machine. I also experimented with increasing the number of propagation rounds further, but I did not observe any further significant performance improvements linking Chromium or Firefox. Differential Revision: https://reviews.llvm.org/D56986 llvm-svn: 351899
author: Peter Collingbourne <peter@pcc.me.uk> 2019-01-22 23:54:49 +0000
committer: Peter Collingbourne <peter@pcc.me.uk> 2019-01-22 23:54:49 +0000
commit: bcd08c16bb7678d1ddf8d511188b310cabd74bc0 (patch)
tree: 2ee55e3d3e83892553f7ad5caa4f5bf5fad9e55c
parent: 342611114580412d814ae435e1d1b37be22f86e8 (diff)
download: bcm5719-llvm-bcd08c16bb7678d1ddf8d511188b310cabd74bc0.tar.gz
bcm5719-llvm-bcd08c16bb7678d1ddf8d511188b310cabd74bc0.zip
2 files changed, 25 insertions, 20 deletions
diff --git a/lld/COFF/ICF.cpp b/lld/COFF/ICF.cpp
index 88e67443c26..7684b2c80a3 100644
--- a/lld/COFF/ICF.cpp
+++ b/lld/COFF/ICF.cpp
@@ -262,19 +262,21 @@ void ICF::run(ArrayRef<Chunk *> Vec) {
 
   // Initially, we use hash values to partition sections.
   parallelForEach(Chunks, [&](SectionChunk *SC) {
-    SC->Class[1] = xxHash64(SC->getContents());
+    SC->Class[0] = xxHash64(SC->getContents());
   });
 
   // Combine the hashes of the sections referenced by each section into its
   // hash.
-  parallelForEach(Chunks, [&](SectionChunk *SC) {
-    uint32_t Hash = SC->Class[1];
-    for (Symbol *B : SC->symbols())
-      if (auto *Sym = dyn_cast_or_null<DefinedRegular>(B))
-        Hash += Sym->getChunk()->Class[1];
-    // Set MSB to 1 to avoid collisions with non-hash classs.
-    SC->Class[0] = Hash | (1U << 31);
-  });
+  for (unsigned Cnt = 0; Cnt != 2; ++Cnt) {
+    parallelForEach(Chunks, [&](SectionChunk *SC) {
+      uint32_t Hash = SC->Class[Cnt % 2];
+      for (Symbol *B : SC->symbols())
+        if (auto *Sym = dyn_cast_or_null<DefinedRegular>(B))
+          Hash += Sym->getChunk()->Class[Cnt % 2];
+      // Set MSB to 1 to avoid collisions with non-hash classs.
+      SC->Class[(Cnt + 1) % 2] = Hash | (1U << 31);
+    });
+  }
 
   // From now on, sections in Chunks are ordered so that sections in
   // the same group are consecutive in the vector.
diff --git a/lld/ELF/ICF.cpp b/lld/ELF/ICF.cpp
index 3f326fa337b..7c07a42a8a9 100644
--- a/lld/ELF/ICF.cpp
+++ b/lld/ELF/ICF.cpp
@@ -425,16 +425,17 @@ void ICF<ELFT>::forEachClass(llvm::function_ref<void(size_t, size_t)> Fn) {
 // Combine the hashes of the sections referenced by the given section into its
 // hash.
 template <class ELFT, class RelTy>
-static void combineRelocHashes(InputSection *IS, ArrayRef<RelTy> Rels) {
-  uint32_t Hash = IS->Class[1];
+static void combineRelocHashes(unsigned Cnt, InputSection *IS,
+                               ArrayRef<RelTy> Rels) {
+  uint32_t Hash = IS->Class[Cnt % 2];
   for (RelTy Rel : Rels) {
     Symbol &S = IS->template getFile<ELFT>()->getRelocTargetSym(Rel);
     if (auto *D = dyn_cast<Defined>(&S))
       if (auto *RelSec = dyn_cast_or_null<InputSection>(D->Section))
-        Hash += RelSec->Class[1];
+        Hash += RelSec->Class[Cnt % 2];
   }
   // Set MSB to 1 to avoid collisions with non-hash IDs.
-  IS->Class[0] = Hash | (1U << 31);
+  IS->Class[(Cnt + 1) % 2] = Hash | (1U << 31);
 }
 
 static void print(const Twine &S) {
@@ -452,15 +453,17 @@ template <class ELFT> void ICF<ELFT>::run() {
 
   // Initially, we use hash values to partition sections.
   parallelForEach(Sections, [&](InputSection *S) {
-    S->Class[1] = xxHash64(S->data());
+    S->Class[0] = xxHash64(S->data());
   });
 
-  parallelForEach(Sections, [&](InputSection *S) {
-    if (S->AreRelocsRela)
-      combineRelocHashes<ELFT>(S, S->template relas<ELFT>());
-    else
-      combineRelocHashes<ELFT>(S, S->template rels<ELFT>());
-  });
+  for (unsigned Cnt = 0; Cnt != 2; ++Cnt) {
+    parallelForEach(Sections, [&](InputSection *S) {
+      if (S->AreRelocsRela)
+        combineRelocHashes<ELFT>(Cnt, S, S->template relas<ELFT>());
+      else
+        combineRelocHashes<ELFT>(Cnt, S, S->template rels<ELFT>());
+    });
+  }
 
   // From now on, sections in Sections vector are ordered so that sections
   // in the same equivalence class are consecutive in the vector.
author	Peter Collingbourne <peter@pcc.me.uk>	2019-01-22 23:54:49 +0000
committer	Peter Collingbourne <peter@pcc.me.uk>	2019-01-22 23:54:49 +0000
commit	bcd08c16bb7678d1ddf8d511188b310cabd74bc0 (patch)
tree	2ee55e3d3e83892553f7ad5caa4f5bf5fad9e55c
parent	342611114580412d814ae435e1d1b37be22f86e8 (diff)
download	bcm5719-llvm-bcd08c16bb7678d1ddf8d511188b310cabd74bc0.tar.gz bcm5719-llvm-bcd08c16bb7678d1ddf8d511188b310cabd74bc0.zip