diff options
author | Anton Blanchard <anton@samba.org> | 2014-10-02 15:44:21 +1000 |
---|---|---|
committer | Michael Ellerman <mpe@ellerman.id.au> | 2014-10-02 16:04:21 +1000 |
commit | e35735b9a5d8d38d9ffe2f1f0cdcbb0d45c42eff (patch) | |
tree | 643f44507bdd55a621f7b772d877054fa7869f6e | |
parent | 2013add4ce73c93ae2148969a9ec3ecc8b1e26fa (diff) | |
download | blackbird-op-linux-e35735b9a5d8d38d9ffe2f1f0cdcbb0d45c42eff.tar.gz blackbird-op-linux-e35735b9a5d8d38d9ffe2f1f0cdcbb0d45c42eff.zip |
powerpc: Speed up clear_page by unrolling it
Unroll clear_page 8 times. A simple microbenchmark which
allocates and frees a zeroed page:
for (i = 0; i < iterations; i++) {
unsigned long p = __get_free_page(GFP_KERNEL | __GFP_ZERO);
free_page(p);
}
improves 20% on POWER8.
This assumes cacheline sizes won't grow beyond 512 bytes or
page sizes wont drop below 1kB, which is unlikely, but we could
add a runtime check during early init if it makes people nervous.
Michael found that some versions of gcc produce quite bad code
(all multiplies), so we give gcc a hand by using shifts and adds.
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
-rw-r--r-- | arch/powerpc/include/asm/page_64.h | 42 |
1 files changed, 31 insertions, 11 deletions
diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index d0d6afb353d4..d908a46d05c0 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -42,20 +42,40 @@ typedef unsigned long pte_basic_t; -static __inline__ void clear_page(void *addr) +static inline void clear_page(void *addr) { - unsigned long lines, line_size; - - line_size = ppc64_caches.dline_size; - lines = ppc64_caches.dlines_per_page; - - __asm__ __volatile__( + unsigned long iterations; + unsigned long onex, twox, fourx, eightx; + + iterations = ppc64_caches.dlines_per_page / 8; + + /* + * Some verisions of gcc use multiply instructions to + * calculate the offsets so lets give it a hand to + * do better. + */ + onex = ppc64_caches.dline_size; + twox = onex << 1; + fourx = onex << 2; + eightx = onex << 3; + + asm volatile( "mtctr %1 # clear_page\n\ -1: dcbz 0,%0\n\ - add %0,%0,%3\n\ + .balign 16\n\ +1: dcbz 0,%0\n\ + dcbz %3,%0\n\ + dcbz %4,%0\n\ + dcbz %5,%0\n\ + dcbz %6,%0\n\ + dcbz %7,%0\n\ + dcbz %8,%0\n\ + dcbz %9,%0\n\ + add %0,%0,%10\n\ bdnz+ 1b" - : "=r" (addr) - : "r" (lines), "0" (addr), "r" (line_size) + : "=&r" (addr) + : "r" (iterations), "0" (addr), "b" (onex), "b" (twox), + "b" (twox+onex), "b" (fourx), "b" (fourx+onex), + "b" (twox+fourx), "b" (eightx-onex), "r" (eightx) : "ctr", "memory"); } |