summaryrefslogtreecommitdiffstats
path: root/tools/arch/x86/lib
diff options
context:
space:
mode:
authorArnaldo Carvalho de Melo <acme@redhat.com>2016-07-11 12:36:41 -0300
committerArnaldo Carvalho de Melo <acme@redhat.com>2016-07-12 15:20:32 -0300
commit7d7d1bf1d1dabe435ef50efb051724b8664749cb (patch)
tree25b24227fb2d78e03262785ba893c1fb21306d1c /tools/arch/x86/lib
parentc4b6014e8bb0c8d47fe5c71ebc604f31091e5d3f (diff)
downloadtalos-obmc-linux-7d7d1bf1d1dabe435ef50efb051724b8664749cb.tar.gz
talos-obmc-linux-7d7d1bf1d1dabe435ef50efb051724b8664749cb.zip
perf bench: Copy kernel files needed to build mem{cpy,set} x86_64 benchmarks
We can't access kernel files directly from tools/, so copy the required bits, and make sure that we detect when the original files, in the kernel, gets modified. Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: David Ahern <dsahern@gmail.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/n/tip-z7e76274ch5j4nugv048qacb@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Diffstat (limited to 'tools/arch/x86/lib')
-rw-r--r--tools/arch/x86/lib/memcpy_64.S297
-rw-r--r--tools/arch/x86/lib/memset_64.S138
2 files changed, 435 insertions, 0 deletions
diff --git a/tools/arch/x86/lib/memcpy_64.S b/tools/arch/x86/lib/memcpy_64.S
new file mode 100644
index 000000000000..2ec0b0abbfaa
--- /dev/null
+++ b/tools/arch/x86/lib/memcpy_64.S
@@ -0,0 +1,297 @@
+/* Copyright 2002 Andi Kleen */
+
+#include <linux/linkage.h>
+#include <asm/errno.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative-asm.h>
+
+/*
+ * We build a jump to memcpy_orig by default which gets NOPped out on
+ * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
+ * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
+ * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
+ */
+
+.weak memcpy
+
+/*
+ * memcpy - Copy a memory block.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
+ * Output:
+ * rax original destination
+ */
+ENTRY(__memcpy)
+ENTRY(memcpy)
+ ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
+ "jmp memcpy_erms", X86_FEATURE_ERMS
+
+ movq %rdi, %rax
+ movq %rdx, %rcx
+ shrq $3, %rcx
+ andl $7, %edx
+ rep movsq
+ movl %edx, %ecx
+ rep movsb
+ ret
+ENDPROC(memcpy)
+ENDPROC(__memcpy)
+
+/*
+ * memcpy_erms() - enhanced fast string memcpy. This is faster and
+ * simpler than memcpy. Use memcpy_erms when possible.
+ */
+ENTRY(memcpy_erms)
+ movq %rdi, %rax
+ movq %rdx, %rcx
+ rep movsb
+ ret
+ENDPROC(memcpy_erms)
+
+ENTRY(memcpy_orig)
+ movq %rdi, %rax
+
+ cmpq $0x20, %rdx
+ jb .Lhandle_tail
+
+ /*
+ * We check whether memory false dependence could occur,
+ * then jump to corresponding copy mode.
+ */
+ cmp %dil, %sil
+ jl .Lcopy_backward
+ subq $0x20, %rdx
+.Lcopy_forward_loop:
+ subq $0x20, %rdx
+
+ /*
+ * Move in blocks of 4x8 bytes:
+ */
+ movq 0*8(%rsi), %r8
+ movq 1*8(%rsi), %r9
+ movq 2*8(%rsi), %r10
+ movq 3*8(%rsi), %r11
+ leaq 4*8(%rsi), %rsi
+
+ movq %r8, 0*8(%rdi)
+ movq %r9, 1*8(%rdi)
+ movq %r10, 2*8(%rdi)
+ movq %r11, 3*8(%rdi)
+ leaq 4*8(%rdi), %rdi
+ jae .Lcopy_forward_loop
+ addl $0x20, %edx
+ jmp .Lhandle_tail
+
+.Lcopy_backward:
+ /*
+ * Calculate copy position to tail.
+ */
+ addq %rdx, %rsi
+ addq %rdx, %rdi
+ subq $0x20, %rdx
+ /*
+ * At most 3 ALU operations in one cycle,
+ * so append NOPS in the same 16 bytes trunk.
+ */
+ .p2align 4
+.Lcopy_backward_loop:
+ subq $0x20, %rdx
+ movq -1*8(%rsi), %r8
+ movq -2*8(%rsi), %r9
+ movq -3*8(%rsi), %r10
+ movq -4*8(%rsi), %r11
+ leaq -4*8(%rsi), %rsi
+ movq %r8, -1*8(%rdi)
+ movq %r9, -2*8(%rdi)
+ movq %r10, -3*8(%rdi)
+ movq %r11, -4*8(%rdi)
+ leaq -4*8(%rdi), %rdi
+ jae .Lcopy_backward_loop
+
+ /*
+ * Calculate copy position to head.
+ */
+ addl $0x20, %edx
+ subq %rdx, %rsi
+ subq %rdx, %rdi
+.Lhandle_tail:
+ cmpl $16, %edx
+ jb .Lless_16bytes
+
+ /*
+ * Move data from 16 bytes to 31 bytes.
+ */
+ movq 0*8(%rsi), %r8
+ movq 1*8(%rsi), %r9
+ movq -2*8(%rsi, %rdx), %r10
+ movq -1*8(%rsi, %rdx), %r11
+ movq %r8, 0*8(%rdi)
+ movq %r9, 1*8(%rdi)
+ movq %r10, -2*8(%rdi, %rdx)
+ movq %r11, -1*8(%rdi, %rdx)
+ retq
+ .p2align 4
+.Lless_16bytes:
+ cmpl $8, %edx
+ jb .Lless_8bytes
+ /*
+ * Move data from 8 bytes to 15 bytes.
+ */
+ movq 0*8(%rsi), %r8
+ movq -1*8(%rsi, %rdx), %r9
+ movq %r8, 0*8(%rdi)
+ movq %r9, -1*8(%rdi, %rdx)
+ retq
+ .p2align 4
+.Lless_8bytes:
+ cmpl $4, %edx
+ jb .Lless_3bytes
+
+ /*
+ * Move data from 4 bytes to 7 bytes.
+ */
+ movl (%rsi), %ecx
+ movl -4(%rsi, %rdx), %r8d
+ movl %ecx, (%rdi)
+ movl %r8d, -4(%rdi, %rdx)
+ retq
+ .p2align 4
+.Lless_3bytes:
+ subl $1, %edx
+ jb .Lend
+ /*
+ * Move data from 1 bytes to 3 bytes.
+ */
+ movzbl (%rsi), %ecx
+ jz .Lstore_1byte
+ movzbq 1(%rsi), %r8
+ movzbq (%rsi, %rdx), %r9
+ movb %r8b, 1(%rdi)
+ movb %r9b, (%rdi, %rdx)
+.Lstore_1byte:
+ movb %cl, (%rdi)
+
+.Lend:
+ retq
+ENDPROC(memcpy_orig)
+
+#ifndef CONFIG_UML
+/*
+ * memcpy_mcsafe - memory copy with machine check exception handling
+ * Note that we only catch machine checks when reading the source addresses.
+ * Writes to target are posted and don't generate machine checks.
+ */
+ENTRY(memcpy_mcsafe)
+ cmpl $8, %edx
+ /* Less than 8 bytes? Go to byte copy loop */
+ jb .L_no_whole_words
+
+ /* Check for bad alignment of source */
+ testl $7, %esi
+ /* Already aligned */
+ jz .L_8byte_aligned
+
+ /* Copy one byte at a time until source is 8-byte aligned */
+ movl %esi, %ecx
+ andl $7, %ecx
+ subl $8, %ecx
+ negl %ecx
+ subl %ecx, %edx
+.L_copy_leading_bytes:
+ movb (%rsi), %al
+ movb %al, (%rdi)
+ incq %rsi
+ incq %rdi
+ decl %ecx
+ jnz .L_copy_leading_bytes
+
+.L_8byte_aligned:
+ /* Figure out how many whole cache lines (64-bytes) to copy */
+ movl %edx, %ecx
+ andl $63, %edx
+ shrl $6, %ecx
+ jz .L_no_whole_cache_lines
+
+ /* Loop copying whole cache lines */
+.L_cache_w0: movq (%rsi), %r8
+.L_cache_w1: movq 1*8(%rsi), %r9
+.L_cache_w2: movq 2*8(%rsi), %r10
+.L_cache_w3: movq 3*8(%rsi), %r11
+ movq %r8, (%rdi)
+ movq %r9, 1*8(%rdi)
+ movq %r10, 2*8(%rdi)
+ movq %r11, 3*8(%rdi)
+.L_cache_w4: movq 4*8(%rsi), %r8
+.L_cache_w5: movq 5*8(%rsi), %r9
+.L_cache_w6: movq 6*8(%rsi), %r10
+.L_cache_w7: movq 7*8(%rsi), %r11
+ movq %r8, 4*8(%rdi)
+ movq %r9, 5*8(%rdi)
+ movq %r10, 6*8(%rdi)
+ movq %r11, 7*8(%rdi)
+ leaq 64(%rsi), %rsi
+ leaq 64(%rdi), %rdi
+ decl %ecx
+ jnz .L_cache_w0
+
+ /* Are there any trailing 8-byte words? */
+.L_no_whole_cache_lines:
+ movl %edx, %ecx
+ andl $7, %edx
+ shrl $3, %ecx
+ jz .L_no_whole_words
+
+ /* Copy trailing words */
+.L_copy_trailing_words:
+ movq (%rsi), %r8
+ mov %r8, (%rdi)
+ leaq 8(%rsi), %rsi
+ leaq 8(%rdi), %rdi
+ decl %ecx
+ jnz .L_copy_trailing_words
+
+ /* Any trailing bytes? */
+.L_no_whole_words:
+ andl %edx, %edx
+ jz .L_done_memcpy_trap
+
+ /* Copy trailing bytes */
+ movl %edx, %ecx
+.L_copy_trailing_bytes:
+ movb (%rsi), %al
+ movb %al, (%rdi)
+ incq %rsi
+ incq %rdi
+ decl %ecx
+ jnz .L_copy_trailing_bytes
+
+ /* Copy successful. Return zero */
+.L_done_memcpy_trap:
+ xorq %rax, %rax
+ ret
+ENDPROC(memcpy_mcsafe)
+
+ .section .fixup, "ax"
+ /* Return -EFAULT for any failure */
+.L_memcpy_mcsafe_fail:
+ mov $-EFAULT, %rax
+ ret
+
+ .previous
+
+ _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
+ _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
+ _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
+ _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
+ _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
+ _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
+ _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
+ _ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
+ _ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
+ _ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
+ _ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
+#endif
diff --git a/tools/arch/x86/lib/memset_64.S b/tools/arch/x86/lib/memset_64.S
new file mode 100644
index 000000000000..e1229ecd2a82
--- /dev/null
+++ b/tools/arch/x86/lib/memset_64.S
@@ -0,0 +1,138 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs */
+
+#include <linux/linkage.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative-asm.h>
+
+.weak memset
+
+/*
+ * ISO C memset - set a memory block to a byte value. This function uses fast
+ * string to get better performance than the original function. The code is
+ * simpler and shorter than the original function as well.
+ *
+ * rdi destination
+ * rsi value (char)
+ * rdx count (bytes)
+ *
+ * rax original destination
+ */
+ENTRY(memset)
+ENTRY(__memset)
+ /*
+ * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
+ * to use it when possible. If not available, use fast string instructions.
+ *
+ * Otherwise, use original memset function.
+ */
+ ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
+ "jmp memset_erms", X86_FEATURE_ERMS
+
+ movq %rdi,%r9
+ movq %rdx,%rcx
+ andl $7,%edx
+ shrq $3,%rcx
+ /* expand byte value */
+ movzbl %sil,%esi
+ movabs $0x0101010101010101,%rax
+ imulq %rsi,%rax
+ rep stosq
+ movl %edx,%ecx
+ rep stosb
+ movq %r9,%rax
+ ret
+ENDPROC(memset)
+ENDPROC(__memset)
+
+/*
+ * ISO C memset - set a memory block to a byte value. This function uses
+ * enhanced rep stosb to override the fast string function.
+ * The code is simpler and shorter than the fast string function as well.
+ *
+ * rdi destination
+ * rsi value (char)
+ * rdx count (bytes)
+ *
+ * rax original destination
+ */
+ENTRY(memset_erms)
+ movq %rdi,%r9
+ movb %sil,%al
+ movq %rdx,%rcx
+ rep stosb
+ movq %r9,%rax
+ ret
+ENDPROC(memset_erms)
+
+ENTRY(memset_orig)
+ movq %rdi,%r10
+
+ /* expand byte value */
+ movzbl %sil,%ecx
+ movabs $0x0101010101010101,%rax
+ imulq %rcx,%rax
+
+ /* align dst */
+ movl %edi,%r9d
+ andl $7,%r9d
+ jnz .Lbad_alignment
+.Lafter_bad_alignment:
+
+ movq %rdx,%rcx
+ shrq $6,%rcx
+ jz .Lhandle_tail
+
+ .p2align 4
+.Lloop_64:
+ decq %rcx
+ movq %rax,(%rdi)
+ movq %rax,8(%rdi)
+ movq %rax,16(%rdi)
+ movq %rax,24(%rdi)
+ movq %rax,32(%rdi)
+ movq %rax,40(%rdi)
+ movq %rax,48(%rdi)
+ movq %rax,56(%rdi)
+ leaq 64(%rdi),%rdi
+ jnz .Lloop_64
+
+ /* Handle tail in loops. The loops should be faster than hard
+ to predict jump tables. */
+ .p2align 4
+.Lhandle_tail:
+ movl %edx,%ecx
+ andl $63&(~7),%ecx
+ jz .Lhandle_7
+ shrl $3,%ecx
+ .p2align 4
+.Lloop_8:
+ decl %ecx
+ movq %rax,(%rdi)
+ leaq 8(%rdi),%rdi
+ jnz .Lloop_8
+
+.Lhandle_7:
+ andl $7,%edx
+ jz .Lende
+ .p2align 4
+.Lloop_1:
+ decl %edx
+ movb %al,(%rdi)
+ leaq 1(%rdi),%rdi
+ jnz .Lloop_1
+
+.Lende:
+ movq %r10,%rax
+ ret
+
+.Lbad_alignment:
+ cmpq $7,%rdx
+ jbe .Lhandle_7
+ movq %rax,(%rdi) /* unaligned store */
+ movq $8,%r8
+ subq %r9,%r8
+ addq %r8,%rdi
+ subq %r8,%rdx
+ jmp .Lafter_bad_alignment
+.Lfinal:
+ENDPROC(memset_orig)
OpenPOWER on IntegriCloud