5 files changed, 52 insertions, 326 deletions
diff --git a/arch/x86_64/lib/bitops.c b/arch/x86_64/lib/bitops.c
index a29fb75b33ac..95b6d9639fba 100644
--- a/arch/x86_64/lib/bitops.c
+++ b/arch/x86_64/lib/bitops.c
@@ -5,19 +5,23 @@
 #undef find_first_bit
 #undef find_next_bit
 
-/**
- * find_first_zero_bit - find the first zero bit in a memory region
- * @addr: The address to start the search at
- * @size: The maximum size to search
- *
- * Returns the bit-number of the first zero bit, not the number of the byte
- * containing a bit.
- */
-inline long find_first_zero_bit(const unsigned long * addr, unsigned long size)
+static inline long
+__find_first_zero_bit(const unsigned long * addr, unsigned long size)
 {
 	long d0, d1, d2;
 	long res;
 
+	/*
+	 * We must test the size in words, not in bits, because
+	 * otherwise incoming sizes in the range -63..-1 will not run
+	 * any scasq instructions, and then the flags used by the je
+	 * instruction will have whatever random value was in place
+	 * before.  Nobody should call us like that, but
+	 * find_next_zero_bit() does when offset and size are at the
+	 * same word and it fails to find a zero itself.
+	 */
+	size += 63;
+	size >>= 6;
 	if (!size)
 		return 0;
 	asm volatile(
@@ -30,12 +34,30 @@ inline long find_first_zero_bit(const unsigned long * addr, unsigned long size)
 		"  shlq $3,%%rdi\n"
 		"  addq %%rdi,%%rdx"
 		:"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2)
-		:"0" (0ULL), "1" ((size + 63) >> 6), "2" (addr), "3" (-1ULL),
-		 [addr] "r" (addr) : "memory");
+		:"0" (0ULL), "1" (size), "2" (addr), "3" (-1ULL),
+		 [addr] "S" (addr) : "memory");
+	/*
+	 * Any register would do for [addr] above, but GCC tends to
+	 * prefer rbx over rsi, even though rsi is readily available
+	 * and doesn't have to be saved.
+	 */
 	return res;
 }
 
 /**
+ * find_first_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first zero bit, not the number of the byte
+ * containing a bit.
+ */
+long find_first_zero_bit(const unsigned long * addr, unsigned long size)
+{
+	return __find_first_zero_bit (addr, size);
+}
+
+/**
  * find_next_zero_bit - find the first zero bit in a memory region
  * @addr: The address to base the search on
  * @offset: The bitnumber to start searching at
@@ -43,7 +65,7 @@ inline long find_first_zero_bit(const unsigned long * addr, unsigned long size)
  */
 long find_next_zero_bit (const unsigned long * addr, long size, long offset)
 {
-	unsigned long * p = ((unsigned long *) addr) + (offset >> 6);
+	const unsigned long * p = addr + (offset >> 6);
 	unsigned long set = 0;
 	unsigned long res, bit = offset&63;
 
@@ -63,8 +85,8 @@ long find_next_zero_bit (const unsigned long * addr, long size, long offset)
 	/*
 	 * No zero yet, search remaining full words for a zero
 	 */
-	res = find_first_zero_bit ((const unsigned long *)p,
-				   size - 64 * (p - (unsigned long *) addr));
+	res = __find_first_zero_bit (p, size - 64 * (p - addr));
+
 	return (offset + set + res);
 }
 
@@ -74,6 +96,19 @@ __find_first_bit(const unsigned long * addr, unsigned long size)
 	long d0, d1;
 	long res;
 
+	/*
+	 * We must test the size in words, not in bits, because
+	 * otherwise incoming sizes in the range -63..-1 will not run
+	 * any scasq instructions, and then the flags used by the jz
+	 * instruction will have whatever random value was in place
+	 * before.  Nobody should call us like that, but
+	 * find_next_bit() does when offset and size are at the same
+	 * word and it fails to find a one itself.
+	 */
+	size += 63;
+	size >>= 6;
+	if (!size)
+		return 0;
 	asm volatile(
 		"   repe; scasq\n"
 		"   jz 1f\n"
@@ -83,8 +118,7 @@ __find_first_bit(const unsigned long * addr, unsigned long size)
 		"   shlq $3,%%rdi\n"
 		"   addq %%rdi,%%rax"
 		:"=a" (res), "=&c" (d0), "=&D" (d1)
-		:"0" (0ULL),
-		 "1" ((size + 63) >> 6), "2" (addr),
+		:"0" (0ULL), "1" (size), "2" (addr),
 		 [addr] "r" (addr) : "memory");
 	return res;
 }
diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S
index 30a9da458c15..43d9fa136180 100644
--- a/arch/x86_64/lib/clear_page.S
+++ b/arch/x86_64/lib/clear_page.S
@@ -5,46 +5,8 @@
 	.globl clear_page
 	.p2align 4
 clear_page:
-	xorl   %eax,%eax
-	movl   $4096/64,%ecx
-	.p2align 4
-.Lloop:
-	decl	%ecx
-#define PUT(x) movq %rax,x*8(%rdi) 
-	movq %rax,(%rdi)
-	PUT(1)
-	PUT(2)
-	PUT(3)
-	PUT(4)
-	PUT(5)
-	PUT(6)
-	PUT(7)
-	leaq	64(%rdi),%rdi
-	jnz	.Lloop
-	nop
-	ret
-clear_page_end:	
-	
-	/* C stepping K8 run faster using the string instructions.
-	   It is also a lot simpler. Use this when possible */
-	
-#include <asm/cpufeature.h>
-	    	
-	.section .altinstructions,"a"
-	.align 8
-	.quad  clear_page
-	.quad  clear_page_c
-	.byte  X86_FEATURE_K8_C
-	.byte  clear_page_end-clear_page	
-	.byte  clear_page_c_end-clear_page_c
-	.previous
-
-	.section .altinstr_replacement,"ax"
-clear_page_c:
 	movl $4096/8,%ecx
 	xorl %eax,%eax
 	rep 
 	stosq
 	ret
-clear_page_c_end:
-	.previous
diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S
index dd3aa47b6bf5..621a19769406 100644
--- a/arch/x86_64/lib/copy_page.S
+++ b/arch/x86_64/lib/copy_page.S
@@ -8,94 +8,7 @@
 	.globl copy_page
 	.p2align 4
 copy_page:
-	subq	$3*8,%rsp
-	movq	%rbx,(%rsp)
-	movq	%r12,1*8(%rsp)
-	movq	%r13,2*8(%rsp)
-			
-	movl	$(4096/64)-5,%ecx
-	.p2align 4
-.Loop64:	
-  	dec     %rcx
-
-	movq        (%rsi), %rax
-	movq      8 (%rsi), %rbx
-	movq     16 (%rsi), %rdx
-	movq     24 (%rsi), %r8
-	movq     32 (%rsi), %r9
-	movq     40 (%rsi), %r10
-	movq     48 (%rsi), %r11
-	movq     56 (%rsi), %r12
-
-	prefetcht0 5*64(%rsi)
-
-	movq     %rax,    (%rdi)
-	movq     %rbx,  8 (%rdi)
-	movq     %rdx, 16 (%rdi)
-	movq     %r8,  24 (%rdi)
-	movq     %r9,  32 (%rdi)
-	movq     %r10, 40 (%rdi)
-	movq     %r11, 48 (%rdi)
-	movq     %r12, 56 (%rdi)
-
-	leaq    64 (%rsi), %rsi
-	leaq    64 (%rdi), %rdi
-
-	jnz     .Loop64
-
-	movl	$5,%ecx
-	.p2align 4
-.Loop2:	
-	decl   %ecx
-
-	movq        (%rsi), %rax
-	movq      8 (%rsi), %rbx
-	movq     16 (%rsi), %rdx
-	movq     24 (%rsi), %r8
-	movq     32 (%rsi), %r9
-	movq     40 (%rsi), %r10
-	movq     48 (%rsi), %r11
-	movq     56 (%rsi), %r12
-
-	movq     %rax,    (%rdi)
-	movq     %rbx,  8 (%rdi)
-	movq     %rdx, 16 (%rdi)
-	movq     %r8,  24 (%rdi)
-	movq     %r9,  32 (%rdi)
-	movq     %r10, 40 (%rdi)
-	movq     %r11, 48 (%rdi)
-	movq     %r12, 56 (%rdi)
-	
-	leaq	64(%rdi),%rdi			
-	leaq	64(%rsi),%rsi			
-	
-	jnz	.Loop2		
-	
-	movq	(%rsp),%rbx
-	movq	1*8(%rsp),%r12
-	movq	2*8(%rsp),%r13
-	addq	$3*8,%rsp
-	ret
-	
-	/* C stepping K8 run faster using the string copy instructions.
-	   It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>		
-		
-	.section .altinstructions,"a"
-	.align 8
-	.quad  copy_page
-	.quad  copy_page_c
-	.byte  X86_FEATURE_K8_C
-	.byte  copy_page_c_end-copy_page_c
-	.byte  copy_page_c_end-copy_page_c
-	.previous
-
-	.section .altinstr_replacement,"ax"
-copy_page_c:
 	movl $4096/8,%ecx
 	rep 
 	movsq 
 	ret
-copy_page_c_end:
-	.previous
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
index c6c46494fef5..92dd80544602 100644
--- a/arch/x86_64/lib/memcpy.S
+++ b/arch/x86_64/lib/memcpy.S
@@ -11,6 +11,8 @@
  * 
  * Output:
  * rax original destination
+ * 
+ * TODO: check best memcpy for PSC
  */	
 
  	.globl __memcpy
@@ -18,95 +20,6 @@
 	.p2align 4
 __memcpy:
 memcpy:		
-	pushq %rbx
-	movq %rdi,%rax
-
-	movl %edx,%ecx
-	shrl $6,%ecx
-	jz .Lhandle_tail
-	
-	.p2align 4
-.Lloop_64:
-	decl %ecx
-	
-	movq (%rsi),%r11
-	movq 8(%rsi),%r8
-
-	movq %r11,(%rdi)
-	movq %r8,1*8(%rdi)
-
-	movq 2*8(%rsi),%r9
-	movq 3*8(%rsi),%r10
-
-	movq %r9,2*8(%rdi)
-	movq %r10,3*8(%rdi)
-		
-	movq 4*8(%rsi),%r11
-	movq 5*8(%rsi),%r8
-
-	movq %r11,4*8(%rdi)
-	movq %r8,5*8(%rdi)
-
-	movq 6*8(%rsi),%r9
-	movq 7*8(%rsi),%r10
-
-	movq %r9,6*8(%rdi)
-	movq %r10,7*8(%rdi)
-
-	leaq 64(%rsi),%rsi
-	leaq 64(%rdi),%rdi
-	jnz  .Lloop_64
-
-.Lhandle_tail:
-	movl %edx,%ecx
-	andl $63,%ecx
-	shrl $3,%ecx
-	jz   .Lhandle_7
-	.p2align 4
-.Lloop_8: 
-	decl %ecx
-	movq (%rsi),%r8
-	movq %r8,(%rdi) 
-	leaq 8(%rdi),%rdi
-	leaq 8(%rsi),%rsi
-	jnz  .Lloop_8
-
-.Lhandle_7:
-	movl %edx,%ecx
-	andl $7,%ecx
-	jz .Lende
-	.p2align 4
-.Lloop_1:
-	movb (%rsi),%r8b
-	movb %r8b,(%rdi) 
-	incq %rdi
-	incq %rsi
-	decl %ecx
-	jnz .Lloop_1
-	
-.Lende: 	
-	popq %rbx
-	ret
-.Lfinal:
-	
-	/* C stepping K8 run faster using the string copy instructions.
-	   It is also a lot simpler. Use this when possible */
-	
-	.section .altinstructions,"a"
-	.align 8
-	.quad  memcpy
-	.quad  memcpy_c
-	.byte  X86_FEATURE_K8_C
-	.byte  .Lfinal-memcpy
-	.byte  memcpy_c_end-memcpy_c	
-	.previous
-
-	.section .altinstr_replacement,"ax"
- /* rdi	destination
-  * rsi source
-  * rdx count
-  */			
-memcpy_c:
 	movq %rdi,%rax
 	movl %edx,%ecx
 	shrl $3,%ecx
@@ -117,5 +30,3 @@ memcpy_c:
 	rep
 	movsb
 	ret
-memcpy_c_end:
-	.previous
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
index 4b4c40638640..2aa48f24ed1e 100644
--- a/arch/x86_64/lib/memset.S
+++ b/arch/x86_64/lib/memset.S
@@ -13,98 +13,6 @@
 	.p2align 4
 memset:	
 __memset:
-	movq %rdi,%r10
-	movq %rdx,%r11
-
-	/* expand byte value  */
-	movzbl %sil,%ecx
-	movabs $0x0101010101010101,%rax
-	mul    %rcx		/* with rax, clobbers rdx */
-
-	/* align dst */
-	movl  %edi,%r9d		
-	andl  $7,%r9d	
-	jnz  .Lbad_alignment
-.Lafter_bad_alignment:
-	
-	movl %r11d,%ecx
-	shrl $6,%ecx
-	jz	 .Lhandle_tail
-
-	.p2align 4
-.Lloop_64:	
-	decl   %ecx
-	movq  %rax,(%rdi) 
-	movq  %rax,8(%rdi) 
-	movq  %rax,16(%rdi) 
-	movq  %rax,24(%rdi) 
-	movq  %rax,32(%rdi) 
-	movq  %rax,40(%rdi) 
-	movq  %rax,48(%rdi) 
-	movq  %rax,56(%rdi) 
-	leaq  64(%rdi),%rdi
-	jnz    .Lloop_64
-
-	/* Handle tail in loops. The loops should be faster than hard
-	   to predict jump tables. */ 
-	.p2align 4	   
-.Lhandle_tail:
-	movl	%r11d,%ecx
-	andl    $63&(~7),%ecx
-	jz 		.Lhandle_7
-	shrl	$3,%ecx
-	.p2align 4
-.Lloop_8:
-	decl   %ecx
-	movq  %rax,(%rdi)
-	leaq  8(%rdi),%rdi
-	jnz    .Lloop_8
-
-.Lhandle_7:
-	movl	%r11d,%ecx
-	andl	$7,%ecx
-	jz      .Lende
-	.p2align 4
-.Lloop_1:
-	decl    %ecx
-	movb 	%al,(%rdi)
-	leaq	1(%rdi),%rdi
-	jnz     .Lloop_1
-	
-.Lende:	
-	movq	%r10,%rax
-	ret
-
-.Lbad_alignment:
-	cmpq $7,%r11
-	jbe	.Lhandle_7
-	movq %rax,(%rdi)	/* unaligned store */
-	movq $8,%r8			
-	subq %r9,%r8 
-	addq %r8,%rdi
-	subq %r8,%r11
-	jmp .Lafter_bad_alignment
-
-	/* C stepping K8 run faster using the string instructions.
-	   It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>	
-		
-	.section .altinstructions,"a"
-	.align 8
-	.quad  memset
-	.quad  memset_c
-	.byte  X86_FEATURE_K8_C
-	.byte  memset_c_end-memset_c
-	.byte  memset_c_end-memset_c
-	.previous
-
-	.section .altinstr_replacement,"ax"
- /* rdi	destination
-  * rsi value
-  * rdx count
-  */			
-memset_c:	
 	movq %rdi,%r9
 	movl %edx,%r8d
 	andl $7,%r8d		
@@ -121,5 +29,3 @@ memset_c:
 	stosb
 	movq %r9,%rax
 	ret
-memset_c_end:
-	.previous