From 023ef184fff6ac2e7cba345708f35536a2a419cb Mon Sep 17 00:00:00 2001 From: Stuart Menefy Date: Fri, 28 Sep 2007 12:36:35 +0900 Subject: sh: __copy_user() optimizations for small copies. This implements a fast-path for small (less than 12 bytes) copies, with the existing path treated as the slow-path and left as the default behaviour for all other copy sizes. Signed-off-by: Stuart Menefy Signed-off-by: Paul Mundt --- arch/sh/mm/copy_page.S | 169 +++++++++++++++++++++++++++++++------------------ 1 file changed, 108 insertions(+), 61 deletions(-) (limited to 'arch/sh/mm') diff --git a/arch/sh/mm/copy_page.S b/arch/sh/mm/copy_page.S index ae039f2da162..a81dbdb05596 100644 --- a/arch/sh/mm/copy_page.S +++ b/arch/sh/mm/copy_page.S @@ -141,47 +141,38 @@ ENTRY(__copy_user_page) .long 9999b, 6000f ; \ .previous ENTRY(__copy_user) - tst r6,r6 ! Check explicitly for zero - bf 1f - rts - mov #0,r0 ! normal return -1: - mov.l r10,@-r15 - mov.l r9,@-r15 - mov.l r8,@-r15 + ! Check if small number of bytes + mov #11,r0 mov r4,r3 - add r6,r3 ! last destination address - mov #12,r0 ! Check if small number of bytes - cmp/gt r0,r6 - bt 2f - bra .L_cleanup_loop - nop -2: - neg r5,r0 ! Calculate bytes needed to align source + cmp/gt r0,r6 ! r6 (len) > r0 (11) + bf/s .L_cleanup_loop_no_pop + add r6,r3 ! last destination address + + ! Calculate bytes needed to align to src + mov.l r11,@-r15 + neg r5,r0 + mov.l r10,@-r15 add #4,r0 + mov.l r9,@-r15 and #3,r0 + mov.l r8,@-r15 tst r0,r0 - bt .L_jump - mov r0,r1 + bt 2f -.L_loop1: - ! Copy bytes to align source -EX( mov.b @r5+,r0 ) - dt r1 -EX( mov.b r0,@r4 ) +1: + ! Copy bytes to long word align src +EX( mov.b @r5+,r1 ) + dt r0 add #-1,r6 - bf/s .L_loop1 +EX( mov.b r1,@r4 ) + bf/s 1b add #1,r4 -.L_jump: - mov r6,r2 ! Calculate number of longwords to copy + ! Jump to appropriate routine depending on dest +2: mov #3,r1 + mov r6, r2 + and r4,r1 shlr2 r2 - tst r2,r2 - bt .L_cleanup - - mov r4,r0 ! Jump to appropriate routine - and #3,r0 - mov r0,r1 shll2 r1 mova .L_jump_tbl,r0 mov.l @(r0,r1),r1 @@ -195,43 +186,97 @@ EX( mov.b r0,@r4 ) .long .L_dest10 .long .L_dest11 +/* + * Come here if there are less than 12 bytes to copy + * + * Keep the branch target close, so the bf/s callee doesn't overflow + * and result in a more expensive branch being inserted. This is the + * fast-path for small copies, the jump via the jump table will hit the + * default slow-path cleanup. -PFM. + */ +.L_cleanup_loop_no_pop: + tst r6,r6 ! Check explicitly for zero + bt 1f + +2: +EX( mov.b @r5+,r0 ) + dt r6 +EX( mov.b r0,@r4 ) + bf/s 2b + add #1,r4 + +1: mov #0,r0 ! normal return +5000: + +# Exception handler: +.section .fixup, "ax" +6000: + mov.l 8000f,r1 + mov r3,r0 + jmp @r1 + sub r4,r0 + .align 2 +8000: .long 5000b + +.previous + rts + nop + ! Destination = 00 .L_dest00: - mov r2,r7 - shlr2 r7 - shlr r7 - tst r7,r7 - mov #7,r0 - bt/s 1f - and r0,r2 - .align 2 + ! Skip the large copy for small transfers + mov #(32+32-4), r0 + cmp/gt r6, r0 ! r0 (60) > r6 (len) + bt 1f + + ! Align dest to a 32 byte boundary + neg r4,r0 + add #0x20, r0 + and #0x1f, r0 + tst r0, r0 + bt 2f + + sub r0, r6 + shlr2 r0 +3: +EX( mov.l @r5+,r1 ) + dt r0 +EX( mov.l r1,@r4 ) + bf/s 3b + add #4,r4 + 2: EX( mov.l @r5+,r0 ) +EX( mov.l @r5+,r1 ) +EX( mov.l @r5+,r2 ) +EX( mov.l @r5+,r7 ) EX( mov.l @r5+,r8 ) EX( mov.l @r5+,r9 ) EX( mov.l @r5+,r10 ) -EX( mov.l r0,@r4 ) -EX( mov.l r8,@(4,r4) ) -EX( mov.l r9,@(8,r4) ) -EX( mov.l r10,@(12,r4) ) -EX( mov.l @r5+,r0 ) -EX( mov.l @r5+,r8 ) -EX( mov.l @r5+,r9 ) -EX( mov.l @r5+,r10 ) - dt r7 -EX( mov.l r0,@(16,r4) ) -EX( mov.l r8,@(20,r4) ) -EX( mov.l r9,@(24,r4) ) -EX( mov.l r10,@(28,r4) ) +EX( mov.l @r5+,r11 ) +EX( movca.l r0,@r4 ) + add #-32, r6 +EX( mov.l r1,@(4,r4) ) + mov #32, r0 +EX( mov.l r2,@(8,r4) ) + cmp/gt r6, r0 ! r0 (32) > r6 (len) +EX( mov.l r7,@(12,r4) ) +EX( mov.l r8,@(16,r4) ) +EX( mov.l r9,@(20,r4) ) +EX( mov.l r10,@(24,r4) ) +EX( mov.l r11,@(28,r4) ) bf/s 2b add #32,r4 - tst r2,r2 + +1: mov r6, r0 + shlr2 r0 + tst r0, r0 bt .L_cleanup 1: -EX( mov.l @r5+,r0 ) - dt r2 -EX( mov.l r0,@r4 ) +EX( mov.l @r5+,r1 ) + dt r0 +EX( mov.l r1,@r4 ) bf/s 1b add #4,r4 @@ -250,7 +295,7 @@ EX( mov.l r0,@r4 ) and r0,r2 2: dt r7 -#ifdef __LITTLE_ENDIAN__ +#ifdef CONFIG_CPU_LITTLE_ENDIAN EX( mov.l @r5+,r0 ) EX( mov.l @r5+,r1 ) EX( mov.l @r5+,r8 ) @@ -320,7 +365,7 @@ EX( mov.w r0,@(2,r4) ) 1: ! Read longword, write two words per iteration EX( mov.l @r5+,r0 ) dt r2 -#ifdef __LITTLE_ENDIAN__ +#ifdef CONFIG_CPU_LITTLE_ENDIAN EX( mov.w r0,@r4 ) shlr16 r0 EX( mov.w r0,@(2,r4) ) @@ -342,7 +387,7 @@ EX( mov.w r0,@r4 ) ! Read longword, write byte, word, byte per iteration EX( mov.l @r5+,r0 ) dt r2 -#ifdef __LITTLE_ENDIAN__ +#ifdef CONFIG_CPU_LITTLE_ENDIAN EX( mov.b r0,@r4 ) shlr8 r0 add #1,r4 @@ -379,6 +424,7 @@ EX( mov.b r0,@r4 ) .L_exit: mov #0,r0 ! normal return + 5000: # Exception handler: @@ -394,5 +440,6 @@ EX( mov.b r0,@r4 ) .previous mov.l @r15+,r8 mov.l @r15+,r9 + mov.l @r15+,r10 rts - mov.l @r15+,r10 + mov.l @r15+,r11 -- cgit v1.2.1