diff options
Diffstat (limited to 'arch/ppc/kernel/vector.S')
-rw-r--r-- | arch/ppc/kernel/vector.S | 217 |
1 files changed, 217 insertions, 0 deletions
diff --git a/arch/ppc/kernel/vector.S b/arch/ppc/kernel/vector.S new file mode 100644 index 000000000000..82a21346bf80 --- /dev/null +++ b/arch/ppc/kernel/vector.S @@ -0,0 +1,217 @@ +#include <asm/ppc_asm.h> +#include <asm/processor.h> + +/* + * The routines below are in assembler so we can closely control the + * usage of floating-point registers. These routines must be called + * with preempt disabled. + */ + .data +fpzero: + .long 0 +fpone: + .long 0x3f800000 /* 1.0 in single-precision FP */ +fphalf: + .long 0x3f000000 /* 0.5 in single-precision FP */ + + .text +/* + * Internal routine to enable floating point and set FPSCR to 0. + * Don't call it from C; it doesn't use the normal calling convention. + */ +fpenable: + mfmsr r10 + ori r11,r10,MSR_FP + mtmsr r11 + isync + stfd fr0,24(r1) + stfd fr1,16(r1) + stfd fr31,8(r1) + lis r11,fpzero@ha + mffs fr31 + lfs fr1,fpzero@l(r11) + mtfsf 0xff,fr1 + blr + +fpdisable: + mtfsf 0xff,fr31 + lfd fr31,8(r1) + lfd fr1,16(r1) + lfd fr0,24(r1) + mtmsr r10 + isync + blr + +/* + * Vector add, floating point. + */ + .globl vaddfp +vaddfp: + stwu r1,-32(r1) + mflr r0 + stw r0,36(r1) + bl fpenable + li r0,4 + mtctr r0 + li r6,0 +1: lfsx fr0,r4,r6 + lfsx fr1,r5,r6 + fadds fr0,fr0,fr1 + stfsx fr0,r3,r6 + addi r6,r6,4 + bdnz 1b + bl fpdisable + lwz r0,36(r1) + mtlr r0 + addi r1,r1,32 + blr + +/* + * Vector subtract, floating point. + */ + .globl vsubfp +vsubfp: + stwu r1,-32(r1) + mflr r0 + stw r0,36(r1) + bl fpenable + li r0,4 + mtctr r0 + li r6,0 +1: lfsx fr0,r4,r6 + lfsx fr1,r5,r6 + fsubs fr0,fr0,fr1 + stfsx fr0,r3,r6 + addi r6,r6,4 + bdnz 1b + bl fpdisable + lwz r0,36(r1) + mtlr r0 + addi r1,r1,32 + blr + +/* + * Vector multiply and add, floating point. + */ + .globl vmaddfp +vmaddfp: + stwu r1,-48(r1) + mflr r0 + stw r0,52(r1) + bl fpenable + stfd fr2,32(r1) + li r0,4 + mtctr r0 + li r7,0 +1: lfsx fr0,r4,r7 + lfsx fr1,r5,r7 + lfsx fr2,r6,r7 + fmadds fr0,fr0,fr2,fr1 + stfsx fr0,r3,r7 + addi r7,r7,4 + bdnz 1b + lfd fr2,32(r1) + bl fpdisable + lwz r0,52(r1) + mtlr r0 + addi r1,r1,48 + blr + +/* + * Vector negative multiply and subtract, floating point. + */ + .globl vnmsubfp +vnmsubfp: + stwu r1,-48(r1) + mflr r0 + stw r0,52(r1) + bl fpenable + stfd fr2,32(r1) + li r0,4 + mtctr r0 + li r7,0 +1: lfsx fr0,r4,r7 + lfsx fr1,r5,r7 + lfsx fr2,r6,r7 + fnmsubs fr0,fr0,fr2,fr1 + stfsx fr0,r3,r7 + addi r7,r7,4 + bdnz 1b + lfd fr2,32(r1) + bl fpdisable + lwz r0,52(r1) + mtlr r0 + addi r1,r1,48 + blr + +/* + * Vector reciprocal estimate. We just compute 1.0/x. + * r3 -> destination, r4 -> source. + */ + .globl vrefp +vrefp: + stwu r1,-32(r1) + mflr r0 + stw r0,36(r1) + bl fpenable + lis r9,fpone@ha + li r0,4 + lfs fr1,fpone@l(r9) + mtctr r0 + li r6,0 +1: lfsx fr0,r4,r6 + fdivs fr0,fr1,fr0 + stfsx fr0,r3,r6 + addi r6,r6,4 + bdnz 1b + bl fpdisable + lwz r0,36(r1) + mtlr r0 + addi r1,r1,32 + blr + +/* + * Vector reciprocal square-root estimate, floating point. + * We use the frsqrte instruction for the initial estimate followed + * by 2 iterations of Newton-Raphson to get sufficient accuracy. + * r3 -> destination, r4 -> source. + */ + .globl vrsqrtefp +vrsqrtefp: + stwu r1,-48(r1) + mflr r0 + stw r0,52(r1) + bl fpenable + stfd fr2,32(r1) + stfd fr3,40(r1) + stfd fr4,48(r1) + stfd fr5,56(r1) + lis r9,fpone@ha + lis r8,fphalf@ha + li r0,4 + lfs fr4,fpone@l(r9) + lfs fr5,fphalf@l(r8) + mtctr r0 + li r6,0 +1: lfsx fr0,r4,r6 + frsqrte fr1,fr0 /* r = frsqrte(s) */ + fmuls fr3,fr1,fr0 /* r * s */ + fmuls fr2,fr1,fr5 /* r * 0.5 */ + fnmsubs fr3,fr1,fr3,fr4 /* 1 - s * r * r */ + fmadds fr1,fr2,fr3,fr1 /* r = r + 0.5 * r * (1 - s * r * r) */ + fmuls fr3,fr1,fr0 /* r * s */ + fmuls fr2,fr1,fr5 /* r * 0.5 */ + fnmsubs fr3,fr1,fr3,fr4 /* 1 - s * r * r */ + fmadds fr1,fr2,fr3,fr1 /* r = r + 0.5 * r * (1 - s * r * r) */ + stfsx fr1,r3,r6 + addi r6,r6,4 + bdnz 1b + lfd fr5,56(r1) + lfd fr4,48(r1) + lfd fr3,40(r1) + lfd fr2,32(r1) + bl fpdisable + lwz r0,36(r1) + mtlr r0 + addi r1,r1,32 + blr |