1 files changed, 255 insertions, 0 deletions
diff --git a/src/ssx/ppc32/div64.S b/src/ssx/ppc32/div64.S
new file mode 100755
index 0000000..04ee008
--- /dev/null
+++ b/src/ssx/ppc32/div64.S
@@ -0,0 +1,255 @@
+// $Id: div64.S,v 1.1.1.1 2013/12/11 21:03:25 bcbrock Exp $
+// $Source: /afs/awd/projects/eclipz/KnowledgeBase/.cvsroot/eclipz/chips/p8/working/procedures/ssx/ppc32/div64.S,v $
+//-----------------------------------------------------------------------------
+// *! (C) Copyright International Business Machines Corp. 2013
+// *! All Rights Reserved -- Property of IBM
+// *! *** IBM Confidential ***
+//-----------------------------------------------------------------------------
+                
+/// \file div64.S
+/// \brief Unsigned 64/64 bit division
+///
+/// This is IBM code, originally part of OS Open.  The code has been slightly
+/// modified from its original form, both to be compatible with SSX and to
+/// change the function prototype slightly.
+///
+/// The code was provided by Matt Tyrlik in Raleigh.
+
+/* @#START#@
+**
+**      PSCN (Power Service and Control Network)
+**      Cage Controller OS Open Code
+**
+**      (C) Copyright International Business Machines Corporation 2002
+**      All Rights Reserved
+**      Licensed Material  -  Program Property of I B M
+**      Refer to copyright instructions: Form G120-2083
+**
+** Module:
+**      div64.s
+**
+** Description:
+**      Divide 64 bit unsigned values on 32 bit CPU
+**      div64(uint64_t dividen, uint64_t divisor, 
+**            uint64_t *quotient, uint64_t *remainder)
+**      
+**      Original source from:
+**      "The PowerPC Compiler Writer's Guide", pp62-65 by
+**      Steve Hoxey, Faraydon Karim, Bill Hay, Hank Warray,
+**      published by Warthman Associates, 240 Hamilton Avenue,
+**      Palo Alto, CA 94301, USA, 1996 for IBM.
+**      ISBN 0-9649654-0-2.
+**
+**      This version checks for divisor equal to zero.
+**
+** Environment:
+**      OS Open (XCOFF)
+**
+** Linkage:
+**      AIX 4.3.3
+**
+** @author
+**      Thomas Richter
+**
+** History:
+** Date      Author      Description
+** -----------------------------------------------------------------------------
+** 23-Sep-02 Richter     Created
+**
+** @#END#@*/
+
+        .nolist
+#include "ssx.h"
+        .list
+        
+        .set    r0, 0
+        .set    r1, 1
+        .set    r2, 2
+        .set    r3, 3
+        .set    r4, 4
+        .set    r5, 5
+        .set    r6, 6
+        .set    r7, 7
+        .set    r8, 8
+        .set    r9, 9
+        .set    r10, 10
+        .set    r11, 11
+        .set    r12, 12
+
+        .global_function __ppc32_udiv64
+
+        /*
+        ** Code comment notation: 
+        ** msw = most-significant (high-order) word, i.e. bits 0..31 
+        ** lsw = least-significant (low-order) word, i.e. bits 32..63
+        ** LZ = Leading Zeroes 
+        ** SD = Significant Digits 
+        ** 
+        ** R3:R4 = Input parameter, dividend.
+        ** R5:R6 = Input parameter, divisor.
+        ** R7 = Output parameter, pointer to quotient.
+        ** R8 = Output parameter, pointer to remainder.
+        **
+        ** Pointer arguments point to a uint64_t.
+        **
+        ** Division is achieved using a shift/rotate/substract algorithsm
+        ** described above.
+        ** The registers are used as follows:
+        ** R3:R4 = dividend (upper 32bits:lower 32bits)
+        ** R5:R6 = divisor (upper 32bits:lower 32bits)
+        ** 
+        ** R7:R8 = temporary 64 bit register (upper 32bits:lower 32bits)
+        ** count the number of leading 0s in the dividend 
+        **
+        ** Here is the description from the book. The dividend is placed
+        ** in the low order part of a 4 (32bit) register sequence named
+        ** tmp-high:tmp-low:dividend-high:dividend:low or tmp:dvd for short.
+        ** 
+        ** Each iteration includes the following steps:
+        ** 1. Shift tmp:dvd by one bit to the left.
+        ** 2. Subtract the divisor from tmp. This is a 64 bit operation.
+        ** 3. If result is greater than or equal, place result in tmp and
+        **    set the low order bit of dividend
+        ** 4. If result is negative, do not modify tmp and
+        **    clear the low order bit of dividend
+        ** 5. If the number of iterations is less than the width of the
+        **    dividend, goto step 1.
+        **
+        ** Now the algorithm can be improved by reducing the number of
+        ** iterations to be executed.
+        ** 1. Calculate the leading zeroes of the dividend.
+        ** 2. Calculate the leading zeroes of the divisor.
+        ** 3. Calculate the significant ones of the dividend.
+        ** 4. Calculate the significant ones of the divisor.
+        **
+        ** Initial tmp := dvd >> (dvd.SD - dvs.SD)
+        ** Initial dvd := dvd << (dvd.LZ + dvs.SD)
+        ** Loops: dvd.SD - dvs.SD.
+        **
+        ** Warning: Special care must be taken if dvd.LZ == dvs.LZ. The code
+        ** below does so by reducing the number of dvs.SD by one. This leads
+        ** to the loop being executed 1 more time than really necessary,
+        ** but avoids to check for the case when dvd.LZ == dvs.LZ.
+        ** This case (dvd.LZ == dvs.LZ) only checks for the number of leading
+        ** zeroes, but does not check if dividend is really greater than the
+        ** divisor.
+        ** Consider 16/17, both have an LZ value of 59. The code sets dvs.LZ
+        ** 60. This resutls in dvs.SD to 4, thus one iteration after which
+        ** tmp is the remainder 16.
+        */
+
+__ppc32_udiv64: // SSX
+        /* Save result pointers on volatile spare registers */
+        ori     r12, r8, 0              /* Save remainder address */
+        ori     r11, r7, 0              /* Save quotient address */
+
+        /* count the number of leading 0s in the dividend */
+        cmpwi   cr0, r3, 0      /* dvd.msw == 0? */
+        cntlzw  r0, r3          /* R0 = dvd.msw.LZ */
+        cntlzw  r9, r4          /* R9 = dvd.lsw.LZ */
+        bne     cr0, lab1       /* if(dvd.msw == 0) dvd.LZ = dvd.msw.LZ */
+        addi    r0, r9, 32      /* dvd.LZ = dvd.lsw.LZ + 32 */
+lab1: 
+        /* count the number of leading 0s in the divisor */
+        cmpwi   cr0, r5, 0      /* dvd.msw == 0? */
+        cmpwi   cr1, r6, 0      /* dvd.lsw == 0? */
+        cntlzw  r9, r5          /* R9 = dvs.msw.LZ */
+        cntlzw  r10, r6         /* R10 = dvs.lsw.LZ */
+        bne     cr0, lab2       /* if(dvs.msw == 0) dvs.LZ = dvs.msw.LZ */
+        beq     cr1, lab10      /* dvs.msw == 0 */
+        addi    r9, r10, 32     /* dvs.LZ = dvs.lsw.LZ + 32 */
+
+lab2:
+        /* Determine shift amounts to minimize the number of iterations  */
+        cmpw    cr0, r0, r9     /* Compare dvd.LZ to dvs.LZ */
+        subfic  r10, r0, 64     /* R10 = dvd.SD */
+        bgt     cr0, lab9       /* if(dvs > dvd) quotient = 0 */
+        addi    r9, r9, 1       /* See comment above. ++dvs.LZ (or --dvs.SD) */
+        subfic  r9, r9, 64      /* R9 = dvs.SD */
+        add     r0, r0, r9      /* (dvd.LZ + dvs.SD) = left shift of dvd for */
+                                /* initial dvd */
+        subf    r9, r9, r10     /* (dvd.SD - dvs.SD) = right shift of dvd for */
+                                /* initial tmp */
+        mtctr   r9              /* Number of iterations = dvd.SD - dvs.SD */
+        
+        /* R7:R8 = R3:R4 >> R9 */
+        cmpwi   cr0, r9, 32     /* compare R9 to 32 */
+        addi    r7, r9, -32
+        blt     cr0, lab3       /* if(R9 < 32) jump to lab3 */
+        srw     r8, r3, r7      /* tmp.lsw = dvd.msw >> (R9 - 32) */
+        addi    r7, r0, 0       /* tmp.msw = 0 */
+        b       lab4 
+
+lab3: 
+        srw     r8, r4, r9      /* R8 = dvd.lsw >> R9 */
+        subfic  r7, r9, 32 
+        slw     r7,r3,r7        /* R7 = dvd.msw << 32 - R9 */
+        or      r8, r8,r7       /* tmp.lsw = R8 | R7 */
+        srw     r7,r3,r9        /* tmp.msw = dvd.msw >> R9 */
+lab4:
+        /* R3:R4 = R3:R4 << R0 */
+        cmpwi   cr0, r0, 32     /* Compare R0 to 32 */
+        addic   r9, r0, -32 
+        blt     cr0, lab5       /* if(R0 < 32) jump to lab5 */
+        slw     r3, r4, r9      /* dvd.msw = dvd.lsw << R9 */
+        addi    r4, r0, 0       /* dvd.lsw = 0 */
+        b       lab6 
+
+lab5: 
+        slw     r3, r3, r0      /* r3 = dvd.msw << r0 */
+        subfic  r9, r0, 32 
+        srw     r9, r4, r9      /* r9 = dvd.lsw >> 32 - r0 */
+        or      r3, r3, r9      /* dvd.msw = r3 | r9 */
+        slw     r4, r4, r0      /* dvd.lsw = dvd.lsw << r0 */
+lab6: 
+        /* Restoring division shift and subtract loop */
+        addi    r10, r0, -1     /* r10 = -1 */
+        addic   r7, r7, 0       /* Clear carry bit before loop starts */
+lab7: 
+        /*
+        ** tmp:dvd is considered one large register 
+        ** each portion is shifted left 1 bit by adding it to itself 
+        ** adde sums the carry from the previous and creates a new carry 
+        */
+        adde    r4, r4, r4      /* Shift dvd.lsw left 1 bit */
+        adde    r3, r3, r3      /* Shift dvd.msw to left 1 bit */
+        adde    r8, r8, r8      /* Shift tmp.lsw to left 1 bit */
+        adde    r7, r7, r7      /* Shift tmp.msw to left 1 bit */
+        subfc   r0, r6, r8      /* tmp.lsw - dvs.lsw */
+        subfe.  r9, r5, r7      /* tmp.msw - dvs.msw */
+        blt     cr0, lab8       /* if(result < 0) clear carry bit */
+        or      r8, r0, r0      /* Move lsw */
+        or      r7, r9, r9      /* Move msw */
+        addic   r0, r10, 1      /* Set carry bit */
+
+lab8: 
+        bdnz    lab7 
+
+        /* Write quotient and remainder */
+        adde    r4, r4, r4      /* quo.lsw (lsb = CA) */
+        adde    r3, r3, r3      /* quo.msw (lsb from lsw) */
+        stw     r4, 4(r11)
+        stw     r3, 0(r11)
+        stw     r8, 4(r12)      /* rem.lsw */
+        stw     r7, 0(r12)      /* rem.msw */
+        blr
+
+lab9:
+        /* Qoutient is 0, divisor > dividend */
+        addi    r0, r0, 0
+        stw     r3, 0(r12)      /* Store remainder */
+        stw     r4, 4(r12)
+        stw     r0, 0(r11)      /* Set quotient to zero */
+        stw     r0, 4(r11)
+        blr
+
+lab10:
+        /* Divisor is 0 */
+        addi    r0, r0, -1
+        stw     r0, 0(r12)      /* Set remainder to zero */
+        stw     r0, 4(r12)
+        stw     r0, 0(r11)      /* Set quotient to zero */
+        stw     r0, 4(r11)
+        blr
+
+        .epilogue __ppc32_udiv64