//-----------------------------------------------------------------------------
// *! (C) Copyright International Business Machines Corp. 2014
// *! All Rights Reserved -- Property of IBM
// *! *** IBM Confidential ***
//-----------------------------------------------------------------------------
                
/// \file div64.S
/// \brief Unsigned 64/64 bit division
///
/// This is IBM code, originally part of OS Open.  The code has been slightly
/// modified from its original form, both to be compatible with PK and to
/// change the function prototype slightly.
///
/// The code was provided by Matt Tyrlik in Raleigh.

/* @#START#@
**
**      PSCN (Power Service and Control Network)
**      Cage Controller OS Open Code
**
**      (C) Copyright International Business Machines Corporation 2002
**      All Rights Reserved
**      Licensed Material  -  Program Property of I B M
**      Refer to copyright instructions: Form G120-2083
**
** Module:
**      div64.s
**
** Description:
**      Divide 64 bit unsigned values on 32 bit CPU
**      div64(uint64_t dividen, uint64_t divisor, 
**            uint64_t *quotient, uint64_t *remainder)
**      
**      Original source from:
**      "The PowerPC Compiler Writer's Guide", pp62-65 by
**      Steve Hoxey, Faraydon Karim, Bill Hay, Hank Warray,
**      published by Warthman Associates, 240 Hamilton Avenue,
**      Palo Alto, CA 94301, USA, 1996 for IBM.
**      ISBN 0-9649654-0-2.
**
**      This version checks for divisor equal to zero.
**
** Environment:
**      OS Open (XCOFF)
**
** Linkage:
**      AIX 4.3.3
**
** @author
**      Thomas Richter
**
** History:
** Date      Author      Description
** -----------------------------------------------------------------------------
** 23-Sep-02 Richter     Created
**
** @#END#@*/

        .nolist
#include "pk.h"
        .list
        
        .global_function __ppe42_udiv64

        /*
        ** Code comment notation: 
        ** msw = most-significant (high-order) word, i.e. bits 0..31 
        ** lsw = least-significant (low-order) word, i.e. bits 32..63
        ** LZ = Leading Zeroes 
        ** SD = Significant Digits 
        ** 
        ** R3:R4 = Input parameter, dividend.
        ** R5:R6 = Input parameter, divisor.
        ** R7 = Output parameter, pointer to quotient.
        ** R8 = Output parameter, pointer to remainder.
        **
        ** Pointer arguments point to a uint64_t.
        **
        ** Division is achieved using a shift/rotate/substract algorithsm
        ** described above.
        ** The registers are used as follows:
        ** R3:R4 = dividend (upper 32bits:lower 32bits)
        ** R5:R6 = divisor (upper 32bits:lower 32bits)
        ** 
        ** R7:R8 = temporary 64 bit register (upper 32bits:lower 32bits)
        ** count the number of leading 0s in the dividend 
        **
        ** Here is the description from the book. The dividend is placed
        ** in the low order part of a 4 (32bit) register sequence named
        ** tmp-high:tmp-low:dividend-high:dividend:low or tmp:dvd for short.
        ** 
        ** Each iteration includes the following steps:
        ** 1. Shift tmp:dvd by one bit to the left.
        ** 2. Subtract the divisor from tmp. This is a 64 bit operation.
        ** 3. If result is greater than or equal, place result in tmp and
        **    set the low order bit of dividend
        ** 4. If result is negative, do not modify tmp and
        **    clear the low order bit of dividend
        ** 5. If the number of iterations is less than the width of the
        **    dividend, goto step 1.
        **
        ** Now the algorithm can be improved by reducing the number of
        ** iterations to be executed.
        ** 1. Calculate the leading zeroes of the dividend.
        ** 2. Calculate the leading zeroes of the divisor.
        ** 3. Calculate the significant ones of the dividend.
        ** 4. Calculate the significant ones of the divisor.
        **
        ** Initial tmp := dvd >> (dvd.SD - dvs.SD)
        ** Initial dvd := dvd << (dvd.LZ + dvs.SD)
        ** Loops: dvd.SD - dvs.SD.
        **
        ** Warning: Special care must be taken if dvd.LZ == dvs.LZ. The code
        ** below does so by reducing the number of dvs.SD by one. This leads
        ** to the loop being executed 1 more time than really necessary,
        ** but avoids to check for the case when dvd.LZ == dvs.LZ.
        ** This case (dvd.LZ == dvs.LZ) only checks for the number of leading
        ** zeroes, but does not check if dividend is really greater than the
        ** divisor.
        ** Consider 16/17, both have an LZ value of 59. The code sets dvs.LZ
        ** 60. This resutls in dvs.SD to 4, thus one iteration after which
        ** tmp is the remainder 16.
        */

__ppe42_udiv64: // PK
       
        /* push R30 & R31 onto the stack */
        stwu    r1, -16(r1)
        stvd    r30, 8(r1)

        /* Save result pointers on volatile spare registers */
        ori     r31, r8, 0              /* Save remainder address */
        ori     r30, r7, 0              /* Save quotient address */

        /* count the number of leading 0s in the dividend */
        cmpwi   cr0, r3, 0      /* dvd.msw == 0? */
        cntlzw  r0, r3          /* R0 = dvd.msw.LZ */
        cntlzw  r9, r4          /* R9 = dvd.lsw.LZ */
        bne     cr0, lab1       /* if(dvd.msw == 0) dvd.LZ = dvd.msw.LZ */
        addi    r0, r9, 32      /* dvd.LZ = dvd.lsw.LZ + 32 */
lab1: 
        /* count the number of leading 0s in the divisor */
        cmpwi   cr0, r5, 0      /* dvd.msw == 0? */
        cntlzw  r9, r5          /* R9 = dvs.msw.LZ */
        cntlzw  r10, r6         /* R10 = dvs.lsw.LZ */
        bne     cr0, lab2       /* if(dvs.msw == 0) dvs.LZ = dvs.msw.LZ */
        cmpwi   cr0, r6, 0      /* dvd.lsw == 0? */
        beq     cr0, lab10      /* dvs.msw == 0 */
        addi    r9, r10, 32     /* dvs.LZ = dvs.lsw.LZ + 32 */

lab2:
        /* Determine shift amounts to minimize the number of iterations  */
        cmpw    cr0, r0, r9     /* Compare dvd.LZ to dvs.LZ */
        subfic  r10, r0, 64     /* R10 = dvd.SD */
        bgt     cr0, lab9       /* if(dvs > dvd) quotient = 0 */
        addi    r9, r9, 1       /* See comment above. ++dvs.LZ (or --dvs.SD) */
        subfic  r9, r9, 64      /* R9 = dvs.SD */
        add     r0, r0, r9      /* (dvd.LZ + dvs.SD) = left shift of dvd for */
                                /* initial dvd */
        subf    r9, r9, r10     /* (dvd.SD - dvs.SD) = right shift of dvd for */
                                /* initial tmp */
        mtctr   r9              /* Number of iterations = dvd.SD - dvs.SD */
        
        /* R7:R8 = R3:R4 >> R9 */
        cmpwi   cr0, r9, 32     /* compare R9 to 32 */
        addi    r7, r9, -32
        blt     cr0, lab3       /* if(R9 < 32) jump to lab3 */
        srw     r8, r3, r7      /* tmp.lsw = dvd.msw >> (R9 - 32) */
        addi    r7, r0, 0       /* tmp.msw = 0 */
        b       lab4 

lab3: 
        srw     r8, r4, r9      /* R8 = dvd.lsw >> R9 */
        subfic  r7, r9, 32 
        slw     r7,r3,r7        /* R7 = dvd.msw << 32 - R9 */
        or      r8, r8,r7       /* tmp.lsw = R8 | R7 */
        srw     r7,r3,r9        /* tmp.msw = dvd.msw >> R9 */
lab4:
        /* R3:R4 = R3:R4 << R0 */
        cmpwi   cr0, r0, 32     /* Compare R0 to 32 */
        addic   r9, r0, -32 
        blt     cr0, lab5       /* if(R0 < 32) jump to lab5 */
        slw     r3, r4, r9      /* dvd.msw = dvd.lsw << R9 */
        addi    r4, r0, 0       /* dvd.lsw = 0 */
        b       lab6 

lab5: 
        slw     r3, r3, r0      /* r3 = dvd.msw << r0 */
        subfic  r9, r0, 32 
        srw     r9, r4, r9      /* r9 = dvd.lsw >> 32 - r0 */
        or      r3, r3, r9      /* dvd.msw = r3 | r9 */
        slw     r4, r4, r0      /* dvd.lsw = dvd.lsw << r0 */
lab6: 
        /* Restoring division shift and subtract loop */
        addi    r10, r0, -1     /* r10 = -1 */
        addic   r7, r7, 0       /* Clear carry bit before loop starts */
lab7: 
        /*
        ** tmp:dvd is considered one large register 
        ** each portion is shifted left 1 bit by adding it to itself 
        ** adde sums the carry from the previous and creates a new carry 
        */
        adde    r4, r4, r4      /* Shift dvd.lsw left 1 bit */
        adde    r3, r3, r3      /* Shift dvd.msw to left 1 bit */
        adde    r8, r8, r8      /* Shift tmp.lsw to left 1 bit */
        adde    r7, r7, r7      /* Shift tmp.msw to left 1 bit */
        subfc   r0, r6, r8      /* tmp.lsw - dvs.lsw */
        subfe.  r9, r5, r7      /* tmp.msw - dvs.msw */
        blt     cr0, lab8       /* if(result < 0) clear carry bit */
        or      r8, r0, r0      /* Move lsw */
        or      r7, r9, r9      /* Move msw */
        addic   r0, r10, 1      /* Set carry bit */

lab8: 
        bdnz    lab7 

        /* Write quotient and remainder */
        adde    r4, r4, r4      /* quo.lsw (lsb = CA) */
        adde    r3, r3, r3      /* quo.msw (lsb from lsw) */
        stw     r4, 4(r30)
        stw     r3, 0(r30)
        stw     r8, 4(r31)      /* rem.lsw */
        stw     r7, 0(r31)      /* rem.msw */
        b       lab11

lab9:
        /* Qoutient is 0, divisor > dividend */
        addi    r0, r0, 0
        stw     r3, 0(r31)      /* Store remainder */
        stw     r4, 4(r31)
        stw     r0, 0(r30)      /* Set quotient to zero */
        stw     r0, 4(r30)
        b       lab11

lab10:
        /* Divisor is 0 */
        addi    r0, r0, -1
        stw     r0, 0(r31)      /* Set remainder to zero */
        stw     r0, 4(r31)
        stw     r0, 0(r30)      /* Set quotient to zero */
        stw     r0, 4(r30)

lab11:
        //pop r30 & r31 from stack
        lvd     r30, 8(r1)
        lwz     r1, 0(r1)
        blr
        .epilogue __ppe42_udiv64