/* IBM_PROLOG_BEGIN_TAG                                                   */
/* This is an automatically generated prolog.                             */
/*                                                                        */
/* $Source: src/ssx/ppc32/div64.S $                                       */
/*                                                                        */
/* OpenPOWER OnChipController Project                                     */
/*                                                                        */
/* Contributors Listed Below - COPYRIGHT 2014,2016                        */
/* [+] International Business Machines Corp.                              */
/*                                                                        */
/*                                                                        */
/* Licensed under the Apache License, Version 2.0 (the "License");        */
/* you may not use this file except in compliance with the License.       */
/* You may obtain a copy of the License at                                */
/*                                                                        */
/*     http://www.apache.org/licenses/LICENSE-2.0                         */
/*                                                                        */
/* Unless required by applicable law or agreed to in writing, software    */
/* distributed under the License is distributed on an "AS IS" BASIS,      */
/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or        */
/* implied. See the License for the specific language governing           */
/* permissions and limitations under the License.                         */
/*                                                                        */
/* IBM_PROLOG_END_TAG                                                     */
//-----------------------------------------------------------------------------
// *! (C) Copyright International Business Machines Corp. 2014
// *! All Rights Reserved -- Property of IBM
// *! *** IBM Confidential ***
//-----------------------------------------------------------------------------
                
/// \file div64.S
/// \brief Unsigned 64/64 bit division
///
/// This is IBM code, originally part of OS Open.  The code has been slightly
/// modified from its original form, both to be compatible with SSX and to
/// change the function prototype slightly.
///
/// The code was provided by Matt Tyrlik in Raleigh.

/* @#START#@
**
**      PSCN (Power Service and Control Network)
**      Cage Controller OS Open Code
**
**      (C) Copyright International Business Machines Corporation 2002
**      All Rights Reserved
**      Licensed Material  -  Program Property of I B M
**      Refer to copyright instructions: Form G120-2083
**
** Module:
**      div64.s
**
** Description:
**      Divide 64 bit unsigned values on 32 bit CPU
**      div64(uint64_t dividen, uint64_t divisor, 
**            uint64_t *quotient, uint64_t *remainder)
**      
**      Original source from:
**      "The PowerPC Compiler Writer's Guide", pp62-65 by
**      Steve Hoxey, Faraydon Karim, Bill Hay, Hank Warray,
**      published by Warthman Associates, 240 Hamilton Avenue,
**      Palo Alto, CA 94301, USA, 1996 for IBM.
**      ISBN 0-9649654-0-2.
**
**      This version checks for divisor equal to zero.
**
** Environment:
**      OS Open (XCOFF)
**
** Linkage:
**      AIX 4.3.3
**
** @author
**      Thomas Richter
**
** History:
** Date      Author      Description
** -----------------------------------------------------------------------------
** 23-Sep-02 Richter     Created
**
** @#END#@*/

        .nolist
#include "ssx.h"
        .list
        
        .set    r0, 0
        .set    r1, 1
        .set    r2, 2
        .set    r3, 3
        .set    r4, 4
        .set    r5, 5
        .set    r6, 6
        .set    r7, 7
        .set    r8, 8
        .set    r9, 9
        .set    r10, 10
        .set    r11, 11
        .set    r12, 12

        .global_function __ppc32_udiv64

        /*
        ** Code comment notation: 
        ** msw = most-significant (high-order) word, i.e. bits 0..31 
        ** lsw = least-significant (low-order) word, i.e. bits 32..63
        ** LZ = Leading Zeroes 
        ** SD = Significant Digits 
        ** 
        ** R3:R4 = Input parameter, dividend.
        ** R5:R6 = Input parameter, divisor.
        ** R7 = Output parameter, pointer to quotient.
        ** R8 = Output parameter, pointer to remainder.
        **
        ** Pointer arguments point to a uint64_t.
        **
        ** Division is achieved using a shift/rotate/substract algorithsm
        ** described above.
        ** The registers are used as follows:
        ** R3:R4 = dividend (upper 32bits:lower 32bits)
        ** R5:R6 = divisor (upper 32bits:lower 32bits)
        ** 
        ** R7:R8 = temporary 64 bit register (upper 32bits:lower 32bits)
        ** count the number of leading 0s in the dividend 
        **
        ** Here is the description from the book. The dividend is placed
        ** in the low order part of a 4 (32bit) register sequence named
        ** tmp-high:tmp-low:dividend-high:dividend:low or tmp:dvd for short.
        ** 
        ** Each iteration includes the following steps:
        ** 1. Shift tmp:dvd by one bit to the left.
        ** 2. Subtract the divisor from tmp. This is a 64 bit operation.
        ** 3. If result is greater than or equal, place result in tmp and
        **    set the low order bit of dividend
        ** 4. If result is negative, do not modify tmp and
        **    clear the low order bit of dividend
        ** 5. If the number of iterations is less than the width of the
        **    dividend, goto step 1.
        **
        ** Now the algorithm can be improved by reducing the number of
        ** iterations to be executed.
        ** 1. Calculate the leading zeroes of the dividend.
        ** 2. Calculate the leading zeroes of the divisor.
        ** 3. Calculate the significant ones of the dividend.
        ** 4. Calculate the significant ones of the divisor.
        **
        ** Initial tmp := dvd >> (dvd.SD - dvs.SD)
        ** Initial dvd := dvd << (dvd.LZ + dvs.SD)
        ** Loops: dvd.SD - dvs.SD.
        **
        ** Warning: Special care must be taken if dvd.LZ == dvs.LZ. The code
        ** below does so by reducing the number of dvs.SD by one. This leads
        ** to the loop being executed 1 more time than really necessary,
        ** but avoids to check for the case when dvd.LZ == dvs.LZ.
        ** This case (dvd.LZ == dvs.LZ) only checks for the number of leading
        ** zeroes, but does not check if dividend is really greater than the
        ** divisor.
        ** Consider 16/17, both have an LZ value of 59. The code sets dvs.LZ
        ** 60. This resutls in dvs.SD to 4, thus one iteration after which
        ** tmp is the remainder 16.
        */

__ppc32_udiv64: // SSX
        /* Save result pointers on volatile spare registers */
        ori     r12, r8, 0              /* Save remainder address */
        ori     r11, r7, 0              /* Save quotient address */

        /* count the number of leading 0s in the dividend */
        cmpwi   cr0, r3, 0      /* dvd.msw == 0? */
        cntlzw  r0, r3          /* R0 = dvd.msw.LZ */
        cntlzw  r9, r4          /* R9 = dvd.lsw.LZ */
        bne     cr0, lab1       /* if(dvd.msw == 0) dvd.LZ = dvd.msw.LZ */
        addi    r0, r9, 32      /* dvd.LZ = dvd.lsw.LZ + 32 */
lab1: 
        /* count the number of leading 0s in the divisor */
        cmpwi   cr0, r5, 0      /* dvd.msw == 0? */
        cmpwi   cr1, r6, 0      /* dvd.lsw == 0? */
        cntlzw  r9, r5          /* R9 = dvs.msw.LZ */
        cntlzw  r10, r6         /* R10 = dvs.lsw.LZ */
        bne     cr0, lab2       /* if(dvs.msw == 0) dvs.LZ = dvs.msw.LZ */
        beq     cr1, lab10      /* dvs.msw == 0 */
        addi    r9, r10, 32     /* dvs.LZ = dvs.lsw.LZ + 32 */

lab2:
        /* Determine shift amounts to minimize the number of iterations  */
        cmpw    cr0, r0, r9     /* Compare dvd.LZ to dvs.LZ */
        subfic  r10, r0, 64     /* R10 = dvd.SD */
        bgt     cr0, lab9       /* if(dvs > dvd) quotient = 0 */
        addi    r9, r9, 1       /* See comment above. ++dvs.LZ (or --dvs.SD) */
        subfic  r9, r9, 64      /* R9 = dvs.SD */
        add     r0, r0, r9      /* (dvd.LZ + dvs.SD) = left shift of dvd for */
                                /* initial dvd */
        subf    r9, r9, r10     /* (dvd.SD - dvs.SD) = right shift of dvd for */
                                /* initial tmp */
        mtctr   r9              /* Number of iterations = dvd.SD - dvs.SD */
        
        /* R7:R8 = R3:R4 >> R9 */
        cmpwi   cr0, r9, 32     /* compare R9 to 32 */
        addi    r7, r9, -32
        blt     cr0, lab3       /* if(R9 < 32) jump to lab3 */
        srw     r8, r3, r7      /* tmp.lsw = dvd.msw >> (R9 - 32) */
        addi    r7, r0, 0       /* tmp.msw = 0 */
        b       lab4 

lab3: 
        srw     r8, r4, r9      /* R8 = dvd.lsw >> R9 */
        subfic  r7, r9, 32 
        slw     r7,r3,r7        /* R7 = dvd.msw << 32 - R9 */
        or      r8, r8,r7       /* tmp.lsw = R8 | R7 */
        srw     r7,r3,r9        /* tmp.msw = dvd.msw >> R9 */
lab4:
        /* R3:R4 = R3:R4 << R0 */
        cmpwi   cr0, r0, 32     /* Compare R0 to 32 */
        addic   r9, r0, -32 
        blt     cr0, lab5       /* if(R0 < 32) jump to lab5 */
        slw     r3, r4, r9      /* dvd.msw = dvd.lsw << R9 */
        addi    r4, r0, 0       /* dvd.lsw = 0 */
        b       lab6 

lab5: 
        slw     r3, r3, r0      /* r3 = dvd.msw << r0 */
        subfic  r9, r0, 32 
        srw     r9, r4, r9      /* r9 = dvd.lsw >> 32 - r0 */
        or      r3, r3, r9      /* dvd.msw = r3 | r9 */
        slw     r4, r4, r0      /* dvd.lsw = dvd.lsw << r0 */
lab6: 
        /* Restoring division shift and subtract loop */
        addi    r10, r0, -1     /* r10 = -1 */
        addic   r7, r7, 0       /* Clear carry bit before loop starts */
lab7: 
        /*
        ** tmp:dvd is considered one large register 
        ** each portion is shifted left 1 bit by adding it to itself 
        ** adde sums the carry from the previous and creates a new carry 
        */
        adde    r4, r4, r4      /* Shift dvd.lsw left 1 bit */
        adde    r3, r3, r3      /* Shift dvd.msw to left 1 bit */
        adde    r8, r8, r8      /* Shift tmp.lsw to left 1 bit */
        adde    r7, r7, r7      /* Shift tmp.msw to left 1 bit */
        subfc   r0, r6, r8      /* tmp.lsw - dvs.lsw */
        subfe.  r9, r5, r7      /* tmp.msw - dvs.msw */
        blt     cr0, lab8       /* if(result < 0) clear carry bit */
        or      r8, r0, r0      /* Move lsw */
        or      r7, r9, r9      /* Move msw */
        addic   r0, r10, 1      /* Set carry bit */

lab8: 
        bdnz    lab7 

        /* Write quotient and remainder */
        adde    r4, r4, r4      /* quo.lsw (lsb = CA) */
        adde    r3, r3, r3      /* quo.msw (lsb from lsw) */
        stw     r4, 4(r11)
        stw     r3, 0(r11)
        stw     r8, 4(r12)      /* rem.lsw */
        stw     r7, 0(r12)      /* rem.msw */
        blr

lab9:
        /* Qoutient is 0, divisor > dividend */
        addi    r0, r0, 0
        stw     r3, 0(r12)      /* Store remainder */
        stw     r4, 4(r12)
        stw     r0, 0(r11)      /* Set quotient to zero */
        stw     r0, 4(r11)
        blr

lab10:
        /* Divisor is 0 */
        addi    r0, r0, -1
        stw     r0, 0(r12)      /* Set remainder to zero */
        stw     r0, 4(r12)
        stw     r0, 0(r11)      /* Set quotient to zero */
        stw     r0, 4(r11)
        blr

        .epilogue __ppc32_udiv64