summaryrefslogtreecommitdiffstats
path: root/arch/parisc/lib/milli/div_const.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/parisc/lib/milli/div_const.S')
-rw-r--r--arch/parisc/lib/milli/div_const.S682
1 files changed, 0 insertions, 682 deletions
diff --git a/arch/parisc/lib/milli/div_const.S b/arch/parisc/lib/milli/div_const.S
deleted file mode 100644
index dd660076e944..000000000000
--- a/arch/parisc/lib/milli/div_const.S
+++ /dev/null
@@ -1,682 +0,0 @@
-/* 32 and 64-bit millicode, original author Hewlett-Packard
- adapted for gcc by Paul Bame <bame@debian.org>
- and Alan Modra <alan@linuxcare.com.au>.
-
- Copyright 2001, 2002, 2003 Free Software Foundation, Inc.
-
- This file is part of GCC and is released under the terms of
- of the GNU General Public License as published by the Free Software
- Foundation; either version 2, or (at your option) any later version.
- See the file COPYING in the top-level GCC source directory for a copy
- of the license. */
-
-#include "milli.h"
-
-#ifdef L_div_const
-/* ROUTINE: $$divI_2
- . $$divI_3 $$divU_3
- . $$divI_4
- . $$divI_5 $$divU_5
- . $$divI_6 $$divU_6
- . $$divI_7 $$divU_7
- . $$divI_8
- . $$divI_9 $$divU_9
- . $$divI_10 $$divU_10
- .
- . $$divI_12 $$divU_12
- .
- . $$divI_14 $$divU_14
- . $$divI_15 $$divU_15
- . $$divI_16
- . $$divI_17 $$divU_17
- .
- . Divide by selected constants for single precision binary integers.
-
- INPUT REGISTERS:
- . arg0 == dividend
- . mrp == return pc
- . sr0 == return space when called externally
-
- OUTPUT REGISTERS:
- . arg0 = undefined
- . arg1 = undefined
- . ret1 = quotient
-
- OTHER REGISTERS AFFECTED:
- . r1 = undefined
-
- SIDE EFFECTS:
- . Causes a trap under the following conditions: NONE
- . Changes memory at the following places: NONE
-
- PERMISSIBLE CONTEXT:
- . Unwindable.
- . Does not create a stack frame.
- . Suitable for internal or external millicode.
- . Assumes the special millicode register conventions.
-
- DISCUSSION:
- . Calls other millicode routines using mrp: NONE
- . Calls other millicode routines: NONE */
-
-
-/* TRUNCATED DIVISION BY SMALL INTEGERS
-
- We are interested in q(x) = floor(x/y), where x >= 0 and y > 0
- (with y fixed).
-
- Let a = floor(z/y), for some choice of z. Note that z will be
- chosen so that division by z is cheap.
-
- Let r be the remainder(z/y). In other words, r = z - ay.
-
- Now, our method is to choose a value for b such that
-
- q'(x) = floor((ax+b)/z)
-
- is equal to q(x) over as large a range of x as possible. If the
- two are equal over a sufficiently large range, and if it is easy to
- form the product (ax), and it is easy to divide by z, then we can
- perform the division much faster than the general division algorithm.
-
- So, we want the following to be true:
-
- . For x in the following range:
- .
- . ky <= x < (k+1)y
- .
- . implies that
- .
- . k <= (ax+b)/z < (k+1)
-
- We want to determine b such that this is true for all k in the
- range {0..K} for some maximum K.
-
- Since (ax+b) is an increasing function of x, we can take each
- bound separately to determine the "best" value for b.
-
- (ax+b)/z < (k+1) implies
-
- (a((k+1)y-1)+b < (k+1)z implies
-
- b < a + (k+1)(z-ay) implies
-
- b < a + (k+1)r
-
- This needs to be true for all k in the range {0..K}. In
- particular, it is true for k = 0 and this leads to a maximum
- acceptable value for b.
-
- b < a+r or b <= a+r-1
-
- Taking the other bound, we have
-
- k <= (ax+b)/z implies
-
- k <= (aky+b)/z implies
-
- k(z-ay) <= b implies
-
- kr <= b
-
- Clearly, the largest range for k will be achieved by maximizing b,
- when r is not zero. When r is zero, then the simplest choice for b
- is 0. When r is not 0, set
-
- . b = a+r-1
-
- Now, by construction, q'(x) = floor((ax+b)/z) = q(x) = floor(x/y)
- for all x in the range:
-
- . 0 <= x < (K+1)y
-
- We need to determine what K is. Of our two bounds,
-
- . b < a+(k+1)r is satisfied for all k >= 0, by construction.
-
- The other bound is
-
- . kr <= b
-
- This is always true if r = 0. If r is not 0 (the usual case), then
- K = floor((a+r-1)/r), is the maximum value for k.
-
- Therefore, the formula q'(x) = floor((ax+b)/z) yields the correct
- answer for q(x) = floor(x/y) when x is in the range
-
- (0,(K+1)y-1) K = floor((a+r-1)/r)
-
- To be most useful, we want (K+1)y-1 = (max x) >= 2**32-1 so that
- the formula for q'(x) yields the correct value of q(x) for all x
- representable by a single word in HPPA.
-
- We are also constrained in that computing the product (ax), adding
- b, and dividing by z must all be done quickly, otherwise we will be
- better off going through the general algorithm using the DS
- instruction, which uses approximately 70 cycles.
-
- For each y, there is a choice of z which satisfies the constraints
- for (K+1)y >= 2**32. We may not, however, be able to satisfy the
- timing constraints for arbitrary y. It seems that z being equal to
- a power of 2 or a power of 2 minus 1 is as good as we can do, since
- it minimizes the time to do division by z. We want the choice of z
- to also result in a value for (a) that minimizes the computation of
- the product (ax). This is best achieved if (a) has a regular bit
- pattern (so the multiplication can be done with shifts and adds).
- The value of (a) also needs to be less than 2**32 so the product is
- always guaranteed to fit in 2 words.
-
- In actual practice, the following should be done:
-
- 1) For negative x, you should take the absolute value and remember
- . the fact so that the result can be negated. This obviously does
- . not apply in the unsigned case.
- 2) For even y, you should factor out the power of 2 that divides y
- . and divide x by it. You can then proceed by dividing by the
- . odd factor of y.
-
- Here is a table of some odd values of y, and corresponding choices
- for z which are "good".
-
- y z r a (hex) max x (hex)
-
- 3 2**32 1 55555555 100000001
- 5 2**32 1 33333333 100000003
- 7 2**24-1 0 249249 (infinite)
- 9 2**24-1 0 1c71c7 (infinite)
- 11 2**20-1 0 1745d (infinite)
- 13 2**24-1 0 13b13b (infinite)
- 15 2**32 1 11111111 10000000d
- 17 2**32 1 f0f0f0f 10000000f
-
- If r is 1, then b = a+r-1 = a. This simplifies the computation
- of (ax+b), since you can compute (x+1)(a) instead. If r is 0,
- then b = 0 is ok to use which simplifies (ax+b).
-
- The bit patterns for 55555555, 33333333, and 11111111 are obviously
- very regular. The bit patterns for the other values of a above are:
-
- y (hex) (binary)
-
- 7 249249 001001001001001001001001 << regular >>
- 9 1c71c7 000111000111000111000111 << regular >>
- 11 1745d 000000010111010001011101 << irregular >>
- 13 13b13b 000100111011000100111011 << irregular >>
-
- The bit patterns for (a) corresponding to (y) of 11 and 13 may be
- too irregular to warrant using this method.
-
- When z is a power of 2 minus 1, then the division by z is slightly
- more complicated, involving an iterative solution.
-
- The code presented here solves division by 1 through 17, except for
- 11 and 13. There are algorithms for both signed and unsigned
- quantities given.
-
- TIMINGS (cycles)
-
- divisor positive negative unsigned
-
- . 1 2 2 2
- . 2 4 4 2
- . 3 19 21 19
- . 4 4 4 2
- . 5 18 22 19
- . 6 19 22 19
- . 8 4 4 2
- . 10 18 19 17
- . 12 18 20 18
- . 15 16 18 16
- . 16 4 4 2
- . 17 16 18 16
-
- Now, the algorithm for 7, 9, and 14 is an iterative one. That is,
- a loop body is executed until the tentative quotient is 0. The
- number of times the loop body is executed varies depending on the
- dividend, but is never more than two times. If the dividend is
- less than the divisor, then the loop body is not executed at all.
- Each iteration adds 4 cycles to the timings.
-
- divisor positive negative unsigned
-
- . 7 19+4n 20+4n 20+4n n = number of iterations
- . 9 21+4n 22+4n 21+4n
- . 14 21+4n 22+4n 20+4n
-
- To give an idea of how the number of iterations varies, here is a
- table of dividend versus number of iterations when dividing by 7.
-
- smallest largest required
- dividend dividend iterations
-
- . 0 6 0
- . 7 0x6ffffff 1
- 0x1000006 0xffffffff 2
-
- There is some overlap in the range of numbers requiring 1 and 2
- iterations. */
-
-RDEFINE(t2,r1)
-RDEFINE(x2,arg0) /* r26 */
-RDEFINE(t1,arg1) /* r25 */
-RDEFINE(x1,ret1) /* r29 */
-
- SUBSPA_MILLI_DIV
- ATTR_MILLI
-
- .proc
- .callinfo millicode
- .entry
-/* NONE of these routines require a stack frame
- ALL of these routines are unwindable from millicode */
-
-GSYM($$divide_by_constant)
- .export $$divide_by_constant,millicode
-/* Provides a "nice" label for the code covered by the unwind descriptor
- for things like gprof. */
-
-/* DIVISION BY 2 (shift by 1) */
-GSYM($$divI_2)
- .export $$divI_2,millicode
- comclr,>= arg0,0,0
- addi 1,arg0,arg0
- MILLIRET
- extrs arg0,30,31,ret1
-
-
-/* DIVISION BY 4 (shift by 2) */
-GSYM($$divI_4)
- .export $$divI_4,millicode
- comclr,>= arg0,0,0
- addi 3,arg0,arg0
- MILLIRET
- extrs arg0,29,30,ret1
-
-
-/* DIVISION BY 8 (shift by 3) */
-GSYM($$divI_8)
- .export $$divI_8,millicode
- comclr,>= arg0,0,0
- addi 7,arg0,arg0
- MILLIRET
- extrs arg0,28,29,ret1
-
-/* DIVISION BY 16 (shift by 4) */
-GSYM($$divI_16)
- .export $$divI_16,millicode
- comclr,>= arg0,0,0
- addi 15,arg0,arg0
- MILLIRET
- extrs arg0,27,28,ret1
-
-/****************************************************************************
-*
-* DIVISION BY DIVISORS OF FFFFFFFF, and powers of 2 times these
-*
-* includes 3,5,15,17 and also 6,10,12
-*
-****************************************************************************/
-
-/* DIVISION BY 3 (use z = 2**32; a = 55555555) */
-
-GSYM($$divI_3)
- .export $$divI_3,millicode
- comb,<,N x2,0,LREF(neg3)
-
- addi 1,x2,x2 /* this cannot overflow */
- extru x2,1,2,x1 /* multiply by 5 to get started */
- sh2add x2,x2,x2
- b LREF(pos)
- addc x1,0,x1
-
-LSYM(neg3)
- subi 1,x2,x2 /* this cannot overflow */
- extru x2,1,2,x1 /* multiply by 5 to get started */
- sh2add x2,x2,x2
- b LREF(neg)
- addc x1,0,x1
-
-GSYM($$divU_3)
- .export $$divU_3,millicode
- addi 1,x2,x2 /* this CAN overflow */
- addc 0,0,x1
- shd x1,x2,30,t1 /* multiply by 5 to get started */
- sh2add x2,x2,x2
- b LREF(pos)
- addc x1,t1,x1
-
-/* DIVISION BY 5 (use z = 2**32; a = 33333333) */
-
-GSYM($$divI_5)
- .export $$divI_5,millicode
- comb,<,N x2,0,LREF(neg5)
-
- addi 3,x2,t1 /* this cannot overflow */
- sh1add x2,t1,x2 /* multiply by 3 to get started */
- b LREF(pos)
- addc 0,0,x1
-
-LSYM(neg5)
- sub 0,x2,x2 /* negate x2 */
- addi 1,x2,x2 /* this cannot overflow */
- shd 0,x2,31,x1 /* get top bit (can be 1) */
- sh1add x2,x2,x2 /* multiply by 3 to get started */
- b LREF(neg)
- addc x1,0,x1
-
-GSYM($$divU_5)
- .export $$divU_5,millicode
- addi 1,x2,x2 /* this CAN overflow */
- addc 0,0,x1
- shd x1,x2,31,t1 /* multiply by 3 to get started */
- sh1add x2,x2,x2
- b LREF(pos)
- addc t1,x1,x1
-
-/* DIVISION BY 6 (shift to divide by 2 then divide by 3) */
-GSYM($$divI_6)
- .export $$divI_6,millicode
- comb,<,N x2,0,LREF(neg6)
- extru x2,30,31,x2 /* divide by 2 */
- addi 5,x2,t1 /* compute 5*(x2+1) = 5*x2+5 */
- sh2add x2,t1,x2 /* multiply by 5 to get started */
- b LREF(pos)
- addc 0,0,x1
-
-LSYM(neg6)
- subi 2,x2,x2 /* negate, divide by 2, and add 1 */
- /* negation and adding 1 are done */
- /* at the same time by the SUBI */
- extru x2,30,31,x2
- shd 0,x2,30,x1
- sh2add x2,x2,x2 /* multiply by 5 to get started */
- b LREF(neg)
- addc x1,0,x1
-
-GSYM($$divU_6)
- .export $$divU_6,millicode
- extru x2,30,31,x2 /* divide by 2 */
- addi 1,x2,x2 /* cannot carry */
- shd 0,x2,30,x1 /* multiply by 5 to get started */
- sh2add x2,x2,x2
- b LREF(pos)
- addc x1,0,x1
-
-/* DIVISION BY 10 (shift to divide by 2 then divide by 5) */
-GSYM($$divU_10)
- .export $$divU_10,millicode
- extru x2,30,31,x2 /* divide by 2 */
- addi 3,x2,t1 /* compute 3*(x2+1) = (3*x2)+3 */
- sh1add x2,t1,x2 /* multiply by 3 to get started */
- addc 0,0,x1
-LSYM(pos)
- shd x1,x2,28,t1 /* multiply by 0x11 */
- shd x2,0,28,t2
- add x2,t2,x2
- addc x1,t1,x1
-LSYM(pos_for_17)
- shd x1,x2,24,t1 /* multiply by 0x101 */
- shd x2,0,24,t2
- add x2,t2,x2
- addc x1,t1,x1
-
- shd x1,x2,16,t1 /* multiply by 0x10001 */
- shd x2,0,16,t2
- add x2,t2,x2
- MILLIRET
- addc x1,t1,x1
-
-GSYM($$divI_10)
- .export $$divI_10,millicode
- comb,< x2,0,LREF(neg10)
- copy 0,x1
- extru x2,30,31,x2 /* divide by 2 */
- addib,TR 1,x2,LREF(pos) /* add 1 (cannot overflow) */
- sh1add x2,x2,x2 /* multiply by 3 to get started */
-
-LSYM(neg10)
- subi 2,x2,x2 /* negate, divide by 2, and add 1 */
- /* negation and adding 1 are done */
- /* at the same time by the SUBI */
- extru x2,30,31,x2
- sh1add x2,x2,x2 /* multiply by 3 to get started */
-LSYM(neg)
- shd x1,x2,28,t1 /* multiply by 0x11 */
- shd x2,0,28,t2
- add x2,t2,x2
- addc x1,t1,x1
-LSYM(neg_for_17)
- shd x1,x2,24,t1 /* multiply by 0x101 */
- shd x2,0,24,t2
- add x2,t2,x2
- addc x1,t1,x1
-
- shd x1,x2,16,t1 /* multiply by 0x10001 */
- shd x2,0,16,t2
- add x2,t2,x2
- addc x1,t1,x1
- MILLIRET
- sub 0,x1,x1
-
-/* DIVISION BY 12 (shift to divide by 4 then divide by 3) */
-GSYM($$divI_12)
- .export $$divI_12,millicode
- comb,< x2,0,LREF(neg12)
- copy 0,x1
- extru x2,29,30,x2 /* divide by 4 */
- addib,tr 1,x2,LREF(pos) /* compute 5*(x2+1) = 5*x2+5 */
- sh2add x2,x2,x2 /* multiply by 5 to get started */
-
-LSYM(neg12)
- subi 4,x2,x2 /* negate, divide by 4, and add 1 */
- /* negation and adding 1 are done */
- /* at the same time by the SUBI */
- extru x2,29,30,x2
- b LREF(neg)
- sh2add x2,x2,x2 /* multiply by 5 to get started */
-
-GSYM($$divU_12)
- .export $$divU_12,millicode
- extru x2,29,30,x2 /* divide by 4 */
- addi 5,x2,t1 /* cannot carry */
- sh2add x2,t1,x2 /* multiply by 5 to get started */
- b LREF(pos)
- addc 0,0,x1
-
-/* DIVISION BY 15 (use z = 2**32; a = 11111111) */
-GSYM($$divI_15)
- .export $$divI_15,millicode
- comb,< x2,0,LREF(neg15)
- copy 0,x1
- addib,tr 1,x2,LREF(pos)+4
- shd x1,x2,28,t1
-
-LSYM(neg15)
- b LREF(neg)
- subi 1,x2,x2
-
-GSYM($$divU_15)
- .export $$divU_15,millicode
- addi 1,x2,x2 /* this CAN overflow */
- b LREF(pos)
- addc 0,0,x1
-
-/* DIVISION BY 17 (use z = 2**32; a = f0f0f0f) */
-GSYM($$divI_17)
- .export $$divI_17,millicode
- comb,<,n x2,0,LREF(neg17)
- addi 1,x2,x2 /* this cannot overflow */
- shd 0,x2,28,t1 /* multiply by 0xf to get started */
- shd x2,0,28,t2
- sub t2,x2,x2
- b LREF(pos_for_17)
- subb t1,0,x1
-
-LSYM(neg17)
- subi 1,x2,x2 /* this cannot overflow */
- shd 0,x2,28,t1 /* multiply by 0xf to get started */
- shd x2,0,28,t2
- sub t2,x2,x2
- b LREF(neg_for_17)
- subb t1,0,x1
-
-GSYM($$divU_17)
- .export $$divU_17,millicode
- addi 1,x2,x2 /* this CAN overflow */
- addc 0,0,x1
- shd x1,x2,28,t1 /* multiply by 0xf to get started */
-LSYM(u17)
- shd x2,0,28,t2
- sub t2,x2,x2
- b LREF(pos_for_17)
- subb t1,x1,x1
-
-
-/* DIVISION BY DIVISORS OF FFFFFF, and powers of 2 times these
- includes 7,9 and also 14
-
-
- z = 2**24-1
- r = z mod x = 0
-
- so choose b = 0
-
- Also, in order to divide by z = 2**24-1, we approximate by dividing
- by (z+1) = 2**24 (which is easy), and then correcting.
-
- (ax) = (z+1)q' + r
- . = zq' + (q'+r)
-
- So to compute (ax)/z, compute q' = (ax)/(z+1) and r = (ax) mod (z+1)
- Then the true remainder of (ax)/z is (q'+r). Repeat the process
- with this new remainder, adding the tentative quotients together,
- until a tentative quotient is 0 (and then we are done). There is
- one last correction to be done. It is possible that (q'+r) = z.
- If so, then (q'+r)/(z+1) = 0 and it looks like we are done. But,
- in fact, we need to add 1 more to the quotient. Now, it turns
- out that this happens if and only if the original value x is
- an exact multiple of y. So, to avoid a three instruction test at
- the end, instead use 1 instruction to add 1 to x at the beginning. */
-
-/* DIVISION BY 7 (use z = 2**24-1; a = 249249) */
-GSYM($$divI_7)
- .export $$divI_7,millicode
- comb,<,n x2,0,LREF(neg7)
-LSYM(7)
- addi 1,x2,x2 /* cannot overflow */
- shd 0,x2,29,x1
- sh3add x2,x2,x2
- addc x1,0,x1
-LSYM(pos7)
- shd x1,x2,26,t1
- shd x2,0,26,t2
- add x2,t2,x2
- addc x1,t1,x1
-
- shd x1,x2,20,t1
- shd x2,0,20,t2
- add x2,t2,x2
- addc x1,t1,t1
-
- /* computed <t1,x2>. Now divide it by (2**24 - 1) */
-
- copy 0,x1
- shd,= t1,x2,24,t1 /* tentative quotient */
-LSYM(1)
- addb,tr t1,x1,LREF(2) /* add to previous quotient */
- extru x2,31,24,x2 /* new remainder (unadjusted) */
-
- MILLIRETN
-
-LSYM(2)
- addb,tr t1,x2,LREF(1) /* adjust remainder */
- extru,= x2,7,8,t1 /* new quotient */
-
-LSYM(neg7)
- subi 1,x2,x2 /* negate x2 and add 1 */
-LSYM(8)
- shd 0,x2,29,x1
- sh3add x2,x2,x2
- addc x1,0,x1
-
-LSYM(neg7_shift)
- shd x1,x2,26,t1
- shd x2,0,26,t2
- add x2,t2,x2
- addc x1,t1,x1
-
- shd x1,x2,20,t1
- shd x2,0,20,t2
- add x2,t2,x2
- addc x1,t1,t1
-
- /* computed <t1,x2>. Now divide it by (2**24 - 1) */
-
- copy 0,x1
- shd,= t1,x2,24,t1 /* tentative quotient */
-LSYM(3)
- addb,tr t1,x1,LREF(4) /* add to previous quotient */
- extru x2,31,24,x2 /* new remainder (unadjusted) */
-
- MILLIRET
- sub 0,x1,x1 /* negate result */
-
-LSYM(4)
- addb,tr t1,x2,LREF(3) /* adjust remainder */
- extru,= x2,7,8,t1 /* new quotient */
-
-GSYM($$divU_7)
- .export $$divU_7,millicode
- addi 1,x2,x2 /* can carry */
- addc 0,0,x1
- shd x1,x2,29,t1
- sh3add x2,x2,x2
- b LREF(pos7)
- addc t1,x1,x1
-
-/* DIVISION BY 9 (use z = 2**24-1; a = 1c71c7) */
-GSYM($$divI_9)
- .export $$divI_9,millicode
- comb,<,n x2,0,LREF(neg9)
- addi 1,x2,x2 /* cannot overflow */
- shd 0,x2,29,t1
- shd x2,0,29,t2
- sub t2,x2,x2
- b LREF(pos7)
- subb t1,0,x1
-
-LSYM(neg9)
- subi 1,x2,x2 /* negate and add 1 */
- shd 0,x2,29,t1
- shd x2,0,29,t2
- sub t2,x2,x2
- b LREF(neg7_shift)
- subb t1,0,x1
-
-GSYM($$divU_9)
- .export $$divU_9,millicode
- addi 1,x2,x2 /* can carry */
- addc 0,0,x1
- shd x1,x2,29,t1
- shd x2,0,29,t2
- sub t2,x2,x2
- b LREF(pos7)
- subb t1,x1,x1
-
-/* DIVISION BY 14 (shift to divide by 2 then divide by 7) */
-GSYM($$divI_14)
- .export $$divI_14,millicode
- comb,<,n x2,0,LREF(neg14)
-GSYM($$divU_14)
- .export $$divU_14,millicode
- b LREF(7) /* go to 7 case */
- extru x2,30,31,x2 /* divide by 2 */
-
-LSYM(neg14)
- subi 2,x2,x2 /* negate (and add 2) */
- b LREF(8)
- extru x2,30,31,x2 /* divide by 2 */
- .exit
- .procend
- .end
-#endif
OpenPOWER on IntegriCloud