Loading...
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 | /* * Integer division routine. * * Copyright (C) 1999 Hewlett-Packard Co * Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com> */ /* Simple integer division. It uses the straight forward division algorithm. This may not be the absolutely fastest way to do it, but it's not horrible either. According to ski, the worst case scenario of dividing 0xffffffffffffffff by 1 takes 133 cycles. An alternative would be to use an algorithm similar to the floating point division algorithm (Newton-Raphson iteration), but that approach is rather tricky (one has to be very careful to get the last bit right...). While this algorithm is straight-forward, it does use a couple of neat ia-64 specific tricks: - it uses the floating point unit to determine the initial shift amount (shift = floor(ld(x)) - floor(ld(y))) - it uses predication to avoid a branch in the case where x < y (this is what p8 is used for) - it uses rotating registers and the br.ctop branch to implement a software-pipelined loop that's unrolled twice (without any code expansion!) - the code is relatively well scheduled to avoid unnecessary nops while maximizing parallelism */ #include <asm/asmmacro.h> #include <asm/break.h> .text .psr abi64 #ifdef __BIG_ENDIAN__ .psr msb .msb #else .psr lsb .lsb #endif #ifdef MODULO # define OP mod # define Q r9 # define R r8 #else # define OP div # define Q r8 # define R r9 #endif #ifdef SINGLE # define PREC si #else # define PREC di #endif #ifdef UNSIGNED # define SGN u # define INT_TO_FP(a,b) fma.s0 a=b,f1,f0 # define FP_TO_INT(a,b) fcvt.fxu.trunc.s0 a=b #else # define SGN # define INT_TO_FP(a,b) fcvt.xf a=b # define FP_TO_INT(a,b) fcvt.fx.trunc.s0 a=b #endif #define PASTE1(a,b) a##b #define PASTE(a,b) PASTE1(a,b) #define NAME PASTE(PASTE(__,SGN),PASTE(OP,PASTE(PREC,3))) GLOBAL_ENTRY(NAME) UNW(.prologue) alloc r2=ar.pfs,2,6,0,8 UNW(.save pr, r18) mov r18=pr #ifdef SINGLE # ifdef UNSIGNED zxt4 in0=in0 zxt4 in1=in1 # else sxt4 in0=in0 sxt4 in1=in1 # endif ;; #endif #ifndef UNSIGNED cmp.lt p6,p0=in0,r0 // x negative? cmp.lt p7,p0=in1,r0 // y negative? ;; (p6) sub in0=r0,in0 // make x positive (p7) sub in1=r0,in1 // ditto for y ;; #endif setf.sig f8=in0 UNW(.save ar.lc, r3) UNW(.body) mov r3=ar.lc // save ar.lc setf.sig f9=in1 ;; mov Q=0 // initialize q mov R=in0 // stash away x in a static register mov r16=1 // r16 = 1 INT_TO_FP(f8,f8) cmp.eq p8,p0=0,in0 // x==0? cmp.eq p9,p0=0,in1 // y==0? ;; INT_TO_FP(f9,f9) (p8) br.dpnt.few .L3 (p9) break __IA64_BREAK_KDB // attempted division by zero (should never happen) mov ar.ec=r0 // epilogue count = 0 ;; getf.exp r14=f8 // r14 = exponent of x getf.exp r15=f9 // r15 = exponent of y mov ar.lc=r0 // loop count = 0 ;; sub r17=r14,r15 // r17 = (exp of x - exp y) = shift amount cmp.ge p8,p0=r14,r15 ;; .rotr y[2], mask[2] // in0 and in1 may no longer be valid after // the first write to a rotating register! (p8) shl y[1]=in1,r17 // y[1] = y<<shift (p8) shl mask[1]=r16,r17 // mask[1] = 1<<shift (p8) mov ar.lc=r17 // loop count = r17 ;; .L1: (p8) cmp.geu.unc p9,p0=R,y[1]// p9 = (x >= y[1]) (p8) shr.u mask[0]=mask[1],1 // prepare mask[0] and y[0] for next (p8) shr.u y[0]=y[1],1 // iteration ;; (p9) sub R=R,y[1] // if (x >= y[1]), subtract y[1] from x (p9) add Q=Q,mask[1] // and set corresponding bit in q (Q) br.ctop.dptk.few .L1 // repeated unless ar.lc-- == 0 ;; .L2: #ifndef UNSIGNED # ifdef MODULO (p6) sub R=r0,R // set sign of remainder according to x # else (p6) sub Q=r0,Q // set sign of quotient ;; (p7) sub Q=r0,Q # endif #endif .L3: mov ar.pfs=r2 // restore ar.pfs mov ar.lc=r3 // restore ar.lc mov pr=r18,0xffffffffffff0000 // restore p16-p63 br.ret.sptk.few rp END(NAME) |