Loading...
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 | /* * * Optmized version of the standard do_csum() function * * Return: a 64bit quantity containing the 16bit Internet checksum * * Inputs: * in0: address of buffer to checksum (char *) * in1: length of the buffer (int) * * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co * Stephane Eranian <eranian@hpl.hp.com> * * 02/04/22 Ken Chen <kenneth.w.chen@intel.com> * Data locality study on the checksum buffer. * More optimization cleanup - remove excessive stop bits. * 02/04/08 David Mosberger <davidm@hpl.hp.com> * More cleanup and tuning. * 01/04/18 Jun Nakajima <jun.nakajima@intel.com> * Clean up and optimize and the software pipeline, loading two * back-to-back 8-byte words per loop. Clean up the initialization * for the loop. Support the cases where load latency = 1 or 2. * Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default). */ #include <asm/asmmacro.h> // // Theory of operations: // The goal is to go as quickly as possible to the point where // we can checksum 16 bytes/loop. Before reaching that point we must // take care of incorrect alignment of first byte. // // The code hereafter also takes care of the "tail" part of the buffer // before entering the core loop, if any. The checksum is a sum so it // allows us to commute operations. So we do the "head" and "tail" // first to finish at full speed in the body. Once we get the head and // tail values, we feed them into the pipeline, very handy initialization. // // Of course we deal with the special case where the whole buffer fits // into one 8 byte word. In this case we have only one entry in the pipeline. // // We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for // possible load latency and also to accommodate for head and tail. // // The end of the function deals with folding the checksum from 64bits // down to 16bits taking care of the carry. // // This version avoids synchronization in the core loop by also using a // pipeline for the accumulation of the checksum in resultx[] (x=1,2). // // wordx[] (x=1,2) // |---| // | | 0 : new value loaded in pipeline // |---| // | | - : in transit data // |---| // | | LOAD_LATENCY : current value to add to checksum // |---| // | | LOAD_LATENCY+1 : previous value added to checksum // |---| (previous iteration) // // resultx[] (x=1,2) // |---| // | | 0 : initial value // |---| // | | LOAD_LATENCY-1 : new checksum // |---| // | | LOAD_LATENCY : previous value of checksum // |---| // | | LOAD_LATENCY+1 : final checksum when out of the loop // |---| // // // See RFC1071 "Computing the Internet Checksum" for various techniques for // calculating the Internet checksum. // // NOT YET DONE: // - Maybe another algorithm which would take care of the folding at the // end in a different manner // - Work with people more knowledgeable than me on the network stack // to figure out if we could not split the function depending on the // type of packet or alignment we get. Like the ip_fast_csum() routine // where we know we have at least 20bytes worth of data to checksum. // - Do a better job of handling small packets. // - Note on prefetching: it was found that under various load, i.e. ftp read/write, // nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8% // on the data that buffer points to (partly because the checksum is often preceded by // a copy_from_user()). This finding indiate that lfetch will not be beneficial since // the data is already in the cache. // #define saved_pfs r11 #define hmask r16 #define tmask r17 #define first1 r18 #define firstval r19 #define firstoff r20 #define last r21 #define lastval r22 #define lastoff r23 #define saved_lc r24 #define saved_pr r25 #define tmp1 r26 #define tmp2 r27 #define tmp3 r28 #define carry1 r29 #define carry2 r30 #define first2 r31 #define buf in0 #define len in1 #define LOAD_LATENCY 2 // XXX fix me #if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2) # error "Only 1 or 2 is supported/tested for LOAD_LATENCY." #endif #define PIPE_DEPTH (LOAD_LATENCY+2) #define ELD p[LOAD_LATENCY] // end of load #define ELD_1 p[LOAD_LATENCY+1] // and next stage // unsigned long do_csum(unsigned char *buf,long len) GLOBAL_ENTRY(do_csum) .prologue .save ar.pfs, saved_pfs alloc saved_pfs=ar.pfs,2,16,0,16 .rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2] .rotp p[PIPE_DEPTH], pC1[2], pC2[2] mov ret0=r0 // in case we have zero length cmp.lt p0,p6=r0,len // check for zero length or negative (32bit len) ;; add tmp1=buf,len // last byte's address .save pr, saved_pr mov saved_pr=pr // preserve predicates (rotation) (p6) br.ret.spnt.many rp // return if zero or negative length mov hmask=-1 // initialize head mask tbit.nz p15,p0=buf,0 // is buf an odd address? and first1=-8,buf // 8-byte align down address of first1 element and firstoff=7,buf // how many bytes off for first1 element mov tmask=-1 // initialize tail mask ;; adds tmp2=-1,tmp1 // last-1 and lastoff=7,tmp1 // how many bytes off for last element ;; sub tmp1=8,lastoff // complement to lastoff and last=-8,tmp2 // address of word containing last byte ;; sub tmp3=last,first1 // tmp3=distance from first1 to last .save ar.lc, saved_lc mov saved_lc=ar.lc // save lc cmp.eq p8,p9=last,first1 // everything fits in one word ? ld8 firstval=[first1],8 // load, ahead of time, "first1" word and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0 shl tmp2=firstoff,3 // number of bits ;; (p9) ld8 lastval=[last] // load, ahead of time, "last" word, if needed shl tmp1=tmp1,3 // number of bits (p9) adds tmp3=-8,tmp3 // effectively loaded ;; (p8) mov lastval=r0 // we don't need lastval if first1==last shl hmask=hmask,tmp2 // build head mask, mask off [0,first1off[ shr.u tmask=tmask,tmp1 // build tail mask, mask off ]8,lastoff] ;; .body #define count tmp3 (p8) and hmask=hmask,tmask // apply tail mask to head mask if 1 word only (p9) and word2[0]=lastval,tmask // mask last it as appropriate shr.u count=count,3 // how many 8-byte? ;; // If count is odd, finish this 8-byte word so that we can // load two back-to-back 8-byte words per loop thereafter. and word1[0]=firstval,hmask // and mask it as appropriate tbit.nz p10,p11=count,0 // if (count is odd) ;; (p8) mov result1[0]=word1[0] (p9) add result1[0]=word1[0],word2[0] ;; cmp.ltu p6,p0=result1[0],word1[0] // check the carry cmp.eq.or.andcm p8,p0=0,count // exit if zero 8-byte ;; (p6) adds result1[0]=1,result1[0] (p8) br.cond.dptk .do_csum_exit // if (within an 8-byte word) (p11) br.cond.dptk .do_csum16 // if (count is even) // Here count is odd. ld8 word1[1]=[first1],8 // load an 8-byte word cmp.eq p9,p10=1,count // if (count == 1) adds count=-1,count // loaded an 8-byte word ;; add result1[0]=result1[0],word1[1] ;; cmp.ltu p6,p0=result1[0],word1[1] ;; (p6) adds result1[0]=1,result1[0] (p9) br.cond.sptk .do_csum_exit // if (count == 1) exit // Fall through to calculate the checksum, feeding result1[0] as // the initial value in result1[0]. // // Calculate the checksum loading two 8-byte words per loop. // .do_csum16: add first2=8,first1 shr.u count=count,1 // we do 16 bytes per loop ;; adds count=-1,count mov carry1=r0 mov carry2=r0 brp.loop.imp 1f,2f ;; mov ar.ec=PIPE_DEPTH mov ar.lc=count // set lc mov pr.rot=1<<16 // result1[0] must be initialized in advance. mov result2[0]=r0 ;; .align 32 1: (ELD_1) cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1] (pC1[1])adds carry1=1,carry1 (ELD_1) cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1] (pC2[1])adds carry2=1,carry2 (ELD) add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY] (ELD) add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY] 2: (p[0]) ld8 word1[0]=[first1],16 (p[0]) ld8 word2[0]=[first2],16 br.ctop.sptk 1b ;; // Since len is a 32-bit value, carry cannot be larger than a 64-bit value. (pC1[1])adds carry1=1,carry1 // since we miss the last one (pC2[1])adds carry2=1,carry2 ;; add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1 add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2 ;; cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1 cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2 ;; (p6) adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1] (p7) adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1] ;; add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1] ;; cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1] ;; (p6) adds result1[0]=1,result1[0] ;; .do_csum_exit: // // now fold 64 into 16 bits taking care of carry // that's not very good because it has lots of sequentiality // mov tmp3=0xffff zxt4 tmp1=result1[0] shr.u tmp2=result1[0],32 ;; add result1[0]=tmp1,tmp2 ;; and tmp1=result1[0],tmp3 shr.u tmp2=result1[0],16 ;; add result1[0]=tmp1,tmp2 ;; and tmp1=result1[0],tmp3 shr.u tmp2=result1[0],16 ;; add result1[0]=tmp1,tmp2 ;; and tmp1=result1[0],tmp3 shr.u tmp2=result1[0],16 ;; add ret0=tmp1,tmp2 mov pr=saved_pr,0xffffffffffff0000 ;; // if buf was odd then swap bytes mov ar.pfs=saved_pfs // restore ar.ec (p15) mux1 ret0=ret0,@rev // reverse word ;; mov ar.lc=saved_lc (p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes br.ret.sptk.many rp // I (Jun Nakajima) wrote an equivalent code (see below), but it was // not much better than the original. So keep the original there so that // someone else can challenge. // // shr.u word1[0]=result1[0],32 // zxt4 result1[0]=result1[0] // ;; // add result1[0]=result1[0],word1[0] // ;; // zxt2 result2[0]=result1[0] // extr.u word1[0]=result1[0],16,16 // shr.u carry1=result1[0],32 // ;; // add result2[0]=result2[0],word1[0] // ;; // add result2[0]=result2[0],carry1 // ;; // extr.u ret0=result2[0],16,16 // ;; // add ret0=ret0,result2[0] // ;; // zxt2 ret0=ret0 // mov ar.pfs=saved_pfs // restore ar.ec // mov pr=saved_pr,0xffffffffffff0000 // ;; // // if buf was odd then swap bytes // mov ar.lc=saved_lc //(p15) mux1 ret0=ret0,@rev // reverse word // ;; //(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes // br.ret.sptk.many rp END(do_csum) |