Loading...
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 | /* * * Optimized version of the standard strlen() function * * * Inputs: * in0 address of string * * Outputs: * ret0 the number of characters in the string (0 if empty string) * does not count the \0 * * Copyright (C) 1999, 2001 Hewlett-Packard Co * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com> * * 09/24/99 S.Eranian add speculation recovery code */ #include <asm/asmmacro.h> // // // This is an enhanced version of the basic strlen. it includes a combination // of compute zero index (czx), parallel comparisons, speculative loads and // loop unroll using rotating registers. // // General Ideas about the algorithm: // The goal is to look at the string in chunks of 8 bytes. // so we need to do a few extra checks at the beginning because the // string may not be 8-byte aligned. In this case we load the 8byte // quantity which includes the start of the string and mask the unused // bytes with 0xff to avoid confusing czx. // We use speculative loads and software pipelining to hide memory // latency and do read ahead safely. This way we defer any exception. // // Because we don't want the kernel to be relying on particular // settings of the DCR register, we provide recovery code in case // speculation fails. The recovery code is going to "redo" the work using // only normal loads. If we still get a fault then we generate a // kernel panic. Otherwise we return the strlen as usual. // // The fact that speculation may fail can be caused, for instance, by // the DCR.dm bit being set. In this case TLB misses are deferred, i.e., // a NaT bit will be set if the translation is not present. The normal // load, on the other hand, will cause the translation to be inserted // if the mapping exists. // // It should be noted that we execute recovery code only when we need // to use the data that has been speculatively loaded: we don't execute // recovery code on pure read ahead data. // // Remarks: // - the cmp r0,r0 is used as a fast way to initialize a predicate // register to 1. This is required to make sure that we get the parallel // compare correct. // // - we don't use the epilogue counter to exit the loop but we need to set // it to zero beforehand. // // - after the loop we must test for Nat values because neither the // czx nor cmp instruction raise a NaT consumption fault. We must be // careful not to look too far for a Nat for which we don't care. // For instance we don't need to look at a NaT in val2 if the zero byte // was in val1. // // - Clearly performance tuning is required. // // // #define saved_pfs r11 #define tmp r10 #define base r16 #define orig r17 #define saved_pr r18 #define src r19 #define mask r20 #define val r21 #define val1 r22 #define val2 r23 GLOBAL_ENTRY(strlen) .prologue .save ar.pfs, saved_pfs alloc saved_pfs=ar.pfs,11,0,0,8 // rotating must be multiple of 8 .rotr v[2], w[2] // declares our 4 aliases extr.u tmp=in0,0,3 // tmp=least significant 3 bits mov orig=in0 // keep trackof initial byte address dep src=0,in0,0,3 // src=8byte-aligned in0 address .save pr, saved_pr mov saved_pr=pr // preserve predicates (rotation) ;; .body ld8 v[1]=[src],8 // must not speculate: can fail here shl tmp=tmp,3 // multiply by 8bits/byte mov mask=-1 // our mask ;; ld8.s w[1]=[src],8 // speculatively load next cmp.eq p6,p0=r0,r0 // sets p6 to true for cmp.and sub tmp=64,tmp // how many bits to shift our mask on the right ;; shr.u mask=mask,tmp // zero enough bits to hold v[1] valuable part mov ar.ec=r0 // clear epilogue counter (saved in ar.pfs) ;; add base=-16,src // keep track of aligned base or v[1]=v[1],mask // now we have a safe initial byte pattern ;; 1: ld8.s v[0]=[src],8 // speculatively load next czx1.r val1=v[1] // search 0 byte from right czx1.r val2=w[1] // search 0 byte from right following 8bytes ;; ld8.s w[0]=[src],8 // speculatively load next to next cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8 cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8 (p6) br.wtop.dptk.few 1b // loop until p6 == 0 ;; // // We must return try the recovery code iff // val1_is_nat || (val1==8 && val2_is_nat) // // XXX Fixme // - there must be a better way of doing the test // cmp.eq p8,p9=8,val1 // p6 = val1 had zero (disambiguate) tnat.nz p6,p7=val1 // test NaT on val1 (p6) br.cond.spnt.few recover// jump to recovery if val1 is NaT ;; // // if we come here p7 is true, i.e., initialized for // cmp // cmp.eq.and p7,p0=8,val1// val1==8? tnat.nz.and p7,p0=val2 // test NaT if val2 (p7) br.cond.spnt.few recover// jump to recovery if val2 is NaT ;; (p8) mov val1=val2 // the other test got us out of the loop (p8) adds src=-16,src // correct position when 3 ahead (p9) adds src=-24,src // correct position when 4 ahead ;; sub ret0=src,orig // distance from base sub tmp=8,val1 // which byte in word mov pr=saved_pr,0xffffffffffff0000 ;; sub ret0=ret0,tmp // adjust mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what br.ret.sptk.few rp // end of normal execution // // Outlined recovery code when speculation failed // // This time we don't use speculation and rely on the normal exception // mechanism. that's why the loop is not as good as the previous one // because read ahead is not possible // // IMPORTANT: // Please note that in the case of strlen() as opposed to strlen_user() // we don't use the exception mechanism, as this function is not // supposed to fail. If that happens it means we have a bug and the // code will cause of kernel fault. // // XXX Fixme // - today we restart from the beginning of the string instead // of trying to continue where we left off. // recover: ld8 val=[base],8 // will fail if unrecoverable fault ;; or val=val,mask // remask first bytes cmp.eq p0,p6=r0,r0 // nullify first ld8 in loop ;; // // ar.ec is still zero here // 2: (p6) ld8 val=[base],8 // will fail if unrecoverable fault ;; czx1.r val1=val // search 0 byte from right ;; cmp.eq p6,p0=8,val1 // val1==8 ? (p6) br.wtop.dptk.few 2b // loop until p6 == 0 ;; // (avoid WAW on p63) sub ret0=base,orig // distance from base sub tmp=8,val1 mov pr=saved_pr,0xffffffffffff0000 ;; sub ret0=ret0,tmp // length=now - back -1 mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what br.ret.sptk.few rp // end of successful recovery code END(strlen) |