Loading...
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 | /* * arch/alpha/lib/ev6-copy_user.S * * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> * * Copy to/from user space, handling exceptions as we go.. This * isn't exactly pretty. * * This is essentially the same as "memcpy()", but with a few twists. * Notably, we have to make sure that $0 is always up-to-date and * contains the right "bytes left to copy" value (and that it is updated * only _after_ a successful copy). There is also some rather minor * exception setup stuff.. * * NOTE! This is not directly C-callable, because the calling semantics are * different: * * Inputs: * length in $0 * destination address in $6 * source address in $7 * return address in $28 * * Outputs: * bytes left to copy in $0 * * Clobbers: * $1,$2,$3,$4,$5,$6,$7 * * Much of the information about 21264 scheduling/coding comes from: * Compiler Writer's Guide for the Alpha 21264 * abbreviated as 'CWG' in other comments here * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html * Scheduling notation: * E - either cluster * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 */ /* Allow an exception for an insn; exit if we get one. */ #define EXI(x,y...) \ 99: x,##y; \ .section __ex_table,"a"; \ .gprel32 99b; \ lda $31, $exitin-99b($31); \ .previous #define EXO(x,y...) \ 99: x,##y; \ .section __ex_table,"a"; \ .gprel32 99b; \ lda $31, $exitout-99b($31); \ .previous .set noat .align 4 .globl __copy_user .ent __copy_user # Pipeline info: Slotting & Comments __copy_user: ldgp $29,0($27) # we do exceptions -- we need the gp. # Macro instruction becomes ldah/lda # .. .. E E .prologue 1 subq $0, 32, $1 # .. E .. .. : Is this going to be a small copy? beq $0, $zerolength # U .. .. .. : U L U L and $6,7,$3 # .. .. .. E : is leading dest misalignment ble $1, $onebyteloop # .. .. U .. : 1st branch : small amount of data beq $3, $destaligned # .. U .. .. : 2nd (one cycle fetcher stall) subq $3, 8, $3 # E .. .. .. : L U U L : trip counter /* * The fetcher stall also hides the 1 cycle cross-cluster stall for $3 (L --> U) * This loop aligns the destination a byte at a time * We know we have at least one trip through this loop */ $aligndest: EXI( ldbu $1,0($7) ) # .. .. .. L : Keep loads separate from stores addq $6,1,$6 # .. .. E .. : Section 3.8 in the CWG addq $3,1,$3 # .. E .. .. : nop # E .. .. .. : U L U L /* * the -1 is to compensate for the inc($6) done in a previous quadpack * which allows us zero dependencies within either quadpack in the loop */ EXO( stb $1,-1($6) ) # .. .. .. L : addq $7,1,$7 # .. .. E .. : Section 3.8 in the CWG subq $0,1,$0 # .. E .. .. : bne $3, $aligndest # U .. .. .. : U L U L /* * If we fell through into here, we have a minimum of 33 - 7 bytes * If we arrived via branch, we have a minimum of 32 bytes */ $destaligned: and $7,7,$1 # .. .. .. E : Check _current_ source alignment bic $0,7,$4 # .. .. E .. : number bytes as a quadword loop EXI( ldq_u $3,0($7) ) # .. L .. .. : Forward fetch for fallthrough code beq $1,$quadaligned # U .. .. .. : U L U L /* * In the worst case, we've just executed an ldq_u here from 0($7) * and we'll repeat it once if we take the branch */ /* Misaligned quadword loop - not unrolled. Leave it that way. */ $misquad: EXI( ldq_u $2,8($7) ) # .. .. .. L : subq $4,8,$4 # .. .. E .. : extql $3,$7,$3 # .. U .. .. : extqh $2,$7,$1 # U .. .. .. : U U L L bis $3,$1,$1 # .. .. .. E : EXO( stq $1,0($6) ) # .. .. L .. : addq $7,8,$7 # .. E .. .. : subq $0,8,$0 # E .. .. .. : U L L U addq $6,8,$6 # .. .. .. E : bis $2,$2,$3 # .. .. E .. : nop # .. E .. .. : bne $4,$misquad # U .. .. .. : U L U L nop # .. .. .. E nop # .. .. E .. nop # .. E .. .. beq $0,$zerolength # U .. .. .. : U L U L /* We know we have at least one trip through the byte loop */ EXI ( ldbu $2,0($7) ) # .. .. .. L : No loads in the same quad addq $6,1,$6 # .. .. E .. : as the store (Section 3.8 in CWG) nop # .. E .. .. : br $31, $dirtyentry # L0 .. .. .. : L U U L /* Do the trailing byte loop load, then hop into the store part of the loop */ /* * A minimum of (33 - 7) bytes to do a quad at a time. * Based upon the usage context, it's worth the effort to unroll this loop * $0 - number of bytes to be moved * $4 - number of bytes to move as quadwords * $6 is current destination address * $7 is current source address */ $quadaligned: subq $4, 32, $2 # .. .. .. E : do not unroll for small stuff nop # .. .. E .. nop # .. E .. .. blt $2, $onequad # U .. .. .. : U L U L /* * There is a significant assumption here that the source and destination * addresses differ by more than 32 bytes. In this particular case, a * sparsity of registers further bounds this to be a minimum of 8 bytes. * But if this isn't met, then the output result will be incorrect. * Furthermore, due to a lack of available registers, we really can't * unroll this to be an 8x loop (which would enable us to use the wh64 * instruction memory hint instruction). */ $unroll4: EXI( ldq $1,0($7) ) # .. .. .. L EXI( ldq $2,8($7) ) # .. .. L .. subq $4,32,$4 # .. E .. .. nop # E .. .. .. : U U L L addq $7,16,$7 # .. .. .. E EXO( stq $1,0($6) ) # .. .. L .. EXO( stq $2,8($6) ) # .. L .. .. subq $0,16,$0 # E .. .. .. : U L L U addq $6,16,$6 # .. .. .. E EXI( ldq $1,0($7) ) # .. .. L .. EXI( ldq $2,8($7) ) # .. L .. .. subq $4, 32, $3 # E .. .. .. : U U L L : is there enough for another trip? EXO( stq $1,0($6) ) # .. .. .. L EXO( stq $2,8($6) ) # .. .. L .. subq $0,16,$0 # .. E .. .. addq $7,16,$7 # E .. .. .. : U L L U nop # .. .. .. E nop # .. .. E .. addq $6,16,$6 # .. E .. .. bgt $3,$unroll4 # U .. .. .. : U L U L nop nop nop beq $4, $noquads $onequad: EXI( ldq $1,0($7) ) subq $4,8,$4 addq $7,8,$7 nop EXO( stq $1,0($6) ) subq $0,8,$0 addq $6,8,$6 bne $4,$onequad $noquads: nop nop nop beq $0,$zerolength /* * For small copies (or the tail of a larger copy), do a very simple byte loop. * There's no point in doing a lot of complex alignment calculations to try to * to quadword stuff for a small amount of data. * $0 - remaining number of bytes left to copy * $6 - current dest addr * $7 - current source addr */ $onebyteloop: EXI ( ldbu $2,0($7) ) # .. .. .. L : No loads in the same quad addq $6,1,$6 # .. .. E .. : as the store (Section 3.8 in CWG) nop # .. E .. .. : nop # E .. .. .. : U L U L $dirtyentry: /* * the -1 is to compensate for the inc($6) done in a previous quadpack * which allows us zero dependencies within either quadpack in the loop */ EXO ( stb $2,-1($6) ) # .. .. .. L : addq $7,1,$7 # .. .. E .. : quadpack as the load subq $0,1,$0 # .. E .. .. : change count _after_ copy bgt $0,$onebyteloop # U .. .. .. : U L U L $zerolength: $exitout: # Destination for exception recovery(?) nop # .. .. .. E nop # .. .. E .. nop # .. E .. .. ret $31,($28),1 # L0 .. .. .. : L U L U $exitin: /* A stupid byte-by-byte zeroing of the rest of the output buffer. This cures security holes by never leaving random kernel data around to be copied elsewhere. */ nop nop nop mov $0,$1 $101: EXO ( stb $31,0($6) ) # L subq $1,1,$1 # E addq $6,1,$6 # E bgt $1,$101 # U nop nop nop ret $31,($28),1 # L0 .end __copy_user |