Loading...
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com> */ #include <linux/linkage.h> #include <asm/asm.h> SYM_FUNC_START(__memmove) /* * Returns * a0 - dest * * Parameters * a0 - Inclusive first byte of dest * a1 - Inclusive first byte of src * a2 - Length of copy n * * Because the return matches the parameter register a0, * we will not clobber or modify that register. * * Note: This currently only works on little-endian. * To port to big-endian, reverse the direction of shifts * in the 2 misaligned fixup copy loops. */ /* Return if nothing to do */ beq a0, a1, .Lreturn_from_memmove beqz a2, .Lreturn_from_memmove /* * Register Uses * Forward Copy: a1 - Index counter of src * Reverse Copy: a4 - Index counter of src * Forward Copy: t3 - Index counter of dest * Reverse Copy: t4 - Index counter of dest * Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest * Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest * Both Copy Modes: t0 - Link / Temporary for load-store * Both Copy Modes: t1 - Temporary for load-store * Both Copy Modes: t2 - Temporary for load-store * Both Copy Modes: a5 - dest to src alignment offset * Both Copy Modes: a6 - Shift ammount * Both Copy Modes: a7 - Inverse Shift ammount * Both Copy Modes: a2 - Alternate breakpoint for unrolled loops */ /* * Solve for some register values now. * Byte copy does not need t5 or t6. */ mv t3, a0 add t4, a0, a2 add a4, a1, a2 /* * Byte copy if copying less than (2 * SZREG) bytes. This can * cause problems with the bulk copy implementation and is * small enough not to bother. */ andi t0, a2, -(2 * SZREG) beqz t0, .Lbyte_copy /* * Now solve for t5 and t6. */ andi t5, t3, -SZREG andi t6, t4, -SZREG /* * If dest(Register t3) rounded down to the nearest naturally * aligned SZREG address, does not equal dest, then add SZREG * to find the low-bound of SZREG alignment in the dest memory * region. Note that this could overshoot the dest memory * region if n is less than SZREG. This is one reason why * we always byte copy if n is less than SZREG. * Otherwise, dest is already naturally aligned to SZREG. */ beq t5, t3, 1f addi t5, t5, SZREG 1: /* * If the dest and src are co-aligned to SZREG, then there is * no need for the full rigmarole of a full misaligned fixup copy. * Instead, do a simpler co-aligned copy. */ xor t0, a0, a1 andi t1, t0, (SZREG - 1) beqz t1, .Lcoaligned_copy /* Fall through to misaligned fixup copy */ .Lmisaligned_fixup_copy: bltu a1, a0, .Lmisaligned_fixup_copy_reverse .Lmisaligned_fixup_copy_forward: jal t0, .Lbyte_copy_until_aligned_forward andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */ slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */ sub a5, a1, t3 /* Find the difference between src and dest */ andi a1, a1, -SZREG /* Align the src pointer */ addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/ /* * Compute The Inverse Shift * a7 = XLEN - a6 = XLEN + -a6 * 2s complement negation to find the negative: -a6 = ~a6 + 1 * Add that to XLEN. XLEN = SZREG * 8. */ not a7, a6 addi a7, a7, (SZREG * 8 + 1) /* * Fix Misalignment Copy Loop - Forward * load_val0 = load_ptr[0]; * do { * load_val1 = load_ptr[1]; * store_ptr += 2; * store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7}); * * if (store_ptr == {a2}) * break; * * load_val0 = load_ptr[2]; * load_ptr += 2; * store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7}); * * } while (store_ptr != store_ptr_end); * store_ptr = store_ptr_end; */ REG_L t0, (0 * SZREG)(a1) 1: REG_L t1, (1 * SZREG)(a1) addi t3, t3, (2 * SZREG) srl t0, t0, a6 sll t2, t1, a7 or t2, t0, t2 REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3) beq t3, a2, 2f REG_L t0, (2 * SZREG)(a1) addi a1, a1, (2 * SZREG) srl t1, t1, a6 sll t2, t0, a7 or t2, t1, t2 REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3) bne t3, t6, 1b 2: mv t3, t6 /* Fix the dest pointer in case the loop was broken */ add a1, t3, a5 /* Restore the src pointer */ j .Lbyte_copy_forward /* Copy any remaining bytes */ .Lmisaligned_fixup_copy_reverse: jal t0, .Lbyte_copy_until_aligned_reverse andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */ slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */ sub a5, a4, t4 /* Find the difference between src and dest */ andi a4, a4, -SZREG /* Align the src pointer */ addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/ /* * Compute The Inverse Shift * a7 = XLEN - a6 = XLEN + -a6 * 2s complement negation to find the negative: -a6 = ~a6 + 1 * Add that to XLEN. XLEN = SZREG * 8. */ not a7, a6 addi a7, a7, (SZREG * 8 + 1) /* * Fix Misalignment Copy Loop - Reverse * load_val1 = load_ptr[0]; * do { * load_val0 = load_ptr[-1]; * store_ptr -= 2; * store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7}); * * if (store_ptr == {a2}) * break; * * load_val1 = load_ptr[-2]; * load_ptr -= 2; * store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7}); * * } while (store_ptr != store_ptr_end); * store_ptr = store_ptr_end; */ REG_L t1, ( 0 * SZREG)(a4) 1: REG_L t0, (-1 * SZREG)(a4) addi t4, t4, (-2 * SZREG) sll t1, t1, a7 srl t2, t0, a6 or t2, t1, t2 REG_S t2, ( 1 * SZREG)(t4) beq t4, a2, 2f REG_L t1, (-2 * SZREG)(a4) addi a4, a4, (-2 * SZREG) sll t0, t0, a7 srl t2, t1, a6 or t2, t0, t2 REG_S t2, ( 0 * SZREG)(t4) bne t4, t5, 1b 2: mv t4, t5 /* Fix the dest pointer in case the loop was broken */ add a4, t4, a5 /* Restore the src pointer */ j .Lbyte_copy_reverse /* Copy any remaining bytes */ /* * Simple copy loops for SZREG co-aligned memory locations. * These also make calls to do byte copies for any unaligned * data at their terminations. */ .Lcoaligned_copy: bltu a1, a0, .Lcoaligned_copy_reverse .Lcoaligned_copy_forward: jal t0, .Lbyte_copy_until_aligned_forward 1: REG_L t1, ( 0 * SZREG)(a1) addi a1, a1, SZREG addi t3, t3, SZREG REG_S t1, (-1 * SZREG)(t3) bne t3, t6, 1b j .Lbyte_copy_forward /* Copy any remaining bytes */ .Lcoaligned_copy_reverse: jal t0, .Lbyte_copy_until_aligned_reverse 1: REG_L t1, (-1 * SZREG)(a4) addi a4, a4, -SZREG addi t4, t4, -SZREG REG_S t1, ( 0 * SZREG)(t4) bne t4, t5, 1b j .Lbyte_copy_reverse /* Copy any remaining bytes */ /* * These are basically sub-functions within the function. They * are used to byte copy until the dest pointer is in alignment. * At which point, a bulk copy method can be used by the * calling code. These work on the same registers as the bulk * copy loops. Therefore, the register values can be picked * up from where they were left and we avoid code duplication * without any overhead except the call in and return jumps. */ .Lbyte_copy_until_aligned_forward: beq t3, t5, 2f 1: lb t1, 0(a1) addi a1, a1, 1 addi t3, t3, 1 sb t1, -1(t3) bne t3, t5, 1b 2: jalr zero, 0x0(t0) /* Return to multibyte copy loop */ .Lbyte_copy_until_aligned_reverse: beq t4, t6, 2f 1: lb t1, -1(a4) addi a4, a4, -1 addi t4, t4, -1 sb t1, 0(t4) bne t4, t6, 1b 2: jalr zero, 0x0(t0) /* Return to multibyte copy loop */ /* * Simple byte copy loops. * These will byte copy until they reach the end of data to copy. * At that point, they will call to return from memmove. */ .Lbyte_copy: bltu a1, a0, .Lbyte_copy_reverse .Lbyte_copy_forward: beq t3, t4, 2f 1: lb t1, 0(a1) addi a1, a1, 1 addi t3, t3, 1 sb t1, -1(t3) bne t3, t4, 1b 2: ret .Lbyte_copy_reverse: beq t4, t3, 2f 1: lb t1, -1(a4) addi a4, a4, -1 addi t4, t4, -1 sb t1, 0(t4) bne t4, t3, 1b 2: .Lreturn_from_memmove: ret SYM_FUNC_END(__memmove) SYM_FUNC_ALIAS_WEAK(memmove, __memmove) SYM_FUNC_ALIAS(__pi_memmove, __memmove) SYM_FUNC_ALIAS(__pi___memmove, __memmove) |