Loading...
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 | /* SPDX-License-Identifier: GPL-2.0-or-later WITH GCC-exception-2.0 */ #include <linux/linkage.h> #include <asm/asmmacro.h> #include <asm/core.h> #if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32 || XCHAL_HAVE_MAC16 #define XCHAL_NO_MUL 0 #else #define XCHAL_NO_MUL 1 #endif ENTRY(__umulsidi3) #ifdef __XTENSA_CALL0_ABI__ abi_entry(32) s32i a12, sp, 16 s32i a13, sp, 20 s32i a14, sp, 24 s32i a15, sp, 28 #elif XCHAL_NO_MUL /* This is not really a leaf function; allocate enough stack space to allow CALL12s to a helper function. */ abi_entry(32) #else abi_entry_default #endif #ifdef __XTENSA_EB__ #define wh a2 #define wl a3 #else #define wh a3 #define wl a2 #endif /* __XTENSA_EB__ */ /* This code is taken from the mulsf3 routine in ieee754-sf.S. See more comments there. */ #if XCHAL_HAVE_MUL32_HIGH mull a6, a2, a3 muluh wh, a2, a3 mov wl, a6 #else /* ! MUL32_HIGH */ #if defined(__XTENSA_CALL0_ABI__) && XCHAL_NO_MUL /* a0 and a8 will be clobbered by calling the multiply function but a8 is not used here and need not be saved. */ s32i a0, sp, 0 #endif #if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32 #define a2h a4 #define a3h a5 /* Get the high halves of the inputs into registers. */ srli a2h, a2, 16 srli a3h, a3, 16 #define a2l a2 #define a3l a3 #if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16 /* Clear the high halves of the inputs. This does not matter for MUL16 because the high bits are ignored. */ extui a2, a2, 0, 16 extui a3, a3, 0, 16 #endif #endif /* MUL16 || MUL32 */ #if XCHAL_HAVE_MUL16 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \ mul16u dst, xreg ## xhalf, yreg ## yhalf #elif XCHAL_HAVE_MUL32 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \ mull dst, xreg ## xhalf, yreg ## yhalf #elif XCHAL_HAVE_MAC16 /* The preprocessor insists on inserting a space when concatenating after a period in the definition of do_mul below. These macros are a workaround using underscores instead of periods when doing the concatenation. */ #define umul_aa_ll umul.aa.ll #define umul_aa_lh umul.aa.lh #define umul_aa_hl umul.aa.hl #define umul_aa_hh umul.aa.hh #define do_mul(dst, xreg, xhalf, yreg, yhalf) \ umul_aa_ ## xhalf ## yhalf xreg, yreg; \ rsr dst, ACCLO #else /* no multiply hardware */ #define set_arg_l(dst, src) \ extui dst, src, 0, 16 #define set_arg_h(dst, src) \ srli dst, src, 16 #ifdef __XTENSA_CALL0_ABI__ #define do_mul(dst, xreg, xhalf, yreg, yhalf) \ set_arg_ ## xhalf (a13, xreg); \ set_arg_ ## yhalf (a14, yreg); \ call0 .Lmul_mulsi3; \ mov dst, a12 #else #define do_mul(dst, xreg, xhalf, yreg, yhalf) \ set_arg_ ## xhalf (a14, xreg); \ set_arg_ ## yhalf (a15, yreg); \ call12 .Lmul_mulsi3; \ mov dst, a14 #endif /* __XTENSA_CALL0_ABI__ */ #endif /* no multiply hardware */ /* Add pp1 and pp2 into a6 with carry-out in a9. */ do_mul(a6, a2, l, a3, h) /* pp 1 */ do_mul(a11, a2, h, a3, l) /* pp 2 */ movi a9, 0 add a6, a6, a11 bgeu a6, a11, 1f addi a9, a9, 1 1: /* Shift the high half of a9/a6 into position in a9. Note that this value can be safely incremented without any carry-outs. */ ssai 16 src a9, a9, a6 /* Compute the low word into a6. */ do_mul(a11, a2, l, a3, l) /* pp 0 */ sll a6, a6 add a6, a6, a11 bgeu a6, a11, 1f addi a9, a9, 1 1: /* Compute the high word into wh. */ do_mul(wh, a2, h, a3, h) /* pp 3 */ add wh, wh, a9 mov wl, a6 #endif /* !MUL32_HIGH */ #if defined(__XTENSA_CALL0_ABI__) && XCHAL_NO_MUL /* Restore the original return address. */ l32i a0, sp, 0 #endif #ifdef __XTENSA_CALL0_ABI__ l32i a12, sp, 16 l32i a13, sp, 20 l32i a14, sp, 24 l32i a15, sp, 28 abi_ret(32) #else abi_ret_default #endif #if XCHAL_NO_MUL .macro do_addx2 dst, as, at, tmp #if XCHAL_HAVE_ADDX addx2 \dst, \as, \at #else slli \tmp, \as, 1 add \dst, \tmp, \at #endif .endm .macro do_addx4 dst, as, at, tmp #if XCHAL_HAVE_ADDX addx4 \dst, \as, \at #else slli \tmp, \as, 2 add \dst, \tmp, \at #endif .endm .macro do_addx8 dst, as, at, tmp #if XCHAL_HAVE_ADDX addx8 \dst, \as, \at #else slli \tmp, \as, 3 add \dst, \tmp, \at #endif .endm /* For Xtensa processors with no multiply hardware, this simplified version of _mulsi3 is used for multiplying 16-bit chunks of the floating-point mantissas. When using CALL0, this function uses a custom ABI: the inputs are passed in a13 and a14, the result is returned in a12, and a8 and a15 are clobbered. */ .align 4 .Lmul_mulsi3: abi_entry_default .macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2 movi \dst, 0 1: add \tmp1, \src2, \dst extui \tmp2, \src1, 0, 1 movnez \dst, \tmp1, \tmp2 do_addx2 \tmp1, \src2, \dst, \tmp1 extui \tmp2, \src1, 1, 1 movnez \dst, \tmp1, \tmp2 do_addx4 \tmp1, \src2, \dst, \tmp1 extui \tmp2, \src1, 2, 1 movnez \dst, \tmp1, \tmp2 do_addx8 \tmp1, \src2, \dst, \tmp1 extui \tmp2, \src1, 3, 1 movnez \dst, \tmp1, \tmp2 srli \src1, \src1, 4 slli \src2, \src2, 4 bnez \src1, 1b .endm #ifdef __XTENSA_CALL0_ABI__ mul_mulsi3_body a12, a13, a14, a15, a8 #else /* The result will be written into a2, so save that argument in a4. */ mov a4, a2 mul_mulsi3_body a2, a4, a3, a5, a6 #endif abi_ret_default #endif /* XCHAL_NO_MUL */ ENDPROC(__umulsidi3) |