Loading...
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com) */ #include <linux/linkage.h> #include <asm/cache.h> /* * The memset implementation below is optimized to use prefetchw and prealloc * instruction in case of CPU with 64B L1 data cache line (L1_CACHE_SHIFT == 6) * If you want to implement optimized memset for other possible L1 data cache * line lengths (32B and 128B) you should rewrite code carefully checking * we don't call any prefetchw/prealloc instruction for L1 cache lines which * don't belongs to memset area. */ #if L1_CACHE_SHIFT == 6 .macro PREALLOC_INSTR reg, off prealloc [\reg, \off] .endm .macro PREFETCHW_INSTR reg, off prefetchw [\reg, \off] .endm #else .macro PREALLOC_INSTR reg, off .endm .macro PREFETCHW_INSTR reg, off .endm #endif ENTRY_CFI(memset) mov.f 0, r2 ;;; if size is zero jz.d [blink] mov r3, r0 ; don't clobber ret val PREFETCHW_INSTR r0, 0 ; Prefetch the first write location ;;; if length < 8 brls.d.nt r2, 8, .Lsmallchunk mov.f lp_count,r2 and.f r4, r0, 0x03 rsub lp_count, r4, 4 lpnz @.Laligndestination ;; LOOP BEGIN stb.ab r1, [r3,1] sub r2, r2, 1 .Laligndestination: ;;; Destination is aligned and r1, r1, 0xFF asl r4, r1, 8 or r4, r4, r1 asl r5, r4, 16 or r5, r5, r4 mov r4, r5 sub3 lp_count, r2, 8 cmp r2, 64 bmsk.hi r2, r2, 5 mov.ls lp_count, 0 add3.hi r2, r2, 8 ;;; Convert len to Dwords, unfold x8 lsr.f lp_count, lp_count, 6 lpnz @.Lset64bytes ;; LOOP START PREALLOC_INSTR r3, 64 ; alloc next line w/o fetching #ifdef CONFIG_ARC_HAS_LL64 std.ab r4, [r3, 8] std.ab r4, [r3, 8] std.ab r4, [r3, 8] std.ab r4, [r3, 8] std.ab r4, [r3, 8] std.ab r4, [r3, 8] std.ab r4, [r3, 8] std.ab r4, [r3, 8] #else st.ab r4, [r3, 4] st.ab r4, [r3, 4] st.ab r4, [r3, 4] st.ab r4, [r3, 4] st.ab r4, [r3, 4] st.ab r4, [r3, 4] st.ab r4, [r3, 4] st.ab r4, [r3, 4] st.ab r4, [r3, 4] st.ab r4, [r3, 4] st.ab r4, [r3, 4] st.ab r4, [r3, 4] st.ab r4, [r3, 4] st.ab r4, [r3, 4] st.ab r4, [r3, 4] st.ab r4, [r3, 4] #endif .Lset64bytes: lsr.f lp_count, r2, 5 ;Last remaining max 124 bytes lpnz .Lset32bytes ;; LOOP START #ifdef CONFIG_ARC_HAS_LL64 std.ab r4, [r3, 8] std.ab r4, [r3, 8] std.ab r4, [r3, 8] std.ab r4, [r3, 8] #else st.ab r4, [r3, 4] st.ab r4, [r3, 4] st.ab r4, [r3, 4] st.ab r4, [r3, 4] st.ab r4, [r3, 4] st.ab r4, [r3, 4] st.ab r4, [r3, 4] st.ab r4, [r3, 4] #endif .Lset32bytes: and.f lp_count, r2, 0x1F ;Last remaining 31 bytes .Lsmallchunk: lpnz .Lcopy3bytes ;; LOOP START stb.ab r1, [r3, 1] .Lcopy3bytes: j [blink] END_CFI(memset) ENTRY_CFI(memzero) ; adjust bzero args to memset args mov r2, r1 b.d memset ;tail call so need to tinker with blink mov r1, 0 END_CFI(memzero) |