1/* 2 * Copyright (C) 2013 ARM Ltd. 3 * Copyright (C) 2013 Linaro. 4 * 5 * This code is based on glibc cortex strings work originally authored by Linaro 6 * and re-licensed under GPLv2 for the Linux kernel. The original code can 7 * be found @ 8 * 9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10 * files/head:/src/aarch64/ 11 * 12 * This program is free software; you can redistribute it and/or modify 13 * it under the terms of the GNU General Public License version 2 as 14 * published by the Free Software Foundation. 15 * 16 * This program is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 * GNU General Public License for more details. 20 * 21 * You should have received a copy of the GNU General Public License 22 * along with this program. If not, see <http://www.gnu.org/licenses/>. 23 */ 24 25#include <linux/linkage.h> 26#include <asm/assembler.h> 27#include <asm/cache.h> 28 29/* 30 * Move a buffer from src to test (alignment handled by the hardware). 31 * If dest <= src, call memcpy, otherwise copy in reverse order. 32 * 33 * Parameters: 34 * x0 - dest 35 * x1 - src 36 * x2 - n 37 * Returns: 38 * x0 - dest 39 */ 40dstin .req x0 41src .req x1 42count .req x2 43tmp1 .req x3 44tmp1w .req w3 45tmp2 .req x4 46tmp2w .req w4 47tmp3 .req x5 48tmp3w .req w5 49dst .req x6 50 51A_l .req x7 52A_h .req x8 53B_l .req x9 54B_h .req x10 55C_l .req x11 56C_h .req x12 57D_l .req x13 58D_h .req x14 59 60ENTRY(memmove) 61 cmp dstin, src 62 b.lo memcpy 63 add tmp1, src, count 64 cmp dstin, tmp1 65 b.hs memcpy /* No overlap. */ 66 67 add dst, dstin, count 68 add src, src, count 69 cmp count, #16 70 b.lo .Ltail15 /*probably non-alignment accesses.*/ 71 72 ands tmp2, src, #15 /* Bytes to reach alignment. */ 73 b.eq .LSrcAligned 74 sub count, count, tmp2 75 /* 76 * process the aligned offset length to make the src aligned firstly. 77 * those extra instructions' cost is acceptable. It also make the 78 * coming accesses are based on aligned address. 79 */ 80 tbz tmp2, #0, 1f 81 ldrb tmp1w, [src, #-1]! 82 strb tmp1w, [dst, #-1]! 831: 84 tbz tmp2, #1, 2f 85 ldrh tmp1w, [src, #-2]! 86 strh tmp1w, [dst, #-2]! 872: 88 tbz tmp2, #2, 3f 89 ldr tmp1w, [src, #-4]! 90 str tmp1w, [dst, #-4]! 913: 92 tbz tmp2, #3, .LSrcAligned 93 ldr tmp1, [src, #-8]! 94 str tmp1, [dst, #-8]! 95 96.LSrcAligned: 97 cmp count, #64 98 b.ge .Lcpy_over64 99 100 /* 101 * Deal with small copies quickly by dropping straight into the 102 * exit block. 103 */ 104.Ltail63: 105 /* 106 * Copy up to 48 bytes of data. At this point we only need the 107 * bottom 6 bits of count to be accurate. 108 */ 109 ands tmp1, count, #0x30 110 b.eq .Ltail15 111 cmp tmp1w, #0x20 112 b.eq 1f 113 b.lt 2f 114 ldp A_l, A_h, [src, #-16]! 115 stp A_l, A_h, [dst, #-16]! 1161: 117 ldp A_l, A_h, [src, #-16]! 118 stp A_l, A_h, [dst, #-16]! 1192: 120 ldp A_l, A_h, [src, #-16]! 121 stp A_l, A_h, [dst, #-16]! 122 123.Ltail15: 124 tbz count, #3, 1f 125 ldr tmp1, [src, #-8]! 126 str tmp1, [dst, #-8]! 1271: 128 tbz count, #2, 2f 129 ldr tmp1w, [src, #-4]! 130 str tmp1w, [dst, #-4]! 1312: 132 tbz count, #1, 3f 133 ldrh tmp1w, [src, #-2]! 134 strh tmp1w, [dst, #-2]! 1353: 136 tbz count, #0, .Lexitfunc 137 ldrb tmp1w, [src, #-1] 138 strb tmp1w, [dst, #-1] 139 140.Lexitfunc: 141 ret 142 143.Lcpy_over64: 144 subs count, count, #128 145 b.ge .Lcpy_body_large 146 /* 147 * Less than 128 bytes to copy, so handle 64 bytes here and then jump 148 * to the tail. 149 */ 150 ldp A_l, A_h, [src, #-16] 151 stp A_l, A_h, [dst, #-16] 152 ldp B_l, B_h, [src, #-32] 153 ldp C_l, C_h, [src, #-48] 154 stp B_l, B_h, [dst, #-32] 155 stp C_l, C_h, [dst, #-48] 156 ldp D_l, D_h, [src, #-64]! 157 stp D_l, D_h, [dst, #-64]! 158 159 tst count, #0x3f 160 b.ne .Ltail63 161 ret 162 163 /* 164 * Critical loop. Start at a new cache line boundary. Assuming 165 * 64 bytes per line this ensures the entire loop is in one line. 166 */ 167 .p2align L1_CACHE_SHIFT 168.Lcpy_body_large: 169 /* pre-load 64 bytes data. */ 170 ldp A_l, A_h, [src, #-16] 171 ldp B_l, B_h, [src, #-32] 172 ldp C_l, C_h, [src, #-48] 173 ldp D_l, D_h, [src, #-64]! 1741: 175 /* 176 * interlace the load of next 64 bytes data block with store of the last 177 * loaded 64 bytes data. 178 */ 179 stp A_l, A_h, [dst, #-16] 180 ldp A_l, A_h, [src, #-16] 181 stp B_l, B_h, [dst, #-32] 182 ldp B_l, B_h, [src, #-32] 183 stp C_l, C_h, [dst, #-48] 184 ldp C_l, C_h, [src, #-48] 185 stp D_l, D_h, [dst, #-64]! 186 ldp D_l, D_h, [src, #-64]! 187 subs count, count, #64 188 b.ge 1b 189 stp A_l, A_h, [dst, #-16] 190 stp B_l, B_h, [dst, #-32] 191 stp C_l, C_h, [dst, #-48] 192 stp D_l, D_h, [dst, #-64]! 193 194 tst count, #0x3f 195 b.ne .Ltail63 196 ret 197ENDPROC(memmove) 198