1/* U1memcpy.S: UltraSPARC-I/II/IIi/IIe optimized memcpy. 2 * 3 * Copyright (C) 1997, 2004 David S. Miller (davem@redhat.com) 4 * Copyright (C) 1996, 1997, 1998, 1999 Jakub Jelinek (jj@ultra.linux.cz) 5 */ 6 7#ifdef __KERNEL__ 8#include <asm/visasm.h> 9#include <asm/asi.h> 10#define GLOBAL_SPARE g7 11#else 12#define GLOBAL_SPARE g5 13#define ASI_BLK_P 0xf0 14#define FPRS_FEF 0x04 15#ifdef MEMCPY_DEBUG 16#define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \ 17 clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0; 18#define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs 19#else 20#define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs 21#define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs 22#endif 23#endif 24 25#ifndef EX_LD 26#define EX_LD(x) x 27#endif 28 29#ifndef EX_ST 30#define EX_ST(x) x 31#endif 32 33#ifndef EX_RETVAL 34#define EX_RETVAL(x) x 35#endif 36 37#ifndef LOAD 38#define LOAD(type,addr,dest) type [addr], dest 39#endif 40 41#ifndef LOAD_BLK 42#define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_P, dest 43#endif 44 45#ifndef STORE 46#define STORE(type,src,addr) type src, [addr] 47#endif 48 49#ifndef STORE_BLK 50#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P 51#endif 52 53#ifndef FUNC_NAME 54#define FUNC_NAME memcpy 55#endif 56 57#ifndef PREAMBLE 58#define PREAMBLE 59#endif 60 61#ifndef XCC 62#define XCC xcc 63#endif 64 65#define FREG_FROB(f1, f2, f3, f4, f5, f6, f7, f8, f9) \ 66 faligndata %f1, %f2, %f48; \ 67 faligndata %f2, %f3, %f50; \ 68 faligndata %f3, %f4, %f52; \ 69 faligndata %f4, %f5, %f54; \ 70 faligndata %f5, %f6, %f56; \ 71 faligndata %f6, %f7, %f58; \ 72 faligndata %f7, %f8, %f60; \ 73 faligndata %f8, %f9, %f62; 74 75#define MAIN_LOOP_CHUNK(src, dest, fdest, fsrc, len, jmptgt) \ 76 EX_LD(LOAD_BLK(%src, %fdest)); \ 77 EX_ST(STORE_BLK(%fsrc, %dest)); \ 78 add %src, 0x40, %src; \ 79 subcc %len, 0x40, %len; \ 80 be,pn %xcc, jmptgt; \ 81 add %dest, 0x40, %dest; \ 82 83#define LOOP_CHUNK1(src, dest, len, branch_dest) \ 84 MAIN_LOOP_CHUNK(src, dest, f0, f48, len, branch_dest) 85#define LOOP_CHUNK2(src, dest, len, branch_dest) \ 86 MAIN_LOOP_CHUNK(src, dest, f16, f48, len, branch_dest) 87#define LOOP_CHUNK3(src, dest, len, branch_dest) \ 88 MAIN_LOOP_CHUNK(src, dest, f32, f48, len, branch_dest) 89 90#define DO_SYNC membar #Sync; 91#define STORE_SYNC(dest, fsrc) \ 92 EX_ST(STORE_BLK(%fsrc, %dest)); \ 93 add %dest, 0x40, %dest; \ 94 DO_SYNC 95 96#define STORE_JUMP(dest, fsrc, target) \ 97 EX_ST(STORE_BLK(%fsrc, %dest)); \ 98 add %dest, 0x40, %dest; \ 99 ba,pt %xcc, target; \ 100 nop; 101 102#define FINISH_VISCHUNK(dest, f0, f1, left) \ 103 subcc %left, 8, %left;\ 104 bl,pn %xcc, 95f; \ 105 faligndata %f0, %f1, %f48; \ 106 EX_ST(STORE(std, %f48, %dest)); \ 107 add %dest, 8, %dest; 108 109#define UNEVEN_VISCHUNK_LAST(dest, f0, f1, left) \ 110 subcc %left, 8, %left; \ 111 bl,pn %xcc, 95f; \ 112 fsrc2 %f0, %f1; 113 114#define UNEVEN_VISCHUNK(dest, f0, f1, left) \ 115 UNEVEN_VISCHUNK_LAST(dest, f0, f1, left) \ 116 ba,a,pt %xcc, 93f; 117 118 .register %g2,#scratch 119 .register %g3,#scratch 120 121 .text 122 .align 64 123 124 .globl FUNC_NAME 125 .type FUNC_NAME,#function 126FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ 127 srlx %o2, 31, %g2 128 cmp %g2, 0 129 tne %xcc, 5 130 PREAMBLE 131 mov %o0, %o4 132 cmp %o2, 0 133 be,pn %XCC, 85f 134 or %o0, %o1, %o3 135 cmp %o2, 16 136 blu,a,pn %XCC, 80f 137 or %o3, %o2, %o3 138 139 cmp %o2, (5 * 64) 140 blu,pt %XCC, 70f 141 andcc %o3, 0x7, %g0 142 143 /* Clobbers o5/g1/g2/g3/g7/icc/xcc. */ 144 VISEntry 145 146 /* Is 'dst' already aligned on an 64-byte boundary? */ 147 andcc %o0, 0x3f, %g2 148 be,pt %XCC, 2f 149 150 /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number 151 * of bytes to copy to make 'dst' 64-byte aligned. We pre- 152 * subtract this from 'len'. 153 */ 154 sub %o0, %o1, %GLOBAL_SPARE 155 sub %g2, 0x40, %g2 156 sub %g0, %g2, %g2 157 sub %o2, %g2, %o2 158 andcc %g2, 0x7, %g1 159 be,pt %icc, 2f 160 and %g2, 0x38, %g2 161 1621: subcc %g1, 0x1, %g1 163 EX_LD(LOAD(ldub, %o1 + 0x00, %o3)) 164 EX_ST(STORE(stb, %o3, %o1 + %GLOBAL_SPARE)) 165 bgu,pt %XCC, 1b 166 add %o1, 0x1, %o1 167 168 add %o1, %GLOBAL_SPARE, %o0 169 1702: cmp %g2, 0x0 171 and %o1, 0x7, %g1 172 be,pt %icc, 3f 173 alignaddr %o1, %g0, %o1 174 175 EX_LD(LOAD(ldd, %o1, %f4)) 1761: EX_LD(LOAD(ldd, %o1 + 0x8, %f6)) 177 add %o1, 0x8, %o1 178 subcc %g2, 0x8, %g2 179 faligndata %f4, %f6, %f0 180 EX_ST(STORE(std, %f0, %o0)) 181 be,pn %icc, 3f 182 add %o0, 0x8, %o0 183 184 EX_LD(LOAD(ldd, %o1 + 0x8, %f4)) 185 add %o1, 0x8, %o1 186 subcc %g2, 0x8, %g2 187 faligndata %f6, %f4, %f0 188 EX_ST(STORE(std, %f0, %o0)) 189 bne,pt %icc, 1b 190 add %o0, 0x8, %o0 191 192 /* Destination is 64-byte aligned. */ 1933: 194 membar #LoadStore | #StoreStore | #StoreLoad 195 196 subcc %o2, 0x40, %GLOBAL_SPARE 197 add %o1, %g1, %g1 198 andncc %GLOBAL_SPARE, (0x40 - 1), %GLOBAL_SPARE 199 srl %g1, 3, %g2 200 sub %o2, %GLOBAL_SPARE, %g3 201 andn %o1, (0x40 - 1), %o1 202 and %g2, 7, %g2 203 andncc %g3, 0x7, %g3 204 fsrc2 %f0, %f2 205 sub %g3, 0x8, %g3 206 sub %o2, %GLOBAL_SPARE, %o2 207 208 add %g1, %GLOBAL_SPARE, %g1 209 subcc %o2, %g3, %o2 210 211 EX_LD(LOAD_BLK(%o1, %f0)) 212 add %o1, 0x40, %o1 213 add %g1, %g3, %g1 214 EX_LD(LOAD_BLK(%o1, %f16)) 215 add %o1, 0x40, %o1 216 sub %GLOBAL_SPARE, 0x80, %GLOBAL_SPARE 217 EX_LD(LOAD_BLK(%o1, %f32)) 218 add %o1, 0x40, %o1 219 220 /* There are 8 instances of the unrolled loop, 221 * one for each possible alignment of the 222 * source buffer. Each loop instance is 452 223 * bytes. 224 */ 225 sll %g2, 3, %o3 226 sub %o3, %g2, %o3 227 sllx %o3, 4, %o3 228 add %o3, %g2, %o3 229 sllx %o3, 2, %g2 2301: rd %pc, %o3 231 add %o3, %lo(1f - 1b), %o3 232 jmpl %o3 + %g2, %g0 233 nop 234 235 .align 64 2361: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) 237 LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) 238 FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) 239 LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) 240 FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) 241 LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) 242 ba,pt %xcc, 1b+4 243 faligndata %f0, %f2, %f48 2441: FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) 245 STORE_SYNC(o0, f48) 246 FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) 247 STORE_JUMP(o0, f48, 40f) 2482: FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) 249 STORE_SYNC(o0, f48) 250 FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) 251 STORE_JUMP(o0, f48, 48f) 2523: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) 253 STORE_SYNC(o0, f48) 254 FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) 255 STORE_JUMP(o0, f48, 56f) 256 2571: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) 258 LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) 259 FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) 260 LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) 261 FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) 262 LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) 263 ba,pt %xcc, 1b+4 264 faligndata %f2, %f4, %f48 2651: FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) 266 STORE_SYNC(o0, f48) 267 FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) 268 STORE_JUMP(o0, f48, 41f) 2692: FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) 270 STORE_SYNC(o0, f48) 271 FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) 272 STORE_JUMP(o0, f48, 49f) 2733: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) 274 STORE_SYNC(o0, f48) 275 FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) 276 STORE_JUMP(o0, f48, 57f) 277 2781: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) 279 LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) 280 FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) 281 LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) 282 FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) 283 LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) 284 ba,pt %xcc, 1b+4 285 faligndata %f4, %f6, %f48 2861: FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) 287 STORE_SYNC(o0, f48) 288 FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) 289 STORE_JUMP(o0, f48, 42f) 2902: FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) 291 STORE_SYNC(o0, f48) 292 FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) 293 STORE_JUMP(o0, f48, 50f) 2943: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) 295 STORE_SYNC(o0, f48) 296 FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) 297 STORE_JUMP(o0, f48, 58f) 298 2991: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) 300 LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) 301 FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) 302 LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) 303 FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) 304 LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) 305 ba,pt %xcc, 1b+4 306 faligndata %f6, %f8, %f48 3071: FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) 308 STORE_SYNC(o0, f48) 309 FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) 310 STORE_JUMP(o0, f48, 43f) 3112: FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) 312 STORE_SYNC(o0, f48) 313 FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) 314 STORE_JUMP(o0, f48, 51f) 3153: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) 316 STORE_SYNC(o0, f48) 317 FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) 318 STORE_JUMP(o0, f48, 59f) 319 3201: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) 321 LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) 322 FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) 323 LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) 324 FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) 325 LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) 326 ba,pt %xcc, 1b+4 327 faligndata %f8, %f10, %f48 3281: FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) 329 STORE_SYNC(o0, f48) 330 FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) 331 STORE_JUMP(o0, f48, 44f) 3322: FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) 333 STORE_SYNC(o0, f48) 334 FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) 335 STORE_JUMP(o0, f48, 52f) 3363: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) 337 STORE_SYNC(o0, f48) 338 FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) 339 STORE_JUMP(o0, f48, 60f) 340 3411: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) 342 LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) 343 FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) 344 LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) 345 FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) 346 LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) 347 ba,pt %xcc, 1b+4 348 faligndata %f10, %f12, %f48 3491: FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) 350 STORE_SYNC(o0, f48) 351 FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) 352 STORE_JUMP(o0, f48, 45f) 3532: FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) 354 STORE_SYNC(o0, f48) 355 FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) 356 STORE_JUMP(o0, f48, 53f) 3573: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) 358 STORE_SYNC(o0, f48) 359 FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) 360 STORE_JUMP(o0, f48, 61f) 361 3621: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) 363 LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) 364 FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) 365 LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) 366 FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) 367 LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) 368 ba,pt %xcc, 1b+4 369 faligndata %f12, %f14, %f48 3701: FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) 371 STORE_SYNC(o0, f48) 372 FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) 373 STORE_JUMP(o0, f48, 46f) 3742: FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) 375 STORE_SYNC(o0, f48) 376 FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) 377 STORE_JUMP(o0, f48, 54f) 3783: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) 379 STORE_SYNC(o0, f48) 380 FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) 381 STORE_JUMP(o0, f48, 62f) 382 3831: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) 384 LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) 385 FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) 386 LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) 387 FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) 388 LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) 389 ba,pt %xcc, 1b+4 390 faligndata %f14, %f16, %f48 3911: FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) 392 STORE_SYNC(o0, f48) 393 FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) 394 STORE_JUMP(o0, f48, 47f) 3952: FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) 396 STORE_SYNC(o0, f48) 397 FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) 398 STORE_JUMP(o0, f48, 55f) 3993: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) 400 STORE_SYNC(o0, f48) 401 FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) 402 STORE_JUMP(o0, f48, 63f) 403 40440: FINISH_VISCHUNK(o0, f0, f2, g3) 40541: FINISH_VISCHUNK(o0, f2, f4, g3) 40642: FINISH_VISCHUNK(o0, f4, f6, g3) 40743: FINISH_VISCHUNK(o0, f6, f8, g3) 40844: FINISH_VISCHUNK(o0, f8, f10, g3) 40945: FINISH_VISCHUNK(o0, f10, f12, g3) 41046: FINISH_VISCHUNK(o0, f12, f14, g3) 41147: UNEVEN_VISCHUNK(o0, f14, f0, g3) 41248: FINISH_VISCHUNK(o0, f16, f18, g3) 41349: FINISH_VISCHUNK(o0, f18, f20, g3) 41450: FINISH_VISCHUNK(o0, f20, f22, g3) 41551: FINISH_VISCHUNK(o0, f22, f24, g3) 41652: FINISH_VISCHUNK(o0, f24, f26, g3) 41753: FINISH_VISCHUNK(o0, f26, f28, g3) 41854: FINISH_VISCHUNK(o0, f28, f30, g3) 41955: UNEVEN_VISCHUNK(o0, f30, f0, g3) 42056: FINISH_VISCHUNK(o0, f32, f34, g3) 42157: FINISH_VISCHUNK(o0, f34, f36, g3) 42258: FINISH_VISCHUNK(o0, f36, f38, g3) 42359: FINISH_VISCHUNK(o0, f38, f40, g3) 42460: FINISH_VISCHUNK(o0, f40, f42, g3) 42561: FINISH_VISCHUNK(o0, f42, f44, g3) 42662: FINISH_VISCHUNK(o0, f44, f46, g3) 42763: UNEVEN_VISCHUNK_LAST(o0, f46, f0, g3) 428 42993: EX_LD(LOAD(ldd, %o1, %f2)) 430 add %o1, 8, %o1 431 subcc %g3, 8, %g3 432 faligndata %f0, %f2, %f8 433 EX_ST(STORE(std, %f8, %o0)) 434 bl,pn %xcc, 95f 435 add %o0, 8, %o0 436 EX_LD(LOAD(ldd, %o1, %f0)) 437 add %o1, 8, %o1 438 subcc %g3, 8, %g3 439 faligndata %f2, %f0, %f8 440 EX_ST(STORE(std, %f8, %o0)) 441 bge,pt %xcc, 93b 442 add %o0, 8, %o0 443 44495: brz,pt %o2, 2f 445 mov %g1, %o1 446 4471: EX_LD(LOAD(ldub, %o1, %o3)) 448 add %o1, 1, %o1 449 subcc %o2, 1, %o2 450 EX_ST(STORE(stb, %o3, %o0)) 451 bne,pt %xcc, 1b 452 add %o0, 1, %o0 453 4542: membar #StoreLoad | #StoreStore 455 VISExit 456 retl 457 mov EX_RETVAL(%o4), %o0 458 459 .align 64 46070: /* 16 < len <= (5 * 64) */ 461 bne,pn %XCC, 75f 462 sub %o0, %o1, %o3 463 46472: andn %o2, 0xf, %GLOBAL_SPARE 465 and %o2, 0xf, %o2 4661: EX_LD(LOAD(ldx, %o1 + 0x00, %o5)) 467 EX_LD(LOAD(ldx, %o1 + 0x08, %g1)) 468 subcc %GLOBAL_SPARE, 0x10, %GLOBAL_SPARE 469 EX_ST(STORE(stx, %o5, %o1 + %o3)) 470 add %o1, 0x8, %o1 471 EX_ST(STORE(stx, %g1, %o1 + %o3)) 472 bgu,pt %XCC, 1b 473 add %o1, 0x8, %o1 47473: andcc %o2, 0x8, %g0 475 be,pt %XCC, 1f 476 nop 477 EX_LD(LOAD(ldx, %o1, %o5)) 478 sub %o2, 0x8, %o2 479 EX_ST(STORE(stx, %o5, %o1 + %o3)) 480 add %o1, 0x8, %o1 4811: andcc %o2, 0x4, %g0 482 be,pt %XCC, 1f 483 nop 484 EX_LD(LOAD(lduw, %o1, %o5)) 485 sub %o2, 0x4, %o2 486 EX_ST(STORE(stw, %o5, %o1 + %o3)) 487 add %o1, 0x4, %o1 4881: cmp %o2, 0 489 be,pt %XCC, 85f 490 nop 491 ba,pt %xcc, 90f 492 nop 493 49475: andcc %o0, 0x7, %g1 495 sub %g1, 0x8, %g1 496 be,pn %icc, 2f 497 sub %g0, %g1, %g1 498 sub %o2, %g1, %o2 499 5001: EX_LD(LOAD(ldub, %o1, %o5)) 501 subcc %g1, 1, %g1 502 EX_ST(STORE(stb, %o5, %o1 + %o3)) 503 bgu,pt %icc, 1b 504 add %o1, 1, %o1 505 5062: add %o1, %o3, %o0 507 andcc %o1, 0x7, %g1 508 bne,pt %icc, 8f 509 sll %g1, 3, %g1 510 511 cmp %o2, 16 512 bgeu,pt %icc, 72b 513 nop 514 ba,a,pt %xcc, 73b 515 5168: mov 64, %o3 517 andn %o1, 0x7, %o1 518 EX_LD(LOAD(ldx, %o1, %g2)) 519 sub %o3, %g1, %o3 520 andn %o2, 0x7, %GLOBAL_SPARE 521 sllx %g2, %g1, %g2 5221: EX_LD(LOAD(ldx, %o1 + 0x8, %g3)) 523 subcc %GLOBAL_SPARE, 0x8, %GLOBAL_SPARE 524 add %o1, 0x8, %o1 525 srlx %g3, %o3, %o5 526 or %o5, %g2, %o5 527 EX_ST(STORE(stx, %o5, %o0)) 528 add %o0, 0x8, %o0 529 bgu,pt %icc, 1b 530 sllx %g3, %g1, %g2 531 532 srl %g1, 3, %g1 533 andcc %o2, 0x7, %o2 534 be,pn %icc, 85f 535 add %o1, %g1, %o1 536 ba,pt %xcc, 90f 537 sub %o0, %o1, %o3 538 539 .align 64 54080: /* 0 < len <= 16 */ 541 andcc %o3, 0x3, %g0 542 bne,pn %XCC, 90f 543 sub %o0, %o1, %o3 544 5451: EX_LD(LOAD(lduw, %o1, %g1)) 546 subcc %o2, 4, %o2 547 EX_ST(STORE(stw, %g1, %o1 + %o3)) 548 bgu,pt %XCC, 1b 549 add %o1, 4, %o1 550 55185: retl 552 mov EX_RETVAL(%o4), %o0 553 554 .align 32 55590: EX_LD(LOAD(ldub, %o1, %g1)) 556 subcc %o2, 1, %o2 557 EX_ST(STORE(stb, %g1, %o1 + %o3)) 558 bgu,pt %XCC, 90b 559 add %o1, 1, %o1 560 retl 561 mov EX_RETVAL(%o4), %o0 562 563 .size FUNC_NAME, .-FUNC_NAME 564