root/arch/microblaze/lib/fastcopy.S

/* [<][>][^][v][top][bottom][index][help] */
   1 /*
   2  * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
   3  * Copyright (C) 2008-2009 PetaLogix
   4  * Copyright (C) 2008 Jim Law - Iris LP  All rights reserved.
   5  *
   6  * This file is subject to the terms and conditions of the GNU General
   7  * Public License.  See the file COPYING in the main directory of this
   8  * archive for more details.
   9  *
  10  * Written by Jim Law <jlaw@irispower.com>
  11  *
  12  * intended to replace:
  13  *      memcpy in memcpy.c and
  14  *      memmove in memmove.c
  15  * ... in arch/microblaze/lib
  16  *
  17  *
  18  * assly_fastcopy.S
  19  *
  20  * Attempt at quicker memcpy and memmove for MicroBlaze
  21  *      Input : Operand1 in Reg r5 - destination address
  22  *              Operand2 in Reg r6 - source address
  23  *              Operand3 in Reg r7 - number of bytes to transfer
  24  *      Output: Result in Reg r3 - starting destinaition address
  25  *
  26  *
  27  * Explanation:
  28  *      Perform (possibly unaligned) copy of a block of memory
  29  *      between mem locations with size of xfer spec'd in bytes
  30  */
  31 
  32 #include <linux/linkage.h>
  33         .text
  34         .globl  memcpy
  35         .type  memcpy, @function
  36         .ent    memcpy
  37 
  38 memcpy:
  39 fast_memcpy_ascending:
  40         /* move d to return register as value of function */
  41         addi    r3, r5, 0
  42 
  43         addi    r4, r0, 4       /* n = 4 */
  44         cmpu    r4, r4, r7      /* n = c - n  (unsigned) */
  45         blti    r4, a_xfer_end  /* if n < 0, less than one word to transfer */
  46 
  47         /* transfer first 0~3 bytes to get aligned dest address */
  48         andi    r4, r5, 3               /* n = d & 3 */
  49         /* if zero, destination already aligned */
  50         beqi    r4, a_dalign_done
  51         /* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
  52         rsubi   r4, r4, 4
  53         rsub    r7, r4, r7              /* c = c - n adjust c */
  54 
  55 a_xfer_first_loop:
  56         /* if no bytes left to transfer, transfer the bulk */
  57         beqi    r4, a_dalign_done
  58         lbui    r11, r6, 0              /* h = *s */
  59         sbi     r11, r5, 0              /* *d = h */
  60         addi    r6, r6, 1               /* s++ */
  61         addi    r5, r5, 1               /* d++ */
  62         brid    a_xfer_first_loop       /* loop */
  63         addi    r4, r4, -1              /* n-- (IN DELAY SLOT) */
  64 
  65 a_dalign_done:
  66         addi    r4, r0, 32              /* n = 32 */
  67         cmpu    r4, r4, r7              /* n = c - n  (unsigned) */
  68         /* if n < 0, less than one block to transfer */
  69         blti    r4, a_block_done
  70 
  71 a_block_xfer:
  72         andi    r4, r7, 0xffffffe0      /* n = c & ~31 */
  73         rsub    r7, r4, r7              /* c = c - n */
  74 
  75         andi    r9, r6, 3               /* t1 = s & 3 */
  76         /* if temp != 0, unaligned transfers needed */
  77         bnei    r9, a_block_unaligned
  78 
  79 a_block_aligned:
  80         lwi     r9, r6, 0               /* t1 = *(s + 0) */
  81         lwi     r10, r6, 4              /* t2 = *(s + 4) */
  82         lwi     r11, r6, 8              /* t3 = *(s + 8) */
  83         lwi     r12, r6, 12             /* t4 = *(s + 12) */
  84         swi     r9, r5, 0               /* *(d + 0) = t1 */
  85         swi     r10, r5, 4              /* *(d + 4) = t2 */
  86         swi     r11, r5, 8              /* *(d + 8) = t3 */
  87         swi     r12, r5, 12             /* *(d + 12) = t4 */
  88         lwi     r9, r6, 16              /* t1 = *(s + 16) */
  89         lwi     r10, r6, 20             /* t2 = *(s + 20) */
  90         lwi     r11, r6, 24             /* t3 = *(s + 24) */
  91         lwi     r12, r6, 28             /* t4 = *(s + 28) */
  92         swi     r9, r5, 16              /* *(d + 16) = t1 */
  93         swi     r10, r5, 20             /* *(d + 20) = t2 */
  94         swi     r11, r5, 24             /* *(d + 24) = t3 */
  95         swi     r12, r5, 28             /* *(d + 28) = t4 */
  96         addi    r6, r6, 32              /* s = s + 32 */
  97         addi    r4, r4, -32             /* n = n - 32 */
  98         bneid   r4, a_block_aligned     /* while (n) loop */
  99         addi    r5, r5, 32              /* d = d + 32 (IN DELAY SLOT) */
 100         bri     a_block_done
 101 
 102 a_block_unaligned:
 103         andi    r8, r6, 0xfffffffc      /* as = s & ~3 */
 104         add     r6, r6, r4              /* s = s + n */
 105         lwi     r11, r8, 0              /* h = *(as + 0) */
 106 
 107         addi    r9, r9, -1
 108         beqi    r9, a_block_u1          /* t1 was 1 => 1 byte offset */
 109         addi    r9, r9, -1
 110         beqi    r9, a_block_u2          /* t1 was 2 => 2 byte offset */
 111 
 112 a_block_u3:
 113         bslli   r11, r11, 24    /* h = h << 24 */
 114 a_bu3_loop:
 115         lwi     r12, r8, 4      /* v = *(as + 4) */
 116         bsrli   r9, r12, 8      /* t1 = v >> 8 */
 117         or      r9, r11, r9     /* t1 = h | t1 */
 118         swi     r9, r5, 0       /* *(d + 0) = t1 */
 119         bslli   r11, r12, 24    /* h = v << 24 */
 120         lwi     r12, r8, 8      /* v = *(as + 8) */
 121         bsrli   r9, r12, 8      /* t1 = v >> 8 */
 122         or      r9, r11, r9     /* t1 = h | t1 */
 123         swi     r9, r5, 4       /* *(d + 4) = t1 */
 124         bslli   r11, r12, 24    /* h = v << 24 */
 125         lwi     r12, r8, 12     /* v = *(as + 12) */
 126         bsrli   r9, r12, 8      /* t1 = v >> 8 */
 127         or      r9, r11, r9     /* t1 = h | t1 */
 128         swi     r9, r5, 8       /* *(d + 8) = t1 */
 129         bslli   r11, r12, 24    /* h = v << 24 */
 130         lwi     r12, r8, 16     /* v = *(as + 16) */
 131         bsrli   r9, r12, 8      /* t1 = v >> 8 */
 132         or      r9, r11, r9     /* t1 = h | t1 */
 133         swi     r9, r5, 12      /* *(d + 12) = t1 */
 134         bslli   r11, r12, 24    /* h = v << 24 */
 135         lwi     r12, r8, 20     /* v = *(as + 20) */
 136         bsrli   r9, r12, 8      /* t1 = v >> 8 */
 137         or      r9, r11, r9     /* t1 = h | t1 */
 138         swi     r9, r5, 16      /* *(d + 16) = t1 */
 139         bslli   r11, r12, 24    /* h = v << 24 */
 140         lwi     r12, r8, 24     /* v = *(as + 24) */
 141         bsrli   r9, r12, 8      /* t1 = v >> 8 */
 142         or      r9, r11, r9     /* t1 = h | t1 */
 143         swi     r9, r5, 20      /* *(d + 20) = t1 */
 144         bslli   r11, r12, 24    /* h = v << 24 */
 145         lwi     r12, r8, 28     /* v = *(as + 28) */
 146         bsrli   r9, r12, 8      /* t1 = v >> 8 */
 147         or      r9, r11, r9     /* t1 = h | t1 */
 148         swi     r9, r5, 24      /* *(d + 24) = t1 */
 149         bslli   r11, r12, 24    /* h = v << 24 */
 150         lwi     r12, r8, 32     /* v = *(as + 32) */
 151         bsrli   r9, r12, 8      /* t1 = v >> 8 */
 152         or      r9, r11, r9     /* t1 = h | t1 */
 153         swi     r9, r5, 28      /* *(d + 28) = t1 */
 154         bslli   r11, r12, 24    /* h = v << 24 */
 155         addi    r8, r8, 32      /* as = as + 32 */
 156         addi    r4, r4, -32     /* n = n - 32 */
 157         bneid   r4, a_bu3_loop  /* while (n) loop */
 158         addi    r5, r5, 32      /* d = d + 32 (IN DELAY SLOT) */
 159         bri     a_block_done
 160 
 161 a_block_u1:
 162         bslli   r11, r11, 8     /* h = h << 8 */
 163 a_bu1_loop:
 164         lwi     r12, r8, 4      /* v = *(as + 4) */
 165         bsrli   r9, r12, 24     /* t1 = v >> 24 */
 166         or      r9, r11, r9     /* t1 = h | t1 */
 167         swi     r9, r5, 0       /* *(d + 0) = t1 */
 168         bslli   r11, r12, 8     /* h = v << 8 */
 169         lwi     r12, r8, 8      /* v = *(as + 8) */
 170         bsrli   r9, r12, 24     /* t1 = v >> 24 */
 171         or      r9, r11, r9     /* t1 = h | t1 */
 172         swi     r9, r5, 4       /* *(d + 4) = t1 */
 173         bslli   r11, r12, 8     /* h = v << 8 */
 174         lwi     r12, r8, 12     /* v = *(as + 12) */
 175         bsrli   r9, r12, 24     /* t1 = v >> 24 */
 176         or      r9, r11, r9     /* t1 = h | t1 */
 177         swi     r9, r5, 8       /* *(d + 8) = t1 */
 178         bslli   r11, r12, 8     /* h = v << 8 */
 179         lwi     r12, r8, 16     /* v = *(as + 16) */
 180         bsrli   r9, r12, 24     /* t1 = v >> 24 */
 181         or      r9, r11, r9     /* t1 = h | t1 */
 182         swi     r9, r5, 12      /* *(d + 12) = t1 */
 183         bslli   r11, r12, 8     /* h = v << 8 */
 184         lwi     r12, r8, 20     /* v = *(as + 20) */
 185         bsrli   r9, r12, 24     /* t1 = v >> 24 */
 186         or      r9, r11, r9     /* t1 = h | t1 */
 187         swi     r9, r5, 16      /* *(d + 16) = t1 */
 188         bslli   r11, r12, 8     /* h = v << 8 */
 189         lwi     r12, r8, 24     /* v = *(as + 24) */
 190         bsrli   r9, r12, 24     /* t1 = v >> 24 */
 191         or      r9, r11, r9     /* t1 = h | t1 */
 192         swi     r9, r5, 20      /* *(d + 20) = t1 */
 193         bslli   r11, r12, 8     /* h = v << 8 */
 194         lwi     r12, r8, 28     /* v = *(as + 28) */
 195         bsrli   r9, r12, 24     /* t1 = v >> 24 */
 196         or      r9, r11, r9     /* t1 = h | t1 */
 197         swi     r9, r5, 24      /* *(d + 24) = t1 */
 198         bslli   r11, r12, 8     /* h = v << 8 */
 199         lwi     r12, r8, 32     /* v = *(as + 32) */
 200         bsrli   r9, r12, 24     /* t1 = v >> 24 */
 201         or      r9, r11, r9     /* t1 = h | t1 */
 202         swi     r9, r5, 28      /* *(d + 28) = t1 */
 203         bslli   r11, r12, 8     /* h = v << 8 */
 204         addi    r8, r8, 32      /* as = as + 32 */
 205         addi    r4, r4, -32     /* n = n - 32 */
 206         bneid   r4, a_bu1_loop  /* while (n) loop */
 207         addi    r5, r5, 32      /* d = d + 32 (IN DELAY SLOT) */
 208         bri     a_block_done
 209 
 210 a_block_u2:
 211         bslli   r11, r11, 16    /* h = h << 16 */
 212 a_bu2_loop:
 213         lwi     r12, r8, 4      /* v = *(as + 4) */
 214         bsrli   r9, r12, 16     /* t1 = v >> 16 */
 215         or      r9, r11, r9     /* t1 = h | t1 */
 216         swi     r9, r5, 0       /* *(d + 0) = t1 */
 217         bslli   r11, r12, 16    /* h = v << 16 */
 218         lwi     r12, r8, 8      /* v = *(as + 8) */
 219         bsrli   r9, r12, 16     /* t1 = v >> 16 */
 220         or      r9, r11, r9     /* t1 = h | t1 */
 221         swi     r9, r5, 4       /* *(d + 4) = t1 */
 222         bslli   r11, r12, 16    /* h = v << 16 */
 223         lwi     r12, r8, 12     /* v = *(as + 12) */
 224         bsrli   r9, r12, 16     /* t1 = v >> 16 */
 225         or      r9, r11, r9     /* t1 = h | t1 */
 226         swi     r9, r5, 8       /* *(d + 8) = t1 */
 227         bslli   r11, r12, 16    /* h = v << 16 */
 228         lwi     r12, r8, 16     /* v = *(as + 16) */
 229         bsrli   r9, r12, 16     /* t1 = v >> 16 */
 230         or      r9, r11, r9     /* t1 = h | t1 */
 231         swi     r9, r5, 12      /* *(d + 12) = t1 */
 232         bslli   r11, r12, 16    /* h = v << 16 */
 233         lwi     r12, r8, 20     /* v = *(as + 20) */
 234         bsrli   r9, r12, 16     /* t1 = v >> 16 */
 235         or      r9, r11, r9     /* t1 = h | t1 */
 236         swi     r9, r5, 16      /* *(d + 16) = t1 */
 237         bslli   r11, r12, 16    /* h = v << 16 */
 238         lwi     r12, r8, 24     /* v = *(as + 24) */
 239         bsrli   r9, r12, 16     /* t1 = v >> 16 */
 240         or      r9, r11, r9     /* t1 = h | t1 */
 241         swi     r9, r5, 20      /* *(d + 20) = t1 */
 242         bslli   r11, r12, 16    /* h = v << 16 */
 243         lwi     r12, r8, 28     /* v = *(as + 28) */
 244         bsrli   r9, r12, 16     /* t1 = v >> 16 */
 245         or      r9, r11, r9     /* t1 = h | t1 */
 246         swi     r9, r5, 24      /* *(d + 24) = t1 */
 247         bslli   r11, r12, 16    /* h = v << 16 */
 248         lwi     r12, r8, 32     /* v = *(as + 32) */
 249         bsrli   r9, r12, 16     /* t1 = v >> 16 */
 250         or      r9, r11, r9     /* t1 = h | t1 */
 251         swi     r9, r5, 28      /* *(d + 28) = t1 */
 252         bslli   r11, r12, 16    /* h = v << 16 */
 253         addi    r8, r8, 32      /* as = as + 32 */
 254         addi    r4, r4, -32     /* n = n - 32 */
 255         bneid   r4, a_bu2_loop  /* while (n) loop */
 256         addi    r5, r5, 32      /* d = d + 32 (IN DELAY SLOT) */
 257 
 258 a_block_done:
 259         addi    r4, r0, 4       /* n = 4 */
 260         cmpu    r4, r4, r7      /* n = c - n  (unsigned) */
 261         blti    r4, a_xfer_end  /* if n < 0, less than one word to transfer */
 262 
 263 a_word_xfer:
 264         andi    r4, r7, 0xfffffffc      /* n = c & ~3 */
 265         addi    r10, r0, 0              /* offset = 0 */
 266 
 267         andi    r9, r6, 3               /* t1 = s & 3 */
 268         /* if temp != 0, unaligned transfers needed */
 269         bnei    r9, a_word_unaligned
 270 
 271 a_word_aligned:
 272         lw      r9, r6, r10             /* t1 = *(s+offset) */
 273         sw      r9, r5, r10             /* *(d+offset) = t1 */
 274         addi    r4, r4,-4               /* n-- */
 275         bneid   r4, a_word_aligned      /* loop */
 276         addi    r10, r10, 4             /* offset++ (IN DELAY SLOT) */
 277 
 278         bri     a_word_done
 279 
 280 a_word_unaligned:
 281         andi    r8, r6, 0xfffffffc      /* as = s & ~3 */
 282         lwi     r11, r8, 0              /* h = *(as + 0) */
 283         addi    r8, r8, 4               /* as = as + 4 */
 284 
 285         addi    r9, r9, -1
 286         beqi    r9, a_word_u1           /* t1 was 1 => 1 byte offset */
 287         addi    r9, r9, -1
 288         beqi    r9, a_word_u2           /* t1 was 2 => 2 byte offset */
 289 
 290 a_word_u3:
 291         bslli   r11, r11, 24    /* h = h << 24 */
 292 a_wu3_loop:
 293         lw      r12, r8, r10    /* v = *(as + offset) */
 294         bsrli   r9, r12, 8      /* t1 = v >> 8 */
 295         or      r9, r11, r9     /* t1 = h | t1 */
 296         sw      r9, r5, r10     /* *(d + offset) = t1 */
 297         bslli   r11, r12, 24    /* h = v << 24 */
 298         addi    r4, r4,-4       /* n = n - 4 */
 299         bneid   r4, a_wu3_loop  /* while (n) loop */
 300         addi    r10, r10, 4     /* offset = ofset + 4 (IN DELAY SLOT) */
 301 
 302         bri     a_word_done
 303 
 304 a_word_u1:
 305         bslli   r11, r11, 8     /* h = h << 8 */
 306 a_wu1_loop:
 307         lw      r12, r8, r10    /* v = *(as + offset) */
 308         bsrli   r9, r12, 24     /* t1 = v >> 24 */
 309         or      r9, r11, r9     /* t1 = h | t1 */
 310         sw      r9, r5, r10     /* *(d + offset) = t1 */
 311         bslli   r11, r12, 8     /* h = v << 8 */
 312         addi    r4, r4,-4       /* n = n - 4 */
 313         bneid   r4, a_wu1_loop  /* while (n) loop */
 314         addi    r10, r10, 4     /* offset = ofset + 4 (IN DELAY SLOT) */
 315 
 316         bri     a_word_done
 317 
 318 a_word_u2:
 319         bslli   r11, r11, 16    /* h = h << 16 */
 320 a_wu2_loop:
 321         lw      r12, r8, r10    /* v = *(as + offset) */
 322         bsrli   r9, r12, 16     /* t1 = v >> 16 */
 323         or      r9, r11, r9     /* t1 = h | t1 */
 324         sw      r9, r5, r10     /* *(d + offset) = t1 */
 325         bslli   r11, r12, 16    /* h = v << 16 */
 326         addi    r4, r4,-4       /* n = n - 4 */
 327         bneid   r4, a_wu2_loop  /* while (n) loop */
 328         addi    r10, r10, 4     /* offset = ofset + 4 (IN DELAY SLOT) */
 329 
 330 a_word_done:
 331         add     r5, r5, r10     /* d = d + offset */
 332         add     r6, r6, r10     /* s = s + offset */
 333         rsub    r7, r10, r7     /* c = c - offset */
 334 
 335 a_xfer_end:
 336 a_xfer_end_loop:
 337         beqi    r7, a_done              /* while (c) */
 338         lbui    r9, r6, 0               /* t1 = *s */
 339         addi    r6, r6, 1               /* s++ */
 340         sbi     r9, r5, 0               /* *d = t1 */
 341         addi    r7, r7, -1              /* c-- */
 342         brid    a_xfer_end_loop         /* loop */
 343         addi    r5, r5, 1               /* d++ (IN DELAY SLOT) */
 344 
 345 a_done:
 346         rtsd    r15, 8
 347         nop
 348 
 349 .size  memcpy, . - memcpy
 350 .end memcpy
 351 /*----------------------------------------------------------------------------*/
 352         .globl  memmove
 353         .type  memmove, @function
 354         .ent    memmove
 355 
 356 memmove:
 357         cmpu    r4, r5, r6      /* n = s - d */
 358         bgei    r4,fast_memcpy_ascending
 359 
 360 fast_memcpy_descending:
 361         /* move d to return register as value of function */
 362         addi    r3, r5, 0
 363 
 364         add     r5, r5, r7      /* d = d + c */
 365         add     r6, r6, r7      /* s = s + c */
 366 
 367         addi    r4, r0, 4       /* n = 4 */
 368         cmpu    r4, r4, r7      /* n = c - n  (unsigned) */
 369         blti    r4,d_xfer_end   /* if n < 0, less than one word to transfer */
 370 
 371         /* transfer first 0~3 bytes to get aligned dest address */
 372         andi    r4, r5, 3               /* n = d & 3 */
 373         /* if zero, destination already aligned */
 374         beqi    r4,d_dalign_done
 375         rsub    r7, r4, r7              /* c = c - n adjust c */
 376 
 377 d_xfer_first_loop:
 378         /* if no bytes left to transfer, transfer the bulk */
 379         beqi    r4,d_dalign_done
 380         addi    r6, r6, -1              /* s-- */
 381         addi    r5, r5, -1              /* d-- */
 382         lbui    r11, r6, 0              /* h = *s */
 383         sbi     r11, r5, 0              /* *d = h */
 384         brid    d_xfer_first_loop       /* loop */
 385         addi    r4, r4, -1              /* n-- (IN DELAY SLOT) */
 386 
 387 d_dalign_done:
 388         addi    r4, r0, 32      /* n = 32 */
 389         cmpu    r4, r4, r7      /* n = c - n  (unsigned) */
 390         /* if n < 0, less than one block to transfer */
 391         blti    r4, d_block_done
 392 
 393 d_block_xfer:
 394         andi    r4, r7, 0xffffffe0      /* n = c & ~31 */
 395         rsub    r7, r4, r7              /* c = c - n */
 396 
 397         andi    r9, r6, 3               /* t1 = s & 3 */
 398         /* if temp != 0, unaligned transfers needed */
 399         bnei    r9, d_block_unaligned
 400 
 401 d_block_aligned:
 402         addi    r6, r6, -32             /* s = s - 32 */
 403         addi    r5, r5, -32             /* d = d - 32 */
 404         lwi     r9, r6, 28              /* t1 = *(s + 28) */
 405         lwi     r10, r6, 24             /* t2 = *(s + 24) */
 406         lwi     r11, r6, 20             /* t3 = *(s + 20) */
 407         lwi     r12, r6, 16             /* t4 = *(s + 16) */
 408         swi     r9, r5, 28              /* *(d + 28) = t1 */
 409         swi     r10, r5, 24             /* *(d + 24) = t2 */
 410         swi     r11, r5, 20             /* *(d + 20) = t3 */
 411         swi     r12, r5, 16             /* *(d + 16) = t4 */
 412         lwi     r9, r6, 12              /* t1 = *(s + 12) */
 413         lwi     r10, r6, 8              /* t2 = *(s + 8) */
 414         lwi     r11, r6, 4              /* t3 = *(s + 4) */
 415         lwi     r12, r6, 0              /* t4 = *(s + 0) */
 416         swi     r9, r5, 12              /* *(d + 12) = t1 */
 417         swi     r10, r5, 8              /* *(d + 8) = t2 */
 418         swi     r11, r5, 4              /* *(d + 4) = t3 */
 419         addi    r4, r4, -32             /* n = n - 32 */
 420         bneid   r4, d_block_aligned     /* while (n) loop */
 421         swi     r12, r5, 0              /* *(d + 0) = t4 (IN DELAY SLOT) */
 422         bri     d_block_done
 423 
 424 d_block_unaligned:
 425         andi    r8, r6, 0xfffffffc      /* as = s & ~3 */
 426         rsub    r6, r4, r6              /* s = s - n */
 427         lwi     r11, r8, 0              /* h = *(as + 0) */
 428 
 429         addi    r9, r9, -1
 430         beqi    r9,d_block_u1           /* t1 was 1 => 1 byte offset */
 431         addi    r9, r9, -1
 432         beqi    r9,d_block_u2           /* t1 was 2 => 2 byte offset */
 433 
 434 d_block_u3:
 435         bsrli   r11, r11, 8     /* h = h >> 8 */
 436 d_bu3_loop:
 437         addi    r8, r8, -32     /* as = as - 32 */
 438         addi    r5, r5, -32     /* d = d - 32 */
 439         lwi     r12, r8, 28     /* v = *(as + 28) */
 440         bslli   r9, r12, 24     /* t1 = v << 24 */
 441         or      r9, r11, r9     /* t1 = h | t1 */
 442         swi     r9, r5, 28      /* *(d + 28) = t1 */
 443         bsrli   r11, r12, 8     /* h = v >> 8 */
 444         lwi     r12, r8, 24     /* v = *(as + 24) */
 445         bslli   r9, r12, 24     /* t1 = v << 24 */
 446         or      r9, r11, r9     /* t1 = h | t1 */
 447         swi     r9, r5, 24      /* *(d + 24) = t1 */
 448         bsrli   r11, r12, 8     /* h = v >> 8 */
 449         lwi     r12, r8, 20     /* v = *(as + 20) */
 450         bslli   r9, r12, 24     /* t1 = v << 24 */
 451         or      r9, r11, r9     /* t1 = h | t1 */
 452         swi     r9, r5, 20      /* *(d + 20) = t1 */
 453         bsrli   r11, r12, 8     /* h = v >> 8 */
 454         lwi     r12, r8, 16     /* v = *(as + 16) */
 455         bslli   r9, r12, 24     /* t1 = v << 24 */
 456         or      r9, r11, r9     /* t1 = h | t1 */
 457         swi     r9, r5, 16      /* *(d + 16) = t1 */
 458         bsrli   r11, r12, 8     /* h = v >> 8 */
 459         lwi     r12, r8, 12     /* v = *(as + 12) */
 460         bslli   r9, r12, 24     /* t1 = v << 24 */
 461         or      r9, r11, r9     /* t1 = h | t1 */
 462         swi     r9, r5, 12      /* *(d + 112) = t1 */
 463         bsrli   r11, r12, 8     /* h = v >> 8 */
 464         lwi     r12, r8, 8      /* v = *(as + 8) */
 465         bslli   r9, r12, 24     /* t1 = v << 24 */
 466         or      r9, r11, r9     /* t1 = h | t1 */
 467         swi     r9, r5, 8       /* *(d + 8) = t1 */
 468         bsrli   r11, r12, 8     /* h = v >> 8 */
 469         lwi     r12, r8, 4      /* v = *(as + 4) */
 470         bslli   r9, r12, 24     /* t1 = v << 24 */
 471         or      r9, r11, r9     /* t1 = h | t1 */
 472         swi     r9, r5, 4       /* *(d + 4) = t1 */
 473         bsrli   r11, r12, 8     /* h = v >> 8 */
 474         lwi     r12, r8, 0      /* v = *(as + 0) */
 475         bslli   r9, r12, 24     /* t1 = v << 24 */
 476         or      r9, r11, r9     /* t1 = h | t1 */
 477         swi     r9, r5, 0       /* *(d + 0) = t1 */
 478         addi    r4, r4, -32     /* n = n - 32 */
 479         bneid   r4, d_bu3_loop  /* while (n) loop */
 480         bsrli   r11, r12, 8     /* h = v >> 8 (IN DELAY SLOT) */
 481         bri     d_block_done
 482 
 483 d_block_u1:
 484         bsrli   r11, r11, 24    /* h = h >> 24 */
 485 d_bu1_loop:
 486         addi    r8, r8, -32     /* as = as - 32 */
 487         addi    r5, r5, -32     /* d = d - 32 */
 488         lwi     r12, r8, 28     /* v = *(as + 28) */
 489         bslli   r9, r12, 8      /* t1 = v << 8 */
 490         or      r9, r11, r9     /* t1 = h | t1 */
 491         swi     r9, r5, 28      /* *(d + 28) = t1 */
 492         bsrli   r11, r12, 24    /* h = v >> 24 */
 493         lwi     r12, r8, 24     /* v = *(as + 24) */
 494         bslli   r9, r12, 8      /* t1 = v << 8 */
 495         or      r9, r11, r9     /* t1 = h | t1 */
 496         swi     r9, r5, 24      /* *(d + 24) = t1 */
 497         bsrli   r11, r12, 24    /* h = v >> 24 */
 498         lwi     r12, r8, 20     /* v = *(as + 20) */
 499         bslli   r9, r12, 8      /* t1 = v << 8 */
 500         or      r9, r11, r9     /* t1 = h | t1 */
 501         swi     r9, r5, 20      /* *(d + 20) = t1 */
 502         bsrli   r11, r12, 24    /* h = v >> 24 */
 503         lwi     r12, r8, 16     /* v = *(as + 16) */
 504         bslli   r9, r12, 8      /* t1 = v << 8 */
 505         or      r9, r11, r9     /* t1 = h | t1 */
 506         swi     r9, r5, 16      /* *(d + 16) = t1 */
 507         bsrli   r11, r12, 24    /* h = v >> 24 */
 508         lwi     r12, r8, 12     /* v = *(as + 12) */
 509         bslli   r9, r12, 8      /* t1 = v << 8 */
 510         or      r9, r11, r9     /* t1 = h | t1 */
 511         swi     r9, r5, 12      /* *(d + 112) = t1 */
 512         bsrli   r11, r12, 24    /* h = v >> 24 */
 513         lwi     r12, r8, 8      /* v = *(as + 8) */
 514         bslli   r9, r12, 8      /* t1 = v << 8 */
 515         or      r9, r11, r9     /* t1 = h | t1 */
 516         swi     r9, r5, 8       /* *(d + 8) = t1 */
 517         bsrli   r11, r12, 24    /* h = v >> 24 */
 518         lwi     r12, r8, 4      /* v = *(as + 4) */
 519         bslli   r9, r12, 8      /* t1 = v << 8 */
 520         or      r9, r11, r9     /* t1 = h | t1 */
 521         swi     r9, r5, 4       /* *(d + 4) = t1 */
 522         bsrli   r11, r12, 24    /* h = v >> 24 */
 523         lwi     r12, r8, 0      /* v = *(as + 0) */
 524         bslli   r9, r12, 8      /* t1 = v << 8 */
 525         or      r9, r11, r9     /* t1 = h | t1 */
 526         swi     r9, r5, 0       /* *(d + 0) = t1 */
 527         addi    r4, r4, -32     /* n = n - 32 */
 528         bneid   r4, d_bu1_loop  /* while (n) loop */
 529         bsrli   r11, r12, 24    /* h = v >> 24 (IN DELAY SLOT) */
 530         bri     d_block_done
 531 
 532 d_block_u2:
 533         bsrli   r11, r11, 16    /* h = h >> 16 */
 534 d_bu2_loop:
 535         addi    r8, r8, -32     /* as = as - 32 */
 536         addi    r5, r5, -32     /* d = d - 32 */
 537         lwi     r12, r8, 28     /* v = *(as + 28) */
 538         bslli   r9, r12, 16     /* t1 = v << 16 */
 539         or      r9, r11, r9     /* t1 = h | t1 */
 540         swi     r9, r5, 28      /* *(d + 28) = t1 */
 541         bsrli   r11, r12, 16    /* h = v >> 16 */
 542         lwi     r12, r8, 24     /* v = *(as + 24) */
 543         bslli   r9, r12, 16     /* t1 = v << 16 */
 544         or      r9, r11, r9     /* t1 = h | t1 */
 545         swi     r9, r5, 24      /* *(d + 24) = t1 */
 546         bsrli   r11, r12, 16    /* h = v >> 16 */
 547         lwi     r12, r8, 20     /* v = *(as + 20) */
 548         bslli   r9, r12, 16     /* t1 = v << 16 */
 549         or      r9, r11, r9     /* t1 = h | t1 */
 550         swi     r9, r5, 20      /* *(d + 20) = t1 */
 551         bsrli   r11, r12, 16    /* h = v >> 16 */
 552         lwi     r12, r8, 16     /* v = *(as + 16) */
 553         bslli   r9, r12, 16     /* t1 = v << 16 */
 554         or      r9, r11, r9     /* t1 = h | t1 */
 555         swi     r9, r5, 16      /* *(d + 16) = t1 */
 556         bsrli   r11, r12, 16    /* h = v >> 16 */
 557         lwi     r12, r8, 12     /* v = *(as + 12) */
 558         bslli   r9, r12, 16     /* t1 = v << 16 */
 559         or      r9, r11, r9     /* t1 = h | t1 */
 560         swi     r9, r5, 12      /* *(d + 112) = t1 */
 561         bsrli   r11, r12, 16    /* h = v >> 16 */
 562         lwi     r12, r8, 8      /* v = *(as + 8) */
 563         bslli   r9, r12, 16     /* t1 = v << 16 */
 564         or      r9, r11, r9     /* t1 = h | t1 */
 565         swi     r9, r5, 8       /* *(d + 8) = t1 */
 566         bsrli   r11, r12, 16    /* h = v >> 16 */
 567         lwi     r12, r8, 4      /* v = *(as + 4) */
 568         bslli   r9, r12, 16     /* t1 = v << 16 */
 569         or      r9, r11, r9     /* t1 = h | t1 */
 570         swi     r9, r5, 4       /* *(d + 4) = t1 */
 571         bsrli   r11, r12, 16    /* h = v >> 16 */
 572         lwi     r12, r8, 0      /* v = *(as + 0) */
 573         bslli   r9, r12, 16     /* t1 = v << 16 */
 574         or      r9, r11, r9     /* t1 = h | t1 */
 575         swi     r9, r5, 0       /* *(d + 0) = t1 */
 576         addi    r4, r4, -32     /* n = n - 32 */
 577         bneid   r4, d_bu2_loop  /* while (n) loop */
 578         bsrli   r11, r12, 16    /* h = v >> 16 (IN DELAY SLOT) */
 579 
 580 d_block_done:
 581         addi    r4, r0, 4       /* n = 4 */
 582         cmpu    r4, r4, r7      /* n = c - n  (unsigned) */
 583         blti    r4,d_xfer_end   /* if n < 0, less than one word to transfer */
 584 
 585 d_word_xfer:
 586         andi    r4, r7, 0xfffffffc      /* n = c & ~3 */
 587         rsub    r5, r4, r5              /* d = d - n */
 588         rsub    r6, r4, r6              /* s = s - n */
 589         rsub    r7, r4, r7              /* c = c - n */
 590 
 591         andi    r9, r6, 3               /* t1 = s & 3 */
 592         /* if temp != 0, unaligned transfers needed */
 593         bnei    r9, d_word_unaligned
 594 
 595 d_word_aligned:
 596         addi    r4, r4,-4               /* n-- */
 597         lw      r9, r6, r4              /* t1 = *(s+n) */
 598         bneid   r4, d_word_aligned      /* loop */
 599         sw      r9, r5, r4              /* *(d+n) = t1 (IN DELAY SLOT) */
 600 
 601         bri     d_word_done
 602 
 603 d_word_unaligned:
 604         andi    r8, r6, 0xfffffffc      /* as = s & ~3 */
 605         lw      r11, r8, r4             /* h = *(as + n) */
 606 
 607         addi    r9, r9, -1
 608         beqi    r9,d_word_u1            /* t1 was 1 => 1 byte offset */
 609         addi    r9, r9, -1
 610         beqi    r9,d_word_u2            /* t1 was 2 => 2 byte offset */
 611 
 612 d_word_u3:
 613         bsrli   r11, r11, 8     /* h = h >> 8 */
 614 d_wu3_loop:
 615         addi    r4, r4,-4       /* n = n - 4 */
 616         lw      r12, r8, r4     /* v = *(as + n) */
 617         bslli   r9, r12, 24     /* t1 = v << 24 */
 618         or      r9, r11, r9     /* t1 = h | t1 */
 619         sw      r9, r5, r4      /* *(d + n) = t1 */
 620         bneid   r4, d_wu3_loop  /* while (n) loop */
 621         bsrli   r11, r12, 8     /* h = v >> 8 (IN DELAY SLOT) */
 622 
 623         bri     d_word_done
 624 
 625 d_word_u1:
 626         bsrli   r11, r11, 24    /* h = h >> 24 */
 627 d_wu1_loop:
 628         addi    r4, r4,-4       /* n = n - 4 */
 629         lw      r12, r8, r4     /* v = *(as + n) */
 630         bslli   r9, r12, 8      /* t1 = v << 8 */
 631         or      r9, r11, r9     /* t1 = h | t1 */
 632         sw      r9, r5, r4      /* *(d + n) = t1 */
 633         bneid   r4, d_wu1_loop  /* while (n) loop */
 634         bsrli   r11, r12, 24    /* h = v >> 24 (IN DELAY SLOT) */
 635 
 636         bri     d_word_done
 637 
 638 d_word_u2:
 639         bsrli   r11, r11, 16    /* h = h >> 16 */
 640 d_wu2_loop:
 641         addi    r4, r4,-4       /* n = n - 4 */
 642         lw      r12, r8, r4     /* v = *(as + n) */
 643         bslli   r9, r12, 16     /* t1 = v << 16 */
 644         or      r9, r11, r9     /* t1 = h | t1 */
 645         sw      r9, r5, r4      /* *(d + n) = t1 */
 646         bneid   r4, d_wu2_loop  /* while (n) loop */
 647         bsrli   r11, r12, 16    /* h = v >> 16 (IN DELAY SLOT) */
 648 
 649 d_word_done:
 650 
 651 d_xfer_end:
 652 d_xfer_end_loop:
 653         beqi    r7, a_done              /* while (c) */
 654         addi    r6, r6, -1              /* s-- */
 655         lbui    r9, r6, 0               /* t1 = *s */
 656         addi    r5, r5, -1              /* d-- */
 657         sbi     r9, r5, 0               /* *d = t1 */
 658         brid    d_xfer_end_loop         /* loop */
 659         addi    r7, r7, -1              /* c-- (IN DELAY SLOT) */
 660 
 661 d_done:
 662         rtsd    r15, 8
 663         nop
 664 
 665 .size  memmove, . - memmove
 666 .end memmove

/* [<][>][^][v][top][bottom][index][help] */