root/arch/sh/lib64/memcpy.S

/* [<][>][^][v][top][bottom][index][help] */
   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 /* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
   3 /* Modified by SuperH, Inc. September 2003 */
   4 !
   5 ! Fast SH memcpy
   6 !
   7 ! by Toshiyasu Morita (tm@netcom.com)
   8 ! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
   9 ! SH5 code Copyright 2002 SuperH Ltd.
  10 !
  11 ! Entry: ARG0: destination pointer
  12 !        ARG1: source pointer
  13 !        ARG2: byte count
  14 !
  15 ! Exit:  RESULT: destination pointer
  16 !        any other registers in the range r0-r7: trashed
  17 !
  18 ! Notes: Usually one wants to do small reads and write a longword, but
  19 !        unfortunately it is difficult in some cases to concatanate bytes
  20 !        into a longword on the SH, so this does a longword read and small
  21 !        writes.
  22 !
  23 ! This implementation makes two assumptions about how it is called:
  24 !
  25 ! 1.: If the byte count is nonzero, the address of the last byte to be
  26 !     copied is unsigned greater than the address of the first byte to
  27 !     be copied.  This could be easily swapped for a signed comparison,
  28 !     but the algorithm used needs some comparison.
  29 !
  30 ! 2.: When there are two or three bytes in the last word of an 11-or-more
  31 !     bytes memory chunk to b copied, the rest of the word can be read
  32 !     without side effects.
  33 !     This could be easily changed by increasing the minimum size of
  34 !     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
  35 !     however, this would cost a few extra cyles on average.
  36 !     For SHmedia, the assumption is that any quadword can be read in its
  37 !     enirety if at least one byte is included in the copy.
  38 !
  39 
  40         .section .text..SHmedia32,"ax"
  41         .globl  memcpy
  42         .type   memcpy, @function
  43         .align  5
  44 
  45 memcpy:
  46 
  47 #define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
  48 #define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
  49 #define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
  50 #define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
  51 
  52         ld.b r3,0,r63
  53         pta/l Large,tr0
  54         movi 25,r0
  55         bgeu/u r4,r0,tr0
  56         nsb r4,r0
  57         shlli r0,5,r0
  58         movi (L1-L0+63*32 + 1) & 0xffff,r1
  59         sub r1, r0, r0
  60 L0:     ptrel r0,tr0
  61         add r2,r4,r5
  62         ptabs r18,tr1
  63         add r3,r4,r6
  64         blink tr0,r63
  65         
  66 /* Rearranged to make cut2 safe */
  67         .balign 8
  68 L4_7:   /* 4..7 byte memcpy cntd. */
  69         stlo.l r2, 0, r0
  70         or r6, r7, r6
  71         sthi.l r5, -1, r6
  72         stlo.l r5, -4, r6
  73         blink tr1,r63
  74 
  75         .balign 8
  76 L1:     /* 0 byte memcpy */
  77         nop
  78         blink tr1,r63
  79         nop
  80         nop
  81         nop
  82         nop
  83 
  84 L2_3:   /* 2 or 3 byte memcpy cntd. */
  85         st.b r5,-1,r6
  86         blink tr1,r63
  87 
  88         /* 1 byte memcpy */
  89         ld.b r3,0,r0
  90         st.b r2,0,r0
  91         blink tr1,r63
  92 
  93 L8_15:  /* 8..15 byte memcpy cntd. */
  94         stlo.q r2, 0, r0
  95         or r6, r7, r6
  96         sthi.q r5, -1, r6
  97         stlo.q r5, -8, r6
  98         blink tr1,r63
  99         
 100         /* 2 or 3 byte memcpy */
 101         ld.b r3,0,r0
 102         ld.b r2,0,r63
 103         ld.b r3,1,r1
 104         st.b r2,0,r0
 105         pta/l L2_3,tr0
 106         ld.b r6,-1,r6
 107         st.b r2,1,r1
 108         blink tr0, r63
 109 
 110         /* 4 .. 7 byte memcpy */
 111         LDUAL (r3, 0, r0, r1)
 112         pta L4_7, tr0
 113         ldlo.l r6, -4, r7
 114         or r0, r1, r0
 115         sthi.l r2, 3, r0
 116         ldhi.l r6, -1, r6
 117         blink tr0, r63
 118 
 119         /* 8 .. 15 byte memcpy */
 120         LDUAQ (r3, 0, r0, r1)
 121         pta L8_15, tr0
 122         ldlo.q r6, -8, r7
 123         or r0, r1, r0
 124         sthi.q r2, 7, r0
 125         ldhi.q r6, -1, r6
 126         blink tr0, r63
 127 
 128         /* 16 .. 24 byte memcpy */
 129         LDUAQ (r3, 0, r0, r1)
 130         LDUAQ (r3, 8, r8, r9)
 131         or r0, r1, r0
 132         sthi.q r2, 7, r0
 133         or r8, r9, r8
 134         sthi.q r2, 15, r8
 135         ldlo.q r6, -8, r7
 136         ldhi.q r6, -1, r6
 137         stlo.q r2, 8, r8
 138         stlo.q r2, 0, r0
 139         or r6, r7, r6
 140         sthi.q r5, -1, r6
 141         stlo.q r5, -8, r6
 142         blink tr1,r63
 143 
 144 Large:
 145         ld.b r2, 0, r63
 146         pta/l  Loop_ua, tr1
 147         ori r3, -8, r7
 148         sub r2, r7, r22
 149         sub r3, r2, r6
 150         add r2, r4, r5
 151         ldlo.q r3, 0, r0
 152         addi r5, -16, r5
 153         movi 64+8, r27 // could subtract r7 from that.
 154         stlo.q r2, 0, r0
 155         sthi.q r2, 7, r0
 156         ldx.q r22, r6, r0
 157         bgtu/l r27, r4, tr1
 158 
 159         addi r5, -48, r27
 160         pta/l Loop_line, tr0
 161         addi r6, 64, r36
 162         addi r6, -24, r19
 163         addi r6, -16, r20
 164         addi r6, -8, r21
 165 
 166 Loop_line:
 167         ldx.q r22, r36, r63
 168         alloco r22, 32
 169         addi r22, 32, r22
 170         ldx.q r22, r19, r23
 171         sthi.q r22, -25, r0
 172         ldx.q r22, r20, r24
 173         ldx.q r22, r21, r25
 174         stlo.q r22, -32, r0
 175         ldx.q r22, r6,  r0
 176         sthi.q r22, -17, r23
 177         sthi.q r22,  -9, r24
 178         sthi.q r22,  -1, r25
 179         stlo.q r22, -24, r23
 180         stlo.q r22, -16, r24
 181         stlo.q r22,  -8, r25
 182         bgeu r27, r22, tr0
 183 
 184 Loop_ua:
 185         addi r22, 8, r22
 186         sthi.q r22, -1, r0
 187         stlo.q r22, -8, r0
 188         ldx.q r22, r6, r0
 189         bgtu/l r5, r22, tr1
 190 
 191         add r3, r4, r7
 192         ldlo.q r7, -8, r1
 193         sthi.q r22, 7, r0
 194         ldhi.q r7, -1, r7
 195         ptabs r18,tr1
 196         stlo.q r22, 0, r0
 197         or r1, r7, r1
 198         sthi.q r5, 15, r1
 199         stlo.q r5, 8, r1
 200         blink tr1, r63
 201 
 202         .size memcpy,.-memcpy

/* [<][>][^][v][top][bottom][index][help] */