1/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */ 2/* Modified by SuperH, Inc. September 2003 */ 3! 4! Fast SH memcpy 5! 6! by Toshiyasu Morita (tm@netcom.com) 7! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut) 8! SH5 code Copyright 2002 SuperH Ltd. 9! 10! Entry: ARG0: destination pointer 11! ARG1: source pointer 12! ARG2: byte count 13! 14! Exit: RESULT: destination pointer 15! any other registers in the range r0-r7: trashed 16! 17! Notes: Usually one wants to do small reads and write a longword, but 18! unfortunately it is difficult in some cases to concatanate bytes 19! into a longword on the SH, so this does a longword read and small 20! writes. 21! 22! This implementation makes two assumptions about how it is called: 23! 24! 1.: If the byte count is nonzero, the address of the last byte to be 25! copied is unsigned greater than the address of the first byte to 26! be copied. This could be easily swapped for a signed comparison, 27! but the algorithm used needs some comparison. 28! 29! 2.: When there are two or three bytes in the last word of an 11-or-more 30! bytes memory chunk to b copied, the rest of the word can be read 31! without side effects. 32! This could be easily changed by increasing the minimum size of 33! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2, 34! however, this would cost a few extra cyles on average. 35! For SHmedia, the assumption is that any quadword can be read in its 36! enirety if at least one byte is included in the copy. 37! 38 39 .section .text..SHmedia32,"ax" 40 .globl memcpy 41 .type memcpy, @function 42 .align 5 43 44memcpy: 45 46#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1 47#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1 48#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1 49#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1 50 51 ld.b r3,0,r63 52 pta/l Large,tr0 53 movi 25,r0 54 bgeu/u r4,r0,tr0 55 nsb r4,r0 56 shlli r0,5,r0 57 movi (L1-L0+63*32 + 1) & 0xffff,r1 58 sub r1, r0, r0 59L0: ptrel r0,tr0 60 add r2,r4,r5 61 ptabs r18,tr1 62 add r3,r4,r6 63 blink tr0,r63 64 65/* Rearranged to make cut2 safe */ 66 .balign 8 67L4_7: /* 4..7 byte memcpy cntd. */ 68 stlo.l r2, 0, r0 69 or r6, r7, r6 70 sthi.l r5, -1, r6 71 stlo.l r5, -4, r6 72 blink tr1,r63 73 74 .balign 8 75L1: /* 0 byte memcpy */ 76 nop 77 blink tr1,r63 78 nop 79 nop 80 nop 81 nop 82 83L2_3: /* 2 or 3 byte memcpy cntd. */ 84 st.b r5,-1,r6 85 blink tr1,r63 86 87 /* 1 byte memcpy */ 88 ld.b r3,0,r0 89 st.b r2,0,r0 90 blink tr1,r63 91 92L8_15: /* 8..15 byte memcpy cntd. */ 93 stlo.q r2, 0, r0 94 or r6, r7, r6 95 sthi.q r5, -1, r6 96 stlo.q r5, -8, r6 97 blink tr1,r63 98 99 /* 2 or 3 byte memcpy */ 100 ld.b r3,0,r0 101 ld.b r2,0,r63 102 ld.b r3,1,r1 103 st.b r2,0,r0 104 pta/l L2_3,tr0 105 ld.b r6,-1,r6 106 st.b r2,1,r1 107 blink tr0, r63 108 109 /* 4 .. 7 byte memcpy */ 110 LDUAL (r3, 0, r0, r1) 111 pta L4_7, tr0 112 ldlo.l r6, -4, r7 113 or r0, r1, r0 114 sthi.l r2, 3, r0 115 ldhi.l r6, -1, r6 116 blink tr0, r63 117 118 /* 8 .. 15 byte memcpy */ 119 LDUAQ (r3, 0, r0, r1) 120 pta L8_15, tr0 121 ldlo.q r6, -8, r7 122 or r0, r1, r0 123 sthi.q r2, 7, r0 124 ldhi.q r6, -1, r6 125 blink tr0, r63 126 127 /* 16 .. 24 byte memcpy */ 128 LDUAQ (r3, 0, r0, r1) 129 LDUAQ (r3, 8, r8, r9) 130 or r0, r1, r0 131 sthi.q r2, 7, r0 132 or r8, r9, r8 133 sthi.q r2, 15, r8 134 ldlo.q r6, -8, r7 135 ldhi.q r6, -1, r6 136 stlo.q r2, 8, r8 137 stlo.q r2, 0, r0 138 or r6, r7, r6 139 sthi.q r5, -1, r6 140 stlo.q r5, -8, r6 141 blink tr1,r63 142 143Large: 144 ld.b r2, 0, r63 145 pta/l Loop_ua, tr1 146 ori r3, -8, r7 147 sub r2, r7, r22 148 sub r3, r2, r6 149 add r2, r4, r5 150 ldlo.q r3, 0, r0 151 addi r5, -16, r5 152 movi 64+8, r27 // could subtract r7 from that. 153 stlo.q r2, 0, r0 154 sthi.q r2, 7, r0 155 ldx.q r22, r6, r0 156 bgtu/l r27, r4, tr1 157 158 addi r5, -48, r27 159 pta/l Loop_line, tr0 160 addi r6, 64, r36 161 addi r6, -24, r19 162 addi r6, -16, r20 163 addi r6, -8, r21 164 165Loop_line: 166 ldx.q r22, r36, r63 167 alloco r22, 32 168 addi r22, 32, r22 169 ldx.q r22, r19, r23 170 sthi.q r22, -25, r0 171 ldx.q r22, r20, r24 172 ldx.q r22, r21, r25 173 stlo.q r22, -32, r0 174 ldx.q r22, r6, r0 175 sthi.q r22, -17, r23 176 sthi.q r22, -9, r24 177 sthi.q r22, -1, r25 178 stlo.q r22, -24, r23 179 stlo.q r22, -16, r24 180 stlo.q r22, -8, r25 181 bgeu r27, r22, tr0 182 183Loop_ua: 184 addi r22, 8, r22 185 sthi.q r22, -1, r0 186 stlo.q r22, -8, r0 187 ldx.q r22, r6, r0 188 bgtu/l r5, r22, tr1 189 190 add r3, r4, r7 191 ldlo.q r7, -8, r1 192 sthi.q r22, 7, r0 193 ldhi.q r7, -1, r7 194 ptabs r18,tr1 195 stlo.q r22, 0, r0 196 or r1, r7, r1 197 sthi.q r5, 15, r1 198 stlo.q r5, 8, r1 199 blink tr1, r63 200 201 .size memcpy,.-memcpy 202