1
2
3
4 !
5 ! Fast SH memcpy
6 !
7 ! by Toshiyasu Morita (tm@netcom.com)
8 ! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
9 ! SH5 code Copyright 2002 SuperH Ltd.
10 !
11 ! Entry: ARG0: destination pointer
12 ! ARG1: source pointer
13 ! ARG2: byte count
14 !
15 ! Exit: RESULT: destination pointer
16 ! any other registers in the range r0-r7: trashed
17 !
18 ! Notes: Usually one wants to do small reads and write a longword, but
19 ! unfortunately it is difficult in some cases to concatanate bytes
20 ! into a longword on the SH, so this does a longword read and small
21 ! writes.
22 !
23 ! This implementation makes two assumptions about how it is called:
24 !
25 ! 1.: If the byte count is nonzero, the address of the last byte to be
26 ! copied is unsigned greater than the address of the first byte to
27 ! be copied. This could be easily swapped for a signed comparison,
28 ! but the algorithm used needs some comparison.
29 !
30 ! 2.: When there are two or three bytes in the last word of an 11-or-more
31 ! bytes memory chunk to b copied, the rest of the word can be read
32 ! without side effects.
33 ! This could be easily changed by increasing the minimum size of
34 ! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
35 ! however, this would cost a few extra cyles on average.
36 ! For SHmedia, the assumption is that any quadword can be read in its
37 ! enirety if at least one byte is included in the copy.
38 !
39
40 .section .text..SHmedia32,"ax"
41 .globl memcpy
42 .type memcpy, @function
43 .align 5
44
45 memcpy:
46
47 #define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
48 #define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
49 #define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
50 #define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
51
52 ld.b r3,0,r63
53 pta/l Large,tr0
54 movi 25,r0
55 bgeu/u r4,r0,tr0
56 nsb r4,r0
57 shlli r0,5,r0
58 movi (L1-L0+63*32 + 1) & 0xffff,r1
59 sub r1, r0, r0
60 L0: ptrel r0,tr0
61 add r2,r4,r5
62 ptabs r18,tr1
63 add r3,r4,r6
64 blink tr0,r63
65
66
67 .balign 8
68 L4_7:
69 stlo.l r2, 0, r0
70 or r6, r7, r6
71 sthi.l r5, -1, r6
72 stlo.l r5, -4, r6
73 blink tr1,r63
74
75 .balign 8
76 L1:
77 nop
78 blink tr1,r63
79 nop
80 nop
81 nop
82 nop
83
84 L2_3:
85 st.b r5,-1,r6
86 blink tr1,r63
87
88
89 ld.b r3,0,r0
90 st.b r2,0,r0
91 blink tr1,r63
92
93 L8_15:
94 stlo.q r2, 0, r0
95 or r6, r7, r6
96 sthi.q r5, -1, r6
97 stlo.q r5, -8, r6
98 blink tr1,r63
99
100
101 ld.b r3,0,r0
102 ld.b r2,0,r63
103 ld.b r3,1,r1
104 st.b r2,0,r0
105 pta/l L2_3,tr0
106 ld.b r6,-1,r6
107 st.b r2,1,r1
108 blink tr0, r63
109
110
111 LDUAL (r3, 0, r0, r1)
112 pta L4_7, tr0
113 ldlo.l r6, -4, r7
114 or r0, r1, r0
115 sthi.l r2, 3, r0
116 ldhi.l r6, -1, r6
117 blink tr0, r63
118
119
120 LDUAQ (r3, 0, r0, r1)
121 pta L8_15, tr0
122 ldlo.q r6, -8, r7
123 or r0, r1, r0
124 sthi.q r2, 7, r0
125 ldhi.q r6, -1, r6
126 blink tr0, r63
127
128
129 LDUAQ (r3, 0, r0, r1)
130 LDUAQ (r3, 8, r8, r9)
131 or r0, r1, r0
132 sthi.q r2, 7, r0
133 or r8, r9, r8
134 sthi.q r2, 15, r8
135 ldlo.q r6, -8, r7
136 ldhi.q r6, -1, r6
137 stlo.q r2, 8, r8
138 stlo.q r2, 0, r0
139 or r6, r7, r6
140 sthi.q r5, -1, r6
141 stlo.q r5, -8, r6
142 blink tr1,r63
143
144 Large:
145 ld.b r2, 0, r63
146 pta/l Loop_ua, tr1
147 ori r3, -8, r7
148 sub r2, r7, r22
149 sub r3, r2, r6
150 add r2, r4, r5
151 ldlo.q r3, 0, r0
152 addi r5, -16, r5
153 movi 64+8, r27
154 stlo.q r2, 0, r0
155 sthi.q r2, 7, r0
156 ldx.q r22, r6, r0
157 bgtu/l r27, r4, tr1
158
159 addi r5, -48, r27
160 pta/l Loop_line, tr0
161 addi r6, 64, r36
162 addi r6, -24, r19
163 addi r6, -16, r20
164 addi r6, -8, r21
165
166 Loop_line:
167 ldx.q r22, r36, r63
168 alloco r22, 32
169 addi r22, 32, r22
170 ldx.q r22, r19, r23
171 sthi.q r22, -25, r0
172 ldx.q r22, r20, r24
173 ldx.q r22, r21, r25
174 stlo.q r22, -32, r0
175 ldx.q r22, r6, r0
176 sthi.q r22, -17, r23
177 sthi.q r22, -9, r24
178 sthi.q r22, -1, r25
179 stlo.q r22, -24, r23
180 stlo.q r22, -16, r24
181 stlo.q r22, -8, r25
182 bgeu r27, r22, tr0
183
184 Loop_ua:
185 addi r22, 8, r22
186 sthi.q r22, -1, r0
187 stlo.q r22, -8, r0
188 ldx.q r22, r6, r0
189 bgtu/l r5, r22, tr1
190
191 add r3, r4, r7
192 ldlo.q r7, -8, r1
193 sthi.q r22, 7, r0
194 ldhi.q r7, -1, r7
195 ptabs r18,tr1
196 stlo.q r22, 0, r0
197 or r1, r7, r1
198 sthi.q r5, 15, r1
199 stlo.q r5, 8, r1
200 blink tr1, r63
201
202 .size memcpy,.-memcpy