1/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
2/* Modified by SuperH, Inc. September 2003 */
3!
4! Fast SH memcpy
5!
6! by Toshiyasu Morita (tm@netcom.com)
7! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
8! SH5 code Copyright 2002 SuperH Ltd.
9!
10! Entry: ARG0: destination pointer
11!        ARG1: source pointer
12!        ARG2: byte count
13!
14! Exit:  RESULT: destination pointer
15!        any other registers in the range r0-r7: trashed
16!
17! Notes: Usually one wants to do small reads and write a longword, but
18!        unfortunately it is difficult in some cases to concatanate bytes
19!        into a longword on the SH, so this does a longword read and small
20!        writes.
21!
22! This implementation makes two assumptions about how it is called:
23!
24! 1.: If the byte count is nonzero, the address of the last byte to be
25!     copied is unsigned greater than the address of the first byte to
26!     be copied.  This could be easily swapped for a signed comparison,
27!     but the algorithm used needs some comparison.
28!
29! 2.: When there are two or three bytes in the last word of an 11-or-more
30!     bytes memory chunk to b copied, the rest of the word can be read
31!     without side effects.
32!     This could be easily changed by increasing the minimum size of
33!     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
34!     however, this would cost a few extra cyles on average.
35!     For SHmedia, the assumption is that any quadword can be read in its
36!     enirety if at least one byte is included in the copy.
37!
38
39	.section .text..SHmedia32,"ax"
40	.globl	memcpy
41	.type	memcpy, @function
42	.align	5
43
44memcpy:
45
46#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
47#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
48#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
49#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
50
51	ld.b r3,0,r63
52	pta/l Large,tr0
53	movi 25,r0
54	bgeu/u r4,r0,tr0
55	nsb r4,r0
56	shlli r0,5,r0
57	movi (L1-L0+63*32 + 1) & 0xffff,r1
58	sub r1, r0, r0
59L0:	ptrel r0,tr0
60	add r2,r4,r5
61	ptabs r18,tr1
62	add r3,r4,r6
63	blink tr0,r63
64
65/* Rearranged to make cut2 safe */
66	.balign 8
67L4_7:	/* 4..7 byte memcpy cntd. */
68	stlo.l r2, 0, r0
69	or r6, r7, r6
70	sthi.l r5, -1, r6
71	stlo.l r5, -4, r6
72	blink tr1,r63
73
74	.balign 8
75L1:	/* 0 byte memcpy */
76	nop
77	blink tr1,r63
78	nop
79	nop
80	nop
81	nop
82
83L2_3:	/* 2 or 3 byte memcpy cntd. */
84	st.b r5,-1,r6
85	blink tr1,r63
86
87	/* 1 byte memcpy */
88	ld.b r3,0,r0
89	st.b r2,0,r0
90	blink tr1,r63
91
92L8_15:	/* 8..15 byte memcpy cntd. */
93	stlo.q r2, 0, r0
94	or r6, r7, r6
95	sthi.q r5, -1, r6
96	stlo.q r5, -8, r6
97	blink tr1,r63
98
99	/* 2 or 3 byte memcpy */
100	ld.b r3,0,r0
101	ld.b r2,0,r63
102	ld.b r3,1,r1
103	st.b r2,0,r0
104	pta/l L2_3,tr0
105	ld.b r6,-1,r6
106	st.b r2,1,r1
107	blink tr0, r63
108
109	/* 4 .. 7 byte memcpy */
110	LDUAL (r3, 0, r0, r1)
111	pta L4_7, tr0
112	ldlo.l r6, -4, r7
113	or r0, r1, r0
114	sthi.l r2, 3, r0
115	ldhi.l r6, -1, r6
116	blink tr0, r63
117
118	/* 8 .. 15 byte memcpy */
119	LDUAQ (r3, 0, r0, r1)
120	pta L8_15, tr0
121	ldlo.q r6, -8, r7
122	or r0, r1, r0
123	sthi.q r2, 7, r0
124	ldhi.q r6, -1, r6
125	blink tr0, r63
126
127	/* 16 .. 24 byte memcpy */
128	LDUAQ (r3, 0, r0, r1)
129	LDUAQ (r3, 8, r8, r9)
130	or r0, r1, r0
131	sthi.q r2, 7, r0
132	or r8, r9, r8
133	sthi.q r2, 15, r8
134	ldlo.q r6, -8, r7
135	ldhi.q r6, -1, r6
136	stlo.q r2, 8, r8
137	stlo.q r2, 0, r0
138	or r6, r7, r6
139	sthi.q r5, -1, r6
140	stlo.q r5, -8, r6
141	blink tr1,r63
142
143Large:
144	ld.b r2, 0, r63
145	pta/l  Loop_ua, tr1
146	ori r3, -8, r7
147	sub r2, r7, r22
148	sub r3, r2, r6
149	add r2, r4, r5
150	ldlo.q r3, 0, r0
151	addi r5, -16, r5
152	movi 64+8, r27 // could subtract r7 from that.
153	stlo.q r2, 0, r0
154	sthi.q r2, 7, r0
155	ldx.q r22, r6, r0
156	bgtu/l r27, r4, tr1
157
158	addi r5, -48, r27
159	pta/l Loop_line, tr0
160	addi r6, 64, r36
161	addi r6, -24, r19
162	addi r6, -16, r20
163	addi r6, -8, r21
164
165Loop_line:
166	ldx.q r22, r36, r63
167	alloco r22, 32
168	addi r22, 32, r22
169	ldx.q r22, r19, r23
170	sthi.q r22, -25, r0
171	ldx.q r22, r20, r24
172	ldx.q r22, r21, r25
173	stlo.q r22, -32, r0
174	ldx.q r22, r6,  r0
175	sthi.q r22, -17, r23
176	sthi.q r22,  -9, r24
177	sthi.q r22,  -1, r25
178	stlo.q r22, -24, r23
179	stlo.q r22, -16, r24
180	stlo.q r22,  -8, r25
181	bgeu r27, r22, tr0
182
183Loop_ua:
184	addi r22, 8, r22
185	sthi.q r22, -1, r0
186	stlo.q r22, -8, r0
187	ldx.q r22, r6, r0
188	bgtu/l r5, r22, tr1
189
190	add r3, r4, r7
191	ldlo.q r7, -8, r1
192	sthi.q r22, 7, r0
193	ldhi.q r7, -1, r7
194	ptabs r18,tr1
195	stlo.q r22, 0, r0
196	or r1, r7, r1
197	sthi.q r5, 15, r1
198	stlo.q r5, 8, r1
199	blink tr1, r63
200
201	.size memcpy,.-memcpy
202