1/*
2 * Copyright (C) 2013 ARM Ltd.
3 * Copyright (C) 2013 Linaro.
4 *
5 * This code is based on glibc cortex strings work originally authored by Linaro
6 * and re-licensed under GPLv2 for the Linux kernel. The original code can
7 * be found @
8 *
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License version 2 as
14 * published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
23 */
24
25
26/*
27 * Copy a buffer from src to dest (alignment handled by the hardware)
28 *
29 * Parameters:
30 *	x0 - dest
31 *	x1 - src
32 *	x2 - n
33 * Returns:
34 *	x0 - dest
35 */
36dstin	.req	x0
37src	.req	x1
38count	.req	x2
39tmp1	.req	x3
40tmp1w	.req	w3
41tmp2	.req	x4
42tmp2w	.req	w4
43dst	.req	x6
44
45A_l	.req	x7
46A_h	.req	x8
47B_l	.req	x9
48B_h	.req	x10
49C_l	.req	x11
50C_h	.req	x12
51D_l	.req	x13
52D_h	.req	x14
53
54	mov	dst, dstin
55	cmp	count, #16
56	/*When memory length is less than 16, the accessed are not aligned.*/
57	b.lo	.Ltiny15
58
59	neg	tmp2, src
60	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
61	b.eq	.LSrcAligned
62	sub	count, count, tmp2
63	/*
64	* Copy the leading memory data from src to dst in an increasing
65	* address order.By this way,the risk of overwritting the source
66	* memory data is eliminated when the distance between src and
67	* dst is less than 16. The memory accesses here are alignment.
68	*/
69	tbz	tmp2, #0, 1f
70	ldrb1	tmp1w, src, #1
71	strb1	tmp1w, dst, #1
721:
73	tbz	tmp2, #1, 2f
74	ldrh1	tmp1w, src, #2
75	strh1	tmp1w, dst, #2
762:
77	tbz	tmp2, #2, 3f
78	ldr1	tmp1w, src, #4
79	str1	tmp1w, dst, #4
803:
81	tbz	tmp2, #3, .LSrcAligned
82	ldr1	tmp1, src, #8
83	str1	tmp1, dst, #8
84
85.LSrcAligned:
86	cmp	count, #64
87	b.ge	.Lcpy_over64
88	/*
89	* Deal with small copies quickly by dropping straight into the
90	* exit block.
91	*/
92.Ltail63:
93	/*
94	* Copy up to 48 bytes of data. At this point we only need the
95	* bottom 6 bits of count to be accurate.
96	*/
97	ands	tmp1, count, #0x30
98	b.eq	.Ltiny15
99	cmp	tmp1w, #0x20
100	b.eq	1f
101	b.lt	2f
102	ldp1	A_l, A_h, src, #16
103	stp1	A_l, A_h, dst, #16
1041:
105	ldp1	A_l, A_h, src, #16
106	stp1	A_l, A_h, dst, #16
1072:
108	ldp1	A_l, A_h, src, #16
109	stp1	A_l, A_h, dst, #16
110.Ltiny15:
111	/*
112	* Prefer to break one ldp/stp into several load/store to access
113	* memory in an increasing address order,rather than to load/store 16
114	* bytes from (src-16) to (dst-16) and to backward the src to aligned
115	* address,which way is used in original cortex memcpy. If keeping
116	* the original memcpy process here, memmove need to satisfy the
117	* precondition that src address is at least 16 bytes bigger than dst
118	* address,otherwise some source data will be overwritten when memove
119	* call memcpy directly. To make memmove simpler and decouple the
120	* memcpy's dependency on memmove, withdrew the original process.
121	*/
122	tbz	count, #3, 1f
123	ldr1	tmp1, src, #8
124	str1	tmp1, dst, #8
1251:
126	tbz	count, #2, 2f
127	ldr1	tmp1w, src, #4
128	str1	tmp1w, dst, #4
1292:
130	tbz	count, #1, 3f
131	ldrh1	tmp1w, src, #2
132	strh1	tmp1w, dst, #2
1333:
134	tbz	count, #0, .Lexitfunc
135	ldrb1	tmp1w, src, #1
136	strb1	tmp1w, dst, #1
137
138	b	.Lexitfunc
139
140.Lcpy_over64:
141	subs	count, count, #128
142	b.ge	.Lcpy_body_large
143	/*
144	* Less than 128 bytes to copy, so handle 64 here and then jump
145	* to the tail.
146	*/
147	ldp1	A_l, A_h, src, #16
148	stp1	A_l, A_h, dst, #16
149	ldp1	B_l, B_h, src, #16
150	ldp1	C_l, C_h, src, #16
151	stp1	B_l, B_h, dst, #16
152	stp1	C_l, C_h, dst, #16
153	ldp1	D_l, D_h, src, #16
154	stp1	D_l, D_h, dst, #16
155
156	tst	count, #0x3f
157	b.ne	.Ltail63
158	b	.Lexitfunc
159
160	/*
161	* Critical loop.  Start at a new cache line boundary.  Assuming
162	* 64 bytes per line this ensures the entire loop is in one line.
163	*/
164	.p2align	L1_CACHE_SHIFT
165.Lcpy_body_large:
166	/* pre-get 64 bytes data. */
167	ldp1	A_l, A_h, src, #16
168	ldp1	B_l, B_h, src, #16
169	ldp1	C_l, C_h, src, #16
170	ldp1	D_l, D_h, src, #16
1711:
172	/*
173	* interlace the load of next 64 bytes data block with store of the last
174	* loaded 64 bytes data.
175	*/
176	stp1	A_l, A_h, dst, #16
177	ldp1	A_l, A_h, src, #16
178	stp1	B_l, B_h, dst, #16
179	ldp1	B_l, B_h, src, #16
180	stp1	C_l, C_h, dst, #16
181	ldp1	C_l, C_h, src, #16
182	stp1	D_l, D_h, dst, #16
183	ldp1	D_l, D_h, src, #16
184	subs	count, count, #64
185	b.ge	1b
186	stp1	A_l, A_h, dst, #16
187	stp1	B_l, B_h, dst, #16
188	stp1	C_l, C_h, dst, #16
189	stp1	D_l, D_h, dst, #16
190
191	tst	count, #0x3f
192	b.ne	.Ltail63
193.Lexitfunc:
194