1/* U3memcpy.S: UltraSparc-III optimized memcpy.
2 *
3 * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com)
4 */
5
6#ifdef __KERNEL__
7#include <asm/visasm.h>
8#include <asm/asi.h>
9#define GLOBAL_SPARE	%g7
10#else
11#define ASI_BLK_P 0xf0
12#define FPRS_FEF  0x04
13#ifdef MEMCPY_DEBUG
14#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
15		     clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0;
16#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
17#else
18#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
19#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
20#endif
21#define GLOBAL_SPARE	%g5
22#endif
23
24#ifndef EX_LD
25#define EX_LD(x)	x
26#endif
27
28#ifndef EX_ST
29#define EX_ST(x)	x
30#endif
31
32#ifndef EX_RETVAL
33#define EX_RETVAL(x)	x
34#endif
35
36#ifndef LOAD
37#define LOAD(type,addr,dest)	type [addr], dest
38#endif
39
40#ifndef STORE
41#define STORE(type,src,addr)	type src, [addr]
42#endif
43
44#ifndef STORE_BLK
45#define STORE_BLK(src,addr)	stda src, [addr] ASI_BLK_P
46#endif
47
48#ifndef FUNC_NAME
49#define FUNC_NAME	U3memcpy
50#endif
51
52#ifndef PREAMBLE
53#define PREAMBLE
54#endif
55
56#ifndef XCC
57#define XCC xcc
58#endif
59
60	.register	%g2,#scratch
61	.register	%g3,#scratch
62
63	/* Special/non-trivial issues of this code:
64	 *
65	 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
66	 * 2) Only low 32 FPU registers are used so that only the
67	 *    lower half of the FPU register set is dirtied by this
68	 *    code.  This is especially important in the kernel.
69	 * 3) This code never prefetches cachelines past the end
70	 *    of the source buffer.
71	 */
72
73	.text
74	.align		64
75
76	/* The cheetah's flexible spine, oversized liver, enlarged heart,
77	 * slender muscular body, and claws make it the swiftest hunter
78	 * in Africa and the fastest animal on land.  Can reach speeds
79	 * of up to 2.4GB per second.
80	 */
81
82	.globl	FUNC_NAME
83	.type	FUNC_NAME,#function
84FUNC_NAME:	/* %o0=dst, %o1=src, %o2=len */
85	srlx		%o2, 31, %g2
86	cmp		%g2, 0
87	tne		%xcc, 5
88	PREAMBLE
89	mov		%o0, %o4
90	cmp		%o2, 0
91	be,pn		%XCC, 85f
92	 or		%o0, %o1, %o3
93	cmp		%o2, 16
94	blu,a,pn	%XCC, 80f
95	 or		%o3, %o2, %o3
96
97	cmp		%o2, (3 * 64)
98	blu,pt		%XCC, 70f
99	 andcc		%o3, 0x7, %g0
100
101	/* Clobbers o5/g1/g2/g3/g7/icc/xcc.  We must preserve
102	 * o5 from here until we hit VISExitHalf.
103	 */
104	VISEntryHalf
105
106	/* Is 'dst' already aligned on an 64-byte boundary? */
107	andcc		%o0, 0x3f, %g2
108	be,pt		%XCC, 2f
109
110	/* Compute abs((dst & 0x3f) - 0x40) into %g2.  This is the number
111	 * of bytes to copy to make 'dst' 64-byte aligned.  We pre-
112	 * subtract this from 'len'.
113	 */
114	 sub		%o0, %o1, GLOBAL_SPARE
115	sub		%g2, 0x40, %g2
116	sub		%g0, %g2, %g2
117	sub		%o2, %g2, %o2
118	andcc		%g2, 0x7, %g1
119	be,pt		%icc, 2f
120	 and		%g2, 0x38, %g2
121
1221:	subcc		%g1, 0x1, %g1
123	EX_LD(LOAD(ldub, %o1 + 0x00, %o3))
124	EX_ST(STORE(stb, %o3, %o1 + GLOBAL_SPARE))
125	bgu,pt		%XCC, 1b
126	 add		%o1, 0x1, %o1
127
128	add		%o1, GLOBAL_SPARE, %o0
129
1302:	cmp		%g2, 0x0
131	and		%o1, 0x7, %g1
132	be,pt		%icc, 3f
133	 alignaddr	%o1, %g0, %o1
134
135	EX_LD(LOAD(ldd, %o1, %f4))
1361:	EX_LD(LOAD(ldd, %o1 + 0x8, %f6))
137	add		%o1, 0x8, %o1
138	subcc		%g2, 0x8, %g2
139	faligndata	%f4, %f6, %f0
140	EX_ST(STORE(std, %f0, %o0))
141	be,pn		%icc, 3f
142	 add		%o0, 0x8, %o0
143
144	EX_LD(LOAD(ldd, %o1 + 0x8, %f4))
145	add		%o1, 0x8, %o1
146	subcc		%g2, 0x8, %g2
147	faligndata	%f6, %f4, %f2
148	EX_ST(STORE(std, %f2, %o0))
149	bne,pt		%icc, 1b
150	 add		%o0, 0x8, %o0
151
1523:	LOAD(prefetch, %o1 + 0x000, #one_read)
153	LOAD(prefetch, %o1 + 0x040, #one_read)
154	andn		%o2, (0x40 - 1), GLOBAL_SPARE
155	LOAD(prefetch, %o1 + 0x080, #one_read)
156	LOAD(prefetch, %o1 + 0x0c0, #one_read)
157	LOAD(prefetch, %o1 + 0x100, #one_read)
158	EX_LD(LOAD(ldd, %o1 + 0x000, %f0))
159	LOAD(prefetch, %o1 + 0x140, #one_read)
160	EX_LD(LOAD(ldd, %o1 + 0x008, %f2))
161	LOAD(prefetch, %o1 + 0x180, #one_read)
162	EX_LD(LOAD(ldd, %o1 + 0x010, %f4))
163	LOAD(prefetch, %o1 + 0x1c0, #one_read)
164	faligndata	%f0, %f2, %f16
165	EX_LD(LOAD(ldd, %o1 + 0x018, %f6))
166	faligndata	%f2, %f4, %f18
167	EX_LD(LOAD(ldd, %o1 + 0x020, %f8))
168	faligndata	%f4, %f6, %f20
169	EX_LD(LOAD(ldd, %o1 + 0x028, %f10))
170	faligndata	%f6, %f8, %f22
171
172	EX_LD(LOAD(ldd, %o1 + 0x030, %f12))
173	faligndata	%f8, %f10, %f24
174	EX_LD(LOAD(ldd, %o1 + 0x038, %f14))
175	faligndata	%f10, %f12, %f26
176	EX_LD(LOAD(ldd, %o1 + 0x040, %f0))
177
178	subcc		GLOBAL_SPARE, 0x80, GLOBAL_SPARE
179	add		%o1, 0x40, %o1
180	bgu,pt		%XCC, 1f
181	 srl		GLOBAL_SPARE, 6, %o3
182	ba,pt		%xcc, 2f
183	 nop
184
185	.align		64
1861:
187	EX_LD(LOAD(ldd, %o1 + 0x008, %f2))
188	faligndata	%f12, %f14, %f28
189	EX_LD(LOAD(ldd, %o1 + 0x010, %f4))
190	faligndata	%f14, %f0, %f30
191	EX_ST(STORE_BLK(%f16, %o0))
192	EX_LD(LOAD(ldd, %o1 + 0x018, %f6))
193	faligndata	%f0, %f2, %f16
194	add		%o0, 0x40, %o0
195
196	EX_LD(LOAD(ldd, %o1 + 0x020, %f8))
197	faligndata	%f2, %f4, %f18
198	EX_LD(LOAD(ldd, %o1 + 0x028, %f10))
199	faligndata	%f4, %f6, %f20
200	EX_LD(LOAD(ldd, %o1 + 0x030, %f12))
201	subcc		%o3, 0x01, %o3
202	faligndata	%f6, %f8, %f22
203	EX_LD(LOAD(ldd, %o1 + 0x038, %f14))
204
205	faligndata	%f8, %f10, %f24
206	EX_LD(LOAD(ldd, %o1 + 0x040, %f0))
207	LOAD(prefetch, %o1 + 0x1c0, #one_read)
208	faligndata	%f10, %f12, %f26
209	bg,pt		%XCC, 1b
210	 add		%o1, 0x40, %o1
211
212	/* Finally we copy the last full 64-byte block. */
2132:
214	EX_LD(LOAD(ldd, %o1 + 0x008, %f2))
215	faligndata	%f12, %f14, %f28
216	EX_LD(LOAD(ldd, %o1 + 0x010, %f4))
217	faligndata	%f14, %f0, %f30
218	EX_ST(STORE_BLK(%f16, %o0))
219	EX_LD(LOAD(ldd, %o1 + 0x018, %f6))
220	faligndata	%f0, %f2, %f16
221	EX_LD(LOAD(ldd, %o1 + 0x020, %f8))
222	faligndata	%f2, %f4, %f18
223	EX_LD(LOAD(ldd, %o1 + 0x028, %f10))
224	faligndata	%f4, %f6, %f20
225	EX_LD(LOAD(ldd, %o1 + 0x030, %f12))
226	faligndata	%f6, %f8, %f22
227	EX_LD(LOAD(ldd, %o1 + 0x038, %f14))
228	faligndata	%f8, %f10, %f24
229	cmp		%g1, 0
230	be,pt		%XCC, 1f
231	 add		%o0, 0x40, %o0
232	EX_LD(LOAD(ldd, %o1 + 0x040, %f0))
2331:	faligndata	%f10, %f12, %f26
234	faligndata	%f12, %f14, %f28
235	faligndata	%f14, %f0, %f30
236	EX_ST(STORE_BLK(%f16, %o0))
237	add		%o0, 0x40, %o0
238	add		%o1, 0x40, %o1
239	membar		#Sync
240
241	/* Now we copy the (len modulo 64) bytes at the end.
242	 * Note how we borrow the %f0 loaded above.
243	 *
244	 * Also notice how this code is careful not to perform a
245	 * load past the end of the src buffer.
246	 */
247	and		%o2, 0x3f, %o2
248	andcc		%o2, 0x38, %g2
249	be,pn		%XCC, 2f
250	 subcc		%g2, 0x8, %g2
251	be,pn		%XCC, 2f
252	 cmp		%g1, 0
253
254	sub		%o2, %g2, %o2
255	be,a,pt		%XCC, 1f
256	 EX_LD(LOAD(ldd, %o1 + 0x00, %f0))
257
2581:	EX_LD(LOAD(ldd, %o1 + 0x08, %f2))
259	add		%o1, 0x8, %o1
260	subcc		%g2, 0x8, %g2
261	faligndata	%f0, %f2, %f8
262	EX_ST(STORE(std, %f8, %o0))
263	be,pn		%XCC, 2f
264	 add		%o0, 0x8, %o0
265	EX_LD(LOAD(ldd, %o1 + 0x08, %f0))
266	add		%o1, 0x8, %o1
267	subcc		%g2, 0x8, %g2
268	faligndata	%f2, %f0, %f8
269	EX_ST(STORE(std, %f8, %o0))
270	bne,pn		%XCC, 1b
271	 add		%o0, 0x8, %o0
272
273	/* If anything is left, we copy it one byte at a time.
274	 * Note that %g1 is (src & 0x3) saved above before the
275	 * alignaddr was performed.
276	 */
2772:
278	cmp		%o2, 0
279	add		%o1, %g1, %o1
280	VISExitHalf
281	be,pn		%XCC, 85f
282	 sub		%o0, %o1, %o3
283
284	andcc		%g1, 0x7, %g0
285	bne,pn		%icc, 90f
286	 andcc		%o2, 0x8, %g0
287	be,pt		%icc, 1f
288	 nop
289	EX_LD(LOAD(ldx, %o1, %o5))
290	EX_ST(STORE(stx, %o5, %o1 + %o3))
291	add		%o1, 0x8, %o1
292
2931:	andcc		%o2, 0x4, %g0
294	be,pt		%icc, 1f
295	 nop
296	EX_LD(LOAD(lduw, %o1, %o5))
297	EX_ST(STORE(stw, %o5, %o1 + %o3))
298	add		%o1, 0x4, %o1
299
3001:	andcc		%o2, 0x2, %g0
301	be,pt		%icc, 1f
302	 nop
303	EX_LD(LOAD(lduh, %o1, %o5))
304	EX_ST(STORE(sth, %o5, %o1 + %o3))
305	add		%o1, 0x2, %o1
306
3071:	andcc		%o2, 0x1, %g0
308	be,pt		%icc, 85f
309	 nop
310	EX_LD(LOAD(ldub, %o1, %o5))
311	ba,pt		%xcc, 85f
312	 EX_ST(STORE(stb, %o5, %o1 + %o3))
313
314	.align		64
31570: /* 16 < len <= 64 */
316	bne,pn		%XCC, 75f
317	 sub		%o0, %o1, %o3
318
31972:
320	andn		%o2, 0xf, GLOBAL_SPARE
321	and		%o2, 0xf, %o2
3221:	subcc		GLOBAL_SPARE, 0x10, GLOBAL_SPARE
323	EX_LD(LOAD(ldx, %o1 + 0x00, %o5))
324	EX_LD(LOAD(ldx, %o1 + 0x08, %g1))
325	EX_ST(STORE(stx, %o5, %o1 + %o3))
326	add		%o1, 0x8, %o1
327	EX_ST(STORE(stx, %g1, %o1 + %o3))
328	bgu,pt		%XCC, 1b
329	 add		%o1, 0x8, %o1
33073:	andcc		%o2, 0x8, %g0
331	be,pt		%XCC, 1f
332	 nop
333	sub		%o2, 0x8, %o2
334	EX_LD(LOAD(ldx, %o1, %o5))
335	EX_ST(STORE(stx, %o5, %o1 + %o3))
336	add		%o1, 0x8, %o1
3371:	andcc		%o2, 0x4, %g0
338	be,pt		%XCC, 1f
339	 nop
340	sub		%o2, 0x4, %o2
341	EX_LD(LOAD(lduw, %o1, %o5))
342	EX_ST(STORE(stw, %o5, %o1 + %o3))
343	add		%o1, 0x4, %o1
3441:	cmp		%o2, 0
345	be,pt		%XCC, 85f
346	 nop
347	ba,pt		%xcc, 90f
348	 nop
349
35075:
351	andcc		%o0, 0x7, %g1
352	sub		%g1, 0x8, %g1
353	be,pn		%icc, 2f
354	 sub		%g0, %g1, %g1
355	sub		%o2, %g1, %o2
356
3571:	subcc		%g1, 1, %g1
358	EX_LD(LOAD(ldub, %o1, %o5))
359	EX_ST(STORE(stb, %o5, %o1 + %o3))
360	bgu,pt		%icc, 1b
361	 add		%o1, 1, %o1
362
3632:	add		%o1, %o3, %o0
364	andcc		%o1, 0x7, %g1
365	bne,pt		%icc, 8f
366	 sll		%g1, 3, %g1
367
368	cmp		%o2, 16
369	bgeu,pt		%icc, 72b
370	 nop
371	ba,a,pt		%xcc, 73b
372
3738:	mov		64, %o3
374	andn		%o1, 0x7, %o1
375	EX_LD(LOAD(ldx, %o1, %g2))
376	sub		%o3, %g1, %o3
377	andn		%o2, 0x7, GLOBAL_SPARE
378	sllx		%g2, %g1, %g2
3791:	EX_LD(LOAD(ldx, %o1 + 0x8, %g3))
380	subcc		GLOBAL_SPARE, 0x8, GLOBAL_SPARE
381	add		%o1, 0x8, %o1
382	srlx		%g3, %o3, %o5
383	or		%o5, %g2, %o5
384	EX_ST(STORE(stx, %o5, %o0))
385	add		%o0, 0x8, %o0
386	bgu,pt		%icc, 1b
387	 sllx		%g3, %g1, %g2
388
389	srl		%g1, 3, %g1
390	andcc		%o2, 0x7, %o2
391	be,pn		%icc, 85f
392	 add		%o1, %g1, %o1
393	ba,pt		%xcc, 90f
394	 sub		%o0, %o1, %o3
395
396	.align		64
39780: /* 0 < len <= 16 */
398	andcc		%o3, 0x3, %g0
399	bne,pn		%XCC, 90f
400	 sub		%o0, %o1, %o3
401
4021:
403	subcc		%o2, 4, %o2
404	EX_LD(LOAD(lduw, %o1, %g1))
405	EX_ST(STORE(stw, %g1, %o1 + %o3))
406	bgu,pt		%XCC, 1b
407	 add		%o1, 4, %o1
408
40985:	retl
410	 mov		EX_RETVAL(%o4), %o0
411
412	.align		32
41390:
414	subcc		%o2, 1, %o2
415	EX_LD(LOAD(ldub, %o1, %g1))
416	EX_ST(STORE(stb, %g1, %o1 + %o3))
417	bgu,pt		%XCC, 90b
418	 add		%o1, 1, %o1
419	retl
420	 mov		EX_RETVAL(%o4), %o0
421
422	.size		FUNC_NAME, .-FUNC_NAME
423