1/*
2 *    Optimized memory copy routines.
3 *
4 *    Copyright (C) 2004 Randolph Chung <tausq@debian.org>
5 *    Copyright (C) 2013 Helge Deller <deller@gmx.de>
6 *
7 *    This program is free software; you can redistribute it and/or modify
8 *    it under the terms of the GNU General Public License as published by
9 *    the Free Software Foundation; either version 2, or (at your option)
10 *    any later version.
11 *
12 *    This program is distributed in the hope that it will be useful,
13 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
14 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 *    GNU General Public License for more details.
16 *
17 *    You should have received a copy of the GNU General Public License
18 *    along with this program; if not, write to the Free Software
19 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 *    Portions derived from the GNU C Library
22 *    Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
23 *
24 * Several strategies are tried to try to get the best performance for various
25 * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using
26 * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using
27 * general registers.  Unaligned copies are handled either by aligning the
28 * destination and then using shift-and-write method, or in a few cases by
29 * falling back to a byte-at-a-time copy.
30 *
31 * I chose to implement this in C because it is easier to maintain and debug,
32 * and in my experiments it appears that the C code generated by gcc (3.3/3.4
33 * at the time of writing) is fairly optimal. Unfortunately some of the
34 * semantics of the copy routine (exception handling) is difficult to express
35 * in C, so we have to play some tricks to get it to work.
36 *
37 * All the loads and stores are done via explicit asm() code in order to use
38 * the right space registers.
39 *
40 * Testing with various alignments and buffer sizes shows that this code is
41 * often >10x faster than a simple byte-at-a-time copy, even for strangely
42 * aligned operands. It is interesting to note that the glibc version
43 * of memcpy (written in C) is actually quite fast already. This routine is
44 * able to beat it by 30-40% for aligned copies because of the loop unrolling,
45 * but in some cases the glibc version is still slightly faster. This lends
46 * more credibility that gcc can generate very good code as long as we are
47 * careful.
48 *
49 * TODO:
50 * - cache prefetching needs more experimentation to get optimal settings
51 * - try not to use the post-increment address modifiers; they create additional
52 *   interlocks
53 * - replace byte-copy loops with stybs sequences
54 */
55
56#ifdef __KERNEL__
57#include <linux/module.h>
58#include <linux/compiler.h>
59#include <linux/uaccess.h>
60#define s_space "%%sr1"
61#define d_space "%%sr2"
62#else
63#include "memcpy.h"
64#define s_space "%%sr0"
65#define d_space "%%sr0"
66#define pa_memcpy new2_copy
67#endif
68
69DECLARE_PER_CPU(struct exception_data, exception_data);
70
71#define preserve_branch(label)	do {					\
72	volatile int dummy = 0;						\
73	/* The following branch is never taken, it's just here to  */	\
74	/* prevent gcc from optimizing away our exception code. */ 	\
75	if (unlikely(dummy != dummy))					\
76		goto label;						\
77} while (0)
78
79#define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
80#define get_kernel_space() (0)
81
82#define MERGE(w0, sh_1, w1, sh_2)  ({					\
83	unsigned int _r;						\
84	asm volatile (							\
85	"mtsar %3\n"							\
86	"shrpw %1, %2, %%sar, %0\n"					\
87	: "=r"(_r)							\
88	: "r"(w0), "r"(w1), "r"(sh_2)					\
89	);								\
90	_r;								\
91})
92#define THRESHOLD	16
93
94#ifdef DEBUG_MEMCPY
95#define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __func__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)
96#else
97#define DPRINTF(fmt, args...)
98#endif
99
100#define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e)	\
101	__asm__ __volatile__ (				\
102	"1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n\t"	\
103	ASM_EXCEPTIONTABLE_ENTRY(1b,_e)			\
104	: _tt(_t), "+r"(_a)				\
105	: 						\
106	: "r8")
107
108#define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) 	\
109	__asm__ __volatile__ (				\
110	"1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n\t"	\
111	ASM_EXCEPTIONTABLE_ENTRY(1b,_e)			\
112	: "+r"(_a) 					\
113	: _tt(_t)					\
114	: "r8")
115
116#define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e)
117#define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e)
118#define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e)
119#define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e)
120#define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e)
121#define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e)
122
123#define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) 	\
124	__asm__ __volatile__ (				\
125	"1:\t" #_insn " " #_o "(" _s ",%1), %0\n\t"	\
126	ASM_EXCEPTIONTABLE_ENTRY(1b,_e)			\
127	: _tt(_t) 					\
128	: "r"(_a)					\
129	: "r8")
130
131#define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) 	\
132	__asm__ __volatile__ (				\
133	"1:\t" #_insn " %0, " #_o "(" _s ",%1)\n\t" 	\
134	ASM_EXCEPTIONTABLE_ENTRY(1b,_e)			\
135	: 						\
136	: _tt(_t), "r"(_a)				\
137	: "r8")
138
139#define ldw(_s,_o,_a,_t,_e)	def_load_insn(ldw,"=r",_s,_o,_a,_t,_e)
140#define stw(_s,_t,_o,_a,_e) 	def_store_insn(stw,"r",_s,_t,_o,_a,_e)
141
142#ifdef  CONFIG_PREFETCH
143static inline void prefetch_src(const void *addr)
144{
145	__asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr));
146}
147
148static inline void prefetch_dst(const void *addr)
149{
150	__asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr));
151}
152#else
153#define prefetch_src(addr) do { } while(0)
154#define prefetch_dst(addr) do { } while(0)
155#endif
156
157#define PA_MEMCPY_OK		0
158#define PA_MEMCPY_LOAD_ERROR	1
159#define PA_MEMCPY_STORE_ERROR	2
160
161/* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
162 * per loop.  This code is derived from glibc.
163 */
164static noinline unsigned long copy_dstaligned(unsigned long dst,
165					unsigned long src, unsigned long len)
166{
167	/* gcc complains that a2 and a3 may be uninitialized, but actually
168	 * they cannot be.  Initialize a2/a3 to shut gcc up.
169	 */
170	register unsigned int a0, a1, a2 = 0, a3 = 0;
171	int sh_1, sh_2;
172
173	/* prefetch_src((const void *)src); */
174
175	/* Calculate how to shift a word read at the memory operation
176	   aligned srcp to make it aligned for copy.  */
177	sh_1 = 8 * (src % sizeof(unsigned int));
178	sh_2 = 8 * sizeof(unsigned int) - sh_1;
179
180	/* Make src aligned by rounding it down.  */
181	src &= -sizeof(unsigned int);
182
183	switch (len % 4)
184	{
185		case 2:
186			/* a1 = ((unsigned int *) src)[0];
187			   a2 = ((unsigned int *) src)[1]; */
188			ldw(s_space, 0, src, a1, cda_ldw_exc);
189			ldw(s_space, 4, src, a2, cda_ldw_exc);
190			src -= 1 * sizeof(unsigned int);
191			dst -= 3 * sizeof(unsigned int);
192			len += 2;
193			goto do1;
194		case 3:
195			/* a0 = ((unsigned int *) src)[0];
196			   a1 = ((unsigned int *) src)[1]; */
197			ldw(s_space, 0, src, a0, cda_ldw_exc);
198			ldw(s_space, 4, src, a1, cda_ldw_exc);
199			src -= 0 * sizeof(unsigned int);
200			dst -= 2 * sizeof(unsigned int);
201			len += 1;
202			goto do2;
203		case 0:
204			if (len == 0)
205				return PA_MEMCPY_OK;
206			/* a3 = ((unsigned int *) src)[0];
207			   a0 = ((unsigned int *) src)[1]; */
208			ldw(s_space, 0, src, a3, cda_ldw_exc);
209			ldw(s_space, 4, src, a0, cda_ldw_exc);
210			src -=-1 * sizeof(unsigned int);
211			dst -= 1 * sizeof(unsigned int);
212			len += 0;
213			goto do3;
214		case 1:
215			/* a2 = ((unsigned int *) src)[0];
216			   a3 = ((unsigned int *) src)[1]; */
217			ldw(s_space, 0, src, a2, cda_ldw_exc);
218			ldw(s_space, 4, src, a3, cda_ldw_exc);
219			src -=-2 * sizeof(unsigned int);
220			dst -= 0 * sizeof(unsigned int);
221			len -= 1;
222			if (len == 0)
223				goto do0;
224			goto do4;			/* No-op.  */
225	}
226
227	do
228	{
229		/* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */
230do4:
231		/* a0 = ((unsigned int *) src)[0]; */
232		ldw(s_space, 0, src, a0, cda_ldw_exc);
233		/* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
234		stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
235do3:
236		/* a1 = ((unsigned int *) src)[1]; */
237		ldw(s_space, 4, src, a1, cda_ldw_exc);
238		/* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */
239		stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc);
240do2:
241		/* a2 = ((unsigned int *) src)[2]; */
242		ldw(s_space, 8, src, a2, cda_ldw_exc);
243		/* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */
244		stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc);
245do1:
246		/* a3 = ((unsigned int *) src)[3]; */
247		ldw(s_space, 12, src, a3, cda_ldw_exc);
248		/* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */
249		stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc);
250
251		src += 4 * sizeof(unsigned int);
252		dst += 4 * sizeof(unsigned int);
253		len -= 4;
254	}
255	while (len != 0);
256
257do0:
258	/* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
259	stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
260
261	preserve_branch(handle_load_error);
262	preserve_branch(handle_store_error);
263
264	return PA_MEMCPY_OK;
265
266handle_load_error:
267	__asm__ __volatile__ ("cda_ldw_exc:\n");
268	return PA_MEMCPY_LOAD_ERROR;
269
270handle_store_error:
271	__asm__ __volatile__ ("cda_stw_exc:\n");
272	return PA_MEMCPY_STORE_ERROR;
273}
274
275
276/* Returns PA_MEMCPY_OK, PA_MEMCPY_LOAD_ERROR or PA_MEMCPY_STORE_ERROR.
277 * In case of an access fault the faulty address can be read from the per_cpu
278 * exception data struct. */
279static noinline unsigned long pa_memcpy_internal(void *dstp, const void *srcp,
280					unsigned long len)
281{
282	register unsigned long src, dst, t1, t2, t3;
283	register unsigned char *pcs, *pcd;
284	register unsigned int *pws, *pwd;
285	register double *pds, *pdd;
286	unsigned long ret;
287
288	src = (unsigned long)srcp;
289	dst = (unsigned long)dstp;
290	pcs = (unsigned char *)srcp;
291	pcd = (unsigned char *)dstp;
292
293	/* prefetch_src((const void *)srcp); */
294
295	if (len < THRESHOLD)
296		goto byte_copy;
297
298	/* Check alignment */
299	t1 = (src ^ dst);
300	if (unlikely(t1 & (sizeof(double)-1)))
301		goto unaligned_copy;
302
303	/* src and dst have same alignment. */
304
305	/* Copy bytes till we are double-aligned. */
306	t2 = src & (sizeof(double) - 1);
307	if (unlikely(t2 != 0)) {
308		t2 = sizeof(double) - t2;
309		while (t2 && len) {
310			/* *pcd++ = *pcs++; */
311			ldbma(s_space, pcs, t3, pmc_load_exc);
312			len--;
313			stbma(d_space, t3, pcd, pmc_store_exc);
314			t2--;
315		}
316	}
317
318	pds = (double *)pcs;
319	pdd = (double *)pcd;
320
321#if 0
322	/* Copy 8 doubles at a time */
323	while (len >= 8*sizeof(double)) {
324		register double r1, r2, r3, r4, r5, r6, r7, r8;
325		/* prefetch_src((char *)pds + L1_CACHE_BYTES); */
326		flddma(s_space, pds, r1, pmc_load_exc);
327		flddma(s_space, pds, r2, pmc_load_exc);
328		flddma(s_space, pds, r3, pmc_load_exc);
329		flddma(s_space, pds, r4, pmc_load_exc);
330		fstdma(d_space, r1, pdd, pmc_store_exc);
331		fstdma(d_space, r2, pdd, pmc_store_exc);
332		fstdma(d_space, r3, pdd, pmc_store_exc);
333		fstdma(d_space, r4, pdd, pmc_store_exc);
334
335#if 0
336		if (L1_CACHE_BYTES <= 32)
337			prefetch_src((char *)pds + L1_CACHE_BYTES);
338#endif
339		flddma(s_space, pds, r5, pmc_load_exc);
340		flddma(s_space, pds, r6, pmc_load_exc);
341		flddma(s_space, pds, r7, pmc_load_exc);
342		flddma(s_space, pds, r8, pmc_load_exc);
343		fstdma(d_space, r5, pdd, pmc_store_exc);
344		fstdma(d_space, r6, pdd, pmc_store_exc);
345		fstdma(d_space, r7, pdd, pmc_store_exc);
346		fstdma(d_space, r8, pdd, pmc_store_exc);
347		len -= 8*sizeof(double);
348	}
349#endif
350
351	pws = (unsigned int *)pds;
352	pwd = (unsigned int *)pdd;
353
354word_copy:
355	while (len >= 8*sizeof(unsigned int)) {
356		register unsigned int r1,r2,r3,r4,r5,r6,r7,r8;
357		/* prefetch_src((char *)pws + L1_CACHE_BYTES); */
358		ldwma(s_space, pws, r1, pmc_load_exc);
359		ldwma(s_space, pws, r2, pmc_load_exc);
360		ldwma(s_space, pws, r3, pmc_load_exc);
361		ldwma(s_space, pws, r4, pmc_load_exc);
362		stwma(d_space, r1, pwd, pmc_store_exc);
363		stwma(d_space, r2, pwd, pmc_store_exc);
364		stwma(d_space, r3, pwd, pmc_store_exc);
365		stwma(d_space, r4, pwd, pmc_store_exc);
366
367		ldwma(s_space, pws, r5, pmc_load_exc);
368		ldwma(s_space, pws, r6, pmc_load_exc);
369		ldwma(s_space, pws, r7, pmc_load_exc);
370		ldwma(s_space, pws, r8, pmc_load_exc);
371		stwma(d_space, r5, pwd, pmc_store_exc);
372		stwma(d_space, r6, pwd, pmc_store_exc);
373		stwma(d_space, r7, pwd, pmc_store_exc);
374		stwma(d_space, r8, pwd, pmc_store_exc);
375		len -= 8*sizeof(unsigned int);
376	}
377
378	while (len >= 4*sizeof(unsigned int)) {
379		register unsigned int r1,r2,r3,r4;
380		ldwma(s_space, pws, r1, pmc_load_exc);
381		ldwma(s_space, pws, r2, pmc_load_exc);
382		ldwma(s_space, pws, r3, pmc_load_exc);
383		ldwma(s_space, pws, r4, pmc_load_exc);
384		stwma(d_space, r1, pwd, pmc_store_exc);
385		stwma(d_space, r2, pwd, pmc_store_exc);
386		stwma(d_space, r3, pwd, pmc_store_exc);
387		stwma(d_space, r4, pwd, pmc_store_exc);
388		len -= 4*sizeof(unsigned int);
389	}
390
391	pcs = (unsigned char *)pws;
392	pcd = (unsigned char *)pwd;
393
394byte_copy:
395	while (len) {
396		/* *pcd++ = *pcs++; */
397		ldbma(s_space, pcs, t3, pmc_load_exc);
398		stbma(d_space, t3, pcd, pmc_store_exc);
399		len--;
400	}
401
402	return PA_MEMCPY_OK;
403
404unaligned_copy:
405	/* possibly we are aligned on a word, but not on a double... */
406	if (likely((t1 & (sizeof(unsigned int)-1)) == 0)) {
407		t2 = src & (sizeof(unsigned int) - 1);
408
409		if (unlikely(t2 != 0)) {
410			t2 = sizeof(unsigned int) - t2;
411			while (t2) {
412				/* *pcd++ = *pcs++; */
413				ldbma(s_space, pcs, t3, pmc_load_exc);
414				stbma(d_space, t3, pcd, pmc_store_exc);
415				len--;
416				t2--;
417			}
418		}
419
420		pws = (unsigned int *)pcs;
421		pwd = (unsigned int *)pcd;
422		goto word_copy;
423	}
424
425	/* Align the destination.  */
426	if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) {
427		t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1));
428		while (t2) {
429			/* *pcd++ = *pcs++; */
430			ldbma(s_space, pcs, t3, pmc_load_exc);
431			stbma(d_space, t3, pcd, pmc_store_exc);
432			len--;
433			t2--;
434		}
435		dst = (unsigned long)pcd;
436		src = (unsigned long)pcs;
437	}
438
439	ret = copy_dstaligned(dst, src, len / sizeof(unsigned int));
440	if (ret)
441		return ret;
442
443	pcs += (len & -sizeof(unsigned int));
444	pcd += (len & -sizeof(unsigned int));
445	len %= sizeof(unsigned int);
446
447	preserve_branch(handle_load_error);
448	preserve_branch(handle_store_error);
449
450	goto byte_copy;
451
452handle_load_error:
453	__asm__ __volatile__ ("pmc_load_exc:\n");
454	return PA_MEMCPY_LOAD_ERROR;
455
456handle_store_error:
457	__asm__ __volatile__ ("pmc_store_exc:\n");
458	return PA_MEMCPY_STORE_ERROR;
459}
460
461
462/* Returns 0 for success, otherwise, returns number of bytes not transferred. */
463static unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
464{
465	unsigned long ret, fault_addr, reference;
466	struct exception_data *d;
467
468	ret = pa_memcpy_internal(dstp, srcp, len);
469	if (likely(ret == PA_MEMCPY_OK))
470		return 0;
471
472	/* if a load or store fault occured we can get the faulty addr */
473	d = this_cpu_ptr(&exception_data);
474	fault_addr = d->fault_addr;
475
476	/* error in load or store? */
477	if (ret == PA_MEMCPY_LOAD_ERROR)
478		reference = (unsigned long) srcp;
479	else
480		reference = (unsigned long) dstp;
481
482	DPRINTF("pa_memcpy: fault type = %lu, len=%lu fault_addr=%lu ref=%lu\n",
483		ret, len, fault_addr, reference);
484
485	if (fault_addr >= reference)
486		return len - (fault_addr - reference);
487	else
488		return len;
489}
490
491#ifdef __KERNEL__
492unsigned long copy_to_user(void __user *dst, const void *src, unsigned long len)
493{
494	mtsp(get_kernel_space(), 1);
495	mtsp(get_user_space(), 2);
496	return pa_memcpy((void __force *)dst, src, len);
497}
498
499EXPORT_SYMBOL(__copy_from_user);
500unsigned long __copy_from_user(void *dst, const void __user *src, unsigned long len)
501{
502	mtsp(get_user_space(), 1);
503	mtsp(get_kernel_space(), 2);
504	return pa_memcpy(dst, (void __force *)src, len);
505}
506
507unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned long len)
508{
509	mtsp(get_user_space(), 1);
510	mtsp(get_user_space(), 2);
511	return pa_memcpy((void __force *)dst, (void __force *)src, len);
512}
513
514
515void * memcpy(void * dst,const void *src, size_t count)
516{
517	mtsp(get_kernel_space(), 1);
518	mtsp(get_kernel_space(), 2);
519	pa_memcpy(dst, src, count);
520	return dst;
521}
522
523EXPORT_SYMBOL(copy_to_user);
524EXPORT_SYMBOL(copy_from_user);
525EXPORT_SYMBOL(copy_in_user);
526EXPORT_SYMBOL(memcpy);
527
528long probe_kernel_read(void *dst, const void *src, size_t size)
529{
530	unsigned long addr = (unsigned long)src;
531
532	if (addr < PAGE_SIZE)
533		return -EFAULT;
534
535	/* check for I/O space F_EXTEND(0xfff00000) access as well? */
536
537	return __probe_kernel_read(dst, src, size);
538}
539
540#endif
541