1/*
2 *	MMX 3DNow! library helper functions
3 *
4 *	To do:
5 *	We can use MMX just for prefetch in IRQ's. This may be a win.
6 *		(reported so on K6-III)
7 *	We should use a better code neutral filler for the short jump
8 *		leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
9 *	We also want to clobber the filler register so we don't get any
10 *		register forwarding stalls on the filler.
11 *
12 *	Add *user handling. Checksums are not a win with MMX on any CPU
13 *	tested so far for any MMX solution figured.
14 *
15 *	22/09/2000 - Arjan van de Ven
16 *		Improved for non-egineering-sample Athlons
17 *
18 */
19#include <linux/hardirq.h>
20#include <linux/string.h>
21#include <linux/module.h>
22#include <linux/sched.h>
23#include <linux/types.h>
24
25#include <asm/i387.h>
26#include <asm/asm.h>
27
28void *_mmx_memcpy(void *to, const void *from, size_t len)
29{
30	void *p;
31	int i;
32
33	if (unlikely(in_interrupt()))
34		return __memcpy(to, from, len);
35
36	p = to;
37	i = len >> 6; /* len/64 */
38
39	kernel_fpu_begin();
40
41	__asm__ __volatile__ (
42		"1: prefetch (%0)\n"		/* This set is 28 bytes */
43		"   prefetch 64(%0)\n"
44		"   prefetch 128(%0)\n"
45		"   prefetch 192(%0)\n"
46		"   prefetch 256(%0)\n"
47		"2:  \n"
48		".section .fixup, \"ax\"\n"
49		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
50		"   jmp 2b\n"
51		".previous\n"
52			_ASM_EXTABLE(1b, 3b)
53			: : "r" (from));
54
55	for ( ; i > 5; i--) {
56		__asm__ __volatile__ (
57		"1:  prefetch 320(%0)\n"
58		"2:  movq (%0), %%mm0\n"
59		"  movq 8(%0), %%mm1\n"
60		"  movq 16(%0), %%mm2\n"
61		"  movq 24(%0), %%mm3\n"
62		"  movq %%mm0, (%1)\n"
63		"  movq %%mm1, 8(%1)\n"
64		"  movq %%mm2, 16(%1)\n"
65		"  movq %%mm3, 24(%1)\n"
66		"  movq 32(%0), %%mm0\n"
67		"  movq 40(%0), %%mm1\n"
68		"  movq 48(%0), %%mm2\n"
69		"  movq 56(%0), %%mm3\n"
70		"  movq %%mm0, 32(%1)\n"
71		"  movq %%mm1, 40(%1)\n"
72		"  movq %%mm2, 48(%1)\n"
73		"  movq %%mm3, 56(%1)\n"
74		".section .fixup, \"ax\"\n"
75		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
76		"   jmp 2b\n"
77		".previous\n"
78			_ASM_EXTABLE(1b, 3b)
79			: : "r" (from), "r" (to) : "memory");
80
81		from += 64;
82		to += 64;
83	}
84
85	for ( ; i > 0; i--) {
86		__asm__ __volatile__ (
87		"  movq (%0), %%mm0\n"
88		"  movq 8(%0), %%mm1\n"
89		"  movq 16(%0), %%mm2\n"
90		"  movq 24(%0), %%mm3\n"
91		"  movq %%mm0, (%1)\n"
92		"  movq %%mm1, 8(%1)\n"
93		"  movq %%mm2, 16(%1)\n"
94		"  movq %%mm3, 24(%1)\n"
95		"  movq 32(%0), %%mm0\n"
96		"  movq 40(%0), %%mm1\n"
97		"  movq 48(%0), %%mm2\n"
98		"  movq 56(%0), %%mm3\n"
99		"  movq %%mm0, 32(%1)\n"
100		"  movq %%mm1, 40(%1)\n"
101		"  movq %%mm2, 48(%1)\n"
102		"  movq %%mm3, 56(%1)\n"
103			: : "r" (from), "r" (to) : "memory");
104
105		from += 64;
106		to += 64;
107	}
108	/*
109	 * Now do the tail of the block:
110	 */
111	__memcpy(to, from, len & 63);
112	kernel_fpu_end();
113
114	return p;
115}
116EXPORT_SYMBOL(_mmx_memcpy);
117
118#ifdef CONFIG_MK7
119
120/*
121 *	The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
122 *	other MMX using processors do not.
123 */
124
125static void fast_clear_page(void *page)
126{
127	int i;
128
129	kernel_fpu_begin();
130
131	__asm__ __volatile__ (
132		"  pxor %%mm0, %%mm0\n" : :
133	);
134
135	for (i = 0; i < 4096/64; i++) {
136		__asm__ __volatile__ (
137		"  movntq %%mm0, (%0)\n"
138		"  movntq %%mm0, 8(%0)\n"
139		"  movntq %%mm0, 16(%0)\n"
140		"  movntq %%mm0, 24(%0)\n"
141		"  movntq %%mm0, 32(%0)\n"
142		"  movntq %%mm0, 40(%0)\n"
143		"  movntq %%mm0, 48(%0)\n"
144		"  movntq %%mm0, 56(%0)\n"
145		: : "r" (page) : "memory");
146		page += 64;
147	}
148
149	/*
150	 * Since movntq is weakly-ordered, a "sfence" is needed to become
151	 * ordered again:
152	 */
153	__asm__ __volatile__("sfence\n"::);
154
155	kernel_fpu_end();
156}
157
158static void fast_copy_page(void *to, void *from)
159{
160	int i;
161
162	kernel_fpu_begin();
163
164	/*
165	 * maybe the prefetch stuff can go before the expensive fnsave...
166	 * but that is for later. -AV
167	 */
168	__asm__ __volatile__(
169		"1: prefetch (%0)\n"
170		"   prefetch 64(%0)\n"
171		"   prefetch 128(%0)\n"
172		"   prefetch 192(%0)\n"
173		"   prefetch 256(%0)\n"
174		"2:  \n"
175		".section .fixup, \"ax\"\n"
176		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
177		"   jmp 2b\n"
178		".previous\n"
179			_ASM_EXTABLE(1b, 3b) : : "r" (from));
180
181	for (i = 0; i < (4096-320)/64; i++) {
182		__asm__ __volatile__ (
183		"1: prefetch 320(%0)\n"
184		"2: movq (%0), %%mm0\n"
185		"   movntq %%mm0, (%1)\n"
186		"   movq 8(%0), %%mm1\n"
187		"   movntq %%mm1, 8(%1)\n"
188		"   movq 16(%0), %%mm2\n"
189		"   movntq %%mm2, 16(%1)\n"
190		"   movq 24(%0), %%mm3\n"
191		"   movntq %%mm3, 24(%1)\n"
192		"   movq 32(%0), %%mm4\n"
193		"   movntq %%mm4, 32(%1)\n"
194		"   movq 40(%0), %%mm5\n"
195		"   movntq %%mm5, 40(%1)\n"
196		"   movq 48(%0), %%mm6\n"
197		"   movntq %%mm6, 48(%1)\n"
198		"   movq 56(%0), %%mm7\n"
199		"   movntq %%mm7, 56(%1)\n"
200		".section .fixup, \"ax\"\n"
201		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
202		"   jmp 2b\n"
203		".previous\n"
204		_ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory");
205
206		from += 64;
207		to += 64;
208	}
209
210	for (i = (4096-320)/64; i < 4096/64; i++) {
211		__asm__ __volatile__ (
212		"2: movq (%0), %%mm0\n"
213		"   movntq %%mm0, (%1)\n"
214		"   movq 8(%0), %%mm1\n"
215		"   movntq %%mm1, 8(%1)\n"
216		"   movq 16(%0), %%mm2\n"
217		"   movntq %%mm2, 16(%1)\n"
218		"   movq 24(%0), %%mm3\n"
219		"   movntq %%mm3, 24(%1)\n"
220		"   movq 32(%0), %%mm4\n"
221		"   movntq %%mm4, 32(%1)\n"
222		"   movq 40(%0), %%mm5\n"
223		"   movntq %%mm5, 40(%1)\n"
224		"   movq 48(%0), %%mm6\n"
225		"   movntq %%mm6, 48(%1)\n"
226		"   movq 56(%0), %%mm7\n"
227		"   movntq %%mm7, 56(%1)\n"
228			: : "r" (from), "r" (to) : "memory");
229		from += 64;
230		to += 64;
231	}
232	/*
233	 * Since movntq is weakly-ordered, a "sfence" is needed to become
234	 * ordered again:
235	 */
236	__asm__ __volatile__("sfence \n"::);
237	kernel_fpu_end();
238}
239
240#else /* CONFIG_MK7 */
241
242/*
243 *	Generic MMX implementation without K7 specific streaming
244 */
245static void fast_clear_page(void *page)
246{
247	int i;
248
249	kernel_fpu_begin();
250
251	__asm__ __volatile__ (
252		"  pxor %%mm0, %%mm0\n" : :
253	);
254
255	for (i = 0; i < 4096/128; i++) {
256		__asm__ __volatile__ (
257		"  movq %%mm0, (%0)\n"
258		"  movq %%mm0, 8(%0)\n"
259		"  movq %%mm0, 16(%0)\n"
260		"  movq %%mm0, 24(%0)\n"
261		"  movq %%mm0, 32(%0)\n"
262		"  movq %%mm0, 40(%0)\n"
263		"  movq %%mm0, 48(%0)\n"
264		"  movq %%mm0, 56(%0)\n"
265		"  movq %%mm0, 64(%0)\n"
266		"  movq %%mm0, 72(%0)\n"
267		"  movq %%mm0, 80(%0)\n"
268		"  movq %%mm0, 88(%0)\n"
269		"  movq %%mm0, 96(%0)\n"
270		"  movq %%mm0, 104(%0)\n"
271		"  movq %%mm0, 112(%0)\n"
272		"  movq %%mm0, 120(%0)\n"
273			: : "r" (page) : "memory");
274		page += 128;
275	}
276
277	kernel_fpu_end();
278}
279
280static void fast_copy_page(void *to, void *from)
281{
282	int i;
283
284	kernel_fpu_begin();
285
286	__asm__ __volatile__ (
287		"1: prefetch (%0)\n"
288		"   prefetch 64(%0)\n"
289		"   prefetch 128(%0)\n"
290		"   prefetch 192(%0)\n"
291		"   prefetch 256(%0)\n"
292		"2:  \n"
293		".section .fixup, \"ax\"\n"
294		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
295		"   jmp 2b\n"
296		".previous\n"
297			_ASM_EXTABLE(1b, 3b) : : "r" (from));
298
299	for (i = 0; i < 4096/64; i++) {
300		__asm__ __volatile__ (
301		"1: prefetch 320(%0)\n"
302		"2: movq (%0), %%mm0\n"
303		"   movq 8(%0), %%mm1\n"
304		"   movq 16(%0), %%mm2\n"
305		"   movq 24(%0), %%mm3\n"
306		"   movq %%mm0, (%1)\n"
307		"   movq %%mm1, 8(%1)\n"
308		"   movq %%mm2, 16(%1)\n"
309		"   movq %%mm3, 24(%1)\n"
310		"   movq 32(%0), %%mm0\n"
311		"   movq 40(%0), %%mm1\n"
312		"   movq 48(%0), %%mm2\n"
313		"   movq 56(%0), %%mm3\n"
314		"   movq %%mm0, 32(%1)\n"
315		"   movq %%mm1, 40(%1)\n"
316		"   movq %%mm2, 48(%1)\n"
317		"   movq %%mm3, 56(%1)\n"
318		".section .fixup, \"ax\"\n"
319		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
320		"   jmp 2b\n"
321		".previous\n"
322			_ASM_EXTABLE(1b, 3b)
323			: : "r" (from), "r" (to) : "memory");
324
325		from += 64;
326		to += 64;
327	}
328	kernel_fpu_end();
329}
330
331#endif /* !CONFIG_MK7 */
332
333/*
334 * Favour MMX for page clear and copy:
335 */
336static void slow_zero_page(void *page)
337{
338	int d0, d1;
339
340	__asm__ __volatile__(
341		"cld\n\t"
342		"rep ; stosl"
343
344			: "=&c" (d0), "=&D" (d1)
345			:"a" (0), "1" (page), "0" (1024)
346			:"memory");
347}
348
349void mmx_clear_page(void *page)
350{
351	if (unlikely(in_interrupt()))
352		slow_zero_page(page);
353	else
354		fast_clear_page(page);
355}
356EXPORT_SYMBOL(mmx_clear_page);
357
358static void slow_copy_page(void *to, void *from)
359{
360	int d0, d1, d2;
361
362	__asm__ __volatile__(
363		"cld\n\t"
364		"rep ; movsl"
365		: "=&c" (d0), "=&D" (d1), "=&S" (d2)
366		: "0" (1024), "1" ((long) to), "2" ((long) from)
367		: "memory");
368}
369
370void mmx_copy_page(void *to, void *from)
371{
372	if (unlikely(in_interrupt()))
373		slow_copy_page(to, from);
374	else
375		fast_copy_page(to, from);
376}
377EXPORT_SYMBOL(mmx_copy_page);
378