1/* 2 * MMX 3DNow! library helper functions 3 * 4 * To do: 5 * We can use MMX just for prefetch in IRQ's. This may be a win. 6 * (reported so on K6-III) 7 * We should use a better code neutral filler for the short jump 8 * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ?? 9 * We also want to clobber the filler register so we don't get any 10 * register forwarding stalls on the filler. 11 * 12 * Add *user handling. Checksums are not a win with MMX on any CPU 13 * tested so far for any MMX solution figured. 14 * 15 * 22/09/2000 - Arjan van de Ven 16 * Improved for non-egineering-sample Athlons 17 * 18 */ 19#include <linux/hardirq.h> 20#include <linux/string.h> 21#include <linux/module.h> 22#include <linux/sched.h> 23#include <linux/types.h> 24 25#include <asm/i387.h> 26#include <asm/asm.h> 27 28void *_mmx_memcpy(void *to, const void *from, size_t len) 29{ 30 void *p; 31 int i; 32 33 if (unlikely(in_interrupt())) 34 return __memcpy(to, from, len); 35 36 p = to; 37 i = len >> 6; /* len/64 */ 38 39 kernel_fpu_begin(); 40 41 __asm__ __volatile__ ( 42 "1: prefetch (%0)\n" /* This set is 28 bytes */ 43 " prefetch 64(%0)\n" 44 " prefetch 128(%0)\n" 45 " prefetch 192(%0)\n" 46 " prefetch 256(%0)\n" 47 "2: \n" 48 ".section .fixup, \"ax\"\n" 49 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ 50 " jmp 2b\n" 51 ".previous\n" 52 _ASM_EXTABLE(1b, 3b) 53 : : "r" (from)); 54 55 for ( ; i > 5; i--) { 56 __asm__ __volatile__ ( 57 "1: prefetch 320(%0)\n" 58 "2: movq (%0), %%mm0\n" 59 " movq 8(%0), %%mm1\n" 60 " movq 16(%0), %%mm2\n" 61 " movq 24(%0), %%mm3\n" 62 " movq %%mm0, (%1)\n" 63 " movq %%mm1, 8(%1)\n" 64 " movq %%mm2, 16(%1)\n" 65 " movq %%mm3, 24(%1)\n" 66 " movq 32(%0), %%mm0\n" 67 " movq 40(%0), %%mm1\n" 68 " movq 48(%0), %%mm2\n" 69 " movq 56(%0), %%mm3\n" 70 " movq %%mm0, 32(%1)\n" 71 " movq %%mm1, 40(%1)\n" 72 " movq %%mm2, 48(%1)\n" 73 " movq %%mm3, 56(%1)\n" 74 ".section .fixup, \"ax\"\n" 75 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ 76 " jmp 2b\n" 77 ".previous\n" 78 _ASM_EXTABLE(1b, 3b) 79 : : "r" (from), "r" (to) : "memory"); 80 81 from += 64; 82 to += 64; 83 } 84 85 for ( ; i > 0; i--) { 86 __asm__ __volatile__ ( 87 " movq (%0), %%mm0\n" 88 " movq 8(%0), %%mm1\n" 89 " movq 16(%0), %%mm2\n" 90 " movq 24(%0), %%mm3\n" 91 " movq %%mm0, (%1)\n" 92 " movq %%mm1, 8(%1)\n" 93 " movq %%mm2, 16(%1)\n" 94 " movq %%mm3, 24(%1)\n" 95 " movq 32(%0), %%mm0\n" 96 " movq 40(%0), %%mm1\n" 97 " movq 48(%0), %%mm2\n" 98 " movq 56(%0), %%mm3\n" 99 " movq %%mm0, 32(%1)\n" 100 " movq %%mm1, 40(%1)\n" 101 " movq %%mm2, 48(%1)\n" 102 " movq %%mm3, 56(%1)\n" 103 : : "r" (from), "r" (to) : "memory"); 104 105 from += 64; 106 to += 64; 107 } 108 /* 109 * Now do the tail of the block: 110 */ 111 __memcpy(to, from, len & 63); 112 kernel_fpu_end(); 113 114 return p; 115} 116EXPORT_SYMBOL(_mmx_memcpy); 117 118#ifdef CONFIG_MK7 119 120/* 121 * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and 122 * other MMX using processors do not. 123 */ 124 125static void fast_clear_page(void *page) 126{ 127 int i; 128 129 kernel_fpu_begin(); 130 131 __asm__ __volatile__ ( 132 " pxor %%mm0, %%mm0\n" : : 133 ); 134 135 for (i = 0; i < 4096/64; i++) { 136 __asm__ __volatile__ ( 137 " movntq %%mm0, (%0)\n" 138 " movntq %%mm0, 8(%0)\n" 139 " movntq %%mm0, 16(%0)\n" 140 " movntq %%mm0, 24(%0)\n" 141 " movntq %%mm0, 32(%0)\n" 142 " movntq %%mm0, 40(%0)\n" 143 " movntq %%mm0, 48(%0)\n" 144 " movntq %%mm0, 56(%0)\n" 145 : : "r" (page) : "memory"); 146 page += 64; 147 } 148 149 /* 150 * Since movntq is weakly-ordered, a "sfence" is needed to become 151 * ordered again: 152 */ 153 __asm__ __volatile__("sfence\n"::); 154 155 kernel_fpu_end(); 156} 157 158static void fast_copy_page(void *to, void *from) 159{ 160 int i; 161 162 kernel_fpu_begin(); 163 164 /* 165 * maybe the prefetch stuff can go before the expensive fnsave... 166 * but that is for later. -AV 167 */ 168 __asm__ __volatile__( 169 "1: prefetch (%0)\n" 170 " prefetch 64(%0)\n" 171 " prefetch 128(%0)\n" 172 " prefetch 192(%0)\n" 173 " prefetch 256(%0)\n" 174 "2: \n" 175 ".section .fixup, \"ax\"\n" 176 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ 177 " jmp 2b\n" 178 ".previous\n" 179 _ASM_EXTABLE(1b, 3b) : : "r" (from)); 180 181 for (i = 0; i < (4096-320)/64; i++) { 182 __asm__ __volatile__ ( 183 "1: prefetch 320(%0)\n" 184 "2: movq (%0), %%mm0\n" 185 " movntq %%mm0, (%1)\n" 186 " movq 8(%0), %%mm1\n" 187 " movntq %%mm1, 8(%1)\n" 188 " movq 16(%0), %%mm2\n" 189 " movntq %%mm2, 16(%1)\n" 190 " movq 24(%0), %%mm3\n" 191 " movntq %%mm3, 24(%1)\n" 192 " movq 32(%0), %%mm4\n" 193 " movntq %%mm4, 32(%1)\n" 194 " movq 40(%0), %%mm5\n" 195 " movntq %%mm5, 40(%1)\n" 196 " movq 48(%0), %%mm6\n" 197 " movntq %%mm6, 48(%1)\n" 198 " movq 56(%0), %%mm7\n" 199 " movntq %%mm7, 56(%1)\n" 200 ".section .fixup, \"ax\"\n" 201 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ 202 " jmp 2b\n" 203 ".previous\n" 204 _ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory"); 205 206 from += 64; 207 to += 64; 208 } 209 210 for (i = (4096-320)/64; i < 4096/64; i++) { 211 __asm__ __volatile__ ( 212 "2: movq (%0), %%mm0\n" 213 " movntq %%mm0, (%1)\n" 214 " movq 8(%0), %%mm1\n" 215 " movntq %%mm1, 8(%1)\n" 216 " movq 16(%0), %%mm2\n" 217 " movntq %%mm2, 16(%1)\n" 218 " movq 24(%0), %%mm3\n" 219 " movntq %%mm3, 24(%1)\n" 220 " movq 32(%0), %%mm4\n" 221 " movntq %%mm4, 32(%1)\n" 222 " movq 40(%0), %%mm5\n" 223 " movntq %%mm5, 40(%1)\n" 224 " movq 48(%0), %%mm6\n" 225 " movntq %%mm6, 48(%1)\n" 226 " movq 56(%0), %%mm7\n" 227 " movntq %%mm7, 56(%1)\n" 228 : : "r" (from), "r" (to) : "memory"); 229 from += 64; 230 to += 64; 231 } 232 /* 233 * Since movntq is weakly-ordered, a "sfence" is needed to become 234 * ordered again: 235 */ 236 __asm__ __volatile__("sfence \n"::); 237 kernel_fpu_end(); 238} 239 240#else /* CONFIG_MK7 */ 241 242/* 243 * Generic MMX implementation without K7 specific streaming 244 */ 245static void fast_clear_page(void *page) 246{ 247 int i; 248 249 kernel_fpu_begin(); 250 251 __asm__ __volatile__ ( 252 " pxor %%mm0, %%mm0\n" : : 253 ); 254 255 for (i = 0; i < 4096/128; i++) { 256 __asm__ __volatile__ ( 257 " movq %%mm0, (%0)\n" 258 " movq %%mm0, 8(%0)\n" 259 " movq %%mm0, 16(%0)\n" 260 " movq %%mm0, 24(%0)\n" 261 " movq %%mm0, 32(%0)\n" 262 " movq %%mm0, 40(%0)\n" 263 " movq %%mm0, 48(%0)\n" 264 " movq %%mm0, 56(%0)\n" 265 " movq %%mm0, 64(%0)\n" 266 " movq %%mm0, 72(%0)\n" 267 " movq %%mm0, 80(%0)\n" 268 " movq %%mm0, 88(%0)\n" 269 " movq %%mm0, 96(%0)\n" 270 " movq %%mm0, 104(%0)\n" 271 " movq %%mm0, 112(%0)\n" 272 " movq %%mm0, 120(%0)\n" 273 : : "r" (page) : "memory"); 274 page += 128; 275 } 276 277 kernel_fpu_end(); 278} 279 280static void fast_copy_page(void *to, void *from) 281{ 282 int i; 283 284 kernel_fpu_begin(); 285 286 __asm__ __volatile__ ( 287 "1: prefetch (%0)\n" 288 " prefetch 64(%0)\n" 289 " prefetch 128(%0)\n" 290 " prefetch 192(%0)\n" 291 " prefetch 256(%0)\n" 292 "2: \n" 293 ".section .fixup, \"ax\"\n" 294 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ 295 " jmp 2b\n" 296 ".previous\n" 297 _ASM_EXTABLE(1b, 3b) : : "r" (from)); 298 299 for (i = 0; i < 4096/64; i++) { 300 __asm__ __volatile__ ( 301 "1: prefetch 320(%0)\n" 302 "2: movq (%0), %%mm0\n" 303 " movq 8(%0), %%mm1\n" 304 " movq 16(%0), %%mm2\n" 305 " movq 24(%0), %%mm3\n" 306 " movq %%mm0, (%1)\n" 307 " movq %%mm1, 8(%1)\n" 308 " movq %%mm2, 16(%1)\n" 309 " movq %%mm3, 24(%1)\n" 310 " movq 32(%0), %%mm0\n" 311 " movq 40(%0), %%mm1\n" 312 " movq 48(%0), %%mm2\n" 313 " movq 56(%0), %%mm3\n" 314 " movq %%mm0, 32(%1)\n" 315 " movq %%mm1, 40(%1)\n" 316 " movq %%mm2, 48(%1)\n" 317 " movq %%mm3, 56(%1)\n" 318 ".section .fixup, \"ax\"\n" 319 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ 320 " jmp 2b\n" 321 ".previous\n" 322 _ASM_EXTABLE(1b, 3b) 323 : : "r" (from), "r" (to) : "memory"); 324 325 from += 64; 326 to += 64; 327 } 328 kernel_fpu_end(); 329} 330 331#endif /* !CONFIG_MK7 */ 332 333/* 334 * Favour MMX for page clear and copy: 335 */ 336static void slow_zero_page(void *page) 337{ 338 int d0, d1; 339 340 __asm__ __volatile__( 341 "cld\n\t" 342 "rep ; stosl" 343 344 : "=&c" (d0), "=&D" (d1) 345 :"a" (0), "1" (page), "0" (1024) 346 :"memory"); 347} 348 349void mmx_clear_page(void *page) 350{ 351 if (unlikely(in_interrupt())) 352 slow_zero_page(page); 353 else 354 fast_clear_page(page); 355} 356EXPORT_SYMBOL(mmx_clear_page); 357 358static void slow_copy_page(void *to, void *from) 359{ 360 int d0, d1, d2; 361 362 __asm__ __volatile__( 363 "cld\n\t" 364 "rep ; movsl" 365 : "=&c" (d0), "=&D" (d1), "=&S" (d2) 366 : "0" (1024), "1" ((long) to), "2" ((long) from) 367 : "memory"); 368} 369 370void mmx_copy_page(void *to, void *from) 371{ 372 if (unlikely(in_interrupt())) 373 slow_copy_page(to, from); 374 else 375 fast_copy_page(to, from); 376} 377EXPORT_SYMBOL(mmx_copy_page); 378