root/arch/x86/include/asm/xor.h

/* [<][>][^][v][top][bottom][index][help] */

INCLUDED FROM


DEFINITIONS

This source file includes following definitions.
  1. xor_sse_2
  2. xor_sse_2_pf64
  3. xor_sse_3
  4. xor_sse_3_pf64
  5. xor_sse_4
  6. xor_sse_4_pf64
  7. xor_sse_5
  8. xor_sse_5_pf64

   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 #ifndef _ASM_X86_XOR_H
   3 #define _ASM_X86_XOR_H
   4 
   5 /*
   6  * Optimized RAID-5 checksumming functions for SSE.
   7  */
   8 
   9 /*
  10  * Cache avoiding checksumming functions utilizing KNI instructions
  11  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
  12  */
  13 
  14 /*
  15  * Based on
  16  * High-speed RAID5 checksumming functions utilizing SSE instructions.
  17  * Copyright (C) 1998 Ingo Molnar.
  18  */
  19 
  20 /*
  21  * x86-64 changes / gcc fixes from Andi Kleen.
  22  * Copyright 2002 Andi Kleen, SuSE Labs.
  23  *
  24  * This hasn't been optimized for the hammer yet, but there are likely
  25  * no advantages to be gotten from x86-64 here anyways.
  26  */
  27 
  28 #include <asm/fpu/api.h>
  29 
  30 #ifdef CONFIG_X86_32
  31 /* reduce register pressure */
  32 # define XOR_CONSTANT_CONSTRAINT "i"
  33 #else
  34 # define XOR_CONSTANT_CONSTRAINT "re"
  35 #endif
  36 
  37 #define OFFS(x)         "16*("#x")"
  38 #define PF_OFFS(x)      "256+16*("#x")"
  39 #define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
  40 #define LD(x, y)        "       movaps "OFFS(x)"(%[p1]), %%xmm"#y"      ;\n"
  41 #define ST(x, y)        "       movaps %%xmm"#y", "OFFS(x)"(%[p1])      ;\n"
  42 #define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
  43 #define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
  44 #define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
  45 #define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
  46 #define XO1(x, y)       "       xorps "OFFS(x)"(%[p2]), %%xmm"#y"       ;\n"
  47 #define XO2(x, y)       "       xorps "OFFS(x)"(%[p3]), %%xmm"#y"       ;\n"
  48 #define XO3(x, y)       "       xorps "OFFS(x)"(%[p4]), %%xmm"#y"       ;\n"
  49 #define XO4(x, y)       "       xorps "OFFS(x)"(%[p5]), %%xmm"#y"       ;\n"
  50 #define NOP(x)
  51 
  52 #define BLK64(pf, op, i)                                \
  53                 pf(i)                                   \
  54                 op(i, 0)                                \
  55                         op(i + 1, 1)                    \
  56                                 op(i + 2, 2)            \
  57                                         op(i + 3, 3)
  58 
  59 static void
  60 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
  61 {
  62         unsigned long lines = bytes >> 8;
  63 
  64         kernel_fpu_begin();
  65 
  66         asm volatile(
  67 #undef BLOCK
  68 #define BLOCK(i)                                        \
  69                 LD(i, 0)                                \
  70                         LD(i + 1, 1)                    \
  71                 PF1(i)                                  \
  72                                 PF1(i + 2)              \
  73                                 LD(i + 2, 2)            \
  74                                         LD(i + 3, 3)    \
  75                 PF0(i + 4)                              \
  76                                 PF0(i + 6)              \
  77                 XO1(i, 0)                               \
  78                         XO1(i + 1, 1)                   \
  79                                 XO1(i + 2, 2)           \
  80                                         XO1(i + 3, 3)   \
  81                 ST(i, 0)                                \
  82                         ST(i + 1, 1)                    \
  83                                 ST(i + 2, 2)            \
  84                                         ST(i + 3, 3)    \
  85 
  86 
  87                 PF0(0)
  88                                 PF0(2)
  89 
  90         " .align 32                     ;\n"
  91         " 1:                            ;\n"
  92 
  93                 BLOCK(0)
  94                 BLOCK(4)
  95                 BLOCK(8)
  96                 BLOCK(12)
  97 
  98         "       add %[inc], %[p1]       ;\n"
  99         "       add %[inc], %[p2]       ;\n"
 100         "       dec %[cnt]              ;\n"
 101         "       jnz 1b                  ;\n"
 102         : [cnt] "+r" (lines),
 103           [p1] "+r" (p1), [p2] "+r" (p2)
 104         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 105         : "memory");
 106 
 107         kernel_fpu_end();
 108 }
 109 
 110 static void
 111 xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
 112 {
 113         unsigned long lines = bytes >> 8;
 114 
 115         kernel_fpu_begin();
 116 
 117         asm volatile(
 118 #undef BLOCK
 119 #define BLOCK(i)                        \
 120                 BLK64(PF0, LD, i)       \
 121                 BLK64(PF1, XO1, i)      \
 122                 BLK64(NOP, ST, i)       \
 123 
 124         " .align 32                     ;\n"
 125         " 1:                            ;\n"
 126 
 127                 BLOCK(0)
 128                 BLOCK(4)
 129                 BLOCK(8)
 130                 BLOCK(12)
 131 
 132         "       add %[inc], %[p1]       ;\n"
 133         "       add %[inc], %[p2]       ;\n"
 134         "       dec %[cnt]              ;\n"
 135         "       jnz 1b                  ;\n"
 136         : [cnt] "+r" (lines),
 137           [p1] "+r" (p1), [p2] "+r" (p2)
 138         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 139         : "memory");
 140 
 141         kernel_fpu_end();
 142 }
 143 
 144 static void
 145 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 146           unsigned long *p3)
 147 {
 148         unsigned long lines = bytes >> 8;
 149 
 150         kernel_fpu_begin();
 151 
 152         asm volatile(
 153 #undef BLOCK
 154 #define BLOCK(i) \
 155                 PF1(i)                                  \
 156                                 PF1(i + 2)              \
 157                 LD(i, 0)                                \
 158                         LD(i + 1, 1)                    \
 159                                 LD(i + 2, 2)            \
 160                                         LD(i + 3, 3)    \
 161                 PF2(i)                                  \
 162                                 PF2(i + 2)              \
 163                 PF0(i + 4)                              \
 164                                 PF0(i + 6)              \
 165                 XO1(i, 0)                               \
 166                         XO1(i + 1, 1)                   \
 167                                 XO1(i + 2, 2)           \
 168                                         XO1(i + 3, 3)   \
 169                 XO2(i, 0)                               \
 170                         XO2(i + 1, 1)                   \
 171                                 XO2(i + 2, 2)           \
 172                                         XO2(i + 3, 3)   \
 173                 ST(i, 0)                                \
 174                         ST(i + 1, 1)                    \
 175                                 ST(i + 2, 2)            \
 176                                         ST(i + 3, 3)    \
 177 
 178 
 179                 PF0(0)
 180                                 PF0(2)
 181 
 182         " .align 32                     ;\n"
 183         " 1:                            ;\n"
 184 
 185                 BLOCK(0)
 186                 BLOCK(4)
 187                 BLOCK(8)
 188                 BLOCK(12)
 189 
 190         "       add %[inc], %[p1]       ;\n"
 191         "       add %[inc], %[p2]       ;\n"
 192         "       add %[inc], %[p3]       ;\n"
 193         "       dec %[cnt]              ;\n"
 194         "       jnz 1b                  ;\n"
 195         : [cnt] "+r" (lines),
 196           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
 197         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 198         : "memory");
 199 
 200         kernel_fpu_end();
 201 }
 202 
 203 static void
 204 xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 205                unsigned long *p3)
 206 {
 207         unsigned long lines = bytes >> 8;
 208 
 209         kernel_fpu_begin();
 210 
 211         asm volatile(
 212 #undef BLOCK
 213 #define BLOCK(i)                        \
 214                 BLK64(PF0, LD, i)       \
 215                 BLK64(PF1, XO1, i)      \
 216                 BLK64(PF2, XO2, i)      \
 217                 BLK64(NOP, ST, i)       \
 218 
 219         " .align 32                     ;\n"
 220         " 1:                            ;\n"
 221 
 222                 BLOCK(0)
 223                 BLOCK(4)
 224                 BLOCK(8)
 225                 BLOCK(12)
 226 
 227         "       add %[inc], %[p1]       ;\n"
 228         "       add %[inc], %[p2]       ;\n"
 229         "       add %[inc], %[p3]       ;\n"
 230         "       dec %[cnt]              ;\n"
 231         "       jnz 1b                  ;\n"
 232         : [cnt] "+r" (lines),
 233           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
 234         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 235         : "memory");
 236 
 237         kernel_fpu_end();
 238 }
 239 
 240 static void
 241 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 242           unsigned long *p3, unsigned long *p4)
 243 {
 244         unsigned long lines = bytes >> 8;
 245 
 246         kernel_fpu_begin();
 247 
 248         asm volatile(
 249 #undef BLOCK
 250 #define BLOCK(i) \
 251                 PF1(i)                                  \
 252                                 PF1(i + 2)              \
 253                 LD(i, 0)                                \
 254                         LD(i + 1, 1)                    \
 255                                 LD(i + 2, 2)            \
 256                                         LD(i + 3, 3)    \
 257                 PF2(i)                                  \
 258                                 PF2(i + 2)              \
 259                 XO1(i, 0)                               \
 260                         XO1(i + 1, 1)                   \
 261                                 XO1(i + 2, 2)           \
 262                                         XO1(i + 3, 3)   \
 263                 PF3(i)                                  \
 264                                 PF3(i + 2)              \
 265                 PF0(i + 4)                              \
 266                                 PF0(i + 6)              \
 267                 XO2(i, 0)                               \
 268                         XO2(i + 1, 1)                   \
 269                                 XO2(i + 2, 2)           \
 270                                         XO2(i + 3, 3)   \
 271                 XO3(i, 0)                               \
 272                         XO3(i + 1, 1)                   \
 273                                 XO3(i + 2, 2)           \
 274                                         XO3(i + 3, 3)   \
 275                 ST(i, 0)                                \
 276                         ST(i + 1, 1)                    \
 277                                 ST(i + 2, 2)            \
 278                                         ST(i + 3, 3)    \
 279 
 280 
 281                 PF0(0)
 282                                 PF0(2)
 283 
 284         " .align 32                     ;\n"
 285         " 1:                            ;\n"
 286 
 287                 BLOCK(0)
 288                 BLOCK(4)
 289                 BLOCK(8)
 290                 BLOCK(12)
 291 
 292         "       add %[inc], %[p1]       ;\n"
 293         "       add %[inc], %[p2]       ;\n"
 294         "       add %[inc], %[p3]       ;\n"
 295         "       add %[inc], %[p4]       ;\n"
 296         "       dec %[cnt]              ;\n"
 297         "       jnz 1b                  ;\n"
 298         : [cnt] "+r" (lines), [p1] "+r" (p1),
 299           [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
 300         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 301         : "memory");
 302 
 303         kernel_fpu_end();
 304 }
 305 
 306 static void
 307 xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 308                unsigned long *p3, unsigned long *p4)
 309 {
 310         unsigned long lines = bytes >> 8;
 311 
 312         kernel_fpu_begin();
 313 
 314         asm volatile(
 315 #undef BLOCK
 316 #define BLOCK(i)                        \
 317                 BLK64(PF0, LD, i)       \
 318                 BLK64(PF1, XO1, i)      \
 319                 BLK64(PF2, XO2, i)      \
 320                 BLK64(PF3, XO3, i)      \
 321                 BLK64(NOP, ST, i)       \
 322 
 323         " .align 32                     ;\n"
 324         " 1:                            ;\n"
 325 
 326                 BLOCK(0)
 327                 BLOCK(4)
 328                 BLOCK(8)
 329                 BLOCK(12)
 330 
 331         "       add %[inc], %[p1]       ;\n"
 332         "       add %[inc], %[p2]       ;\n"
 333         "       add %[inc], %[p3]       ;\n"
 334         "       add %[inc], %[p4]       ;\n"
 335         "       dec %[cnt]              ;\n"
 336         "       jnz 1b                  ;\n"
 337         : [cnt] "+r" (lines), [p1] "+r" (p1),
 338           [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
 339         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 340         : "memory");
 341 
 342         kernel_fpu_end();
 343 }
 344 
 345 static void
 346 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 347           unsigned long *p3, unsigned long *p4, unsigned long *p5)
 348 {
 349         unsigned long lines = bytes >> 8;
 350 
 351         kernel_fpu_begin();
 352 
 353         asm volatile(
 354 #undef BLOCK
 355 #define BLOCK(i) \
 356                 PF1(i)                                  \
 357                                 PF1(i + 2)              \
 358                 LD(i, 0)                                \
 359                         LD(i + 1, 1)                    \
 360                                 LD(i + 2, 2)            \
 361                                         LD(i + 3, 3)    \
 362                 PF2(i)                                  \
 363                                 PF2(i + 2)              \
 364                 XO1(i, 0)                               \
 365                         XO1(i + 1, 1)                   \
 366                                 XO1(i + 2, 2)           \
 367                                         XO1(i + 3, 3)   \
 368                 PF3(i)                                  \
 369                                 PF3(i + 2)              \
 370                 XO2(i, 0)                               \
 371                         XO2(i + 1, 1)                   \
 372                                 XO2(i + 2, 2)           \
 373                                         XO2(i + 3, 3)   \
 374                 PF4(i)                                  \
 375                                 PF4(i + 2)              \
 376                 PF0(i + 4)                              \
 377                                 PF0(i + 6)              \
 378                 XO3(i, 0)                               \
 379                         XO3(i + 1, 1)                   \
 380                                 XO3(i + 2, 2)           \
 381                                         XO3(i + 3, 3)   \
 382                 XO4(i, 0)                               \
 383                         XO4(i + 1, 1)                   \
 384                                 XO4(i + 2, 2)           \
 385                                         XO4(i + 3, 3)   \
 386                 ST(i, 0)                                \
 387                         ST(i + 1, 1)                    \
 388                                 ST(i + 2, 2)            \
 389                                         ST(i + 3, 3)    \
 390 
 391 
 392                 PF0(0)
 393                                 PF0(2)
 394 
 395         " .align 32                     ;\n"
 396         " 1:                            ;\n"
 397 
 398                 BLOCK(0)
 399                 BLOCK(4)
 400                 BLOCK(8)
 401                 BLOCK(12)
 402 
 403         "       add %[inc], %[p1]       ;\n"
 404         "       add %[inc], %[p2]       ;\n"
 405         "       add %[inc], %[p3]       ;\n"
 406         "       add %[inc], %[p4]       ;\n"
 407         "       add %[inc], %[p5]       ;\n"
 408         "       dec %[cnt]              ;\n"
 409         "       jnz 1b                  ;\n"
 410         : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
 411           [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
 412         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 413         : "memory");
 414 
 415         kernel_fpu_end();
 416 }
 417 
 418 static void
 419 xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
 420                unsigned long *p3, unsigned long *p4, unsigned long *p5)
 421 {
 422         unsigned long lines = bytes >> 8;
 423 
 424         kernel_fpu_begin();
 425 
 426         asm volatile(
 427 #undef BLOCK
 428 #define BLOCK(i)                        \
 429                 BLK64(PF0, LD, i)       \
 430                 BLK64(PF1, XO1, i)      \
 431                 BLK64(PF2, XO2, i)      \
 432                 BLK64(PF3, XO3, i)      \
 433                 BLK64(PF4, XO4, i)      \
 434                 BLK64(NOP, ST, i)       \
 435 
 436         " .align 32                     ;\n"
 437         " 1:                            ;\n"
 438 
 439                 BLOCK(0)
 440                 BLOCK(4)
 441                 BLOCK(8)
 442                 BLOCK(12)
 443 
 444         "       add %[inc], %[p1]       ;\n"
 445         "       add %[inc], %[p2]       ;\n"
 446         "       add %[inc], %[p3]       ;\n"
 447         "       add %[inc], %[p4]       ;\n"
 448         "       add %[inc], %[p5]       ;\n"
 449         "       dec %[cnt]              ;\n"
 450         "       jnz 1b                  ;\n"
 451         : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
 452           [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
 453         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
 454         : "memory");
 455 
 456         kernel_fpu_end();
 457 }
 458 
 459 static struct xor_block_template xor_block_sse_pf64 = {
 460         .name = "prefetch64-sse",
 461         .do_2 = xor_sse_2_pf64,
 462         .do_3 = xor_sse_3_pf64,
 463         .do_4 = xor_sse_4_pf64,
 464         .do_5 = xor_sse_5_pf64,
 465 };
 466 
 467 #undef LD
 468 #undef XO1
 469 #undef XO2
 470 #undef XO3
 471 #undef XO4
 472 #undef ST
 473 #undef NOP
 474 #undef BLK64
 475 #undef BLOCK
 476 
 477 #undef XOR_CONSTANT_CONSTRAINT
 478 
 479 #ifdef CONFIG_X86_32
 480 # include <asm/xor_32.h>
 481 #else
 482 # include <asm/xor_64.h>
 483 #endif
 484 
 485 #define XOR_SELECT_TEMPLATE(FASTEST) \
 486         AVX_SELECT(FASTEST)
 487 
 488 #endif /* _ASM_X86_XOR_H */

/* [<][>][^][v][top][bottom][index][help] */