root/arch/arm64/lib/xor-neon.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. xor_arm64_neon_2
  2. xor_arm64_neon_3
  3. xor_arm64_neon_4
  4. xor_arm64_neon_5

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * arch/arm64/lib/xor-neon.c
   4  *
   5  * Authors: Jackie Liu <liuyun01@kylinos.cn>
   6  * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
   7  */
   8 
   9 #include <linux/raid/xor.h>
  10 #include <linux/module.h>
  11 #include <asm/neon-intrinsics.h>
  12 
  13 void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1,
  14         unsigned long *p2)
  15 {
  16         uint64_t *dp1 = (uint64_t *)p1;
  17         uint64_t *dp2 = (uint64_t *)p2;
  18 
  19         register uint64x2_t v0, v1, v2, v3;
  20         long lines = bytes / (sizeof(uint64x2_t) * 4);
  21 
  22         do {
  23                 /* p1 ^= p2 */
  24                 v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
  25                 v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
  26                 v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
  27                 v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
  28 
  29                 /* store */
  30                 vst1q_u64(dp1 +  0, v0);
  31                 vst1q_u64(dp1 +  2, v1);
  32                 vst1q_u64(dp1 +  4, v2);
  33                 vst1q_u64(dp1 +  6, v3);
  34 
  35                 dp1 += 8;
  36                 dp2 += 8;
  37         } while (--lines > 0);
  38 }
  39 
  40 void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
  41         unsigned long *p2, unsigned long *p3)
  42 {
  43         uint64_t *dp1 = (uint64_t *)p1;
  44         uint64_t *dp2 = (uint64_t *)p2;
  45         uint64_t *dp3 = (uint64_t *)p3;
  46 
  47         register uint64x2_t v0, v1, v2, v3;
  48         long lines = bytes / (sizeof(uint64x2_t) * 4);
  49 
  50         do {
  51                 /* p1 ^= p2 */
  52                 v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
  53                 v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
  54                 v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
  55                 v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
  56 
  57                 /* p1 ^= p3 */
  58                 v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
  59                 v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
  60                 v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
  61                 v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
  62 
  63                 /* store */
  64                 vst1q_u64(dp1 +  0, v0);
  65                 vst1q_u64(dp1 +  2, v1);
  66                 vst1q_u64(dp1 +  4, v2);
  67                 vst1q_u64(dp1 +  6, v3);
  68 
  69                 dp1 += 8;
  70                 dp2 += 8;
  71                 dp3 += 8;
  72         } while (--lines > 0);
  73 }
  74 
  75 void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
  76         unsigned long *p2, unsigned long *p3, unsigned long *p4)
  77 {
  78         uint64_t *dp1 = (uint64_t *)p1;
  79         uint64_t *dp2 = (uint64_t *)p2;
  80         uint64_t *dp3 = (uint64_t *)p3;
  81         uint64_t *dp4 = (uint64_t *)p4;
  82 
  83         register uint64x2_t v0, v1, v2, v3;
  84         long lines = bytes / (sizeof(uint64x2_t) * 4);
  85 
  86         do {
  87                 /* p1 ^= p2 */
  88                 v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
  89                 v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
  90                 v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
  91                 v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
  92 
  93                 /* p1 ^= p3 */
  94                 v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
  95                 v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
  96                 v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
  97                 v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
  98 
  99                 /* p1 ^= p4 */
 100                 v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
 101                 v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
 102                 v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
 103                 v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
 104 
 105                 /* store */
 106                 vst1q_u64(dp1 +  0, v0);
 107                 vst1q_u64(dp1 +  2, v1);
 108                 vst1q_u64(dp1 +  4, v2);
 109                 vst1q_u64(dp1 +  6, v3);
 110 
 111                 dp1 += 8;
 112                 dp2 += 8;
 113                 dp3 += 8;
 114                 dp4 += 8;
 115         } while (--lines > 0);
 116 }
 117 
 118 void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
 119         unsigned long *p2, unsigned long *p3,
 120         unsigned long *p4, unsigned long *p5)
 121 {
 122         uint64_t *dp1 = (uint64_t *)p1;
 123         uint64_t *dp2 = (uint64_t *)p2;
 124         uint64_t *dp3 = (uint64_t *)p3;
 125         uint64_t *dp4 = (uint64_t *)p4;
 126         uint64_t *dp5 = (uint64_t *)p5;
 127 
 128         register uint64x2_t v0, v1, v2, v3;
 129         long lines = bytes / (sizeof(uint64x2_t) * 4);
 130 
 131         do {
 132                 /* p1 ^= p2 */
 133                 v0 = veorq_u64(vld1q_u64(dp1 +  0), vld1q_u64(dp2 +  0));
 134                 v1 = veorq_u64(vld1q_u64(dp1 +  2), vld1q_u64(dp2 +  2));
 135                 v2 = veorq_u64(vld1q_u64(dp1 +  4), vld1q_u64(dp2 +  4));
 136                 v3 = veorq_u64(vld1q_u64(dp1 +  6), vld1q_u64(dp2 +  6));
 137 
 138                 /* p1 ^= p3 */
 139                 v0 = veorq_u64(v0, vld1q_u64(dp3 +  0));
 140                 v1 = veorq_u64(v1, vld1q_u64(dp3 +  2));
 141                 v2 = veorq_u64(v2, vld1q_u64(dp3 +  4));
 142                 v3 = veorq_u64(v3, vld1q_u64(dp3 +  6));
 143 
 144                 /* p1 ^= p4 */
 145                 v0 = veorq_u64(v0, vld1q_u64(dp4 +  0));
 146                 v1 = veorq_u64(v1, vld1q_u64(dp4 +  2));
 147                 v2 = veorq_u64(v2, vld1q_u64(dp4 +  4));
 148                 v3 = veorq_u64(v3, vld1q_u64(dp4 +  6));
 149 
 150                 /* p1 ^= p5 */
 151                 v0 = veorq_u64(v0, vld1q_u64(dp5 +  0));
 152                 v1 = veorq_u64(v1, vld1q_u64(dp5 +  2));
 153                 v2 = veorq_u64(v2, vld1q_u64(dp5 +  4));
 154                 v3 = veorq_u64(v3, vld1q_u64(dp5 +  6));
 155 
 156                 /* store */
 157                 vst1q_u64(dp1 +  0, v0);
 158                 vst1q_u64(dp1 +  2, v1);
 159                 vst1q_u64(dp1 +  4, v2);
 160                 vst1q_u64(dp1 +  6, v3);
 161 
 162                 dp1 += 8;
 163                 dp2 += 8;
 164                 dp3 += 8;
 165                 dp4 += 8;
 166                 dp5 += 8;
 167         } while (--lines > 0);
 168 }
 169 
 170 struct xor_block_template const xor_block_inner_neon = {
 171         .name   = "__inner_neon__",
 172         .do_2   = xor_arm64_neon_2,
 173         .do_3   = xor_arm64_neon_3,
 174         .do_4   = xor_arm64_neon_4,
 175         .do_5   = xor_arm64_neon_5,
 176 };
 177 EXPORT_SYMBOL(xor_block_inner_neon);
 178 
 179 MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
 180 MODULE_DESCRIPTION("ARMv8 XOR Extensions");
 181 MODULE_LICENSE("GPL");

/* [<][>][^][v][top][bottom][index][help] */