1#ifndef _ASM_X86_XOR_AVX_H
2#define _ASM_X86_XOR_AVX_H
3
4/*
5 * Optimized RAID-5 checksumming functions for AVX
6 *
7 * Copyright (C) 2012 Intel Corporation
8 * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
9 *
10 * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; version 2
15 * of the License.
16 */
17
18#ifdef CONFIG_AS_AVX
19
20#include <linux/compiler.h>
21#include <asm/fpu/api.h>
22
23#define BLOCK4(i) \
24		BLOCK(32 * i, 0) \
25		BLOCK(32 * (i + 1), 1) \
26		BLOCK(32 * (i + 2), 2) \
27		BLOCK(32 * (i + 3), 3)
28
29#define BLOCK16() \
30		BLOCK4(0) \
31		BLOCK4(4) \
32		BLOCK4(8) \
33		BLOCK4(12)
34
35static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
36{
37	unsigned long lines = bytes >> 9;
38
39	kernel_fpu_begin();
40
41	while (lines--) {
42#undef BLOCK
43#define BLOCK(i, reg) \
44do { \
45	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
46	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
47		"m" (p0[i / sizeof(*p0)])); \
48	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
49		"=m" (p0[i / sizeof(*p0)])); \
50} while (0);
51
52		BLOCK16()
53
54		p0 = (unsigned long *)((uintptr_t)p0 + 512);
55		p1 = (unsigned long *)((uintptr_t)p1 + 512);
56	}
57
58	kernel_fpu_end();
59}
60
61static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
62	unsigned long *p2)
63{
64	unsigned long lines = bytes >> 9;
65
66	kernel_fpu_begin();
67
68	while (lines--) {
69#undef BLOCK
70#define BLOCK(i, reg) \
71do { \
72	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
73	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
74		"m" (p1[i / sizeof(*p1)])); \
75	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
76		"m" (p0[i / sizeof(*p0)])); \
77	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
78		"=m" (p0[i / sizeof(*p0)])); \
79} while (0);
80
81		BLOCK16()
82
83		p0 = (unsigned long *)((uintptr_t)p0 + 512);
84		p1 = (unsigned long *)((uintptr_t)p1 + 512);
85		p2 = (unsigned long *)((uintptr_t)p2 + 512);
86	}
87
88	kernel_fpu_end();
89}
90
91static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
92	unsigned long *p2, unsigned long *p3)
93{
94	unsigned long lines = bytes >> 9;
95
96	kernel_fpu_begin();
97
98	while (lines--) {
99#undef BLOCK
100#define BLOCK(i, reg) \
101do { \
102	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
103	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
104		"m" (p2[i / sizeof(*p2)])); \
105	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
106		"m" (p1[i / sizeof(*p1)])); \
107	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
108		"m" (p0[i / sizeof(*p0)])); \
109	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
110		"=m" (p0[i / sizeof(*p0)])); \
111} while (0);
112
113		BLOCK16();
114
115		p0 = (unsigned long *)((uintptr_t)p0 + 512);
116		p1 = (unsigned long *)((uintptr_t)p1 + 512);
117		p2 = (unsigned long *)((uintptr_t)p2 + 512);
118		p3 = (unsigned long *)((uintptr_t)p3 + 512);
119	}
120
121	kernel_fpu_end();
122}
123
124static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
125	unsigned long *p2, unsigned long *p3, unsigned long *p4)
126{
127	unsigned long lines = bytes >> 9;
128
129	kernel_fpu_begin();
130
131	while (lines--) {
132#undef BLOCK
133#define BLOCK(i, reg) \
134do { \
135	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
136	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
137		"m" (p3[i / sizeof(*p3)])); \
138	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
139		"m" (p2[i / sizeof(*p2)])); \
140	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
141		"m" (p1[i / sizeof(*p1)])); \
142	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
143		"m" (p0[i / sizeof(*p0)])); \
144	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
145		"=m" (p0[i / sizeof(*p0)])); \
146} while (0);
147
148		BLOCK16()
149
150		p0 = (unsigned long *)((uintptr_t)p0 + 512);
151		p1 = (unsigned long *)((uintptr_t)p1 + 512);
152		p2 = (unsigned long *)((uintptr_t)p2 + 512);
153		p3 = (unsigned long *)((uintptr_t)p3 + 512);
154		p4 = (unsigned long *)((uintptr_t)p4 + 512);
155	}
156
157	kernel_fpu_end();
158}
159
160static struct xor_block_template xor_block_avx = {
161	.name = "avx",
162	.do_2 = xor_avx_2,
163	.do_3 = xor_avx_3,
164	.do_4 = xor_avx_4,
165	.do_5 = xor_avx_5,
166};
167
168#define AVX_XOR_SPEED \
169do { \
170	if (cpu_has_avx && cpu_has_osxsave) \
171		xor_speed(&xor_block_avx); \
172} while (0)
173
174#define AVX_SELECT(FASTEST) \
175	(cpu_has_avx && cpu_has_osxsave ? &xor_block_avx : FASTEST)
176
177#else
178
179#define AVX_XOR_SPEED {}
180
181#define AVX_SELECT(FASTEST) (FASTEST)
182
183#endif
184#endif
185