1
2
3
4
5
6
7
8
9
10 #include <linux/linkage.h>
11
12 #define PASS0_SUMS %ymm0
13 #define PASS1_SUMS %ymm1
14 #define PASS2_SUMS %ymm2
15 #define PASS3_SUMS %ymm3
16 #define K0 %ymm4
17 #define K0_XMM %xmm4
18 #define K1 %ymm5
19 #define K1_XMM %xmm5
20 #define K2 %ymm6
21 #define K2_XMM %xmm6
22 #define K3 %ymm7
23 #define K3_XMM %xmm7
24 #define T0 %ymm8
25 #define T1 %ymm9
26 #define T2 %ymm10
27 #define T2_XMM %xmm10
28 #define T3 %ymm11
29 #define T3_XMM %xmm11
30 #define T4 %ymm12
31 #define T5 %ymm13
32 #define T6 %ymm14
33 #define T7 %ymm15
34 #define KEY %rdi
35 #define MESSAGE %rsi
36 #define MESSAGE_LEN %rdx
37 #define HASH %rcx
38
39 .macro _nh_2xstride k0, k1, k2, k3
40
41
42 vpaddd \k0, T3, T0
43 vpaddd \k1, T3, T1
44 vpaddd \k2, T3, T2
45 vpaddd \k3, T3, T3
46
47
48 vpshufd $0x10, T0, T4
49 vpshufd $0x32, T0, T0
50 vpshufd $0x10, T1, T5
51 vpshufd $0x32, T1, T1
52 vpshufd $0x10, T2, T6
53 vpshufd $0x32, T2, T2
54 vpshufd $0x10, T3, T7
55 vpshufd $0x32, T3, T3
56 vpmuludq T4, T0, T0
57 vpmuludq T5, T1, T1
58 vpmuludq T6, T2, T2
59 vpmuludq T7, T3, T3
60 vpaddq T0, PASS0_SUMS, PASS0_SUMS
61 vpaddq T1, PASS1_SUMS, PASS1_SUMS
62 vpaddq T2, PASS2_SUMS, PASS2_SUMS
63 vpaddq T3, PASS3_SUMS, PASS3_SUMS
64 .endm
65
66
67
68
69
70
71
72 ENTRY(nh_avx2)
73
74 vmovdqu 0x00(KEY), K0
75 vmovdqu 0x10(KEY), K1
76 add $0x20, KEY
77 vpxor PASS0_SUMS, PASS0_SUMS, PASS0_SUMS
78 vpxor PASS1_SUMS, PASS1_SUMS, PASS1_SUMS
79 vpxor PASS2_SUMS, PASS2_SUMS, PASS2_SUMS
80 vpxor PASS3_SUMS, PASS3_SUMS, PASS3_SUMS
81
82 sub $0x40, MESSAGE_LEN
83 jl .Lloop4_done
84 .Lloop4:
85 vmovdqu (MESSAGE), T3
86 vmovdqu 0x00(KEY), K2
87 vmovdqu 0x10(KEY), K3
88 _nh_2xstride K0, K1, K2, K3
89
90 vmovdqu 0x20(MESSAGE), T3
91 vmovdqu 0x20(KEY), K0
92 vmovdqu 0x30(KEY), K1
93 _nh_2xstride K2, K3, K0, K1
94
95 add $0x40, MESSAGE
96 add $0x40, KEY
97 sub $0x40, MESSAGE_LEN
98 jge .Lloop4
99
100 .Lloop4_done:
101 and $0x3f, MESSAGE_LEN
102 jz .Ldone
103
104 cmp $0x20, MESSAGE_LEN
105 jl .Llast
106
107
108 vmovdqu (MESSAGE), T3
109 vmovdqu 0x00(KEY), K2
110 vmovdqu 0x10(KEY), K3
111 _nh_2xstride K0, K1, K2, K3
112 add $0x20, MESSAGE
113 add $0x20, KEY
114 sub $0x20, MESSAGE_LEN
115 jz .Ldone
116 vmovdqa K2, K0
117 vmovdqa K3, K1
118 .Llast:
119
120
121 vmovdqu (MESSAGE), T3_XMM
122 vmovdqa K0_XMM, K0_XMM
123 vmovdqa K1_XMM, K1_XMM
124 vmovdqu 0x00(KEY), K2_XMM
125 vmovdqu 0x10(KEY), K3_XMM
126 _nh_2xstride K0, K1, K2, K3
127
128 .Ldone:
129
130
131
132
133
134
135
136
137
138
139
140
141
142 vpunpcklqdq PASS1_SUMS, PASS0_SUMS, T0
143 vpunpckhqdq PASS1_SUMS, PASS0_SUMS, T1
144 vpunpcklqdq PASS3_SUMS, PASS2_SUMS, T2
145 vpunpckhqdq PASS3_SUMS, PASS2_SUMS, T3
146
147 vinserti128 $0x1, T2_XMM, T0, T4
148 vinserti128 $0x1, T3_XMM, T1, T5
149 vperm2i128 $0x31, T2, T0, T0
150 vperm2i128 $0x31, T3, T1, T1
151
152 vpaddq T5, T4, T4
153 vpaddq T1, T0, T0
154 vpaddq T4, T0, T0
155 vmovdqu T0, (HASH)
156 ret
157 ENDPROC(nh_avx2)