1
2
3
4
5
6
7
8 #define load_16way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
9 vmovdqu (0*32)(src), x0; \
10 vmovdqu (1*32)(src), x1; \
11 vmovdqu (2*32)(src), x2; \
12 vmovdqu (3*32)(src), x3; \
13 vmovdqu (4*32)(src), x4; \
14 vmovdqu (5*32)(src), x5; \
15 vmovdqu (6*32)(src), x6; \
16 vmovdqu (7*32)(src), x7;
17
18 #define store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
19 vmovdqu x0, (0*32)(dst); \
20 vmovdqu x1, (1*32)(dst); \
21 vmovdqu x2, (2*32)(dst); \
22 vmovdqu x3, (3*32)(dst); \
23 vmovdqu x4, (4*32)(dst); \
24 vmovdqu x5, (5*32)(dst); \
25 vmovdqu x6, (6*32)(dst); \
26 vmovdqu x7, (7*32)(dst);
27
28 #define store_cbc_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7, t0) \
29 vpxor t0, t0, t0; \
30 vinserti128 $1, (src), t0, t0; \
31 vpxor t0, x0, x0; \
32 vpxor (0*32+16)(src), x1, x1; \
33 vpxor (1*32+16)(src), x2, x2; \
34 vpxor (2*32+16)(src), x3, x3; \
35 vpxor (3*32+16)(src), x4, x4; \
36 vpxor (4*32+16)(src), x5, x5; \
37 vpxor (5*32+16)(src), x6, x6; \
38 vpxor (6*32+16)(src), x7, x7; \
39 store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
40
41 #define inc_le128(x, minus_one, tmp) \
42 vpcmpeqq minus_one, x, tmp; \
43 vpsubq minus_one, x, x; \
44 vpslldq $8, tmp, tmp; \
45 vpsubq tmp, x, x;
46
47 #define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
48 vpcmpeqq minus_one, x, tmp1; \
49 vpcmpeqq minus_two, x, tmp2; \
50 vpsubq minus_two, x, x; \
51 vpor tmp2, tmp1, tmp1; \
52 vpslldq $8, tmp1, tmp1; \
53 vpsubq tmp1, x, x;
54
55 #define load_ctr_16way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t0x, t1, \
56 t1x, t2, t2x, t3, t3x, t4, t5) \
57 vpcmpeqd t0, t0, t0; \
58 vpsrldq $8, t0, t0; \
59 vpaddq t0, t0, t4; \
60 \
61 \
62 vmovdqu (iv), t2x; \
63 vmovdqa t2x, t3x; \
64 inc_le128(t2x, t0x, t1x); \
65 vbroadcasti128 bswap, t1; \
66 vinserti128 $1, t2x, t3, t2; \
67 vpshufb t1, t2, x0; \
68 \
69 \
70 add2_le128(t2, t0, t4, t3, t5); \
71 vpshufb t1, t2, x1; \
72 add2_le128(t2, t0, t4, t3, t5); \
73 vpshufb t1, t2, x2; \
74 add2_le128(t2, t0, t4, t3, t5); \
75 vpshufb t1, t2, x3; \
76 add2_le128(t2, t0, t4, t3, t5); \
77 vpshufb t1, t2, x4; \
78 add2_le128(t2, t0, t4, t3, t5); \
79 vpshufb t1, t2, x5; \
80 add2_le128(t2, t0, t4, t3, t5); \
81 vpshufb t1, t2, x6; \
82 add2_le128(t2, t0, t4, t3, t5); \
83 vpshufb t1, t2, x7; \
84 vextracti128 $1, t2, t2x; \
85 inc_le128(t2x, t0x, t3x); \
86 vmovdqu t2x, (iv);
87
88 #define store_ctr_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
89 vpxor (0*32)(src), x0, x0; \
90 vpxor (1*32)(src), x1, x1; \
91 vpxor (2*32)(src), x2, x2; \
92 vpxor (3*32)(src), x3, x3; \
93 vpxor (4*32)(src), x4, x4; \
94 vpxor (5*32)(src), x5, x5; \
95 vpxor (6*32)(src), x6, x6; \
96 vpxor (7*32)(src), x7, x7; \
97 store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
98
99 #define gf128mul_x_ble(iv, mask, tmp) \
100 vpsrad $31, iv, tmp; \
101 vpaddq iv, iv, iv; \
102 vpshufd $0x13, tmp, tmp; \
103 vpand mask, tmp, tmp; \
104 vpxor tmp, iv, iv;
105
106 #define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
107 vpsrad $31, iv, tmp0; \
108 vpaddq iv, iv, tmp1; \
109 vpsllq $2, iv, iv; \
110 vpshufd $0x13, tmp0, tmp0; \
111 vpsrad $31, tmp1, tmp1; \
112 vpand mask2, tmp0, tmp0; \
113 vpshufd $0x13, tmp1, tmp1; \
114 vpxor tmp0, iv, iv; \
115 vpand mask1, tmp1, tmp1; \
116 vpxor tmp1, iv, iv;
117
118 #define load_xts_16way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, \
119 tivx, t0, t0x, t1, t1x, t2, t2x, t3, \
120 xts_gf128mul_and_shl1_mask_0, \
121 xts_gf128mul_and_shl1_mask_1) \
122 vbroadcasti128 xts_gf128mul_and_shl1_mask_0, t1; \
123 \
124 \
125 vmovdqu (iv), tivx; \
126 vmovdqa tivx, t0x; \
127 gf128mul_x_ble(tivx, t1x, t2x); \
128 vbroadcasti128 xts_gf128mul_and_shl1_mask_1, t2; \
129 vinserti128 $1, tivx, t0, tiv; \
130 vpxor (0*32)(src), tiv, x0; \
131 vmovdqu tiv, (0*32)(dst); \
132 \
133 \
134 gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
135 vpxor (1*32)(src), tiv, x1; \
136 vmovdqu tiv, (1*32)(dst); \
137 \
138 gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
139 vpxor (2*32)(src), tiv, x2; \
140 vmovdqu tiv, (2*32)(dst); \
141 \
142 gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
143 vpxor (3*32)(src), tiv, x3; \
144 vmovdqu tiv, (3*32)(dst); \
145 \
146 gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
147 vpxor (4*32)(src), tiv, x4; \
148 vmovdqu tiv, (4*32)(dst); \
149 \
150 gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
151 vpxor (5*32)(src), tiv, x5; \
152 vmovdqu tiv, (5*32)(dst); \
153 \
154 gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
155 vpxor (6*32)(src), tiv, x6; \
156 vmovdqu tiv, (6*32)(dst); \
157 \
158 gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
159 vpxor (7*32)(src), tiv, x7; \
160 vmovdqu tiv, (7*32)(dst); \
161 \
162 vextracti128 $1, tiv, tivx; \
163 gf128mul_x_ble(tivx, t1x, t2x); \
164 vmovdqu tivx, (iv);
165
166 #define store_xts_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
167 vpxor (0*32)(dst), x0, x0; \
168 vpxor (1*32)(dst), x1, x1; \
169 vpxor (2*32)(dst), x2, x2; \
170 vpxor (3*32)(dst), x3, x3; \
171 vpxor (4*32)(dst), x4, x4; \
172 vpxor (5*32)(dst), x5, x5; \
173 vpxor (6*32)(dst), x6, x6; \
174 vpxor (7*32)(dst), x7, x7; \
175 store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);