1/*
2 * arch/score/lib/csum_partial.S
3 *
4 * Score Processor version.
5 *
6 * Copyright (C) 2009 Sunplus Core Technology Co., Ltd.
7 *  Lennox Wu <lennox.wu@sunplusct.com>
8 *  Chen Liqin <liqin.chen@sunplusct.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, see the file COPYING, or write
22 * to the Free Software Foundation, Inc.,
23 * 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
24 */
25#include <linux/linkage.h>
26
27#define ADDC(sum,reg)			\
28	add	sum, sum, reg;		\
29	cmp.c	reg, sum;		\
30	bleu	9f;			\
31	addi	sum, 0x1;		\
329:
33
34#define CSUM_BIGCHUNK(src, offset, sum)		\
35	lw	r8, [src, offset + 0x00];	\
36	lw	r9, [src, offset + 0x04];	\
37	lw	r10, [src, offset + 0x08];	\
38	lw	r11, [src, offset + 0x0c];	\
39	ADDC(sum, r8);				\
40	ADDC(sum, r9);				\
41	ADDC(sum, r10);				\
42	ADDC(sum, r11);				\
43	lw	r8, [src, offset + 0x10];	\
44	lw	r9, [src, offset + 0x14];	\
45	lw	r10, [src, offset + 0x18]; 	\
46	lw	r11, [src, offset + 0x1c]; 	\
47	ADDC(sum, r8);				\
48	ADDC(sum, r9);				\
49	ADDC(sum, r10);				\
50	ADDC(sum, r11);				\
51
52#define src r4
53#define dest r5
54#define sum r27
55
56	.text
57/* unknown src alignment and < 8 bytes to go */
58small_csumcpy:
59	mv	r5, r10
60	ldi	r9, 0x0
61	cmpi.c	r25, 0x1
62	beq pass_small_set_t7	/*already set, jump to pass_small_set_t7*/
63	andri.c	r25,r4 , 0x1	/*Is src 2 bytes aligned?*/
64
65pass_small_set_t7:
66	beq	aligned
67	cmpi.c	r5, 0x0
68	beq	fold
69	lbu	r9, [src]
70	slli	r9,r9, 0x8	/*Little endian*/
71	ADDC(sum, r9)
72	addi	src, 0x1
73	subi.c	r5, 0x1
74
75	/*len still a full word */
76aligned:
77	andri.c r8, r5, 0x4	/*Len >= 4?*/
78	beq	len_less_4bytes
79
80	/* Still a full word (4byte) to go,and the src is word aligned.*/
81	andri.c	r8, src, 0x3	/*src is 4bytes aligned, so use LW!!*/
82	beq	four_byte_aligned
83	lhu 	r9, [src]
84	addi	src, 2
85	ADDC(sum, r9)
86	lhu 	r9, [src]
87	addi	src, 2
88	ADDC(sum, r9)
89	b len_less_4bytes
90
91four_byte_aligned:		/* Len >=4 and four byte aligned */
92	lw	r9, [src]
93	addi	src, 4
94	ADDC(sum, r9)
95
96len_less_4bytes:		/* 2 byte aligned aligned and length<4B */
97	andri.c r8, r5, 0x2
98	beq	len_less_2bytes
99	lhu	r9, [src]
100	addi	src, 0x2	/* src+=2 */
101	ADDC(sum, r9)
102
103len_less_2bytes:		/* len = 1 */
104	andri.c r8, r5, 0x1
105	beq 	fold		/* less than 2 and not equal 1--> len=0 -> fold */
106	lbu	r9, [src]
107
108fold_ADDC:
109	ADDC(sum, r9)
110fold:
111	/* fold checksum */
112	slli	r26, sum, 16
113	add	sum, sum, r26
114	cmp.c	r26, sum
115	srli	sum, sum, 16
116	bleu 	1f 		/* if r26<=sum */
117	addi	sum, 0x1 	/* r26>sum */
1181:
119	/* odd buffer alignment? r25 was set in csum_partial */
120	cmpi.c	r25, 0x0
121	beq	1f
122	slli	r26, sum, 8
123	srli	sum, sum, 8
124	or	sum, sum, r26
125	andi	sum, 0xffff
1261:
127	.set	optimize
128	/* Add the passed partial csum. */
129	ADDC(sum, r6)
130	mv	r4, sum
131	br	r3
132	.set	volatile
133
134	.align	5
135ENTRY(csum_partial)
136	ldi sum, 0
137	ldi r25, 0
138	mv r10, r5
139	cmpi.c	r5, 0x8
140	blt	small_csumcpy		/* < 8(signed) bytes to copy */
141	cmpi.c	r5, 0x0
142	beq	out
143	andri.c	r25, src, 0x1		/* odd buffer? */
144
145	beq	word_align
146hword_align:				/* 1 byte */
147	lbu	r8, [src]
148	subi	r5, 0x1
149	slli	r8, r8, 8
150	ADDC(sum, r8)
151	addi	src, 0x1
152
153word_align:				/* 2 bytes */
154	andri.c r8, src, 0x2		/* 4bytes(dword)_aligned? */
155	beq	dword_align		/* not, maybe dword_align */
156	lhu	r8, [src]
157	subi	r5, 0x2
158	ADDC(sum, r8)
159	addi	src, 0x2
160
161dword_align:				/* 4bytes */
162	mv 	r26, r5			/* maybe useless when len >=56 */
163	ldi 	r8, 56
164	cmp.c	r8, r5
165	bgtu	do_end_words		/* if a1(len)<t0(56) ,unsigned */
166	andri.c	r26, src, 0x4
167	beq	qword_align
168	lw	r8, [src]
169	subi	r5, 0x4
170	ADDC(sum, r8)
171	addi	src, 0x4
172
173qword_align:				/* 8 bytes */
174	andri.c r26, src, 0x8
175	beq	oword_align
176	lw	r8, [src, 0x0]
177	lw	r9, [src, 0x4]
178	subi	r5, 0x8			/* len-=0x8 */
179	ADDC(sum, r8)
180	ADDC(sum, r9)
181	addi	src, 0x8
182
183oword_align:				/* 16bytes */
184	andri.c	r26, src, 0x10
185	beq	begin_movement
186	lw	r10, [src, 0x08]
187	lw	r11, [src, 0x0c]
188	lw	r8, [src, 0x00]
189	lw	r9, [src, 0x04]
190	ADDC(sum, r10)
191	ADDC(sum, r11)
192	ADDC(sum, r8)
193	ADDC(sum, r9)
194	subi	r5, 0x10
195	addi	src, 0x10
196
197begin_movement:
198	srli.c	r26, r5, 0x7		/* len>=128? */
199	beq	1f			/* len<128 */
200
201/* r26 is the result that computed in oword_align */
202move_128bytes:
203	CSUM_BIGCHUNK(src, 0x00, sum)
204	CSUM_BIGCHUNK(src, 0x20, sum)
205	CSUM_BIGCHUNK(src, 0x40, sum)
206	CSUM_BIGCHUNK(src, 0x60, sum)
207	subi.c	r26, 0x01		/* r26 equals len/128 */
208	addi	src, 0x80
209	bne	move_128bytes
210
2111:	/* len<128,we process 64byte here */
212	andri.c	r10, r5, 0x40
213	beq	1f
214
215move_64bytes:
216	CSUM_BIGCHUNK(src, 0x00, sum)
217	CSUM_BIGCHUNK(src, 0x20, sum)
218	addi	src, 0x40
219
2201:					/* len<64 */
221	andri	r26, r5, 0x1c		/* 0x1c=28 */
222	andri.c	r10, r5, 0x20
223	beq	do_end_words		/* decided by andri */
224
225move_32bytes:
226	CSUM_BIGCHUNK(src, 0x00, sum)
227	andri	r26, r5, 0x1c
228	addri	src, src, 0x20
229
230do_end_words:				/* len<32 */
231	/* r26 was set already in dword_align */
232	cmpi.c	r26, 0x0
233	beq	maybe_end_cruft		/* len<28 or len<56 */
234	srli	r26, r26, 0x2
235
236end_words:
237	lw	r8, [src]
238	subi.c	r26, 0x1		/* unit is 4 byte */
239	ADDC(sum, r8)
240	addi	src, 0x4
241	cmpi.c	r26, 0x0
242	bne	end_words		/* r26!=0 */
243
244maybe_end_cruft:			/* len<4 */
245	andri	r10, r5, 0x3
246
247small_memcpy:
248	mv	r5, r10
249	j	small_csumcpy
250
251out:
252	mv	r4, sum
253	br	r3
254
255END(csum_partial)
256