1/* 2 * arch/score/lib/csum_partial.S 3 * 4 * Score Processor version. 5 * 6 * Copyright (C) 2009 Sunplus Core Technology Co., Ltd. 7 * Lennox Wu <lennox.wu@sunplusct.com> 8 * Chen Liqin <liqin.chen@sunplusct.com> 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU General Public License as published by 12 * the Free Software Foundation; either version 2 of the License, or 13 * (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public License 21 * along with this program; if not, see the file COPYING, or write 22 * to the Free Software Foundation, Inc., 23 * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 24 */ 25#include <linux/linkage.h> 26 27#define ADDC(sum,reg) \ 28 add sum, sum, reg; \ 29 cmp.c reg, sum; \ 30 bleu 9f; \ 31 addi sum, 0x1; \ 329: 33 34#define CSUM_BIGCHUNK(src, offset, sum) \ 35 lw r8, [src, offset + 0x00]; \ 36 lw r9, [src, offset + 0x04]; \ 37 lw r10, [src, offset + 0x08]; \ 38 lw r11, [src, offset + 0x0c]; \ 39 ADDC(sum, r8); \ 40 ADDC(sum, r9); \ 41 ADDC(sum, r10); \ 42 ADDC(sum, r11); \ 43 lw r8, [src, offset + 0x10]; \ 44 lw r9, [src, offset + 0x14]; \ 45 lw r10, [src, offset + 0x18]; \ 46 lw r11, [src, offset + 0x1c]; \ 47 ADDC(sum, r8); \ 48 ADDC(sum, r9); \ 49 ADDC(sum, r10); \ 50 ADDC(sum, r11); \ 51 52#define src r4 53#define dest r5 54#define sum r27 55 56 .text 57/* unknown src alignment and < 8 bytes to go */ 58small_csumcpy: 59 mv r5, r10 60 ldi r9, 0x0 61 cmpi.c r25, 0x1 62 beq pass_small_set_t7 /*already set, jump to pass_small_set_t7*/ 63 andri.c r25,r4 , 0x1 /*Is src 2 bytes aligned?*/ 64 65pass_small_set_t7: 66 beq aligned 67 cmpi.c r5, 0x0 68 beq fold 69 lbu r9, [src] 70 slli r9,r9, 0x8 /*Little endian*/ 71 ADDC(sum, r9) 72 addi src, 0x1 73 subi.c r5, 0x1 74 75 /*len still a full word */ 76aligned: 77 andri.c r8, r5, 0x4 /*Len >= 4?*/ 78 beq len_less_4bytes 79 80 /* Still a full word (4byte) to go,and the src is word aligned.*/ 81 andri.c r8, src, 0x3 /*src is 4bytes aligned, so use LW!!*/ 82 beq four_byte_aligned 83 lhu r9, [src] 84 addi src, 2 85 ADDC(sum, r9) 86 lhu r9, [src] 87 addi src, 2 88 ADDC(sum, r9) 89 b len_less_4bytes 90 91four_byte_aligned: /* Len >=4 and four byte aligned */ 92 lw r9, [src] 93 addi src, 4 94 ADDC(sum, r9) 95 96len_less_4bytes: /* 2 byte aligned aligned and length<4B */ 97 andri.c r8, r5, 0x2 98 beq len_less_2bytes 99 lhu r9, [src] 100 addi src, 0x2 /* src+=2 */ 101 ADDC(sum, r9) 102 103len_less_2bytes: /* len = 1 */ 104 andri.c r8, r5, 0x1 105 beq fold /* less than 2 and not equal 1--> len=0 -> fold */ 106 lbu r9, [src] 107 108fold_ADDC: 109 ADDC(sum, r9) 110fold: 111 /* fold checksum */ 112 slli r26, sum, 16 113 add sum, sum, r26 114 cmp.c r26, sum 115 srli sum, sum, 16 116 bleu 1f /* if r26<=sum */ 117 addi sum, 0x1 /* r26>sum */ 1181: 119 /* odd buffer alignment? r25 was set in csum_partial */ 120 cmpi.c r25, 0x0 121 beq 1f 122 slli r26, sum, 8 123 srli sum, sum, 8 124 or sum, sum, r26 125 andi sum, 0xffff 1261: 127 .set optimize 128 /* Add the passed partial csum. */ 129 ADDC(sum, r6) 130 mv r4, sum 131 br r3 132 .set volatile 133 134 .align 5 135ENTRY(csum_partial) 136 ldi sum, 0 137 ldi r25, 0 138 mv r10, r5 139 cmpi.c r5, 0x8 140 blt small_csumcpy /* < 8(signed) bytes to copy */ 141 cmpi.c r5, 0x0 142 beq out 143 andri.c r25, src, 0x1 /* odd buffer? */ 144 145 beq word_align 146hword_align: /* 1 byte */ 147 lbu r8, [src] 148 subi r5, 0x1 149 slli r8, r8, 8 150 ADDC(sum, r8) 151 addi src, 0x1 152 153word_align: /* 2 bytes */ 154 andri.c r8, src, 0x2 /* 4bytes(dword)_aligned? */ 155 beq dword_align /* not, maybe dword_align */ 156 lhu r8, [src] 157 subi r5, 0x2 158 ADDC(sum, r8) 159 addi src, 0x2 160 161dword_align: /* 4bytes */ 162 mv r26, r5 /* maybe useless when len >=56 */ 163 ldi r8, 56 164 cmp.c r8, r5 165 bgtu do_end_words /* if a1(len)<t0(56) ,unsigned */ 166 andri.c r26, src, 0x4 167 beq qword_align 168 lw r8, [src] 169 subi r5, 0x4 170 ADDC(sum, r8) 171 addi src, 0x4 172 173qword_align: /* 8 bytes */ 174 andri.c r26, src, 0x8 175 beq oword_align 176 lw r8, [src, 0x0] 177 lw r9, [src, 0x4] 178 subi r5, 0x8 /* len-=0x8 */ 179 ADDC(sum, r8) 180 ADDC(sum, r9) 181 addi src, 0x8 182 183oword_align: /* 16bytes */ 184 andri.c r26, src, 0x10 185 beq begin_movement 186 lw r10, [src, 0x08] 187 lw r11, [src, 0x0c] 188 lw r8, [src, 0x00] 189 lw r9, [src, 0x04] 190 ADDC(sum, r10) 191 ADDC(sum, r11) 192 ADDC(sum, r8) 193 ADDC(sum, r9) 194 subi r5, 0x10 195 addi src, 0x10 196 197begin_movement: 198 srli.c r26, r5, 0x7 /* len>=128? */ 199 beq 1f /* len<128 */ 200 201/* r26 is the result that computed in oword_align */ 202move_128bytes: 203 CSUM_BIGCHUNK(src, 0x00, sum) 204 CSUM_BIGCHUNK(src, 0x20, sum) 205 CSUM_BIGCHUNK(src, 0x40, sum) 206 CSUM_BIGCHUNK(src, 0x60, sum) 207 subi.c r26, 0x01 /* r26 equals len/128 */ 208 addi src, 0x80 209 bne move_128bytes 210 2111: /* len<128,we process 64byte here */ 212 andri.c r10, r5, 0x40 213 beq 1f 214 215move_64bytes: 216 CSUM_BIGCHUNK(src, 0x00, sum) 217 CSUM_BIGCHUNK(src, 0x20, sum) 218 addi src, 0x40 219 2201: /* len<64 */ 221 andri r26, r5, 0x1c /* 0x1c=28 */ 222 andri.c r10, r5, 0x20 223 beq do_end_words /* decided by andri */ 224 225move_32bytes: 226 CSUM_BIGCHUNK(src, 0x00, sum) 227 andri r26, r5, 0x1c 228 addri src, src, 0x20 229 230do_end_words: /* len<32 */ 231 /* r26 was set already in dword_align */ 232 cmpi.c r26, 0x0 233 beq maybe_end_cruft /* len<28 or len<56 */ 234 srli r26, r26, 0x2 235 236end_words: 237 lw r8, [src] 238 subi.c r26, 0x1 /* unit is 4 byte */ 239 ADDC(sum, r8) 240 addi src, 0x4 241 cmpi.c r26, 0x0 242 bne end_words /* r26!=0 */ 243 244maybe_end_cruft: /* len<4 */ 245 andri r10, r5, 0x3 246 247small_memcpy: 248 mv r5, r10 249 j small_csumcpy 250 251out: 252 mv r4, sum 253 br r3 254 255END(csum_partial) 256