Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[cris-mirror.git] / arch / score / lib / checksum.S
blob1141f2b4a50186858f6889c74ae3b88867ec2020
1 /*
2  * arch/score/lib/csum_partial.S
3  *
4  * Score Processor version.
5  *
6  * Copyright (C) 2009 Sunplus Core Technology Co., Ltd.
7  *  Lennox Wu <lennox.wu@sunplusct.com>
8  *  Chen Liqin <liqin.chen@sunplusct.com>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, see the file COPYING, or write
22  * to the Free Software Foundation, Inc.,
23  * 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
24  */
25 #include <linux/linkage.h>
27 #define ADDC(sum,reg)                   \
28         add     sum, sum, reg;          \
29         cmp.c   reg, sum;               \
30         bleu    9f;                     \
31         addi    sum, 0x1;               \
34 #define CSUM_BIGCHUNK(src, offset, sum)         \
35         lw      r8, [src, offset + 0x00];       \
36         lw      r9, [src, offset + 0x04];       \
37         lw      r10, [src, offset + 0x08];      \
38         lw      r11, [src, offset + 0x0c];      \
39         ADDC(sum, r8);                          \
40         ADDC(sum, r9);                          \
41         ADDC(sum, r10);                         \
42         ADDC(sum, r11);                         \
43         lw      r8, [src, offset + 0x10];       \
44         lw      r9, [src, offset + 0x14];       \
45         lw      r10, [src, offset + 0x18];      \
46         lw      r11, [src, offset + 0x1c];      \
47         ADDC(sum, r8);                          \
48         ADDC(sum, r9);                          \
49         ADDC(sum, r10);                         \
50         ADDC(sum, r11);                         \
52 #define src r4
53 #define dest r5
54 #define sum r27
56         .text
57 /* unknown src alignment and < 8 bytes to go */
58 small_csumcpy:
59         mv      r5, r10
60         ldi     r9, 0x0
61         cmpi.c  r25, 0x1
62         beq pass_small_set_t7   /*already set, jump to pass_small_set_t7*/
63         andri.c r25,r4 , 0x1    /*Is src 2 bytes aligned?*/
65 pass_small_set_t7:
66         beq     aligned
67         cmpi.c  r5, 0x0
68         beq     fold
69         lbu     r9, [src]
70         slli    r9,r9, 0x8      /*Little endian*/
71         ADDC(sum, r9)
72         addi    src, 0x1
73         subi.c  r5, 0x1
75         /*len still a full word */
76 aligned:
77         andri.c r8, r5, 0x4     /*Len >= 4?*/
78         beq     len_less_4bytes
80         /* Still a full word (4byte) to go,and the src is word aligned.*/
81         andri.c r8, src, 0x3    /*src is 4bytes aligned, so use LW!!*/
82         beq     four_byte_aligned
83         lhu     r9, [src]
84         addi    src, 2
85         ADDC(sum, r9)
86         lhu     r9, [src]
87         addi    src, 2
88         ADDC(sum, r9)
89         b len_less_4bytes
91 four_byte_aligned:              /* Len >=4 and four byte aligned */
92         lw      r9, [src]
93         addi    src, 4
94         ADDC(sum, r9)
96 len_less_4bytes:                /* 2 byte aligned aligned and length<4B */
97         andri.c r8, r5, 0x2
98         beq     len_less_2bytes
99         lhu     r9, [src]
100         addi    src, 0x2        /* src+=2 */
101         ADDC(sum, r9)
103 len_less_2bytes:                /* len = 1 */
104         andri.c r8, r5, 0x1
105         beq     fold            /* less than 2 and not equal 1--> len=0 -> fold */
106         lbu     r9, [src]
108 fold_ADDC:
109         ADDC(sum, r9)
110 fold:
111         /* fold checksum */
112         slli    r26, sum, 16
113         add     sum, sum, r26
114         cmp.c   r26, sum
115         srli    sum, sum, 16
116         bleu    1f              /* if r26<=sum */
117         addi    sum, 0x1        /* r26>sum */
119         /* odd buffer alignment? r25 was set in csum_partial */
120         cmpi.c  r25, 0x0
121         beq     1f
122         slli    r26, sum, 8
123         srli    sum, sum, 8
124         or      sum, sum, r26
125         andi    sum, 0xffff
127         .set    optimize
128         /* Add the passed partial csum. */
129         ADDC(sum, r6)
130         mv      r4, sum
131         br      r3
132         .set    volatile
134         .align  5
135 ENTRY(csum_partial)
136         ldi sum, 0
137         ldi r25, 0
138         mv r10, r5
139         cmpi.c  r5, 0x8
140         blt     small_csumcpy           /* < 8(signed) bytes to copy */
141         cmpi.c  r5, 0x0
142         beq     out
143         andri.c r25, src, 0x1           /* odd buffer? */
145         beq     word_align
146 hword_align:                            /* 1 byte */
147         lbu     r8, [src]
148         subi    r5, 0x1
149         slli    r8, r8, 8
150         ADDC(sum, r8)
151         addi    src, 0x1
153 word_align:                             /* 2 bytes */
154         andri.c r8, src, 0x2            /* 4bytes(dword)_aligned? */
155         beq     dword_align             /* not, maybe dword_align */
156         lhu     r8, [src]
157         subi    r5, 0x2
158         ADDC(sum, r8)
159         addi    src, 0x2
161 dword_align:                            /* 4bytes */
162         mv      r26, r5                 /* maybe useless when len >=56 */
163         ldi     r8, 56
164         cmp.c   r8, r5
165         bgtu    do_end_words            /* if a1(len)<t0(56) ,unsigned */
166         andri.c r26, src, 0x4
167         beq     qword_align
168         lw      r8, [src]
169         subi    r5, 0x4
170         ADDC(sum, r8)
171         addi    src, 0x4
173 qword_align:                            /* 8 bytes */
174         andri.c r26, src, 0x8
175         beq     oword_align
176         lw      r8, [src, 0x0]
177         lw      r9, [src, 0x4]
178         subi    r5, 0x8                 /* len-=0x8 */
179         ADDC(sum, r8)
180         ADDC(sum, r9)
181         addi    src, 0x8
183 oword_align:                            /* 16bytes */
184         andri.c r26, src, 0x10
185         beq     begin_movement
186         lw      r10, [src, 0x08]
187         lw      r11, [src, 0x0c]
188         lw      r8, [src, 0x00]
189         lw      r9, [src, 0x04]
190         ADDC(sum, r10)
191         ADDC(sum, r11)
192         ADDC(sum, r8)
193         ADDC(sum, r9)
194         subi    r5, 0x10
195         addi    src, 0x10
197 begin_movement:
198         srli.c  r26, r5, 0x7            /* len>=128? */
199         beq     1f                      /* len<128 */
201 /* r26 is the result that computed in oword_align */
202 move_128bytes:
203         CSUM_BIGCHUNK(src, 0x00, sum)
204         CSUM_BIGCHUNK(src, 0x20, sum)
205         CSUM_BIGCHUNK(src, 0x40, sum)
206         CSUM_BIGCHUNK(src, 0x60, sum)
207         subi.c  r26, 0x01               /* r26 equals len/128 */
208         addi    src, 0x80
209         bne     move_128bytes
211 1:      /* len<128,we process 64byte here */
212         andri.c r10, r5, 0x40
213         beq     1f
215 move_64bytes:
216         CSUM_BIGCHUNK(src, 0x00, sum)
217         CSUM_BIGCHUNK(src, 0x20, sum)
218         addi    src, 0x40
220 1:                                      /* len<64 */
221         andri   r26, r5, 0x1c           /* 0x1c=28 */
222         andri.c r10, r5, 0x20
223         beq     do_end_words            /* decided by andri */
225 move_32bytes:
226         CSUM_BIGCHUNK(src, 0x00, sum)
227         andri   r26, r5, 0x1c
228         addri   src, src, 0x20
230 do_end_words:                           /* len<32 */
231         /* r26 was set already in dword_align */
232         cmpi.c  r26, 0x0
233         beq     maybe_end_cruft         /* len<28 or len<56 */
234         srli    r26, r26, 0x2
236 end_words:
237         lw      r8, [src]
238         subi.c  r26, 0x1                /* unit is 4 byte */
239         ADDC(sum, r8)
240         addi    src, 0x4
241         cmpi.c  r26, 0x0
242         bne     end_words               /* r26!=0 */
244 maybe_end_cruft:                        /* len<4 */
245         andri   r10, r5, 0x3
247 small_memcpy:
248         mv      r5, r10
249         j       small_csumcpy
251 out:
252         mv      r4, sum
253         br      r3
255 END(csum_partial)