Correct PPTP server firewall rules chain.
[tomato/davidwu.git] / release / src / router / nettle / sparc32 / arcfour-crypt.asm
blob5b217ea637f6244e33602200a642e6dfa6a50eb0
1 C -*- mode: asm; asm-comment-char: ?C; -*-
2 C nettle, low-level cryptographics library
3 C
4 C Copyright (C) 2002, 2005 Niels Möller
5 C
6 C The nettle library is free software; you can redistribute it and/or modify
7 C it under the terms of the GNU Lesser General Public License as published by
8 C the Free Software Foundation; either version 2.1 of the License, or (at your
9 C option) any later version.
11 C The nettle library is distributed in the hope that it will be useful, but
12 C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
14 C License for more details.
16 C You should have received a copy of the GNU Lesser General Public License
17 C along with the nettle library; see the file COPYING.LIB. If not, write to
18 C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 C MA 02111-1301, USA.
21 C Define to YES, to enable the complex code to special case SRC
22 C and DST with compatible alignment.
24 define(<WITH_ALIGN>, <YES>)
26 C Registers
28 define(<CTX>, <%i0>)
29 define(<LENGTH>,<%i1>)
30 define(<DST>, <%i2>)
31 define(<SRC>, <%i3>)
33 define(<I1>, <%i4>)
34 define(<I2>, <%i5>)
35 define(<J>, <%g1>)
36 define(<SI>, <%g2>)
37 define(<SJ>, <%g3>)
38 define(<TMP>, <%o0>)
39 define(<TMP2>, <%o1>)
40 define(<N>, <%o2>)
41 define(<DATA>, <%o3>)
43 C Computes the next byte of the key stream. As input, i must
44 C already point to the index for the current access, the index
45 C for the next access is stored in ni. The resulting key byte is
46 C stored in res.
47 C ARCFOUR_BYTE(i, ni, res)
48 define(<ARCFOUR_BYTE>, <
49 ldub [CTX + $1], SI
50 add $1, 1, $2
51 add J, SI, J
52 and J, 0xff, J
53 ldub [CTX + J], SJ
54 and $2, 0xff, $2
55 stb SI, [CTX + J]
56 add SI, SJ, SI
57 and SI, 0xff, SI
58 stb SJ, [CTX + $1]
59 ldub [CTX + SI], $3
60 >)dnl
62 C FIXME: Consider using the callers window
63 define(<FRAME_SIZE>, 104)
65 .file "arcfour-crypt.asm"
67 C arcfour_crypt(struct arcfour_ctx *ctx,
68 C unsigned length, uint8_t *dst,
69 C const uint8_t *src)
71 .section ".text"
72 .align 16
73 .proc 020
75 PROLOGUE(nettle_arcfour_crypt)
77 save %sp, -FRAME_SIZE, %sp
78 cmp LENGTH, 0
79 be .Lend
80 nop
82 C Load both I and J
83 lduh [CTX + ARCFOUR_I], I1
84 and I1, 0xff, J
85 srl I1, 8, I1
87 C We want an even address for DST
88 andcc DST, 1, %g0
89 add I1, 1 ,I1
90 beq .Laligned2
91 and I1, 0xff, I1
93 mov I1, I2
94 ldub [SRC], DATA
95 ARCFOUR_BYTE(I2, I1, TMP)
96 subcc LENGTH, 1, LENGTH
97 add SRC, 1, SRC
98 xor DATA, TMP, DATA
99 stb DATA, [DST]
100 beq .Ldone
101 add DST, 1, DST
103 .Laligned2:
105 cmp LENGTH, 2
106 blu .Lfinal1
107 C Harmless delay slot instruction
108 andcc DST, 2, %g0
109 beq .Laligned4
112 ldub [SRC], DATA
113 ARCFOUR_BYTE(I1, I2, TMP)
114 ldub [SRC + 1], TMP2
115 add SRC, 2, SRC
116 xor DATA, TMP, DATA
117 sll DATA, 8, DATA
119 ARCFOUR_BYTE(I2, I1, TMP)
120 xor TMP2, TMP, TMP
121 subcc LENGTH, 2, LENGTH
122 or DATA, TMP, DATA
124 sth DATA, [DST]
125 beq .Ldone
126 add DST, 2, DST
128 .Laligned4:
129 cmp LENGTH, 4
130 blu .Lfinal2
131 C Harmless delay slot instruction
132 srl LENGTH, 2, N
134 .Loop:
135 C Main loop, with aligned writes
137 C FIXME: Could check if SRC is aligned, and
138 C use 32-bit reads in that case.
140 ldub [SRC], DATA
141 ARCFOUR_BYTE(I1, I2, TMP)
142 ldub [SRC + 1], TMP2
143 xor TMP, DATA, DATA
144 sll DATA, 8, DATA
146 ARCFOUR_BYTE(I2, I1, TMP)
147 xor TMP2, TMP, TMP
148 ldub [SRC + 2], TMP2
149 or TMP, DATA, DATA
150 sll DATA, 8, DATA
152 ARCFOUR_BYTE(I1, I2, TMP)
153 xor TMP2, TMP, TMP
154 ldub [SRC + 3], TMP2
155 or TMP, DATA, DATA
156 sll DATA, 8, DATA
158 ARCFOUR_BYTE(I2, I1, TMP)
159 xor TMP2, TMP, TMP
160 or TMP, DATA, DATA
161 subcc N, 1, N
162 add SRC, 4, SRC
163 st DATA, [DST]
164 bne .Loop
165 add DST, 4, DST
167 andcc LENGTH, 3, LENGTH
168 beq .Ldone
171 .Lfinal2:
172 C DST address must be 2-aligned
173 cmp LENGTH, 2
174 blu .Lfinal1
177 ldub [SRC], DATA
178 ARCFOUR_BYTE(I1, I2, TMP)
179 ldub [SRC + 1], TMP2
180 add SRC, 2, SRC
181 xor DATA, TMP, DATA
182 sll DATA, 8, DATA
184 ARCFOUR_BYTE(I2, I1, TMP)
185 xor TMP2, TMP, TMP
186 or DATA, TMP, DATA
188 sth DATA, [DST]
189 beq .Ldone
190 add DST, 2, DST
192 .Lfinal1:
193 mov I1, I2
194 ldub [SRC], DATA
195 ARCFOUR_BYTE(I2, I1, TMP)
196 xor DATA, TMP, DATA
197 stb DATA, [DST]
199 .Ldone:
200 C Save back I and J
201 sll I2, 8, I2
202 or I2, J, I2
203 stuh I2, [CTX + ARCFOUR_I]
205 .Lend:
207 restore
209 EPILOGUE(nettle_arcfour_crypt)
211 C Some stats from adriana.lysator.liu.se (SS1000E, 85 MHz), for AES 128
213 C 1: nettle-1.13 C-code
214 C 2: First working version of the assembler code
215 C 3: Moved load of source byte
216 C 4: Better instruction scheduling
217 C 5: Special case SRC and DST with compatible alignment
218 C 6: After bugfix (reorder of ld [CTX+SI+SJ] and st [CTX + SI])
219 C 7: Unrolled only twice, with byte-accesses
220 C 8: Unrolled, using 8-bit reads and aligned 32-bit writes.
222 C MB/s cycles/byte Code size (bytes)
223 C 1: 6.6 12.4 132
224 C 2: 5.6 14.5 116
225 C 3: 6.0 13.5 116
226 C 4: 6.5 12.4 116
227 C 5: 7.9 10.4 496
228 C 6: 8.3 9.7 496
229 C 7: 6.7 12.1 268
230 C 8: 8.3 9.8 768