Correct PPTP server firewall rules chain.
[tomato/davidwu.git] / release / src / router / nettle / arm / neon / umac-nh-n.asm
blob4ae876b5f52879aa493a15cbc8b8327062a8d040
1 C nettle, low-level cryptographics library
2 C
3 C Copyright (C) 2013 Niels Möller
4 C
5 C The nettle library is free software; you can redistribute it and/or modify
6 C it under the terms of the GNU Lesser General Public License as published by
7 C the Free Software Foundation; either version 2.1 of the License, or (at your
8 C option) any later version.
9 C
10 C The nettle library is distributed in the hope that it will be useful, but
11 C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12 C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
13 C License for more details.
15 C You should have received a copy of the GNU Lesser General Public License
16 C along with the nettle library; see the file COPYING.LIB. If not, write to
17 C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
18 C MA 02111-1301, USA.
20 .file "umac-nh.asm"
21 .fpu neon
23 define(<OUT>, <r0>)
24 define(<ITERS>, <r1>)
25 define(<KEY>, <r2>)
26 define(<LENGTH>, <r3>)
27 define(<MSG>, <r12>)
28 define(<SHIFT>, <r14>)
30 define(<QA>, <q0>)
31 define(<QB>, <q1>)
32 define(<QY0>, <q3>) C Accumulates for the first two operations.
33 define(<DM>, <d4>)
34 define(<QY1>, <q4>) C Used for 3 and 4 iterations.
35 define(<QC>, <q5>)
36 define(<QD>, <q6>)
37 define(<QLEFT>, <q8>)
38 define(<QRIGHT>, <q9>)
39 define(<QT0>, <q10>)
40 define(<QT1>, <q11>)
41 define(<QT2>, <q12>)
42 define(<QK0>, <q13>)
43 define(<QK1>, <q14>)
44 define(<QK2>, <q15>)
46 C FIXME: Try permuting subkeys using vld4, vzip or similar.
48 .text
49 .align 3
51 PROLOGUE(_nettle_umac_nh_n)
52 ldr MSG, [sp]
53 str lr, [sp, #-4]!
55 C Setup for 64-bit aligned reads
56 ands SHIFT, MSG, #7
57 and MSG, MSG, #-8
58 vld1.8 {DM}, [MSG :64]
59 addne MSG, MSG, #8
60 addeq SHIFT, SHIFT, #8
62 C FIXME: Combine as rsb ?
63 lsl SHIFT, SHIFT, #3
64 neg SHIFT, SHIFT
66 C Right shift in QRIGHT (both halves)
67 vmov.i32 D0REG(QRIGHT)[0], SHIFT
68 vmov.32 D1REG(QRIGHT), D0REG(QRIGHT)
69 add SHIFT, SHIFT, #64
71 vmov.i32 D0REG(QLEFT)[0], SHIFT
72 vmov.32 D1REG(QLEFT), D0REG(QLEFT)
73 cmp r1, #3
74 vmov.i64 QY0, #0
76 vshl.u64 DM, DM, D0REG(QRIGHT)
77 bcc .Lnh2
78 beq .Lnh3
80 .Lnh4:
81 C Permute key words, so we in each iteration have them in order
83 C P0: [0, 4,1, 5] P1: [ 2, 6, 3, 7] P2: [ 4, 8, 5, 9] P3: [ 6,10, 7,11]
84 C P4: [8,12,9,13] P5: [10,14,11,15] P6: [12,16,13,17] P7: [14,18,15,19]
86 C Also arrange the message words, so we get them as
87 C M0: [0,0,1,1] M1: [ 2, 2, 3, 3] M2: [ 4, 4, 5, 5] M3: [ 6, 6, 7, 7]
88 C M4: [8,8,9,9] M5: [10,10,11,11] M6: [12,12,13,13] M7: [14,14,15,15]
90 C Then, accumulate Y0 (first two "iters") using
92 C Y0 += (M0+P0) * (M2+P2) + (M1+P1) * (M3+P3)
93 C Y1 += (M0+P4) * (M2+P6) + (M1+P5) * (M3+P7)
95 C Next iteration is then
97 C Y0 += (M4+P4) * (M6+P6) + (M5+P5) * (M7 + P7)
98 C Y1 += (M4+P6) * (M6+P8) + (M5+P7) * (M7 + P11)
100 C So we can reuse P4, P5, P6, P7 from the previous iteration.
102 C How to for in registers? We need 4 Q regs for P0-P3, and one
103 C more for the last read key. We need at least two regiters
104 C for the message (QA and QB, more if we want to expand only
105 C once). For the Y0 update, we can let the factors overwrite
106 C P0-P3, and for the Y1 update, we can overwrite M0-M3.
108 vpush {q4,q5,q6}
109 vld1.32 {QK0,QK1}, [KEY]!
110 vld1.32 {QK2}, [KEY]!
111 vmov QT0, QK1
112 vmov QT1, QK2
114 C Permute keys. QK2 us untouched, permuted subkeys put in QK0,QK1,QT0,QT1
115 vtrn.32 QK0, QK1 C Gives us [0, 4, 2, 6] and [1, 5, 3, 7]
116 vswp D1REG(QK0), D0REG(QK1) C Gives us [0, 4, 1, 5] and [2, 6, 3, 7]
117 vtrn.32 QT0, QT1 C Gives us [4,8,6,10] and [5 ,9,7,11]
118 vswp D1REG(QT0), D0REG(QT1) C Gives us [4,8,5, 9] and [6,10,7,11]
120 vmov.i64 QY1, #0
121 .Loop4:
122 C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
123 vld1.8 {QA, QB}, [MSG :64]!
124 vshl.u64 QC, QA, QRIGHT
125 vshl.u64 QD, QB, QRIGHT
126 vshl.u64 QA, QA, QLEFT
127 vshl.u64 QB, QB, QLEFT
128 veor D0REG(QA), D0REG(QA), DM
129 veor D1REG(QA), D1REG(QA), D0REG(QC)
130 veor D0REG(QB), D0REG(QB), D1REG(QC)
131 veor D1REG(QB), D1REG(QB), D0REG(QD)
132 vmov DM, D1REG(QD)
134 C Explode message (too bad there's no vadd with scalar)
135 vdup.32 D1REG(QD), D1REG(QB)[1]
136 vdup.32 D0REG(QD), D1REG(QB)[0]
137 vdup.32 D1REG(QC), D0REG(QB)[1]
138 vdup.32 D0REG(QC), D0REG(QB)[0]
139 vdup.32 D1REG(QB), D1REG(QA)[1]
140 vdup.32 D0REG(QB), D1REG(QA)[0]
141 vdup.32 D1REG(QA), D0REG(QA)[1]
142 vdup.32 D0REG(QA), D0REG(QA)[0]
144 vadd.i32 QK0, QK0, QA
145 vadd.i32 QK1, QK1, QB
146 vadd.i32 QT0, QT0, QC
147 vadd.i32 QT1, QT1, QD
149 vmlal.u32 QY0, D0REG(QK0), D0REG(QT0)
150 vmlal.u32 QY0, D1REG(QK0), D1REG(QT0)
151 vmlal.u32 QY0, D0REG(QK1), D0REG(QT1)
152 vmlal.u32 QY0, D1REG(QK1), D1REG(QT1)
154 C Next 4 subkeys
155 vld1.32 {QT0,QT1}, [KEY]!
156 vmov QK0, QK2
157 vmov QK1, QT0
158 vmov QK2, QT1 C Save
159 vtrn.32 QK0, QK1 C Gives us [8,12,10,14] and [9,13,11,15]
160 vswp D1REG(QK0), D0REG(QK1) C Gives us [8,12,9,13] and [10,14,11,15]
161 vtrn.32 QT0, QT1 C Gives us [12,16,14,18] and [13,17,15,19]
162 vswp D1REG(QT0), D0REG(QT1) C Gives us [12,16,13,17] and [14,18,15,19]
164 vadd.i32 QA, QA, QK0
165 vadd.i32 QB, QB, QK1
166 vadd.i32 QC, QC, QT0
167 vadd.i32 QD, QD, QT1
169 subs LENGTH, LENGTH, #32
171 vmlal.u32 QY1, D0REG(QA), D0REG(QC)
172 vmlal.u32 QY1, D1REG(QA), D1REG(QC)
173 vmlal.u32 QY1, D0REG(QB), D0REG(QD)
174 vmlal.u32 QY1, D1REG(QB), D1REG(QD)
176 bhi .Loop4
178 vst1.64 {QY0, QY1}, [OUT]
180 vpop {q4,q5,q6}
182 ldr pc, [sp], #+4
184 .Lnh3:
185 vpush {q4}
186 vld1.32 {QK0,QK1}, [KEY]!
187 vmov.i64 QY1, #0
188 .Loop3:
189 C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
190 vld1.8 {QA, QB}, [MSG :64]!
191 vshl.u64 QT0, QA, QRIGHT
192 vshl.u64 QT1, QB, QRIGHT
193 vshl.u64 QA, QA, QLEFT
194 vshl.u64 QB, QB, QLEFT
195 veor D0REG(QA), D0REG(QA), DM
196 veor D1REG(QA), D1REG(QA), D0REG(QT0)
197 veor D0REG(QB), D0REG(QB), D1REG(QT0)
198 veor D1REG(QB), D1REG(QB), D0REG(QT1)
199 vmov DM, D1REG(QT1)
201 vld1.32 {QK2}, [KEY]!
202 C Construct factors, with low half corresponding to first iteration,
203 C and high half corresponding to the second iteration.
204 vmov QT0, QK1
205 vtrn.32 QK0, QT0 C Gives us [0, 4, 2, 6] and [1, 5, 3, 7]
206 vswp D1REG(QK0), D0REG(QT0) C Gives us [0, 4, 1, 5] and [2, 6, 3, 7]
207 vdup.32 D0REG(QT1), D0REG(QA)[0]
208 vdup.32 D1REG(QT1), D0REG(QA)[1]
209 vadd.i32 QT1, QT1, QK0
211 vmov QK0, QK2 C Save for next iteration
212 vtrn.32 QK1, QK2 C Gives us [4, 8, 2, 1] and [1, 5, 3, 7]
213 vswp D1REG(QK1), D0REG(QK2) C Gives us [4, 8, 1, 5] and [2, 1, 3, 7]
215 vdup.32 D0REG(QT2), D0REG(QB)[0]
216 vdup.32 D1REG(QT2), D0REG(QB)[1]
217 vadd.i32 QK1, QK1, QT2
218 vmlal.u32 QY0, D0REG(QT1), D0REG(QK1)
219 vmlal.u32 QY0, D1REG(QT1), D1REG(QK1)
221 vdup.32 D0REG(QT1), D1REG(QA)[0]
222 vdup.32 D1REG(QT1), D1REG(QA)[1]
223 vadd.i32 QT0, QT0, QT1
224 vdup.32 D0REG(QT1), D1REG(QB)[0]
225 vdup.32 D1REG(QT1), D1REG(QB)[1]
226 vadd.i32 QK2, QK2, QT1
228 vmlal.u32 QY0, D0REG(QT0), D0REG(QK2)
229 vmlal.u32 QY0, D1REG(QT0), D1REG(QK2)
231 vld1.32 {QK1}, [KEY]!
232 vadd.i32 QA, QA, QK0
233 vadd.i32 QB, QB, QK1
234 subs LENGTH, LENGTH, #32
235 vmlal.u32 QY1, D0REG(QA), D0REG(QB)
236 vmlal.u32 QY1, D1REG(QA), D1REG(QB)
237 bhi .Loop3
239 vadd.i64 D0REG(QY1), D0REG(QY1), D1REG(QY1)
240 vst1.64 {D0REG(QY0), D1REG(QY0), D0REG(QY1)}, [OUT]
242 vpop {q4}
244 ldr pc, [sp], #+4
246 .Lnh2:
247 vld1.32 {QK0}, [KEY]!
248 .Loop2:
249 C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
250 vld1.8 {QA, QB}, [MSG :64]!
251 vshl.u64 QT0, QA, QRIGHT
252 vshl.u64 QT1, QB, QRIGHT
253 vshl.u64 QA, QA, QLEFT
254 vshl.u64 QB, QB, QLEFT
255 veor D0REG(QA), D0REG(QA), DM
256 veor D1REG(QA), D1REG(QA), D0REG(QT0)
257 veor D0REG(QB), D0REG(QB), D1REG(QT0)
258 veor D1REG(QB), D1REG(QB), D0REG(QT1)
259 vmov DM, D1REG(QT1)
261 vld1.32 {QK1,QK2}, [KEY]!
262 C Construct factors, with low half corresponding to first iteration,
263 C and high half corresponding to the second iteration.
264 vmov QT0, QK1
265 vtrn.32 QK0, QT0 C Gives us [0, 4, 2, 6] and [1, 5, 3, 7]
266 vswp D1REG(QK0), D0REG(QT0) C Gives us [0, 4, 1, 5] and [2, 6, 3, 7]
267 vdup.32 D0REG(QT1), D0REG(QA)[0]
268 vdup.32 D1REG(QT1), D0REG(QA)[1]
269 vadd.i32 QT1, QT1, QK0
271 vmov QK0, QK2 C Save for next iteration
272 vtrn.32 QK1, QK2 C Gives us [4, 8, 6, 10] and [5, 9, 7, 11]
273 vswp D1REG(QK1), D0REG(QK2) C Gives us [4, 8, 5, 9] and [6, 10, 7, 11]
275 vdup.32 D0REG(QT2), D0REG(QB)[0]
276 vdup.32 D1REG(QT2), D0REG(QB)[1]
277 vadd.i32 QK1, QK1, QT2
278 vmlal.u32 QY0, D0REG(QT1), D0REG(QK1)
279 vmlal.u32 QY0, D1REG(QT1), D1REG(QK1)
281 vdup.32 D0REG(QT1), D1REG(QA)[0]
282 vdup.32 D1REG(QT1), D1REG(QA)[1]
283 vadd.i32 QT0, QT0, QT1
284 vdup.32 D0REG(QT1), D1REG(QB)[0]
285 vdup.32 D1REG(QT1), D1REG(QB)[1]
286 vadd.i32 QK2, QK2, QT1
288 subs LENGTH, LENGTH, #32
290 vmlal.u32 QY0, D0REG(QT0), D0REG(QK2)
291 vmlal.u32 QY0, D1REG(QT0), D1REG(QK2)
293 bhi .Loop2
294 vst1.64 {QY0}, [OUT]
296 .Lend:
297 ldr pc, [sp], #+4
298 EPILOGUE(_nettle_umac_nh_n)