1 C nettle
, low-level cryptographics library
3 C Copyright
(C
) 2013 Niels Möller
5 C The nettle library is free software
; you can redistribute it and/or modify
6 C it under the terms of the GNU Lesser General
Public License as published by
7 C the Free Software Foundation
; either version 2.1 of the License, or (at your
8 C option
) any later version.
10 C The nettle library is distributed
in the hope that it will be useful
, but
11 C WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
12 C
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General
Public
13 C License for more details.
15 C You should have received a copy of the GNU Lesser General
Public License
16 C along with the nettle library
; see the file COPYING.LIB. If not, write to
17 C the Free Software Foundation
, Inc.
, 51 Franklin Street
, Fifth Floor
, Boston
,
26 define
(<LENGTH>, <r3
>)
28 define
(<SHIFT
>, <r14
>)
32 define
(<QY0
>, <q3
>) C Accumulates for the first two operations.
34 define
(<QY1
>, <q4
>) C Used for
3 and 4 iterations.
38 define
(<QRIGHT
>, <q9
>)
46 C
FIXME: Try permuting subkeys using vld4
, vzip
or similar.
51 PROLOGUE
(_nettle_umac_nh_n
)
55 C Setup for
64-bit aligned reads
58 vld1.8
{DM}, [MSG
:64]
60 addeq SHIFT
, SHIFT
, #
8
62 C
FIXME: Combine as rsb
?
66 C Right shift
in QRIGHT
(both halves
)
67 vmov.i32 D0REG
(QRIGHT
)[0], SHIFT
68 vmov
.32 D1REG
(QRIGHT
), D0REG
(QRIGHT
)
71 vmov.i32 D0REG
(QLEFT
)[0], SHIFT
72 vmov
.32 D1REG
(QLEFT
), D0REG
(QLEFT
)
76 vshl.u64 DM
, DM
, D0REG
(QRIGHT
)
81 C Permute key words
, so we
in each iteration have them
in order
83 C
P0: [0, 4,1, 5] P1: [ 2, 6, 3, 7] P2: [ 4, 8, 5, 9] P3: [ 6,10, 7,11]
84 C
P4: [8,12,9,13] P5: [10,14,11,15] P6: [12,16,13,17] P7: [14,18,15,19]
86 C Also arrange the message words
, so we get them as
87 C
M0: [0,0,1,1] M1: [ 2, 2, 3, 3] M2: [ 4, 4, 5, 5] M3: [ 6, 6, 7, 7]
88 C
M4: [8,8,9,9] M5: [10,10,11,11] M6: [12,12,13,13] M7: [14,14,15,15]
90 C Then
, accumulate Y0
(first two
"iters") using
92 C Y0
+= (M0
+P0
) * (M2
+P2
) + (M1
+P1
) * (M3
+P3
)
93 C Y1
+= (M0
+P4
) * (M2
+P6
) + (M1
+P5
) * (M3
+P7
)
95 C Next iteration is then
97 C Y0
+= (M4
+P4
) * (M6
+P6
) + (M5
+P5
) * (M7
+ P7
)
98 C Y1
+= (M4
+P6
) * (M6
+P8
) + (M5
+P7
) * (M7
+ P11
)
100 C So we can reuse P4
, P5
, P6
, P7 from the previous iteration.
102 C How to for
in registers
? We need
4 Q regs for P0
-P3
, and one
103 C more for the last read key. We need at least two regiters
104 C for the message
(QA
and QB
, more if we want to expand only
105 C once
). For the Y0 update
, we can let the factors overwrite
106 C P0
-P3
, and for the Y1 update
, we can overwrite M0
-M3.
109 vld1.32
{QK0,QK1}, [KEY
]!
110 vld1.32
{QK2}, [KEY
]!
114 C Permute keys. QK2 us untouched
, permuted subkeys put
in QK0
,QK1
,QT0
,QT1
115 vtrn
.32 QK0
, QK1 C Gives us
[0, 4, 2, 6] and [1, 5, 3, 7]
116 vswp D1REG
(QK0
), D0REG
(QK1
) C Gives us
[0, 4, 1, 5] and [2, 6, 3, 7]
117 vtrn
.32 QT0
, QT1 C Gives us
[4,8,6,10] and [5 ,9,7,11]
118 vswp D1REG
(QT0
), D0REG
(QT1
) C Gives us
[4,8,5, 9] and [6,10,7,11]
122 C Set m
[i
] <-- m
[i
-1] >> RSHIFT
+ m
[i
] << LSHIFT
123 vld1.8
{QA, QB}, [MSG
:64]!
124 vshl.u64 QC
, QA
, QRIGHT
125 vshl.u64 QD
, QB
, QRIGHT
126 vshl.u64 QA
, QA
, QLEFT
127 vshl.u64 QB
, QB
, QLEFT
128 veor D0REG
(QA
), D0REG
(QA
), DM
129 veor D1REG
(QA
), D1REG
(QA
), D0REG
(QC
)
130 veor D0REG
(QB
), D0REG
(QB
), D1REG
(QC
)
131 veor D1REG
(QB
), D1REG
(QB
), D0REG
(QD
)
134 C Explode message
(too bad there
's no vadd with scalar)
135 vdup.32 D1REG(QD), D1REG(QB)[1]
136 vdup.32 D0REG(QD), D1REG(QB)[0]
137 vdup.32 D1REG(QC), D0REG(QB)[1]
138 vdup.32 D0REG(QC), D0REG(QB)[0]
139 vdup.32 D1REG(QB), D1REG(QA)[1]
140 vdup.32 D0REG(QB), D1REG(QA)[0]
141 vdup.32 D1REG(QA), D0REG(QA)[1]
142 vdup.32 D0REG(QA), D0REG(QA)[0]
144 vadd.i32 QK0, QK0, QA
145 vadd.i32 QK1, QK1, QB
146 vadd.i32 QT0, QT0, QC
147 vadd.i32 QT1, QT1, QD
149 vmlal.u32 QY0, D0REG(QK0), D0REG(QT0)
150 vmlal.u32 QY0, D1REG(QK0), D1REG(QT0)
151 vmlal.u32 QY0, D0REG(QK1), D0REG(QT1)
152 vmlal.u32 QY0, D1REG(QK1), D1REG(QT1)
155 vld1.32 {QT0,QT1}, [KEY]!
159 vtrn.32 QK0, QK1 C Gives us [8,12,10,14] and [9,13,11,15]
160 vswp D1REG(QK0), D0REG(QK1) C Gives us [8,12,9,13] and [10,14,11,15]
161 vtrn.32 QT0, QT1 C Gives us [12,16,14,18] and [13,17,15,19]
162 vswp D1REG(QT0), D0REG(QT1) C Gives us [12,16,13,17] and [14,18,15,19]
169 subs LENGTH, LENGTH, #32
171 vmlal.u32 QY1, D0REG(QA), D0REG(QC)
172 vmlal.u32 QY1, D1REG(QA), D1REG(QC)
173 vmlal.u32 QY1, D0REG(QB), D0REG(QD)
174 vmlal.u32 QY1, D1REG(QB), D1REG(QD)
178 vst1.64 {QY0, QY1}, [OUT]
186 vld1.32 {QK0,QK1}, [KEY]!
189 C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
190 vld1.8 {QA, QB}, [MSG :64]!
191 vshl.u64 QT0, QA, QRIGHT
192 vshl.u64 QT1, QB, QRIGHT
193 vshl.u64 QA, QA, QLEFT
194 vshl.u64 QB, QB, QLEFT
195 veor D0REG(QA), D0REG(QA), DM
196 veor D1REG(QA), D1REG(QA), D0REG(QT0)
197 veor D0REG(QB), D0REG(QB), D1REG(QT0)
198 veor D1REG(QB), D1REG(QB), D0REG(QT1)
201 vld1.32 {QK2}, [KEY]!
202 C Construct factors, with low half corresponding to first iteration,
203 C and high half corresponding to the second iteration.
205 vtrn.32 QK0, QT0 C Gives us [0, 4, 2, 6] and [1, 5, 3, 7]
206 vswp D1REG(QK0), D0REG(QT0) C Gives us [0, 4, 1, 5] and [2, 6, 3, 7]
207 vdup.32 D0REG(QT1), D0REG(QA)[0]
208 vdup.32 D1REG(QT1), D0REG(QA)[1]
209 vadd.i32 QT1, QT1, QK0
211 vmov QK0, QK2 C Save for next iteration
212 vtrn.32 QK1, QK2 C Gives us [4, 8, 2, 1] and [1, 5, 3, 7]
213 vswp D1REG(QK1), D0REG(QK2) C Gives us [4, 8, 1, 5] and [2, 1, 3, 7]
215 vdup.32 D0REG(QT2), D0REG(QB)[0]
216 vdup.32 D1REG(QT2), D0REG(QB)[1]
217 vadd.i32 QK1, QK1, QT2
218 vmlal.u32 QY0, D0REG(QT1), D0REG(QK1)
219 vmlal.u32 QY0, D1REG(QT1), D1REG(QK1)
221 vdup.32 D0REG(QT1), D1REG(QA)[0]
222 vdup.32 D1REG(QT1), D1REG(QA)[1]
223 vadd.i32 QT0, QT0, QT1
224 vdup.32 D0REG(QT1), D1REG(QB)[0]
225 vdup.32 D1REG(QT1), D1REG(QB)[1]
226 vadd.i32 QK2, QK2, QT1
228 vmlal.u32 QY0, D0REG(QT0), D0REG(QK2)
229 vmlal.u32 QY0, D1REG(QT0), D1REG(QK2)
231 vld1.32 {QK1}, [KEY]!
234 subs LENGTH, LENGTH, #32
235 vmlal.u32 QY1, D0REG(QA), D0REG(QB)
236 vmlal.u32 QY1, D1REG(QA), D1REG(QB)
239 vadd.i64 D0REG(QY1), D0REG(QY1), D1REG(QY1)
240 vst1.64 {D0REG(QY0), D1REG(QY0), D0REG(QY1)}, [OUT]
247 vld1.32 {QK0}, [KEY]!
249 C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
250 vld1.8 {QA, QB}, [MSG :64]!
251 vshl.u64 QT0, QA, QRIGHT
252 vshl.u64 QT1, QB, QRIGHT
253 vshl.u64 QA, QA, QLEFT
254 vshl.u64 QB, QB, QLEFT
255 veor D0REG(QA), D0REG(QA), DM
256 veor D1REG(QA), D1REG(QA), D0REG(QT0)
257 veor D0REG(QB), D0REG(QB), D1REG(QT0)
258 veor D1REG(QB), D1REG(QB), D0REG(QT1)
261 vld1.32 {QK1,QK2}, [KEY]!
262 C Construct factors, with low half corresponding to first iteration,
263 C and high half corresponding to the second iteration.
265 vtrn.32 QK0, QT0 C Gives us [0, 4, 2, 6] and [1, 5, 3, 7]
266 vswp D1REG(QK0), D0REG(QT0) C Gives us [0, 4, 1, 5] and [2, 6, 3, 7]
267 vdup.32 D0REG(QT1), D0REG(QA)[0]
268 vdup.32 D1REG(QT1), D0REG(QA)[1]
269 vadd.i32 QT1, QT1, QK0
271 vmov QK0, QK2 C Save for next iteration
272 vtrn.32 QK1, QK2 C Gives us [4, 8, 6, 10] and [5, 9, 7, 11]
273 vswp D1REG(QK1), D0REG(QK2) C Gives us [4, 8, 5, 9] and [6, 10, 7, 11]
275 vdup.32 D0REG(QT2), D0REG(QB)[0]
276 vdup.32 D1REG(QT2), D0REG(QB)[1]
277 vadd.i32 QK1, QK1, QT2
278 vmlal.u32 QY0, D0REG(QT1), D0REG(QK1)
279 vmlal.u32 QY0, D1REG(QT1), D1REG(QK1)
281 vdup.32 D0REG(QT1), D1REG(QA)[0]
282 vdup.32 D1REG(QT1), D1REG(QA)[1]
283 vadd.i32 QT0, QT0, QT1
284 vdup.32 D0REG(QT1), D1REG(QB)[0]
285 vdup.32 D1REG(QT1), D1REG(QB)[1]
286 vadd.i32 QK2, QK2, QT1
288 subs LENGTH, LENGTH, #32
290 vmlal.u32 QY0, D0REG(QT0), D0REG(QK2)
291 vmlal.u32 QY0, D1REG(QT0), D1REG(QK2)
298 EPILOGUE(_nettle_umac_nh_n)