2 * Bit sliced AES using NEON instructions
4 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
12 * The algorithm implemented here is described in detail by the paper
13 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
14 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
16 * This implementation is based primarily on the OpenSSL implementation
17 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
20 #include <linux/linkage.h>
21 #include <asm/assembler.h>
28 .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
44 .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
58 .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
73 .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
89 .macro mul_gf4, x0, x1, y0, y1, t0, t1
99 .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
116 .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
117 y0, y1, y2, y3, t0, t1, t2, t3
120 mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3
123 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
130 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
133 mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3
140 .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
141 t0, t1, t2, t3, s0, s1, s2, s3
188 mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
189 \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
192 .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
193 t0, t1, t2, t3, s0, s1, s2, s3
194 in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
195 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
196 inv_gf256 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
197 \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
198 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
199 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
200 out_bs_ch \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
201 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
204 .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
205 t0, t1, t2, t3, s0, s1, s2, s3
206 inv_in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
207 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
208 inv_gf256 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
209 \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
210 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
211 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
212 inv_out_bs_ch \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
213 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
217 ldp q16, q17, [bskey], #128
218 ldp q18, q19, [bskey, #-96]
219 ldp q20, q21, [bskey, #-64]
220 ldp q22, q23, [bskey, #-32]
224 ldp q16, q17, [bskey, #-128]!
225 ldp q18, q19, [bskey, #32]
226 ldp q20, q21, [bskey, #64]
227 ldp q22, q23, [bskey, #96]
230 .macro add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
231 eor \x0\().16b, \x0\().16b, v16.16b
232 eor \x1\().16b, \x1\().16b, v17.16b
233 eor \x2\().16b, \x2\().16b, v18.16b
234 eor \x3\().16b, \x3\().16b, v19.16b
235 eor \x4\().16b, \x4\().16b, v20.16b
236 eor \x5\().16b, \x5\().16b, v21.16b
237 eor \x6\().16b, \x6\().16b, v22.16b
238 eor \x7\().16b, \x7\().16b, v23.16b
241 .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
242 tbl \x0\().16b, {\x0\().16b}, \mask\().16b
243 tbl \x1\().16b, {\x1\().16b}, \mask\().16b
244 tbl \x2\().16b, {\x2\().16b}, \mask\().16b
245 tbl \x3\().16b, {\x3\().16b}, \mask\().16b
246 tbl \x4\().16b, {\x4\().16b}, \mask\().16b
247 tbl \x5\().16b, {\x5\().16b}, \mask\().16b
248 tbl \x6\().16b, {\x6\().16b}, \mask\().16b
249 tbl \x7\().16b, {\x7\().16b}, \mask\().16b
252 .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
253 t0, t1, t2, t3, t4, t5, t6, t7, inv
254 ext \t0\().16b, \x0\().16b, \x0\().16b, #12
255 ext \t1\().16b, \x1\().16b, \x1\().16b, #12
256 eor \x0\().16b, \x0\().16b, \t0\().16b
257 ext \t2\().16b, \x2\().16b, \x2\().16b, #12
258 eor \x1\().16b, \x1\().16b, \t1\().16b
259 ext \t3\().16b, \x3\().16b, \x3\().16b, #12
260 eor \x2\().16b, \x2\().16b, \t2\().16b
261 ext \t4\().16b, \x4\().16b, \x4\().16b, #12
262 eor \x3\().16b, \x3\().16b, \t3\().16b
263 ext \t5\().16b, \x5\().16b, \x5\().16b, #12
264 eor \x4\().16b, \x4\().16b, \t4\().16b
265 ext \t6\().16b, \x6\().16b, \x6\().16b, #12
266 eor \x5\().16b, \x5\().16b, \t5\().16b
267 ext \t7\().16b, \x7\().16b, \x7\().16b, #12
268 eor \x6\().16b, \x6\().16b, \t6\().16b
269 eor \t1\().16b, \t1\().16b, \x0\().16b
270 eor \x7\().16b, \x7\().16b, \t7\().16b
271 ext \x0\().16b, \x0\().16b, \x0\().16b, #8
272 eor \t2\().16b, \t2\().16b, \x1\().16b
273 eor \t0\().16b, \t0\().16b, \x7\().16b
274 eor \t1\().16b, \t1\().16b, \x7\().16b
275 ext \x1\().16b, \x1\().16b, \x1\().16b, #8
276 eor \t5\().16b, \t5\().16b, \x4\().16b
277 eor \x0\().16b, \x0\().16b, \t0\().16b
278 eor \t6\().16b, \t6\().16b, \x5\().16b
279 eor \x1\().16b, \x1\().16b, \t1\().16b
280 ext \t0\().16b, \x4\().16b, \x4\().16b, #8
281 eor \t4\().16b, \t4\().16b, \x3\().16b
282 ext \t1\().16b, \x5\().16b, \x5\().16b, #8
283 eor \t7\().16b, \t7\().16b, \x6\().16b
284 ext \x4\().16b, \x3\().16b, \x3\().16b, #8
285 eor \t3\().16b, \t3\().16b, \x2\().16b
286 ext \x5\().16b, \x7\().16b, \x7\().16b, #8
287 eor \t4\().16b, \t4\().16b, \x7\().16b
288 ext \x3\().16b, \x6\().16b, \x6\().16b, #8
289 eor \t3\().16b, \t3\().16b, \x7\().16b
290 ext \x6\().16b, \x2\().16b, \x2\().16b, #8
291 eor \x7\().16b, \t1\().16b, \t5\().16b
293 eor \x2\().16b, \t0\().16b, \t4\().16b
294 eor \x4\().16b, \x4\().16b, \t3\().16b
295 eor \x5\().16b, \x5\().16b, \t7\().16b
296 eor \x3\().16b, \x3\().16b, \t6\().16b
297 eor \x6\().16b, \x6\().16b, \t2\().16b
299 eor \t3\().16b, \t3\().16b, \x4\().16b
300 eor \x5\().16b, \x5\().16b, \t7\().16b
301 eor \x2\().16b, \x3\().16b, \t6\().16b
302 eor \x3\().16b, \t0\().16b, \t4\().16b
303 eor \x4\().16b, \x6\().16b, \t2\().16b
304 mov \x6\().16b, \t3\().16b
308 .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
309 t0, t1, t2, t3, t4, t5, t6, t7
310 ext \t0\().16b, \x0\().16b, \x0\().16b, #8
311 ext \t6\().16b, \x6\().16b, \x6\().16b, #8
312 ext \t7\().16b, \x7\().16b, \x7\().16b, #8
313 eor \t0\().16b, \t0\().16b, \x0\().16b
314 ext \t1\().16b, \x1\().16b, \x1\().16b, #8
315 eor \t6\().16b, \t6\().16b, \x6\().16b
316 ext \t2\().16b, \x2\().16b, \x2\().16b, #8
317 eor \t7\().16b, \t7\().16b, \x7\().16b
318 ext \t3\().16b, \x3\().16b, \x3\().16b, #8
319 eor \t1\().16b, \t1\().16b, \x1\().16b
320 ext \t4\().16b, \x4\().16b, \x4\().16b, #8
321 eor \t2\().16b, \t2\().16b, \x2\().16b
322 ext \t5\().16b, \x5\().16b, \x5\().16b, #8
323 eor \t3\().16b, \t3\().16b, \x3\().16b
324 eor \t4\().16b, \t4\().16b, \x4\().16b
325 eor \t5\().16b, \t5\().16b, \x5\().16b
326 eor \x0\().16b, \x0\().16b, \t6\().16b
327 eor \x1\().16b, \x1\().16b, \t6\().16b
328 eor \x2\().16b, \x2\().16b, \t0\().16b
329 eor \x4\().16b, \x4\().16b, \t2\().16b
330 eor \x3\().16b, \x3\().16b, \t1\().16b
331 eor \x1\().16b, \x1\().16b, \t7\().16b
332 eor \x2\().16b, \x2\().16b, \t7\().16b
333 eor \x4\().16b, \x4\().16b, \t6\().16b
334 eor \x5\().16b, \x5\().16b, \t3\().16b
335 eor \x3\().16b, \x3\().16b, \t6\().16b
336 eor \x6\().16b, \x6\().16b, \t4\().16b
337 eor \x4\().16b, \x4\().16b, \t7\().16b
338 eor \x5\().16b, \x5\().16b, \t7\().16b
339 eor \x7\().16b, \x7\().16b, \t5\().16b
340 mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
341 \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
344 .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
345 ushr \t0\().2d, \b0\().2d, #\n
346 ushr \t1\().2d, \b1\().2d, #\n
347 eor \t0\().16b, \t0\().16b, \a0\().16b
348 eor \t1\().16b, \t1\().16b, \a1\().16b
349 and \t0\().16b, \t0\().16b, \mask\().16b
350 and \t1\().16b, \t1\().16b, \mask\().16b
351 eor \a0\().16b, \a0\().16b, \t0\().16b
352 shl \t0\().2d, \t0\().2d, #\n
353 eor \a1\().16b, \a1\().16b, \t1\().16b
354 shl \t1\().2d, \t1\().2d, #\n
355 eor \b0\().16b, \b0\().16b, \t0\().16b
356 eor \b1\().16b, \b1\().16b, \t1\().16b
359 .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
360 movi \t0\().16b, #0x55
361 movi \t1\().16b, #0x33
362 swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
363 swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
364 movi \t0\().16b, #0x0f
365 swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
366 swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
367 swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
368 swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
373 M0: .octa 0x0004080c0105090d02060a0e03070b0f
375 M0SR: .octa 0x0004080c05090d010a0e02060f03070b
376 SR: .octa 0x0f0e0d0c0a09080b0504070600030201
377 SRM0: .octa 0x01060b0c0207080d0304090e00050a0f
379 M0ISR: .octa 0x0004080c0d0105090a0e0206070b0f03
380 ISR: .octa 0x0f0e0d0c080b0a090504070602010003
381 ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f
384 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
386 ENTRY(aesbs_convert_key)
387 ld1 {v7.4s}, [x1], #16 // load round 0 key
388 ld1 {v17.4s}, [x1], #16 // load round 1 key
390 movi v8.16b, #0x01 // bit masks
401 str q7, [x0], #16 // save round 0 key
404 tbl v7.16b ,{v17.16b}, v16.16b
405 ld1 {v17.4s}, [x1], #16 // load next round key
407 cmtst v0.16b, v7.16b, v8.16b
408 cmtst v1.16b, v7.16b, v9.16b
409 cmtst v2.16b, v7.16b, v10.16b
410 cmtst v3.16b, v7.16b, v11.16b
411 cmtst v4.16b, v7.16b, v12.16b
412 cmtst v5.16b, v7.16b, v13.16b
413 cmtst v6.16b, v7.16b, v14.16b
414 cmtst v7.16b, v7.16b, v15.16b
421 stp q0, q1, [x0], #128
422 stp q2, q3, [x0, #-96]
423 stp q4, q5, [x0, #-64]
424 stp q6, q7, [x0, #-32]
427 movi v7.16b, #0x63 // compose .L63
428 eor v17.16b, v17.16b, v7.16b
431 ENDPROC(aesbs_convert_key)
435 ldr q9, [bskey], #16 // round 0 key
439 eor v10.16b, v0.16b, v9.16b // xor with round0 key
440 eor v11.16b, v1.16b, v9.16b
441 tbl v0.16b, {v10.16b}, v8.16b
442 eor v12.16b, v2.16b, v9.16b
443 tbl v1.16b, {v11.16b}, v8.16b
444 eor v13.16b, v3.16b, v9.16b
445 tbl v2.16b, {v12.16b}, v8.16b
446 eor v14.16b, v4.16b, v9.16b
447 tbl v3.16b, {v13.16b}, v8.16b
448 eor v15.16b, v5.16b, v9.16b
449 tbl v4.16b, {v14.16b}, v8.16b
450 eor v10.16b, v6.16b, v9.16b
451 tbl v5.16b, {v15.16b}, v8.16b
452 eor v11.16b, v7.16b, v9.16b
453 tbl v6.16b, {v10.16b}, v8.16b
454 tbl v7.16b, {v11.16b}, v8.16b
456 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
458 sub rounds, rounds, #1
462 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24
464 sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
466 subs rounds, rounds, #1
471 mix_cols v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
474 add_round_key v0, v1, v2, v3, v4, v5, v6, v7
481 ldr q12, [bskey] // last round key
483 bitslice v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
485 eor v0.16b, v0.16b, v12.16b
486 eor v1.16b, v1.16b, v12.16b
487 eor v4.16b, v4.16b, v12.16b
488 eor v6.16b, v6.16b, v12.16b
489 eor v3.16b, v3.16b, v12.16b
490 eor v7.16b, v7.16b, v12.16b
491 eor v2.16b, v2.16b, v12.16b
492 eor v5.16b, v5.16b, v12.16b
494 ENDPROC(aesbs_encrypt8)
501 ldr q9, [bskey, #-112]! // round 0 key
505 eor v10.16b, v0.16b, v9.16b // xor with round0 key
506 eor v11.16b, v1.16b, v9.16b
507 tbl v0.16b, {v10.16b}, v8.16b
508 eor v12.16b, v2.16b, v9.16b
509 tbl v1.16b, {v11.16b}, v8.16b
510 eor v13.16b, v3.16b, v9.16b
511 tbl v2.16b, {v12.16b}, v8.16b
512 eor v14.16b, v4.16b, v9.16b
513 tbl v3.16b, {v13.16b}, v8.16b
514 eor v15.16b, v5.16b, v9.16b
515 tbl v4.16b, {v14.16b}, v8.16b
516 eor v10.16b, v6.16b, v9.16b
517 tbl v5.16b, {v15.16b}, v8.16b
518 eor v11.16b, v7.16b, v9.16b
519 tbl v6.16b, {v10.16b}, v8.16b
520 tbl v7.16b, {v11.16b}, v8.16b
522 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
524 sub rounds, rounds, #1
528 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24
530 inv_sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
532 subs rounds, rounds, #1
537 add_round_key v0, v1, v6, v4, v2, v7, v3, v5
539 inv_mix_cols v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
546 ldr q12, [bskey, #-16] // last round key
548 bitslice v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
550 eor v0.16b, v0.16b, v12.16b
551 eor v1.16b, v1.16b, v12.16b
552 eor v6.16b, v6.16b, v12.16b
553 eor v4.16b, v4.16b, v12.16b
554 eor v2.16b, v2.16b, v12.16b
555 eor v7.16b, v7.16b, v12.16b
556 eor v3.16b, v3.16b, v12.16b
557 eor v5.16b, v5.16b, v12.16b
559 ENDPROC(aesbs_decrypt8)
562 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
564 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
567 .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
579 csel x23, x23, xzr, pl
582 ld1 {v0.16b}, [x20], #16
584 ld1 {v1.16b}, [x20], #16
586 ld1 {v2.16b}, [x20], #16
588 ld1 {v3.16b}, [x20], #16
590 ld1 {v4.16b}, [x20], #16
592 ld1 {v5.16b}, [x20], #16
594 ld1 {v6.16b}, [x20], #16
596 ld1 {v7.16b}, [x20], #16
602 st1 {\o0\().16b}, [x19], #16
604 st1 {\o1\().16b}, [x19], #16
606 st1 {\o2\().16b}, [x19], #16
608 st1 {\o3\().16b}, [x19], #16
610 st1 {\o4\().16b}, [x19], #16
612 st1 {\o5\().16b}, [x19], #16
614 st1 {\o6\().16b}, [x19], #16
616 st1 {\o7\().16b}, [x19], #16
627 ENTRY(aesbs_ecb_encrypt)
628 __ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
629 ENDPROC(aesbs_ecb_encrypt)
632 ENTRY(aesbs_ecb_decrypt)
633 __ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
634 ENDPROC(aesbs_ecb_decrypt)
637 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
638 * int blocks, u8 iv[])
641 ENTRY(aesbs_cbc_decrypt)
654 csel x23, x23, xzr, pl
657 ld1 {v0.16b}, [x20], #16
660 ld1 {v1.16b}, [x20], #16
663 ld1 {v2.16b}, [x20], #16
666 ld1 {v3.16b}, [x20], #16
669 ld1 {v4.16b}, [x20], #16
672 ld1 {v5.16b}, [x20], #16
675 ld1 {v6.16b}, [x20], #16
684 ld1 {v24.16b}, [x24] // load IV
686 eor v1.16b, v1.16b, v25.16b
687 eor v6.16b, v6.16b, v26.16b
688 eor v4.16b, v4.16b, v27.16b
689 eor v2.16b, v2.16b, v28.16b
690 eor v7.16b, v7.16b, v29.16b
691 eor v0.16b, v0.16b, v24.16b
692 eor v3.16b, v3.16b, v30.16b
693 eor v5.16b, v5.16b, v31.16b
695 st1 {v0.16b}, [x19], #16
698 st1 {v1.16b}, [x19], #16
701 st1 {v6.16b}, [x19], #16
704 st1 {v4.16b}, [x19], #16
707 st1 {v2.16b}, [x19], #16
710 st1 {v7.16b}, [x19], #16
713 st1 {v3.16b}, [x19], #16
716 ld1 {v24.16b}, [x20], #16
717 st1 {v5.16b}, [x19], #16
718 1: st1 {v24.16b}, [x24] // store IV
726 ENDPROC(aesbs_cbc_decrypt)
728 .macro next_tweak, out, in, const, tmp
729 sshr \tmp\().2d, \in\().2d, #63
730 and \tmp\().16b, \tmp\().16b, \const\().16b
731 add \out\().2d, \in\().2d, \in\().2d
732 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
733 eor \out\().16b, \out\().16b, \tmp\().16b
738 CPU_LE( .quad 1, 0x87 )
739 CPU_BE( .quad 0x87, 1 )
742 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
743 * int blocks, u8 iv[])
744 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
745 * int blocks, u8 iv[])
751 csel x23, x23, xzr, pl
754 ld1 {v0.16b}, [x20], #16
755 next_tweak v26, v25, v30, v31
756 eor v0.16b, v0.16b, v25.16b
759 ld1 {v1.16b}, [x20], #16
760 next_tweak v27, v26, v30, v31
761 eor v1.16b, v1.16b, v26.16b
764 ld1 {v2.16b}, [x20], #16
765 next_tweak v28, v27, v30, v31
766 eor v2.16b, v2.16b, v27.16b
769 ld1 {v3.16b}, [x20], #16
770 next_tweak v29, v28, v30, v31
771 eor v3.16b, v3.16b, v28.16b
774 ld1 {v4.16b}, [x20], #16
775 str q29, [sp, #.Lframe_local_offset]
776 eor v4.16b, v4.16b, v29.16b
777 next_tweak v29, v29, v30, v31
780 ld1 {v5.16b}, [x20], #16
781 str q29, [sp, #.Lframe_local_offset + 16]
782 eor v5.16b, v5.16b, v29.16b
783 next_tweak v29, v29, v30, v31
786 ld1 {v6.16b}, [x20], #16
787 str q29, [sp, #.Lframe_local_offset + 32]
788 eor v6.16b, v6.16b, v29.16b
789 next_tweak v29, v29, v30, v31
792 ld1 {v7.16b}, [x20], #16
793 str q29, [sp, #.Lframe_local_offset + 48]
794 eor v7.16b, v7.16b, v29.16b
795 next_tweak v29, v29, v30, v31
800 ENDPROC(__xts_crypt8)
802 .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
812 0: ldr q30, .Lxts_mul_x
818 ldp q16, q17, [sp, #.Lframe_local_offset]
819 ldp q18, q19, [sp, #.Lframe_local_offset + 32]
821 eor \o0\().16b, \o0\().16b, v25.16b
822 eor \o1\().16b, \o1\().16b, v26.16b
823 eor \o2\().16b, \o2\().16b, v27.16b
824 eor \o3\().16b, \o3\().16b, v28.16b
826 st1 {\o0\().16b}, [x19], #16
829 st1 {\o1\().16b}, [x19], #16
832 st1 {\o2\().16b}, [x19], #16
835 st1 {\o3\().16b}, [x19], #16
839 eor \o4\().16b, \o4\().16b, v16.16b
840 eor \o5\().16b, \o5\().16b, v17.16b
841 eor \o6\().16b, \o6\().16b, v18.16b
842 eor \o7\().16b, \o7\().16b, v19.16b
844 st1 {\o4\().16b}, [x19], #16
846 st1 {\o5\().16b}, [x19], #16
848 st1 {\o6\().16b}, [x19], #16
850 st1 {\o7\().16b}, [x19], #16
858 1: st1 {v25.16b}, [x24]
863 ENTRY(aesbs_xts_encrypt)
864 __xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
865 ENDPROC(aesbs_xts_encrypt)
867 ENTRY(aesbs_xts_decrypt)
868 __xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
869 ENDPROC(aesbs_xts_decrypt)
876 rev64 \v\().16b, \v\().16b
880 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
881 * int rounds, int blocks, u8 iv[], u8 final[])
883 ENTRY(aesbs_ctr_encrypt)
896 add x23, x23, x26 // do one extra block if final
898 98: ldp x7, x8, [x24]
908 csel x23, x23, xzr, pl
930 lsr x9, x9, x26 // disregard the extra block
933 ld1 {v8.16b}, [x20], #16
934 eor v0.16b, v0.16b, v8.16b
935 st1 {v0.16b}, [x19], #16
938 ld1 {v9.16b}, [x20], #16
939 eor v1.16b, v1.16b, v9.16b
940 st1 {v1.16b}, [x19], #16
943 ld1 {v10.16b}, [x20], #16
944 eor v4.16b, v4.16b, v10.16b
945 st1 {v4.16b}, [x19], #16
948 ld1 {v11.16b}, [x20], #16
949 eor v6.16b, v6.16b, v11.16b
950 st1 {v6.16b}, [x19], #16
953 ld1 {v12.16b}, [x20], #16
954 eor v3.16b, v3.16b, v12.16b
955 st1 {v3.16b}, [x19], #16
958 ld1 {v13.16b}, [x20], #16
959 eor v7.16b, v7.16b, v13.16b
960 st1 {v7.16b}, [x19], #16
963 ld1 {v14.16b}, [x20], #16
964 eor v2.16b, v2.16b, v14.16b
965 st1 {v2.16b}, [x19], #16
968 ld1 {v15.16b}, [x20], #16
969 eor v5.16b, v5.16b, v15.16b
970 st1 {v5.16b}, [x19], #16
983 * If we are handling the tail of the input (x6 != NULL), return the
984 * final keystream block back to the caller.
1007 ENDPROC(aesbs_ctr_encrypt)