1 /* SPDX-License-Identifier: GPL-2.0-only */
3 * Bit sliced AES using NEON instructions
5 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
9 * The algorithm implemented here is described in detail by the paper
10 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
11 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
13 * This implementation is based primarily on the OpenSSL implementation
14 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
17 #include <linux/linkage.h>
18 #include <asm/assembler.h>
25 .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
41 .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
55 .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
70 .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
86 .macro mul_gf4, x0, x1, y0, y1, t0, t1
96 .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
113 .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
114 y0, y1, y2, y3, t0, t1, t2, t3
117 mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3
120 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
127 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
130 mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3
137 .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
138 t0, t1, t2, t3, s0, s1, s2, s3
185 mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
186 \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
189 .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
190 t0, t1, t2, t3, s0, s1, s2, s3
191 in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
192 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
193 inv_gf256 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
194 \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
195 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
196 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
197 out_bs_ch \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
198 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
201 .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
202 t0, t1, t2, t3, s0, s1, s2, s3
203 inv_in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
204 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
205 inv_gf256 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
206 \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
207 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
208 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
209 inv_out_bs_ch \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
210 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
214 ldp q16, q17, [bskey], #128
215 ldp q18, q19, [bskey, #-96]
216 ldp q20, q21, [bskey, #-64]
217 ldp q22, q23, [bskey, #-32]
221 ldp q16, q17, [bskey, #-128]!
222 ldp q18, q19, [bskey, #32]
223 ldp q20, q21, [bskey, #64]
224 ldp q22, q23, [bskey, #96]
227 .macro add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
228 eor \x0\().16b, \x0\().16b, v16.16b
229 eor \x1\().16b, \x1\().16b, v17.16b
230 eor \x2\().16b, \x2\().16b, v18.16b
231 eor \x3\().16b, \x3\().16b, v19.16b
232 eor \x4\().16b, \x4\().16b, v20.16b
233 eor \x5\().16b, \x5\().16b, v21.16b
234 eor \x6\().16b, \x6\().16b, v22.16b
235 eor \x7\().16b, \x7\().16b, v23.16b
238 .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
239 tbl \x0\().16b, {\x0\().16b}, \mask\().16b
240 tbl \x1\().16b, {\x1\().16b}, \mask\().16b
241 tbl \x2\().16b, {\x2\().16b}, \mask\().16b
242 tbl \x3\().16b, {\x3\().16b}, \mask\().16b
243 tbl \x4\().16b, {\x4\().16b}, \mask\().16b
244 tbl \x5\().16b, {\x5\().16b}, \mask\().16b
245 tbl \x6\().16b, {\x6\().16b}, \mask\().16b
246 tbl \x7\().16b, {\x7\().16b}, \mask\().16b
249 .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
250 t0, t1, t2, t3, t4, t5, t6, t7, inv
251 ext \t0\().16b, \x0\().16b, \x0\().16b, #12
252 ext \t1\().16b, \x1\().16b, \x1\().16b, #12
253 eor \x0\().16b, \x0\().16b, \t0\().16b
254 ext \t2\().16b, \x2\().16b, \x2\().16b, #12
255 eor \x1\().16b, \x1\().16b, \t1\().16b
256 ext \t3\().16b, \x3\().16b, \x3\().16b, #12
257 eor \x2\().16b, \x2\().16b, \t2\().16b
258 ext \t4\().16b, \x4\().16b, \x4\().16b, #12
259 eor \x3\().16b, \x3\().16b, \t3\().16b
260 ext \t5\().16b, \x5\().16b, \x5\().16b, #12
261 eor \x4\().16b, \x4\().16b, \t4\().16b
262 ext \t6\().16b, \x6\().16b, \x6\().16b, #12
263 eor \x5\().16b, \x5\().16b, \t5\().16b
264 ext \t7\().16b, \x7\().16b, \x7\().16b, #12
265 eor \x6\().16b, \x6\().16b, \t6\().16b
266 eor \t1\().16b, \t1\().16b, \x0\().16b
267 eor \x7\().16b, \x7\().16b, \t7\().16b
268 ext \x0\().16b, \x0\().16b, \x0\().16b, #8
269 eor \t2\().16b, \t2\().16b, \x1\().16b
270 eor \t0\().16b, \t0\().16b, \x7\().16b
271 eor \t1\().16b, \t1\().16b, \x7\().16b
272 ext \x1\().16b, \x1\().16b, \x1\().16b, #8
273 eor \t5\().16b, \t5\().16b, \x4\().16b
274 eor \x0\().16b, \x0\().16b, \t0\().16b
275 eor \t6\().16b, \t6\().16b, \x5\().16b
276 eor \x1\().16b, \x1\().16b, \t1\().16b
277 ext \t0\().16b, \x4\().16b, \x4\().16b, #8
278 eor \t4\().16b, \t4\().16b, \x3\().16b
279 ext \t1\().16b, \x5\().16b, \x5\().16b, #8
280 eor \t7\().16b, \t7\().16b, \x6\().16b
281 ext \x4\().16b, \x3\().16b, \x3\().16b, #8
282 eor \t3\().16b, \t3\().16b, \x2\().16b
283 ext \x5\().16b, \x7\().16b, \x7\().16b, #8
284 eor \t4\().16b, \t4\().16b, \x7\().16b
285 ext \x3\().16b, \x6\().16b, \x6\().16b, #8
286 eor \t3\().16b, \t3\().16b, \x7\().16b
287 ext \x6\().16b, \x2\().16b, \x2\().16b, #8
288 eor \x7\().16b, \t1\().16b, \t5\().16b
290 eor \x2\().16b, \t0\().16b, \t4\().16b
291 eor \x4\().16b, \x4\().16b, \t3\().16b
292 eor \x5\().16b, \x5\().16b, \t7\().16b
293 eor \x3\().16b, \x3\().16b, \t6\().16b
294 eor \x6\().16b, \x6\().16b, \t2\().16b
296 eor \t3\().16b, \t3\().16b, \x4\().16b
297 eor \x5\().16b, \x5\().16b, \t7\().16b
298 eor \x2\().16b, \x3\().16b, \t6\().16b
299 eor \x3\().16b, \t0\().16b, \t4\().16b
300 eor \x4\().16b, \x6\().16b, \t2\().16b
301 mov \x6\().16b, \t3\().16b
305 .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
306 t0, t1, t2, t3, t4, t5, t6, t7
307 ext \t0\().16b, \x0\().16b, \x0\().16b, #8
308 ext \t6\().16b, \x6\().16b, \x6\().16b, #8
309 ext \t7\().16b, \x7\().16b, \x7\().16b, #8
310 eor \t0\().16b, \t0\().16b, \x0\().16b
311 ext \t1\().16b, \x1\().16b, \x1\().16b, #8
312 eor \t6\().16b, \t6\().16b, \x6\().16b
313 ext \t2\().16b, \x2\().16b, \x2\().16b, #8
314 eor \t7\().16b, \t7\().16b, \x7\().16b
315 ext \t3\().16b, \x3\().16b, \x3\().16b, #8
316 eor \t1\().16b, \t1\().16b, \x1\().16b
317 ext \t4\().16b, \x4\().16b, \x4\().16b, #8
318 eor \t2\().16b, \t2\().16b, \x2\().16b
319 ext \t5\().16b, \x5\().16b, \x5\().16b, #8
320 eor \t3\().16b, \t3\().16b, \x3\().16b
321 eor \t4\().16b, \t4\().16b, \x4\().16b
322 eor \t5\().16b, \t5\().16b, \x5\().16b
323 eor \x0\().16b, \x0\().16b, \t6\().16b
324 eor \x1\().16b, \x1\().16b, \t6\().16b
325 eor \x2\().16b, \x2\().16b, \t0\().16b
326 eor \x4\().16b, \x4\().16b, \t2\().16b
327 eor \x3\().16b, \x3\().16b, \t1\().16b
328 eor \x1\().16b, \x1\().16b, \t7\().16b
329 eor \x2\().16b, \x2\().16b, \t7\().16b
330 eor \x4\().16b, \x4\().16b, \t6\().16b
331 eor \x5\().16b, \x5\().16b, \t3\().16b
332 eor \x3\().16b, \x3\().16b, \t6\().16b
333 eor \x6\().16b, \x6\().16b, \t4\().16b
334 eor \x4\().16b, \x4\().16b, \t7\().16b
335 eor \x5\().16b, \x5\().16b, \t7\().16b
336 eor \x7\().16b, \x7\().16b, \t5\().16b
337 mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
338 \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
341 .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
342 ushr \t0\().2d, \b0\().2d, #\n
343 ushr \t1\().2d, \b1\().2d, #\n
344 eor \t0\().16b, \t0\().16b, \a0\().16b
345 eor \t1\().16b, \t1\().16b, \a1\().16b
346 and \t0\().16b, \t0\().16b, \mask\().16b
347 and \t1\().16b, \t1\().16b, \mask\().16b
348 eor \a0\().16b, \a0\().16b, \t0\().16b
349 shl \t0\().2d, \t0\().2d, #\n
350 eor \a1\().16b, \a1\().16b, \t1\().16b
351 shl \t1\().2d, \t1\().2d, #\n
352 eor \b0\().16b, \b0\().16b, \t0\().16b
353 eor \b1\().16b, \b1\().16b, \t1\().16b
356 .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
357 movi \t0\().16b, #0x55
358 movi \t1\().16b, #0x33
359 swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
360 swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
361 movi \t0\().16b, #0x0f
362 swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
363 swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
364 swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
365 swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
370 M0: .octa 0x0004080c0105090d02060a0e03070b0f
372 M0SR: .octa 0x0004080c05090d010a0e02060f03070b
373 SR: .octa 0x0f0e0d0c0a09080b0504070600030201
374 SRM0: .octa 0x01060b0c0207080d0304090e00050a0f
376 M0ISR: .octa 0x0004080c0d0105090a0e0206070b0f03
377 ISR: .octa 0x0f0e0d0c080b0a090504070602010003
378 ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f
381 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
383 SYM_FUNC_START(aesbs_convert_key)
384 ld1 {v7.4s}, [x1], #16 // load round 0 key
385 ld1 {v17.4s}, [x1], #16 // load round 1 key
387 movi v8.16b, #0x01 // bit masks
398 str q7, [x0], #16 // save round 0 key
401 tbl v7.16b ,{v17.16b}, v16.16b
402 ld1 {v17.4s}, [x1], #16 // load next round key
404 cmtst v0.16b, v7.16b, v8.16b
405 cmtst v1.16b, v7.16b, v9.16b
406 cmtst v2.16b, v7.16b, v10.16b
407 cmtst v3.16b, v7.16b, v11.16b
408 cmtst v4.16b, v7.16b, v12.16b
409 cmtst v5.16b, v7.16b, v13.16b
410 cmtst v6.16b, v7.16b, v14.16b
411 cmtst v7.16b, v7.16b, v15.16b
418 stp q0, q1, [x0], #128
419 stp q2, q3, [x0, #-96]
420 stp q4, q5, [x0, #-64]
421 stp q6, q7, [x0, #-32]
424 movi v7.16b, #0x63 // compose .L63
425 eor v17.16b, v17.16b, v7.16b
428 SYM_FUNC_END(aesbs_convert_key)
431 SYM_FUNC_START_LOCAL(aesbs_encrypt8)
432 ldr q9, [bskey], #16 // round 0 key
436 eor v10.16b, v0.16b, v9.16b // xor with round0 key
437 eor v11.16b, v1.16b, v9.16b
438 tbl v0.16b, {v10.16b}, v8.16b
439 eor v12.16b, v2.16b, v9.16b
440 tbl v1.16b, {v11.16b}, v8.16b
441 eor v13.16b, v3.16b, v9.16b
442 tbl v2.16b, {v12.16b}, v8.16b
443 eor v14.16b, v4.16b, v9.16b
444 tbl v3.16b, {v13.16b}, v8.16b
445 eor v15.16b, v5.16b, v9.16b
446 tbl v4.16b, {v14.16b}, v8.16b
447 eor v10.16b, v6.16b, v9.16b
448 tbl v5.16b, {v15.16b}, v8.16b
449 eor v11.16b, v7.16b, v9.16b
450 tbl v6.16b, {v10.16b}, v8.16b
451 tbl v7.16b, {v11.16b}, v8.16b
453 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
455 sub rounds, rounds, #1
459 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24
461 sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
463 subs rounds, rounds, #1
468 mix_cols v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
471 add_round_key v0, v1, v2, v3, v4, v5, v6, v7
478 ldr q12, [bskey] // last round key
480 bitslice v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
482 eor v0.16b, v0.16b, v12.16b
483 eor v1.16b, v1.16b, v12.16b
484 eor v4.16b, v4.16b, v12.16b
485 eor v6.16b, v6.16b, v12.16b
486 eor v3.16b, v3.16b, v12.16b
487 eor v7.16b, v7.16b, v12.16b
488 eor v2.16b, v2.16b, v12.16b
489 eor v5.16b, v5.16b, v12.16b
491 SYM_FUNC_END(aesbs_encrypt8)
494 SYM_FUNC_START_LOCAL(aesbs_decrypt8)
498 ldr q9, [bskey, #-112]! // round 0 key
502 eor v10.16b, v0.16b, v9.16b // xor with round0 key
503 eor v11.16b, v1.16b, v9.16b
504 tbl v0.16b, {v10.16b}, v8.16b
505 eor v12.16b, v2.16b, v9.16b
506 tbl v1.16b, {v11.16b}, v8.16b
507 eor v13.16b, v3.16b, v9.16b
508 tbl v2.16b, {v12.16b}, v8.16b
509 eor v14.16b, v4.16b, v9.16b
510 tbl v3.16b, {v13.16b}, v8.16b
511 eor v15.16b, v5.16b, v9.16b
512 tbl v4.16b, {v14.16b}, v8.16b
513 eor v10.16b, v6.16b, v9.16b
514 tbl v5.16b, {v15.16b}, v8.16b
515 eor v11.16b, v7.16b, v9.16b
516 tbl v6.16b, {v10.16b}, v8.16b
517 tbl v7.16b, {v11.16b}, v8.16b
519 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
521 sub rounds, rounds, #1
525 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24
527 inv_sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
529 subs rounds, rounds, #1
534 add_round_key v0, v1, v6, v4, v2, v7, v3, v5
536 inv_mix_cols v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
543 ldr q12, [bskey, #-16] // last round key
545 bitslice v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
547 eor v0.16b, v0.16b, v12.16b
548 eor v1.16b, v1.16b, v12.16b
549 eor v6.16b, v6.16b, v12.16b
550 eor v4.16b, v4.16b, v12.16b
551 eor v2.16b, v2.16b, v12.16b
552 eor v7.16b, v7.16b, v12.16b
553 eor v3.16b, v3.16b, v12.16b
554 eor v5.16b, v5.16b, v12.16b
556 SYM_FUNC_END(aesbs_decrypt8)
559 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
561 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
564 .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
576 csel x23, x23, xzr, pl
579 ld1 {v0.16b}, [x20], #16
581 ld1 {v1.16b}, [x20], #16
583 ld1 {v2.16b}, [x20], #16
585 ld1 {v3.16b}, [x20], #16
587 ld1 {v4.16b}, [x20], #16
589 ld1 {v5.16b}, [x20], #16
591 ld1 {v6.16b}, [x20], #16
593 ld1 {v7.16b}, [x20], #16
599 st1 {\o0\().16b}, [x19], #16
601 st1 {\o1\().16b}, [x19], #16
603 st1 {\o2\().16b}, [x19], #16
605 st1 {\o3\().16b}, [x19], #16
607 st1 {\o4\().16b}, [x19], #16
609 st1 {\o5\().16b}, [x19], #16
611 st1 {\o6\().16b}, [x19], #16
613 st1 {\o7\().16b}, [x19], #16
624 SYM_FUNC_START(aesbs_ecb_encrypt)
625 __ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
626 SYM_FUNC_END(aesbs_ecb_encrypt)
629 SYM_FUNC_START(aesbs_ecb_decrypt)
630 __ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
631 SYM_FUNC_END(aesbs_ecb_decrypt)
634 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
635 * int blocks, u8 iv[])
638 SYM_FUNC_START(aesbs_cbc_decrypt)
651 csel x23, x23, xzr, pl
654 ld1 {v0.16b}, [x20], #16
657 ld1 {v1.16b}, [x20], #16
660 ld1 {v2.16b}, [x20], #16
663 ld1 {v3.16b}, [x20], #16
666 ld1 {v4.16b}, [x20], #16
669 ld1 {v5.16b}, [x20], #16
672 ld1 {v6.16b}, [x20], #16
681 ld1 {v24.16b}, [x24] // load IV
683 eor v1.16b, v1.16b, v25.16b
684 eor v6.16b, v6.16b, v26.16b
685 eor v4.16b, v4.16b, v27.16b
686 eor v2.16b, v2.16b, v28.16b
687 eor v7.16b, v7.16b, v29.16b
688 eor v0.16b, v0.16b, v24.16b
689 eor v3.16b, v3.16b, v30.16b
690 eor v5.16b, v5.16b, v31.16b
692 st1 {v0.16b}, [x19], #16
695 st1 {v1.16b}, [x19], #16
698 st1 {v6.16b}, [x19], #16
701 st1 {v4.16b}, [x19], #16
704 st1 {v2.16b}, [x19], #16
707 st1 {v7.16b}, [x19], #16
710 st1 {v3.16b}, [x19], #16
713 ld1 {v24.16b}, [x20], #16
714 st1 {v5.16b}, [x19], #16
715 1: st1 {v24.16b}, [x24] // store IV
723 SYM_FUNC_END(aesbs_cbc_decrypt)
725 .macro next_tweak, out, in, const, tmp
726 sshr \tmp\().2d, \in\().2d, #63
727 and \tmp\().16b, \tmp\().16b, \const\().16b
728 add \out\().2d, \in\().2d, \in\().2d
729 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
730 eor \out\().16b, \out\().16b, \tmp\().16b
734 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
735 * int blocks, u8 iv[])
736 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
737 * int blocks, u8 iv[])
739 SYM_FUNC_START_LOCAL(__xts_crypt8)
743 csel x23, x23, xzr, pl
746 ld1 {v0.16b}, [x20], #16
747 next_tweak v26, v25, v30, v31
748 eor v0.16b, v0.16b, v25.16b
751 ld1 {v1.16b}, [x20], #16
752 next_tweak v27, v26, v30, v31
753 eor v1.16b, v1.16b, v26.16b
756 ld1 {v2.16b}, [x20], #16
757 next_tweak v28, v27, v30, v31
758 eor v2.16b, v2.16b, v27.16b
761 ld1 {v3.16b}, [x20], #16
762 next_tweak v29, v28, v30, v31
763 eor v3.16b, v3.16b, v28.16b
766 ld1 {v4.16b}, [x20], #16
767 str q29, [sp, #.Lframe_local_offset]
768 eor v4.16b, v4.16b, v29.16b
769 next_tweak v29, v29, v30, v31
772 ld1 {v5.16b}, [x20], #16
773 str q29, [sp, #.Lframe_local_offset + 16]
774 eor v5.16b, v5.16b, v29.16b
775 next_tweak v29, v29, v30, v31
778 ld1 {v6.16b}, [x20], #16
779 str q29, [sp, #.Lframe_local_offset + 32]
780 eor v6.16b, v6.16b, v29.16b
781 next_tweak v29, v29, v30, v31
784 ld1 {v7.16b}, [x20], #16
785 str q29, [sp, #.Lframe_local_offset + 48]
786 eor v7.16b, v7.16b, v29.16b
787 next_tweak v29, v29, v30, v31
792 SYM_FUNC_END(__xts_crypt8)
794 .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
806 uzp1 v30.4s, v30.4s, v25.4s
812 ldp q16, q17, [sp, #.Lframe_local_offset]
813 ldp q18, q19, [sp, #.Lframe_local_offset + 32]
815 eor \o0\().16b, \o0\().16b, v25.16b
816 eor \o1\().16b, \o1\().16b, v26.16b
817 eor \o2\().16b, \o2\().16b, v27.16b
818 eor \o3\().16b, \o3\().16b, v28.16b
820 st1 {\o0\().16b}, [x19], #16
823 st1 {\o1\().16b}, [x19], #16
826 st1 {\o2\().16b}, [x19], #16
829 st1 {\o3\().16b}, [x19], #16
833 eor \o4\().16b, \o4\().16b, v16.16b
834 eor \o5\().16b, \o5\().16b, v17.16b
835 eor \o6\().16b, \o6\().16b, v18.16b
836 eor \o7\().16b, \o7\().16b, v19.16b
838 st1 {\o4\().16b}, [x19], #16
840 st1 {\o5\().16b}, [x19], #16
842 st1 {\o6\().16b}, [x19], #16
844 st1 {\o7\().16b}, [x19], #16
852 1: st1 {v25.16b}, [x24]
857 SYM_FUNC_START(aesbs_xts_encrypt)
858 __xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
859 SYM_FUNC_END(aesbs_xts_encrypt)
861 SYM_FUNC_START(aesbs_xts_decrypt)
862 __xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
863 SYM_FUNC_END(aesbs_xts_decrypt)
870 rev64 \v\().16b, \v\().16b
874 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
875 * int rounds, int blocks, u8 iv[], u8 final[])
877 SYM_FUNC_START(aesbs_ctr_encrypt)
890 add x23, x23, x26 // do one extra block if final
892 98: ldp x7, x8, [x24]
902 csel x23, x23, xzr, pl
924 lsr x9, x9, x26 // disregard the extra block
927 ld1 {v8.16b}, [x20], #16
928 eor v0.16b, v0.16b, v8.16b
929 st1 {v0.16b}, [x19], #16
932 ld1 {v9.16b}, [x20], #16
933 eor v1.16b, v1.16b, v9.16b
934 st1 {v1.16b}, [x19], #16
937 ld1 {v10.16b}, [x20], #16
938 eor v4.16b, v4.16b, v10.16b
939 st1 {v4.16b}, [x19], #16
942 ld1 {v11.16b}, [x20], #16
943 eor v6.16b, v6.16b, v11.16b
944 st1 {v6.16b}, [x19], #16
947 ld1 {v12.16b}, [x20], #16
948 eor v3.16b, v3.16b, v12.16b
949 st1 {v3.16b}, [x19], #16
952 ld1 {v13.16b}, [x20], #16
953 eor v7.16b, v7.16b, v13.16b
954 st1 {v7.16b}, [x19], #16
957 ld1 {v14.16b}, [x20], #16
958 eor v2.16b, v2.16b, v14.16b
959 st1 {v2.16b}, [x19], #16
962 ld1 {v15.16b}, [x20], #16
963 eor v5.16b, v5.16b, v15.16b
964 st1 {v5.16b}, [x19], #16
978 * If we are handling the tail of the input (x6 != NULL), return the
979 * final keystream block back to the caller.
1005 SYM_FUNC_END(aesbs_ctr_encrypt)