2 * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
4 * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
11 #include <linux/linkage.h>
12 #include <asm/assembler.h>
94 .fpu crypto-neon-fp-armv8
96 .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4
97 vmull.p64 \rd, \rn, \rm
101 * This implementation of 64x64 -> 128 bit polynomial multiplication
102 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
103 * "Fast Software Polynomial Multiplication on ARM Processors Using
104 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
105 * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
107 * It has been slightly tweaked for in-order performance, and to allow
108 * 'rq' to overlap with 'ad' or 'bd'.
110 .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
111 vext.8 t0l, \ad, \ad, #1 @ A1
113 vext.8 t4l, \bd, \bd, #1 @ B1
115 vmull.p8 t0q, t0l, \bd @ F = A1*B
116 vext.8 t1l, \ad, \ad, #2 @ A2
117 vmull.p8 t4q, \ad, \b1 @ E = A*B1
119 vext.8 t3l, \bd, \bd, #2 @ B2
121 vmull.p8 t1q, t1l, \bd @ H = A2*B
122 vext.8 t2l, \ad, \ad, #3 @ A3
123 vmull.p8 t3q, \ad, \b2 @ G = A*B2
124 veor t0q, t0q, t4q @ L = E + F
126 vext.8 t4l, \bd, \bd, #3 @ B3
128 vmull.p8 t2q, t2l, \bd @ J = A3*B
129 veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8
130 veor t1q, t1q, t3q @ M = G + H
132 vext.8 t3l, \bd, \bd, #4 @ B4
134 vmull.p8 t4q, \ad, \b3 @ I = A*B3
135 veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16
136 vmull.p8 t3q, \ad, \b4 @ K = A*B4
139 veor t2q, t2q, t4q @ N = I + J
142 veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24
144 veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32
146 vext.8 t0q, t0q, t0q, #15
148 vext.8 t1q, t1q, t1q, #14
149 vmull.p8 \rq, \ad, \bd @ D = A*B
150 vext.8 t2q, t2q, t2q, #13
151 vext.8 t3q, t3q, t3q, #12
159 // PMULL (64x64->128) based reduction for CPUs that can do
160 // it in a single instruction.
162 .macro __pmull_reduce_p64
163 vmull.p64 T1, XL_L, MASK
165 veor XH_L, XH_L, XM_H
166 vext.8 T1, T1, T1, #8
167 veor XL_H, XL_H, XM_L
170 vmull.p64 XL, T1_H, MASK
174 // Alternative reduction for CPUs that lack support for the
175 // 64x64->128 PMULL instruction
177 .macro __pmull_reduce_p8
178 veor XL_H, XL_H, XM_L
179 veor XH_L, XH_L, XM_H
186 veor XL_H, XL_H, T1_L
187 veor XH_L, XH_L, T1_H
196 .macro ghash_update, pn
199 /* do the head block first, if supplied */
208 tst r0, #3 // skip until #blocks is a
209 bne 2f // round multiple of 4
211 vld1.8 {XL2-XM2}, [r2]!
212 1: vld1.8 {T3-T2}, [r2]!
218 vext.8 T1, XL2, XL2, #8
219 veor XL2_H, XL2_H, XL_L
225 vmull.p64 XH, HH4_H, XL_H // a1 * b1
226 veor XL2_H, XL2_H, XL_H
227 vmull.p64 XL, HH4_L, XL_L // a0 * b0
228 vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0)
230 vmull.p64 XH2, HH3_H, XM2_L // a1 * b1
231 veor XM2_L, XM2_L, XM2_H
232 vmull.p64 XL2, HH3_L, XM2_H // a0 * b0
233 vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0)
239 vmull.p64 XH2, HH_H, T3_L // a1 * b1
240 veor T3_L, T3_L, T3_H
241 vmull.p64 XL2, HH_L, T3_H // a0 * b0
242 vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0)
248 vmull.p64 XH2, SHASH_H, T1_L // a1 * b1
249 veor T1_L, T1_L, T1_H
250 vmull.p64 XL2, SHASH_L, T1_H // a0 * b0
251 vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0)
259 vld1.8 {XL2-XM2}, [r2]!
272 2: vld1.64 {T1}, [r2]!
275 3: /* multiply XL by SHASH in GF(2^128) */
276 #ifndef CONFIG_CPU_BIG_ENDIAN
279 vext.8 IN1, T1, T1, #8
280 veor T1_L, T1_L, XL_H
283 __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1
285 __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
286 __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0)
303 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
304 * struct ghash_key const *k, const char *head)
306 ENTRY(pmull_ghash_update_p64)
307 vld1.64 {SHASH}, [r3]!
309 vld1.64 {HH3-HH4}, [r3]
311 veor SHASH2_p64, SHASH_L, SHASH_H
312 veor SHASH2_H, HH_L, HH_H
313 veor HH34_L, HH3_L, HH3_H
314 veor HH34_H, HH4_L, HH4_H
317 vshl.u64 MASK, MASK, #57
320 ENDPROC(pmull_ghash_update_p64)
322 ENTRY(pmull_ghash_update_p8)
323 vld1.64 {SHASH}, [r3]
324 veor SHASH2_p8, SHASH_L, SHASH_H
326 vext.8 s1l, SHASH_L, SHASH_L, #1
327 vext.8 s2l, SHASH_L, SHASH_L, #2
328 vext.8 s3l, SHASH_L, SHASH_L, #3
329 vext.8 s4l, SHASH_L, SHASH_L, #4
330 vext.8 s1h, SHASH_H, SHASH_H, #1
331 vext.8 s2h, SHASH_H, SHASH_H, #2
332 vext.8 s3h, SHASH_H, SHASH_H, #3
333 vext.8 s4h, SHASH_H, SHASH_H, #4
335 vmov.i64 k16, #0xffff
336 vmov.i64 k32, #0xffffffff
337 vmov.i64 k48, #0xffffffffffff
340 ENDPROC(pmull_ghash_update_p8)