perf tools: Streamline bpf examples and headers installation
[linux/fpc-iii.git] / arch / arm / crypto / ghash-ce-core.S
blob2f78c10b188152f80409869a5062c63a751b9442
1 /*
2  * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
3  *
4  * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License version 2 as published
8  * by the Free Software Foundation.
9  */
11 #include <linux/linkage.h>
12 #include <asm/assembler.h>
14         SHASH           .req    q0
15         T1              .req    q1
16         XL              .req    q2
17         XM              .req    q3
18         XH              .req    q4
19         IN1             .req    q4
21         SHASH_L         .req    d0
22         SHASH_H         .req    d1
23         T1_L            .req    d2
24         T1_H            .req    d3
25         XL_L            .req    d4
26         XL_H            .req    d5
27         XM_L            .req    d6
28         XM_H            .req    d7
29         XH_L            .req    d8
31         t0l             .req    d10
32         t0h             .req    d11
33         t1l             .req    d12
34         t1h             .req    d13
35         t2l             .req    d14
36         t2h             .req    d15
37         t3l             .req    d16
38         t3h             .req    d17
39         t4l             .req    d18
40         t4h             .req    d19
42         t0q             .req    q5
43         t1q             .req    q6
44         t2q             .req    q7
45         t3q             .req    q8
46         t4q             .req    q9
47         T2              .req    q9
49         s1l             .req    d20
50         s1h             .req    d21
51         s2l             .req    d22
52         s2h             .req    d23
53         s3l             .req    d24
54         s3h             .req    d25
55         s4l             .req    d26
56         s4h             .req    d27
58         MASK            .req    d28
59         SHASH2_p8       .req    d28
61         k16             .req    d29
62         k32             .req    d30
63         k48             .req    d31
64         SHASH2_p64      .req    d31
66         .text
67         .fpu            crypto-neon-fp-armv8
69         .macro          __pmull_p64, rd, rn, rm, b1, b2, b3, b4
70         vmull.p64       \rd, \rn, \rm
71         .endm
73         /*
74          * This implementation of 64x64 -> 128 bit polynomial multiplication
75          * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
76          * "Fast Software Polynomial Multiplication on ARM Processors Using
77          * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
78          * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
79          *
80          * It has been slightly tweaked for in-order performance, and to allow
81          * 'rq' to overlap with 'ad' or 'bd'.
82          */
83         .macro          __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
84         vext.8          t0l, \ad, \ad, #1       @ A1
85         .ifc            \b1, t4l
86         vext.8          t4l, \bd, \bd, #1       @ B1
87         .endif
88         vmull.p8        t0q, t0l, \bd           @ F = A1*B
89         vext.8          t1l, \ad, \ad, #2       @ A2
90         vmull.p8        t4q, \ad, \b1           @ E = A*B1
91         .ifc            \b2, t3l
92         vext.8          t3l, \bd, \bd, #2       @ B2
93         .endif
94         vmull.p8        t1q, t1l, \bd           @ H = A2*B
95         vext.8          t2l, \ad, \ad, #3       @ A3
96         vmull.p8        t3q, \ad, \b2           @ G = A*B2
97         veor            t0q, t0q, t4q           @ L = E + F
98         .ifc            \b3, t4l
99         vext.8          t4l, \bd, \bd, #3       @ B3
100         .endif
101         vmull.p8        t2q, t2l, \bd           @ J = A3*B
102         veor            t0l, t0l, t0h           @ t0 = (L) (P0 + P1) << 8
103         veor            t1q, t1q, t3q           @ M = G + H
104         .ifc            \b4, t3l
105         vext.8          t3l, \bd, \bd, #4       @ B4
106         .endif
107         vmull.p8        t4q, \ad, \b3           @ I = A*B3
108         veor            t1l, t1l, t1h           @ t1 = (M) (P2 + P3) << 16
109         vmull.p8        t3q, \ad, \b4           @ K = A*B4
110         vand            t0h, t0h, k48
111         vand            t1h, t1h, k32
112         veor            t2q, t2q, t4q           @ N = I + J
113         veor            t0l, t0l, t0h
114         veor            t1l, t1l, t1h
115         veor            t2l, t2l, t2h           @ t2 = (N) (P4 + P5) << 24
116         vand            t2h, t2h, k16
117         veor            t3l, t3l, t3h           @ t3 = (K) (P6 + P7) << 32
118         vmov.i64        t3h, #0
119         vext.8          t0q, t0q, t0q, #15
120         veor            t2l, t2l, t2h
121         vext.8          t1q, t1q, t1q, #14
122         vmull.p8        \rq, \ad, \bd           @ D = A*B
123         vext.8          t2q, t2q, t2q, #13
124         vext.8          t3q, t3q, t3q, #12
125         veor            t0q, t0q, t1q
126         veor            t2q, t2q, t3q
127         veor            \rq, \rq, t0q
128         veor            \rq, \rq, t2q
129         .endm
131         //
132         // PMULL (64x64->128) based reduction for CPUs that can do
133         // it in a single instruction.
134         //
135         .macro          __pmull_reduce_p64
136         vmull.p64       T1, XL_L, MASK
138         veor            XH_L, XH_L, XM_H
139         vext.8          T1, T1, T1, #8
140         veor            XL_H, XL_H, XM_L
141         veor            T1, T1, XL
143         vmull.p64       XL, T1_H, MASK
144         .endm
146         //
147         // Alternative reduction for CPUs that lack support for the
148         // 64x64->128 PMULL instruction
149         //
150         .macro          __pmull_reduce_p8
151         veor            XL_H, XL_H, XM_L
152         veor            XH_L, XH_L, XM_H
154         vshl.i64        T1, XL, #57
155         vshl.i64        T2, XL, #62
156         veor            T1, T1, T2
157         vshl.i64        T2, XL, #63
158         veor            T1, T1, T2
159         veor            XL_H, XL_H, T1_L
160         veor            XH_L, XH_L, T1_H
162         vshr.u64        T1, XL, #1
163         veor            XH, XH, XL
164         veor            XL, XL, T1
165         vshr.u64        T1, T1, #6
166         vshr.u64        XL, XL, #1
167         .endm
169         .macro          ghash_update, pn
170         vld1.64         {XL}, [r1]
172         /* do the head block first, if supplied */
173         ldr             ip, [sp]
174         teq             ip, #0
175         beq             0f
176         vld1.64         {T1}, [ip]
177         teq             r0, #0
178         b               1f
180 0:      vld1.64         {T1}, [r2]!
181         subs            r0, r0, #1
183 1:      /* multiply XL by SHASH in GF(2^128) */
184 #ifndef CONFIG_CPU_BIG_ENDIAN
185         vrev64.8        T1, T1
186 #endif
187         vext.8          IN1, T1, T1, #8
188         veor            T1_L, T1_L, XL_H
189         veor            XL, XL, IN1
191         __pmull_\pn     XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h   @ a1 * b1
192         veor            T1, T1, XL
193         __pmull_\pn     XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l   @ a0 * b0
194         __pmull_\pn     XM, T1_L, SHASH2_\pn                    @ (a1+a0)(b1+b0)
196         veor            T1, XL, XH
197         veor            XM, XM, T1
199         __pmull_reduce_\pn
201         veor            T1, T1, XH
202         veor            XL, XL, T1
204         bne             0b
206         vst1.64         {XL}, [r1]
207         bx              lr
208         .endm
210         /*
211          * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
212          *                         struct ghash_key const *k, const char *head)
213          */
214 ENTRY(pmull_ghash_update_p64)
215         vld1.64         {SHASH}, [r3]
216         veor            SHASH2_p64, SHASH_L, SHASH_H
218         vmov.i8         MASK, #0xe1
219         vshl.u64        MASK, MASK, #57
221         ghash_update    p64
222 ENDPROC(pmull_ghash_update_p64)
224 ENTRY(pmull_ghash_update_p8)
225         vld1.64         {SHASH}, [r3]
226         veor            SHASH2_p8, SHASH_L, SHASH_H
228         vext.8          s1l, SHASH_L, SHASH_L, #1
229         vext.8          s2l, SHASH_L, SHASH_L, #2
230         vext.8          s3l, SHASH_L, SHASH_L, #3
231         vext.8          s4l, SHASH_L, SHASH_L, #4
232         vext.8          s1h, SHASH_H, SHASH_H, #1
233         vext.8          s2h, SHASH_H, SHASH_H, #2
234         vext.8          s3h, SHASH_H, SHASH_H, #3
235         vext.8          s4h, SHASH_H, SHASH_H, #4
237         vmov.i64        k16, #0xffff
238         vmov.i64        k32, #0xffffffff
239         vmov.i64        k48, #0xffffffffffff
241         ghash_update    p8
242 ENDPROC(pmull_ghash_update_p8)