Linux 5.7.7
[linux/fpc-iii.git] / arch / arm / crypto / nh-neon-core.S
blob434d80ab531c2a600fbcffc89c21e6a8ad5ef284
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * NH - ε-almost-universal hash function, NEON accelerated version
4  *
5  * Copyright 2018 Google LLC
6  *
7  * Author: Eric Biggers <ebiggers@google.com>
8  */
10 #include <linux/linkage.h>
12         .text
13         .fpu            neon
15         KEY             .req    r0
16         MESSAGE         .req    r1
17         MESSAGE_LEN     .req    r2
18         HASH            .req    r3
20         PASS0_SUMS      .req    q0
21         PASS0_SUM_A     .req    d0
22         PASS0_SUM_B     .req    d1
23         PASS1_SUMS      .req    q1
24         PASS1_SUM_A     .req    d2
25         PASS1_SUM_B     .req    d3
26         PASS2_SUMS      .req    q2
27         PASS2_SUM_A     .req    d4
28         PASS2_SUM_B     .req    d5
29         PASS3_SUMS      .req    q3
30         PASS3_SUM_A     .req    d6
31         PASS3_SUM_B     .req    d7
32         K0              .req    q4
33         K1              .req    q5
34         K2              .req    q6
35         K3              .req    q7
36         T0              .req    q8
37         T0_L            .req    d16
38         T0_H            .req    d17
39         T1              .req    q9
40         T1_L            .req    d18
41         T1_H            .req    d19
42         T2              .req    q10
43         T2_L            .req    d20
44         T2_H            .req    d21
45         T3              .req    q11
46         T3_L            .req    d22
47         T3_H            .req    d23
49 .macro _nh_stride       k0, k1, k2, k3
51         // Load next message stride
52         vld1.8          {T3}, [MESSAGE]!
54         // Load next key stride
55         vld1.32         {\k3}, [KEY]!
57         // Add message words to key words
58         vadd.u32        T0, T3, \k0
59         vadd.u32        T1, T3, \k1
60         vadd.u32        T2, T3, \k2
61         vadd.u32        T3, T3, \k3
63         // Multiply 32x32 => 64 and accumulate
64         vmlal.u32       PASS0_SUMS, T0_L, T0_H
65         vmlal.u32       PASS1_SUMS, T1_L, T1_H
66         vmlal.u32       PASS2_SUMS, T2_L, T2_H
67         vmlal.u32       PASS3_SUMS, T3_L, T3_H
68 .endm
71  * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
72  *              u8 hash[NH_HASH_BYTES])
73  *
74  * It's guaranteed that message_len % 16 == 0.
75  */
76 ENTRY(nh_neon)
78         vld1.32         {K0,K1}, [KEY]!
79           vmov.u64      PASS0_SUMS, #0
80           vmov.u64      PASS1_SUMS, #0
81         vld1.32         {K2}, [KEY]!
82           vmov.u64      PASS2_SUMS, #0
83           vmov.u64      PASS3_SUMS, #0
85         subs            MESSAGE_LEN, MESSAGE_LEN, #64
86         blt             .Lloop4_done
87 .Lloop4:
88         _nh_stride      K0, K1, K2, K3
89         _nh_stride      K1, K2, K3, K0
90         _nh_stride      K2, K3, K0, K1
91         _nh_stride      K3, K0, K1, K2
92         subs            MESSAGE_LEN, MESSAGE_LEN, #64
93         bge             .Lloop4
95 .Lloop4_done:
96         ands            MESSAGE_LEN, MESSAGE_LEN, #63
97         beq             .Ldone
98         _nh_stride      K0, K1, K2, K3
100         subs            MESSAGE_LEN, MESSAGE_LEN, #16
101         beq             .Ldone
102         _nh_stride      K1, K2, K3, K0
104         subs            MESSAGE_LEN, MESSAGE_LEN, #16
105         beq             .Ldone
106         _nh_stride      K2, K3, K0, K1
108 .Ldone:
109         // Sum the accumulators for each pass, then store the sums to 'hash'
110         vadd.u64        T0_L, PASS0_SUM_A, PASS0_SUM_B
111         vadd.u64        T0_H, PASS1_SUM_A, PASS1_SUM_B
112         vadd.u64        T1_L, PASS2_SUM_A, PASS2_SUM_B
113         vadd.u64        T1_H, PASS3_SUM_A, PASS3_SUM_B
114         vst1.8          {T0-T1}, [HASH]
115         bx              lr
116 ENDPROC(nh_neon)