ARM: dma-api: fix max_pfn off-by-one error in __dma_supported()
[linux/fpc-iii.git] / arch / arm64 / crypto / nh-neon-core.S
blob51c0a534ef87ccf1083f48051b1e617913479c46
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * NH - ε-almost-universal hash function, ARM64 NEON accelerated version
4  *
5  * Copyright 2018 Google LLC
6  *
7  * Author: Eric Biggers <ebiggers@google.com>
8  */
10 #include <linux/linkage.h>
12         KEY             .req    x0
13         MESSAGE         .req    x1
14         MESSAGE_LEN     .req    x2
15         HASH            .req    x3
17         PASS0_SUMS      .req    v0
18         PASS1_SUMS      .req    v1
19         PASS2_SUMS      .req    v2
20         PASS3_SUMS      .req    v3
21         K0              .req    v4
22         K1              .req    v5
23         K2              .req    v6
24         K3              .req    v7
25         T0              .req    v8
26         T1              .req    v9
27         T2              .req    v10
28         T3              .req    v11
29         T4              .req    v12
30         T5              .req    v13
31         T6              .req    v14
32         T7              .req    v15
34 .macro _nh_stride       k0, k1, k2, k3
36         // Load next message stride
37         ld1             {T3.16b}, [MESSAGE], #16
39         // Load next key stride
40         ld1             {\k3\().4s}, [KEY], #16
42         // Add message words to key words
43         add             T0.4s, T3.4s, \k0\().4s
44         add             T1.4s, T3.4s, \k1\().4s
45         add             T2.4s, T3.4s, \k2\().4s
46         add             T3.4s, T3.4s, \k3\().4s
48         // Multiply 32x32 => 64 and accumulate
49         mov             T4.d[0], T0.d[1]
50         mov             T5.d[0], T1.d[1]
51         mov             T6.d[0], T2.d[1]
52         mov             T7.d[0], T3.d[1]
53         umlal           PASS0_SUMS.2d, T0.2s, T4.2s
54         umlal           PASS1_SUMS.2d, T1.2s, T5.2s
55         umlal           PASS2_SUMS.2d, T2.2s, T6.2s
56         umlal           PASS3_SUMS.2d, T3.2s, T7.2s
57 .endm
60  * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
61  *              u8 hash[NH_HASH_BYTES])
62  *
63  * It's guaranteed that message_len % 16 == 0.
64  */
65 SYM_FUNC_START(nh_neon)
67         ld1             {K0.4s,K1.4s}, [KEY], #32
68           movi          PASS0_SUMS.2d, #0
69           movi          PASS1_SUMS.2d, #0
70         ld1             {K2.4s}, [KEY], #16
71           movi          PASS2_SUMS.2d, #0
72           movi          PASS3_SUMS.2d, #0
74         subs            MESSAGE_LEN, MESSAGE_LEN, #64
75         blt             .Lloop4_done
76 .Lloop4:
77         _nh_stride      K0, K1, K2, K3
78         _nh_stride      K1, K2, K3, K0
79         _nh_stride      K2, K3, K0, K1
80         _nh_stride      K3, K0, K1, K2
81         subs            MESSAGE_LEN, MESSAGE_LEN, #64
82         bge             .Lloop4
84 .Lloop4_done:
85         ands            MESSAGE_LEN, MESSAGE_LEN, #63
86         beq             .Ldone
87         _nh_stride      K0, K1, K2, K3
89         subs            MESSAGE_LEN, MESSAGE_LEN, #16
90         beq             .Ldone
91         _nh_stride      K1, K2, K3, K0
93         subs            MESSAGE_LEN, MESSAGE_LEN, #16
94         beq             .Ldone
95         _nh_stride      K2, K3, K0, K1
97 .Ldone:
98         // Sum the accumulators for each pass, then store the sums to 'hash'
99         addp            T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d
100         addp            T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d
101         st1             {T0.16b,T1.16b}, [HASH]
102         ret
103 SYM_FUNC_END(nh_neon)