1 /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
3 // This file is dual-licensed, meaning that you can use it under your
4 // choice of either of the following two licenses:
6 // Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
8 // Licensed under the Apache License 2.0 (the "License"). You can obtain
9 // a copy in the file LICENSE in the source distribution or at
10 // https://www.openssl.org/source/license.html
14 // Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
15 // Copyright 2024 Google LLC
16 // All rights reserved.
18 // Redistribution and use in source and binary forms, with or without
19 // modification, are permitted provided that the following conditions
21 // 1. Redistributions of source code must retain the above copyright
22 // notice, this list of conditions and the following disclaimer.
23 // 2. Redistributions in binary form must reproduce the above copyright
24 // notice, this list of conditions and the following disclaimer in the
25 // documentation and/or other materials provided with the distribution.
27 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
39 // The generated code of this file depends on the following RISC-V extensions:
41 // - RISC-V Vector ('V') with VLEN >= 128 && VLEN < 2048
42 // - RISC-V Vector AES block cipher extension ('Zvkned')
43 // - RISC-V Vector Bit-manipulation extension ('Zvbb')
44 // - RISC-V Vector GCM/GMAC extension ('Zvkg')
46 #include <linux/linkage.h>
49 .option arch, +zvkned, +zvbb, +zvkg
51 #include "aes-macros.S"
64 // v1-v15 contain the AES round keys, but they are used for temporaries before
65 // the AES round keys have been loaded.
66 #define TWEAKS v16 // LMUL=4 (most of the time)
67 #define TWEAKS_BREV v20 // LMUL=4 (most of the time)
68 #define MULTS_BREV v24 // LMUL=4 (most of the time)
74 // xts_init initializes the following values:
76 // TWEAKS: N 128-bit tweaks T*(x^i) for i in 0..(N - 1)
77 // TWEAKS_BREV: same as TWEAKS, but bit-reversed
78 // MULTS_BREV: N 128-bit values x^N, bit-reversed. Only if N > 1.
80 // N is the maximum number of blocks that will be processed per loop iteration,
81 // computed using vsetvli.
83 // The field convention used by XTS is the same as that of GHASH, but with the
84 // bits reversed within each byte. The zvkg extension provides the vgmul
85 // instruction which does multiplication in this field. Therefore, for tweak
86 // computation we use vgmul to do multiplications in parallel, instead of
87 // serially multiplying by x using shifting+xoring. Note that for this to work,
88 // the inputs and outputs to vgmul must be bit-reversed (we do it with vbrev8).
91 // Load the first tweak T.
92 vsetivli zero, 4, e32, m1, ta, ma
93 vle32.v TWEAKS, (TWEAKP)
95 // If there's only one block (or no blocks at all), then skip the tweak
96 // sequence computation because (at most) T itself is needed.
98 ble LEN, t0, .Linit_single_block\@
100 // Save a copy of T bit-reversed in v12.
104 // Generate x^i for i in 0..(N - 1), i.e. 128-bit values 1 << i assuming
105 // that N <= 128. Though, this code actually requires N < 64 (or
106 // equivalently VLEN < 2048) due to the use of 64-bit intermediate
107 // values here and in the x^N computation later.
109 vsetvli VL, LEN32, e32, m4, ta, ma
110 srli t0, VL, 2 // t0 = N (num blocks)
111 // Generate two sequences, each with N 32-bit values:
112 // v0=[1, 1, 1, ...] and v1=[0, 1, 2, ...].
113 vsetvli zero, t0, e32, m1, ta, ma
116 // Use vzext to zero-extend the sequences to 64 bits. Reinterpret them
117 // as two sequences, each with 2*N 32-bit values:
118 // v2=[1, 0, 1, 0, 1, 0, ...] and v4=[0, 0, 1, 0, 2, 0, ...].
119 vsetvli zero, t0, e64, m2, ta, ma
122 slli t1, t0, 1 // t1 = 2*N
123 vsetvli zero, t1, e32, m2, ta, ma
124 // Use vwsll to compute [1<<0, 0<<0, 1<<1, 0<<0, 1<<2, 0<<0, ...],
125 // widening to 64 bits per element. When reinterpreted as N 128-bit
126 // values, this is the needed sequence of 128-bit values 1 << i (x^i).
129 // Copy the bit-reversed T to all N elements of TWEAKS_BREV, then
130 // multiply by x^i. This gives the sequence T*(x^i), bit-reversed.
131 vsetvli zero, LEN32, e32, m4, ta, ma
132 vmv.v.i TWEAKS_BREV, 0
133 vaesz.vs TWEAKS_BREV, v12
135 vgmul.vv TWEAKS_BREV, v8
137 // Save a copy of the sequence T*(x^i) with the bit reversal undone.
138 vbrev8.v TWEAKS, TWEAKS_BREV
140 // Generate N copies of x^N, i.e. 128-bit values 1 << N, bit-reversed.
142 sll t1, t1, t0 // t1 = 1 << N
143 vsetivli zero, 2, e64, m1, ta, ma
145 vsetivli zero, 1, e64, m1, tu, ma
148 vsetvli zero, LEN32, e32, m4, ta, ma
149 vmv.v.i MULTS_BREV, 0
150 vaesz.vs MULTS_BREV, v0
154 .Linit_single_block\@:
155 vbrev8.v TWEAKS_BREV, TWEAKS
159 // Set the first 128 bits of MULTS_BREV to 0x40, i.e. 'x' bit-reversed. This is
160 // the multiplier required to advance the tweak by one.
163 vsetivli zero, 4, e32, m1, ta, ma
164 vmv.v.i MULTS_BREV, 0
165 vsetivli zero, 1, e8, m1, tu, ma
166 vmv.v.x MULTS_BREV, t0
169 .macro __aes_xts_crypt enc, keylen
170 // With 16 < len <= 31, there's no main loop, just ciphertext stealing.
171 beqz LEN32, .Lcts_without_main_loop\@
173 vsetvli VLMAX, zero, e32, m4, ta, ma
175 vsetvli VL, LEN32, e32, m4, ta, ma
177 // Encrypt or decrypt VL/4 blocks.
179 vxor.vv TMP0, TMP0, TWEAKS
180 aes_crypt TMP0, \enc, \keylen
181 vxor.vv TMP0, TMP0, TWEAKS
184 // Update the pointers and the remaining length.
190 // Check whether more blocks remain.
191 beqz LEN32, .Lmain_loop_done\@
193 // Compute the next sequence of tweaks by multiplying the previous
194 // sequence by x^N. Store the result in both bit-reversed order and
195 // regular order (i.e. with the bit reversal undone).
196 vgmul.vv TWEAKS_BREV, MULTS_BREV
197 vbrev8.v TWEAKS, TWEAKS_BREV
199 // Since we compute the tweak multipliers x^N in advance, we require
200 // that each iteration process the same length except possibly the last.
201 // This conflicts slightly with the behavior allowed by RISC-V Vector
202 // Extension, where CPUs can select a lower length for both of the last
203 // two iterations. E.g., vl might take the sequence of values
204 // [16, 16, 16, 12, 12], whereas we need [16, 16, 16, 16, 8] so that we
205 // can use x^4 again instead of computing x^3. Therefore, we explicitly
206 // keep the vl at VLMAX if there is at least VLMAX remaining.
213 // Compute the next tweak.
215 vsetivli zero, 4, e32, m4, ta, ma
216 vslidedown.vx TWEAKS_BREV, TWEAKS_BREV, t0 // Extract last tweak
217 vsetivli zero, 4, e32, m1, ta, ma
218 vgmul.vv TWEAKS_BREV, MULTS_BREV // Advance to next tweak
220 bnez TAIL_LEN, .Lcts\@
222 // Update *TWEAKP to contain the next tweak.
223 vbrev8.v TWEAKS, TWEAKS_BREV
224 vse32.v TWEAKS, (TWEAKP)
227 .Lcts_without_main_loop\@:
230 // TWEAKS_BREV now contains the next tweak. Compute the one after that.
231 vsetivli zero, 4, e32, m1, ta, ma
232 vmv.v.v TMP0, TWEAKS_BREV
233 vgmul.vv TMP0, MULTS_BREV
234 // Undo the bit reversal of the next two tweaks and store them in TMP1
235 // and TMP2, such that TMP1 is the first needed and TMP2 the second.
237 vbrev8.v TMP1, TWEAKS_BREV
241 vbrev8.v TMP2, TWEAKS_BREV
244 // Encrypt/decrypt the last full block.
246 vxor.vv TMP0, TMP0, TMP1
247 aes_crypt TMP0, \enc, \keylen
248 vxor.vv TMP0, TMP0, TMP1
250 // Swap the first TAIL_LEN bytes of the above result with the tail.
251 // Note that to support in-place encryption/decryption, the load from
252 // the input tail must happen before the store to the output tail.
256 vsetvli zero, TAIL_LEN, e8, m1, tu, ma
260 // Encrypt/decrypt again and store the last full block.
261 vsetivli zero, 4, e32, m1, ta, ma
262 vxor.vv TMP0, TMP0, TMP2
263 aes_crypt TMP0, \enc, \keylen
264 vxor.vv TMP0, TMP0, TMP2
270 .macro aes_xts_crypt enc
272 // Check whether the length is a multiple of the AES block size.
273 andi TAIL_LEN, LEN, 15
276 // The length isn't a multiple of the AES block size, so ciphertext
277 // stealing will be required. Ciphertext stealing involves special
278 // handling of the partial block and the last full block, so subtract
279 // the length of both from the length to be processed in the main loop.
280 sub LEN, LEN, TAIL_LEN
284 // LEN and LEN32 now contain the total length of the blocks that will be
285 // processed in the main loop, in bytes and 32-bit words respectively.
288 aes_begin KEYP, 128f, 192f
289 __aes_xts_crypt \enc, 256
291 __aes_xts_crypt \enc, 128
293 __aes_xts_crypt \enc, 192
296 // void aes_xts_encrypt_zvkned_zvbb_zvkg(const struct crypto_aes_ctx *key,
297 // const u8 *in, u8 *out, size_t len,
300 // |key| is the data key. |tweak| contains the next tweak; the encryption of
301 // the original IV with the tweak key was already done. This function supports
302 // incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
303 // |len| must be a multiple of 16 except on the last call. If |len| is a
304 // multiple of 16, then this function updates |tweak| to contain the next tweak.
305 SYM_FUNC_START(aes_xts_encrypt_zvkned_zvbb_zvkg)
307 SYM_FUNC_END(aes_xts_encrypt_zvkned_zvbb_zvkg)
309 // Same prototype and calling convention as the encryption function
310 SYM_FUNC_START(aes_xts_decrypt_zvkned_zvbb_zvkg)
312 SYM_FUNC_END(aes_xts_decrypt_zvkned_zvbb_zvkg)