1 /* SPDX-License-Identifier: GPL-2.0 OR MIT */
3 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
4 * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
7 #include <linux/linkage.h>
9 .section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
11 IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
12 .octa 0x5BE0CD191F83D9AB9B05688C510E527F
13 .section .rodata.cst16.ROT16, "aM", @progbits, 16
15 ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
16 .section .rodata.cst16.ROR328, "aM", @progbits, 16
18 ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
19 .section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
22 .byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
23 .byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
24 .byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
25 .byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
26 .byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
27 .byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
28 .byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
29 .byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
30 .byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
31 .byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
32 #ifdef CONFIG_AS_AVX512
33 .section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
36 .long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
37 .long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
38 .long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
39 .long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
40 .long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
41 .long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
42 .long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
43 .long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
44 .long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
45 .long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
46 #endif /* CONFIG_AS_AVX512 */
49 SYM_FUNC_START(blake2s_compress_ssse3)
53 movdqu 0x10(%rdi),%xmm1
54 movdqa ROT16(%rip),%xmm12
55 movdqa ROR328(%rip),%xmm13
56 movdqu 0x20(%rdi),%xmm14
58 leaq SIGMA+0xa0(%rip),%r8
67 pxor IV+0x10(%rip),%xmm3
71 movd (%rsi,%rax,4),%xmm4
73 movd (%rsi,%rax,4),%xmm5
75 movd (%rsi,%rax,4),%xmm6
77 movd (%rsi,%rax,4),%xmm7
80 punpcklqdq %xmm6,%xmm4
92 movd (%rsi,%rax,4),%xmm5
94 movd (%rsi,%rax,4),%xmm6
96 movd (%rsi,%rax,4),%xmm7
98 movd (%rsi,%rax,4),%xmm4
100 punpckldq %xmm4,%xmm7
101 punpcklqdq %xmm7,%xmm5
112 pshufd $0x93,%xmm0,%xmm0
113 pshufd $0x4e,%xmm3,%xmm3
114 pshufd $0x39,%xmm2,%xmm2
115 movzbl 0x8(%rcx),%eax
116 movd (%rsi,%rax,4),%xmm6
117 movzbl 0x9(%rcx),%eax
118 movd (%rsi,%rax,4),%xmm7
119 movzbl 0xa(%rcx),%eax
120 movd (%rsi,%rax,4),%xmm4
121 movzbl 0xb(%rcx),%eax
122 movd (%rsi,%rax,4),%xmm5
123 punpckldq %xmm7,%xmm6
124 punpckldq %xmm5,%xmm4
125 punpcklqdq %xmm4,%xmm6
136 movzbl 0xc(%rcx),%eax
137 movd (%rsi,%rax,4),%xmm7
138 movzbl 0xd(%rcx),%eax
139 movd (%rsi,%rax,4),%xmm4
140 movzbl 0xe(%rcx),%eax
141 movd (%rsi,%rax,4),%xmm5
142 movzbl 0xf(%rcx),%eax
143 movd (%rsi,%rax,4),%xmm6
144 punpckldq %xmm4,%xmm7
145 punpckldq %xmm6,%xmm5
146 punpcklqdq %xmm5,%xmm7
157 pshufd $0x39,%xmm0,%xmm0
158 pshufd $0x4e,%xmm3,%xmm3
159 pshufd $0x93,%xmm2,%xmm2
171 movdqu %xmm1,0x10(%rdi)
172 movdqu %xmm14,0x20(%rdi)
175 SYM_FUNC_END(blake2s_compress_ssse3)
177 #ifdef CONFIG_AS_AVX512
178 SYM_FUNC_START(blake2s_compress_avx512)
180 vmovdqu 0x10(%rdi),%xmm1
181 vmovdqu 0x20(%rdi),%xmm4
183 vmovdqa IV(%rip),%xmm14
184 vmovdqa IV+16(%rip),%xmm15
185 jmp .Lblake2s_compress_avx512_mainloop
187 .Lblake2s_compress_avx512_mainloop:
190 vpaddq %xmm5,%xmm4,%xmm4
192 vpxor %xmm15,%xmm4,%xmm3
194 vmovdqu 0x20(%rsi),%ymm7
196 leaq SIGMA2(%rip),%rax
198 .Lblake2s_compress_avx512_roundloop:
200 vmovdqa -0x40(%rax),%ymm8
201 vmovdqa -0x20(%rax),%ymm9
202 vpermi2d %ymm7,%ymm6,%ymm8
203 vpermi2d %ymm7,%ymm6,%ymm9
206 vpaddd %xmm8,%xmm0,%xmm0
207 vpaddd %xmm1,%xmm0,%xmm0
208 vpxor %xmm0,%xmm3,%xmm3
209 vprord $0x10,%xmm3,%xmm3
210 vpaddd %xmm3,%xmm2,%xmm2
211 vpxor %xmm2,%xmm1,%xmm1
212 vprord $0xc,%xmm1,%xmm1
213 vextracti128 $0x1,%ymm8,%xmm8
214 vpaddd %xmm8,%xmm0,%xmm0
215 vpaddd %xmm1,%xmm0,%xmm0
216 vpxor %xmm0,%xmm3,%xmm3
217 vprord $0x8,%xmm3,%xmm3
218 vpaddd %xmm3,%xmm2,%xmm2
219 vpxor %xmm2,%xmm1,%xmm1
220 vprord $0x7,%xmm1,%xmm1
221 vpshufd $0x93,%xmm0,%xmm0
222 vpshufd $0x4e,%xmm3,%xmm3
223 vpshufd $0x39,%xmm2,%xmm2
224 vpaddd %xmm9,%xmm0,%xmm0
225 vpaddd %xmm1,%xmm0,%xmm0
226 vpxor %xmm0,%xmm3,%xmm3
227 vprord $0x10,%xmm3,%xmm3
228 vpaddd %xmm3,%xmm2,%xmm2
229 vpxor %xmm2,%xmm1,%xmm1
230 vprord $0xc,%xmm1,%xmm1
231 vextracti128 $0x1,%ymm9,%xmm9
232 vpaddd %xmm9,%xmm0,%xmm0
233 vpaddd %xmm1,%xmm0,%xmm0
234 vpxor %xmm0,%xmm3,%xmm3
235 vprord $0x8,%xmm3,%xmm3
236 vpaddd %xmm3,%xmm2,%xmm2
237 vpxor %xmm2,%xmm1,%xmm1
238 vprord $0x7,%xmm1,%xmm1
239 vpshufd $0x39,%xmm0,%xmm0
240 vpshufd $0x4e,%xmm3,%xmm3
241 vpshufd $0x93,%xmm2,%xmm2
243 jne .Lblake2s_compress_avx512_roundloop
244 vpxor %xmm10,%xmm0,%xmm0
245 vpxor %xmm11,%xmm1,%xmm1
246 vpxor %xmm2,%xmm0,%xmm0
247 vpxor %xmm3,%xmm1,%xmm1
249 jne .Lblake2s_compress_avx512_mainloop
251 vmovdqu %xmm1,0x10(%rdi)
252 vmovdqu %xmm4,0x20(%rdi)
255 SYM_FUNC_END(blake2s_compress_avx512)
256 #endif /* CONFIG_AS_AVX512 */