1 /* SPDX-License-Identifier: GPL-2.0 OR MIT */
3 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
4 * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
7 #include <linux/linkage.h>
9 .section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
11 IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
12 .octa 0x5BE0CD191F83D9AB9B05688C510E527F
13 .section .rodata.cst16.ROT16, "aM", @progbits, 16
15 ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
16 .section .rodata.cst16.ROR328, "aM", @progbits, 16
18 ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
19 .section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
22 .byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
23 .byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
24 .byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
25 .byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
26 .byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
27 .byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
28 .byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
29 .byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
30 .byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
31 .byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
32 #ifdef CONFIG_AS_AVX512
33 .section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
36 .long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
37 .long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
38 .long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
39 .long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
40 .long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
41 .long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
42 .long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
43 .long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
44 .long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
45 .long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
46 #endif /* CONFIG_AS_AVX512 */
49 #ifdef CONFIG_AS_SSSE3
50 SYM_FUNC_START(blake2s_compress_ssse3)
54 movdqu 0x10(%rdi),%xmm1
55 movdqa ROT16(%rip),%xmm12
56 movdqa ROR328(%rip),%xmm13
57 movdqu 0x20(%rdi),%xmm14
59 leaq SIGMA+0xa0(%rip),%r8
68 pxor IV+0x10(%rip),%xmm3
72 movd (%rsi,%rax,4),%xmm4
74 movd (%rsi,%rax,4),%xmm5
76 movd (%rsi,%rax,4),%xmm6
78 movd (%rsi,%rax,4),%xmm7
81 punpcklqdq %xmm6,%xmm4
93 movd (%rsi,%rax,4),%xmm5
95 movd (%rsi,%rax,4),%xmm6
97 movd (%rsi,%rax,4),%xmm7
99 movd (%rsi,%rax,4),%xmm4
100 punpckldq %xmm6,%xmm5
101 punpckldq %xmm4,%xmm7
102 punpcklqdq %xmm7,%xmm5
113 pshufd $0x93,%xmm0,%xmm0
114 pshufd $0x4e,%xmm3,%xmm3
115 pshufd $0x39,%xmm2,%xmm2
116 movzbl 0x8(%rcx),%eax
117 movd (%rsi,%rax,4),%xmm6
118 movzbl 0x9(%rcx),%eax
119 movd (%rsi,%rax,4),%xmm7
120 movzbl 0xa(%rcx),%eax
121 movd (%rsi,%rax,4),%xmm4
122 movzbl 0xb(%rcx),%eax
123 movd (%rsi,%rax,4),%xmm5
124 punpckldq %xmm7,%xmm6
125 punpckldq %xmm5,%xmm4
126 punpcklqdq %xmm4,%xmm6
137 movzbl 0xc(%rcx),%eax
138 movd (%rsi,%rax,4),%xmm7
139 movzbl 0xd(%rcx),%eax
140 movd (%rsi,%rax,4),%xmm4
141 movzbl 0xe(%rcx),%eax
142 movd (%rsi,%rax,4),%xmm5
143 movzbl 0xf(%rcx),%eax
144 movd (%rsi,%rax,4),%xmm6
145 punpckldq %xmm4,%xmm7
146 punpckldq %xmm6,%xmm5
147 punpcklqdq %xmm5,%xmm7
158 pshufd $0x39,%xmm0,%xmm0
159 pshufd $0x4e,%xmm3,%xmm3
160 pshufd $0x93,%xmm2,%xmm2
172 movdqu %xmm1,0x10(%rdi)
173 movdqu %xmm14,0x20(%rdi)
176 SYM_FUNC_END(blake2s_compress_ssse3)
177 #endif /* CONFIG_AS_SSSE3 */
179 #ifdef CONFIG_AS_AVX512
180 SYM_FUNC_START(blake2s_compress_avx512)
182 vmovdqu 0x10(%rdi),%xmm1
183 vmovdqu 0x20(%rdi),%xmm4
185 vmovdqa IV(%rip),%xmm14
186 vmovdqa IV+16(%rip),%xmm15
187 jmp .Lblake2s_compress_avx512_mainloop
189 .Lblake2s_compress_avx512_mainloop:
192 vpaddq %xmm5,%xmm4,%xmm4
194 vpxor %xmm15,%xmm4,%xmm3
196 vmovdqu 0x20(%rsi),%ymm7
198 leaq SIGMA2(%rip),%rax
200 .Lblake2s_compress_avx512_roundloop:
202 vmovdqa -0x40(%rax),%ymm8
203 vmovdqa -0x20(%rax),%ymm9
204 vpermi2d %ymm7,%ymm6,%ymm8
205 vpermi2d %ymm7,%ymm6,%ymm9
208 vpaddd %xmm8,%xmm0,%xmm0
209 vpaddd %xmm1,%xmm0,%xmm0
210 vpxor %xmm0,%xmm3,%xmm3
211 vprord $0x10,%xmm3,%xmm3
212 vpaddd %xmm3,%xmm2,%xmm2
213 vpxor %xmm2,%xmm1,%xmm1
214 vprord $0xc,%xmm1,%xmm1
215 vextracti128 $0x1,%ymm8,%xmm8
216 vpaddd %xmm8,%xmm0,%xmm0
217 vpaddd %xmm1,%xmm0,%xmm0
218 vpxor %xmm0,%xmm3,%xmm3
219 vprord $0x8,%xmm3,%xmm3
220 vpaddd %xmm3,%xmm2,%xmm2
221 vpxor %xmm2,%xmm1,%xmm1
222 vprord $0x7,%xmm1,%xmm1
223 vpshufd $0x93,%xmm0,%xmm0
224 vpshufd $0x4e,%xmm3,%xmm3
225 vpshufd $0x39,%xmm2,%xmm2
226 vpaddd %xmm9,%xmm0,%xmm0
227 vpaddd %xmm1,%xmm0,%xmm0
228 vpxor %xmm0,%xmm3,%xmm3
229 vprord $0x10,%xmm3,%xmm3
230 vpaddd %xmm3,%xmm2,%xmm2
231 vpxor %xmm2,%xmm1,%xmm1
232 vprord $0xc,%xmm1,%xmm1
233 vextracti128 $0x1,%ymm9,%xmm9
234 vpaddd %xmm9,%xmm0,%xmm0
235 vpaddd %xmm1,%xmm0,%xmm0
236 vpxor %xmm0,%xmm3,%xmm3
237 vprord $0x8,%xmm3,%xmm3
238 vpaddd %xmm3,%xmm2,%xmm2
239 vpxor %xmm2,%xmm1,%xmm1
240 vprord $0x7,%xmm1,%xmm1
241 vpshufd $0x39,%xmm0,%xmm0
242 vpshufd $0x4e,%xmm3,%xmm3
243 vpshufd $0x93,%xmm2,%xmm2
245 jne .Lblake2s_compress_avx512_roundloop
246 vpxor %xmm10,%xmm0,%xmm0
247 vpxor %xmm11,%xmm1,%xmm1
248 vpxor %xmm2,%xmm0,%xmm0
249 vpxor %xmm3,%xmm1,%xmm1
251 jne .Lblake2s_compress_avx512_mainloop
253 vmovdqu %xmm1,0x10(%rdi)
254 vmovdqu %xmm4,0x20(%rdi)
257 SYM_FUNC_END(blake2s_compress_avx512)
258 #endif /* CONFIG_AS_AVX512 */