Linux 4.13.16
[linux/fpc-iii.git] / arch / x86 / crypto / poly1305-sse2-x86_64.S
blobc88c670cb5fc6d4b6331ba18882fae34038400b4
1 /*
2  * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
3  *
4  * Copyright (C) 2015 Martin Willi
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  */
12 #include <linux/linkage.h>
14 .section        .rodata.cst16.ANMASK, "aM", @progbits, 16
15 .align 16
16 ANMASK: .octa 0x0000000003ffffff0000000003ffffff
18 .section        .rodata.cst16.ORMASK, "aM", @progbits, 16
19 .align 16
20 ORMASK: .octa 0x00000000010000000000000001000000
22 .text
24 #define h0 0x00(%rdi)
25 #define h1 0x04(%rdi)
26 #define h2 0x08(%rdi)
27 #define h3 0x0c(%rdi)
28 #define h4 0x10(%rdi)
29 #define r0 0x00(%rdx)
30 #define r1 0x04(%rdx)
31 #define r2 0x08(%rdx)
32 #define r3 0x0c(%rdx)
33 #define r4 0x10(%rdx)
34 #define s1 0x00(%rsp)
35 #define s2 0x04(%rsp)
36 #define s3 0x08(%rsp)
37 #define s4 0x0c(%rsp)
38 #define m %rsi
39 #define h01 %xmm0
40 #define h23 %xmm1
41 #define h44 %xmm2
42 #define t1 %xmm3
43 #define t2 %xmm4
44 #define t3 %xmm5
45 #define t4 %xmm6
46 #define mask %xmm7
47 #define d0 %r8
48 #define d1 %r9
49 #define d2 %r10
50 #define d3 %r11
51 #define d4 %r12
53 ENTRY(poly1305_block_sse2)
54         # %rdi: Accumulator h[5]
55         # %rsi: 16 byte input block m
56         # %rdx: Poly1305 key r[5]
57         # %rcx: Block count
59         # This single block variant tries to improve performance by doing two
60         # multiplications in parallel using SSE instructions. There is quite
61         # some quardword packing involved, hence the speedup is marginal.
63         push            %rbx
64         push            %r12
65         sub             $0x10,%rsp
67         # s1..s4 = r1..r4 * 5
68         mov             r1,%eax
69         lea             (%eax,%eax,4),%eax
70         mov             %eax,s1
71         mov             r2,%eax
72         lea             (%eax,%eax,4),%eax
73         mov             %eax,s2
74         mov             r3,%eax
75         lea             (%eax,%eax,4),%eax
76         mov             %eax,s3
77         mov             r4,%eax
78         lea             (%eax,%eax,4),%eax
79         mov             %eax,s4
81         movdqa          ANMASK(%rip),mask
83 .Ldoblock:
84         # h01 = [0, h1, 0, h0]
85         # h23 = [0, h3, 0, h2]
86         # h44 = [0, h4, 0, h4]
87         movd            h0,h01
88         movd            h1,t1
89         movd            h2,h23
90         movd            h3,t2
91         movd            h4,h44
92         punpcklqdq      t1,h01
93         punpcklqdq      t2,h23
94         punpcklqdq      h44,h44
96         # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
97         movd            0x00(m),t1
98         movd            0x03(m),t2
99         psrld           $2,t2
100         punpcklqdq      t2,t1
101         pand            mask,t1
102         paddd           t1,h01
103         # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
104         movd            0x06(m),t1
105         movd            0x09(m),t2
106         psrld           $4,t1
107         psrld           $6,t2
108         punpcklqdq      t2,t1
109         pand            mask,t1
110         paddd           t1,h23
111         # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
112         mov             0x0c(m),%eax
113         shr             $8,%eax
114         or              $0x01000000,%eax
115         movd            %eax,t1
116         pshufd          $0xc4,t1,t1
117         paddd           t1,h44
119         # t1[0] = h0 * r0 + h2 * s3
120         # t1[1] = h1 * s4 + h3 * s2
121         movd            r0,t1
122         movd            s4,t2
123         punpcklqdq      t2,t1
124         pmuludq         h01,t1
125         movd            s3,t2
126         movd            s2,t3
127         punpcklqdq      t3,t2
128         pmuludq         h23,t2
129         paddq           t2,t1
130         # t2[0] = h0 * r1 + h2 * s4
131         # t2[1] = h1 * r0 + h3 * s3
132         movd            r1,t2
133         movd            r0,t3
134         punpcklqdq      t3,t2
135         pmuludq         h01,t2
136         movd            s4,t3
137         movd            s3,t4
138         punpcklqdq      t4,t3
139         pmuludq         h23,t3
140         paddq           t3,t2
141         # t3[0] = h4 * s1
142         # t3[1] = h4 * s2
143         movd            s1,t3
144         movd            s2,t4
145         punpcklqdq      t4,t3
146         pmuludq         h44,t3
147         # d0 = t1[0] + t1[1] + t3[0]
148         # d1 = t2[0] + t2[1] + t3[1]
149         movdqa          t1,t4
150         punpcklqdq      t2,t4
151         punpckhqdq      t2,t1
152         paddq           t4,t1
153         paddq           t3,t1
154         movq            t1,d0
155         psrldq          $8,t1
156         movq            t1,d1
158         # t1[0] = h0 * r2 + h2 * r0
159         # t1[1] = h1 * r1 + h3 * s4
160         movd            r2,t1
161         movd            r1,t2
162         punpcklqdq      t2,t1
163         pmuludq         h01,t1
164         movd            r0,t2
165         movd            s4,t3
166         punpcklqdq      t3,t2
167         pmuludq         h23,t2
168         paddq           t2,t1
169         # t2[0] = h0 * r3 + h2 * r1
170         # t2[1] = h1 * r2 + h3 * r0
171         movd            r3,t2
172         movd            r2,t3
173         punpcklqdq      t3,t2
174         pmuludq         h01,t2
175         movd            r1,t3
176         movd            r0,t4
177         punpcklqdq      t4,t3
178         pmuludq         h23,t3
179         paddq           t3,t2
180         # t3[0] = h4 * s3
181         # t3[1] = h4 * s4
182         movd            s3,t3
183         movd            s4,t4
184         punpcklqdq      t4,t3
185         pmuludq         h44,t3
186         # d2 = t1[0] + t1[1] + t3[0]
187         # d3 = t2[0] + t2[1] + t3[1]
188         movdqa          t1,t4
189         punpcklqdq      t2,t4
190         punpckhqdq      t2,t1
191         paddq           t4,t1
192         paddq           t3,t1
193         movq            t1,d2
194         psrldq          $8,t1
195         movq            t1,d3
197         # t1[0] = h0 * r4 + h2 * r2
198         # t1[1] = h1 * r3 + h3 * r1
199         movd            r4,t1
200         movd            r3,t2
201         punpcklqdq      t2,t1
202         pmuludq         h01,t1
203         movd            r2,t2
204         movd            r1,t3
205         punpcklqdq      t3,t2
206         pmuludq         h23,t2
207         paddq           t2,t1
208         # t3[0] = h4 * r0
209         movd            r0,t3
210         pmuludq         h44,t3
211         # d4 = t1[0] + t1[1] + t3[0]
212         movdqa          t1,t4
213         psrldq          $8,t4
214         paddq           t4,t1
215         paddq           t3,t1
216         movq            t1,d4
218         # d1 += d0 >> 26
219         mov             d0,%rax
220         shr             $26,%rax
221         add             %rax,d1
222         # h0 = d0 & 0x3ffffff
223         mov             d0,%rbx
224         and             $0x3ffffff,%ebx
226         # d2 += d1 >> 26
227         mov             d1,%rax
228         shr             $26,%rax
229         add             %rax,d2
230         # h1 = d1 & 0x3ffffff
231         mov             d1,%rax
232         and             $0x3ffffff,%eax
233         mov             %eax,h1
235         # d3 += d2 >> 26
236         mov             d2,%rax
237         shr             $26,%rax
238         add             %rax,d3
239         # h2 = d2 & 0x3ffffff
240         mov             d2,%rax
241         and             $0x3ffffff,%eax
242         mov             %eax,h2
244         # d4 += d3 >> 26
245         mov             d3,%rax
246         shr             $26,%rax
247         add             %rax,d4
248         # h3 = d3 & 0x3ffffff
249         mov             d3,%rax
250         and             $0x3ffffff,%eax
251         mov             %eax,h3
253         # h0 += (d4 >> 26) * 5
254         mov             d4,%rax
255         shr             $26,%rax
256         lea             (%eax,%eax,4),%eax
257         add             %eax,%ebx
258         # h4 = d4 & 0x3ffffff
259         mov             d4,%rax
260         and             $0x3ffffff,%eax
261         mov             %eax,h4
263         # h1 += h0 >> 26
264         mov             %ebx,%eax
265         shr             $26,%eax
266         add             %eax,h1
267         # h0 = h0 & 0x3ffffff
268         andl            $0x3ffffff,%ebx
269         mov             %ebx,h0
271         add             $0x10,m
272         dec             %rcx
273         jnz             .Ldoblock
275         add             $0x10,%rsp
276         pop             %r12
277         pop             %rbx
278         ret
279 ENDPROC(poly1305_block_sse2)
282 #define u0 0x00(%r8)
283 #define u1 0x04(%r8)
284 #define u2 0x08(%r8)
285 #define u3 0x0c(%r8)
286 #define u4 0x10(%r8)
287 #define hc0 %xmm0
288 #define hc1 %xmm1
289 #define hc2 %xmm2
290 #define hc3 %xmm5
291 #define hc4 %xmm6
292 #define ru0 %xmm7
293 #define ru1 %xmm8
294 #define ru2 %xmm9
295 #define ru3 %xmm10
296 #define ru4 %xmm11
297 #define sv1 %xmm12
298 #define sv2 %xmm13
299 #define sv3 %xmm14
300 #define sv4 %xmm15
301 #undef d0
302 #define d0 %r13
304 ENTRY(poly1305_2block_sse2)
305         # %rdi: Accumulator h[5]
306         # %rsi: 16 byte input block m
307         # %rdx: Poly1305 key r[5]
308         # %rcx: Doubleblock count
309         # %r8:  Poly1305 derived key r^2 u[5]
311         # This two-block variant further improves performance by using loop
312         # unrolled block processing. This is more straight forward and does
313         # less byte shuffling, but requires a second Poly1305 key r^2:
314         # h = (h + m) * r    =>    h = (h + m1) * r^2 + m2 * r
316         push            %rbx
317         push            %r12
318         push            %r13
320         # combine r0,u0
321         movd            u0,ru0
322         movd            r0,t1
323         punpcklqdq      t1,ru0
325         # combine r1,u1 and s1=r1*5,v1=u1*5
326         movd            u1,ru1
327         movd            r1,t1
328         punpcklqdq      t1,ru1
329         movdqa          ru1,sv1
330         pslld           $2,sv1
331         paddd           ru1,sv1
333         # combine r2,u2 and s2=r2*5,v2=u2*5
334         movd            u2,ru2
335         movd            r2,t1
336         punpcklqdq      t1,ru2
337         movdqa          ru2,sv2
338         pslld           $2,sv2
339         paddd           ru2,sv2
341         # combine r3,u3 and s3=r3*5,v3=u3*5
342         movd            u3,ru3
343         movd            r3,t1
344         punpcklqdq      t1,ru3
345         movdqa          ru3,sv3
346         pslld           $2,sv3
347         paddd           ru3,sv3
349         # combine r4,u4 and s4=r4*5,v4=u4*5
350         movd            u4,ru4
351         movd            r4,t1
352         punpcklqdq      t1,ru4
353         movdqa          ru4,sv4
354         pslld           $2,sv4
355         paddd           ru4,sv4
357 .Ldoblock2:
358         # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ]
359         movd            0x00(m),hc0
360         movd            0x10(m),t1
361         punpcklqdq      t1,hc0
362         pand            ANMASK(%rip),hc0
363         movd            h0,t1
364         paddd           t1,hc0
365         # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ]
366         movd            0x03(m),hc1
367         movd            0x13(m),t1
368         punpcklqdq      t1,hc1
369         psrld           $2,hc1
370         pand            ANMASK(%rip),hc1
371         movd            h1,t1
372         paddd           t1,hc1
373         # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ]
374         movd            0x06(m),hc2
375         movd            0x16(m),t1
376         punpcklqdq      t1,hc2
377         psrld           $4,hc2
378         pand            ANMASK(%rip),hc2
379         movd            h2,t1
380         paddd           t1,hc2
381         # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ]
382         movd            0x09(m),hc3
383         movd            0x19(m),t1
384         punpcklqdq      t1,hc3
385         psrld           $6,hc3
386         pand            ANMASK(%rip),hc3
387         movd            h3,t1
388         paddd           t1,hc3
389         # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ]
390         movd            0x0c(m),hc4
391         movd            0x1c(m),t1
392         punpcklqdq      t1,hc4
393         psrld           $8,hc4
394         por             ORMASK(%rip),hc4
395         movd            h4,t1
396         paddd           t1,hc4
398         # t1 = [ hc0[1] * r0, hc0[0] * u0 ]
399         movdqa          ru0,t1
400         pmuludq         hc0,t1
401         # t1 += [ hc1[1] * s4, hc1[0] * v4 ]
402         movdqa          sv4,t2
403         pmuludq         hc1,t2
404         paddq           t2,t1
405         # t1 += [ hc2[1] * s3, hc2[0] * v3 ]
406         movdqa          sv3,t2
407         pmuludq         hc2,t2
408         paddq           t2,t1
409         # t1 += [ hc3[1] * s2, hc3[0] * v2 ]
410         movdqa          sv2,t2
411         pmuludq         hc3,t2
412         paddq           t2,t1
413         # t1 += [ hc4[1] * s1, hc4[0] * v1 ]
414         movdqa          sv1,t2
415         pmuludq         hc4,t2
416         paddq           t2,t1
417         # d0 = t1[0] + t1[1]
418         movdqa          t1,t2
419         psrldq          $8,t2
420         paddq           t2,t1
421         movq            t1,d0
423         # t1 = [ hc0[1] * r1, hc0[0] * u1 ]
424         movdqa          ru1,t1
425         pmuludq         hc0,t1
426         # t1 += [ hc1[1] * r0, hc1[0] * u0 ]
427         movdqa          ru0,t2
428         pmuludq         hc1,t2
429         paddq           t2,t1
430         # t1 += [ hc2[1] * s4, hc2[0] * v4 ]
431         movdqa          sv4,t2
432         pmuludq         hc2,t2
433         paddq           t2,t1
434         # t1 += [ hc3[1] * s3, hc3[0] * v3 ]
435         movdqa          sv3,t2
436         pmuludq         hc3,t2
437         paddq           t2,t1
438         # t1 += [ hc4[1] * s2, hc4[0] * v2 ]
439         movdqa          sv2,t2
440         pmuludq         hc4,t2
441         paddq           t2,t1
442         # d1 = t1[0] + t1[1]
443         movdqa          t1,t2
444         psrldq          $8,t2
445         paddq           t2,t1
446         movq            t1,d1
448         # t1 = [ hc0[1] * r2, hc0[0] * u2 ]
449         movdqa          ru2,t1
450         pmuludq         hc0,t1
451         # t1 += [ hc1[1] * r1, hc1[0] * u1 ]
452         movdqa          ru1,t2
453         pmuludq         hc1,t2
454         paddq           t2,t1
455         # t1 += [ hc2[1] * r0, hc2[0] * u0 ]
456         movdqa          ru0,t2
457         pmuludq         hc2,t2
458         paddq           t2,t1
459         # t1 += [ hc3[1] * s4, hc3[0] * v4 ]
460         movdqa          sv4,t2
461         pmuludq         hc3,t2
462         paddq           t2,t1
463         # t1 += [ hc4[1] * s3, hc4[0] * v3 ]
464         movdqa          sv3,t2
465         pmuludq         hc4,t2
466         paddq           t2,t1
467         # d2 = t1[0] + t1[1]
468         movdqa          t1,t2
469         psrldq          $8,t2
470         paddq           t2,t1
471         movq            t1,d2
473         # t1 = [ hc0[1] * r3, hc0[0] * u3 ]
474         movdqa          ru3,t1
475         pmuludq         hc0,t1
476         # t1 += [ hc1[1] * r2, hc1[0] * u2 ]
477         movdqa          ru2,t2
478         pmuludq         hc1,t2
479         paddq           t2,t1
480         # t1 += [ hc2[1] * r1, hc2[0] * u1 ]
481         movdqa          ru1,t2
482         pmuludq         hc2,t2
483         paddq           t2,t1
484         # t1 += [ hc3[1] * r0, hc3[0] * u0 ]
485         movdqa          ru0,t2
486         pmuludq         hc3,t2
487         paddq           t2,t1
488         # t1 += [ hc4[1] * s4, hc4[0] * v4 ]
489         movdqa          sv4,t2
490         pmuludq         hc4,t2
491         paddq           t2,t1
492         # d3 = t1[0] + t1[1]
493         movdqa          t1,t2
494         psrldq          $8,t2
495         paddq           t2,t1
496         movq            t1,d3
498         # t1 = [ hc0[1] * r4, hc0[0] * u4 ]
499         movdqa          ru4,t1
500         pmuludq         hc0,t1
501         # t1 += [ hc1[1] * r3, hc1[0] * u3 ]
502         movdqa          ru3,t2
503         pmuludq         hc1,t2
504         paddq           t2,t1
505         # t1 += [ hc2[1] * r2, hc2[0] * u2 ]
506         movdqa          ru2,t2
507         pmuludq         hc2,t2
508         paddq           t2,t1
509         # t1 += [ hc3[1] * r1, hc3[0] * u1 ]
510         movdqa          ru1,t2
511         pmuludq         hc3,t2
512         paddq           t2,t1
513         # t1 += [ hc4[1] * r0, hc4[0] * u0 ]
514         movdqa          ru0,t2
515         pmuludq         hc4,t2
516         paddq           t2,t1
517         # d4 = t1[0] + t1[1]
518         movdqa          t1,t2
519         psrldq          $8,t2
520         paddq           t2,t1
521         movq            t1,d4
523         # d1 += d0 >> 26
524         mov             d0,%rax
525         shr             $26,%rax
526         add             %rax,d1
527         # h0 = d0 & 0x3ffffff
528         mov             d0,%rbx
529         and             $0x3ffffff,%ebx
531         # d2 += d1 >> 26
532         mov             d1,%rax
533         shr             $26,%rax
534         add             %rax,d2
535         # h1 = d1 & 0x3ffffff
536         mov             d1,%rax
537         and             $0x3ffffff,%eax
538         mov             %eax,h1
540         # d3 += d2 >> 26
541         mov             d2,%rax
542         shr             $26,%rax
543         add             %rax,d3
544         # h2 = d2 & 0x3ffffff
545         mov             d2,%rax
546         and             $0x3ffffff,%eax
547         mov             %eax,h2
549         # d4 += d3 >> 26
550         mov             d3,%rax
551         shr             $26,%rax
552         add             %rax,d4
553         # h3 = d3 & 0x3ffffff
554         mov             d3,%rax
555         and             $0x3ffffff,%eax
556         mov             %eax,h3
558         # h0 += (d4 >> 26) * 5
559         mov             d4,%rax
560         shr             $26,%rax
561         lea             (%eax,%eax,4),%eax
562         add             %eax,%ebx
563         # h4 = d4 & 0x3ffffff
564         mov             d4,%rax
565         and             $0x3ffffff,%eax
566         mov             %eax,h4
568         # h1 += h0 >> 26
569         mov             %ebx,%eax
570         shr             $26,%eax
571         add             %eax,h1
572         # h0 = h0 & 0x3ffffff
573         andl            $0x3ffffff,%ebx
574         mov             %ebx,h0
576         add             $0x20,m
577         dec             %rcx
578         jnz             .Ldoblock2
580         pop             %r13
581         pop             %r12
582         pop             %rbx
583         ret
584 ENDPROC(poly1305_2block_sse2)