Linux 4.6-rc6
[linux/fpc-iii.git] / arch / x86 / crypto / poly1305-sse2-x86_64.S
blob338c748054ed2fefeb88935e1ed849e2445d2aba
1 /*
2  * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
3  *
4  * Copyright (C) 2015 Martin Willi
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  */
12 #include <linux/linkage.h>
14 .data
15 .align 16
17 ANMASK: .octa 0x0000000003ffffff0000000003ffffff
18 ORMASK: .octa 0x00000000010000000000000001000000
20 .text
22 #define h0 0x00(%rdi)
23 #define h1 0x04(%rdi)
24 #define h2 0x08(%rdi)
25 #define h3 0x0c(%rdi)
26 #define h4 0x10(%rdi)
27 #define r0 0x00(%rdx)
28 #define r1 0x04(%rdx)
29 #define r2 0x08(%rdx)
30 #define r3 0x0c(%rdx)
31 #define r4 0x10(%rdx)
32 #define s1 0x00(%rsp)
33 #define s2 0x04(%rsp)
34 #define s3 0x08(%rsp)
35 #define s4 0x0c(%rsp)
36 #define m %rsi
37 #define h01 %xmm0
38 #define h23 %xmm1
39 #define h44 %xmm2
40 #define t1 %xmm3
41 #define t2 %xmm4
42 #define t3 %xmm5
43 #define t4 %xmm6
44 #define mask %xmm7
45 #define d0 %r8
46 #define d1 %r9
47 #define d2 %r10
48 #define d3 %r11
49 #define d4 %r12
51 ENTRY(poly1305_block_sse2)
52         # %rdi: Accumulator h[5]
53         # %rsi: 16 byte input block m
54         # %rdx: Poly1305 key r[5]
55         # %rcx: Block count
57         # This single block variant tries to improve performance by doing two
58         # multiplications in parallel using SSE instructions. There is quite
59         # some quardword packing involved, hence the speedup is marginal.
61         push            %rbx
62         push            %r12
63         sub             $0x10,%rsp
65         # s1..s4 = r1..r4 * 5
66         mov             r1,%eax
67         lea             (%eax,%eax,4),%eax
68         mov             %eax,s1
69         mov             r2,%eax
70         lea             (%eax,%eax,4),%eax
71         mov             %eax,s2
72         mov             r3,%eax
73         lea             (%eax,%eax,4),%eax
74         mov             %eax,s3
75         mov             r4,%eax
76         lea             (%eax,%eax,4),%eax
77         mov             %eax,s4
79         movdqa          ANMASK(%rip),mask
81 .Ldoblock:
82         # h01 = [0, h1, 0, h0]
83         # h23 = [0, h3, 0, h2]
84         # h44 = [0, h4, 0, h4]
85         movd            h0,h01
86         movd            h1,t1
87         movd            h2,h23
88         movd            h3,t2
89         movd            h4,h44
90         punpcklqdq      t1,h01
91         punpcklqdq      t2,h23
92         punpcklqdq      h44,h44
94         # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
95         movd            0x00(m),t1
96         movd            0x03(m),t2
97         psrld           $2,t2
98         punpcklqdq      t2,t1
99         pand            mask,t1
100         paddd           t1,h01
101         # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
102         movd            0x06(m),t1
103         movd            0x09(m),t2
104         psrld           $4,t1
105         psrld           $6,t2
106         punpcklqdq      t2,t1
107         pand            mask,t1
108         paddd           t1,h23
109         # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
110         mov             0x0c(m),%eax
111         shr             $8,%eax
112         or              $0x01000000,%eax
113         movd            %eax,t1
114         pshufd          $0xc4,t1,t1
115         paddd           t1,h44
117         # t1[0] = h0 * r0 + h2 * s3
118         # t1[1] = h1 * s4 + h3 * s2
119         movd            r0,t1
120         movd            s4,t2
121         punpcklqdq      t2,t1
122         pmuludq         h01,t1
123         movd            s3,t2
124         movd            s2,t3
125         punpcklqdq      t3,t2
126         pmuludq         h23,t2
127         paddq           t2,t1
128         # t2[0] = h0 * r1 + h2 * s4
129         # t2[1] = h1 * r0 + h3 * s3
130         movd            r1,t2
131         movd            r0,t3
132         punpcklqdq      t3,t2
133         pmuludq         h01,t2
134         movd            s4,t3
135         movd            s3,t4
136         punpcklqdq      t4,t3
137         pmuludq         h23,t3
138         paddq           t3,t2
139         # t3[0] = h4 * s1
140         # t3[1] = h4 * s2
141         movd            s1,t3
142         movd            s2,t4
143         punpcklqdq      t4,t3
144         pmuludq         h44,t3
145         # d0 = t1[0] + t1[1] + t3[0]
146         # d1 = t2[0] + t2[1] + t3[1]
147         movdqa          t1,t4
148         punpcklqdq      t2,t4
149         punpckhqdq      t2,t1
150         paddq           t4,t1
151         paddq           t3,t1
152         movq            t1,d0
153         psrldq          $8,t1
154         movq            t1,d1
156         # t1[0] = h0 * r2 + h2 * r0
157         # t1[1] = h1 * r1 + h3 * s4
158         movd            r2,t1
159         movd            r1,t2
160         punpcklqdq      t2,t1
161         pmuludq         h01,t1
162         movd            r0,t2
163         movd            s4,t3
164         punpcklqdq      t3,t2
165         pmuludq         h23,t2
166         paddq           t2,t1
167         # t2[0] = h0 * r3 + h2 * r1
168         # t2[1] = h1 * r2 + h3 * r0
169         movd            r3,t2
170         movd            r2,t3
171         punpcklqdq      t3,t2
172         pmuludq         h01,t2
173         movd            r1,t3
174         movd            r0,t4
175         punpcklqdq      t4,t3
176         pmuludq         h23,t3
177         paddq           t3,t2
178         # t3[0] = h4 * s3
179         # t3[1] = h4 * s4
180         movd            s3,t3
181         movd            s4,t4
182         punpcklqdq      t4,t3
183         pmuludq         h44,t3
184         # d2 = t1[0] + t1[1] + t3[0]
185         # d3 = t2[0] + t2[1] + t3[1]
186         movdqa          t1,t4
187         punpcklqdq      t2,t4
188         punpckhqdq      t2,t1
189         paddq           t4,t1
190         paddq           t3,t1
191         movq            t1,d2
192         psrldq          $8,t1
193         movq            t1,d3
195         # t1[0] = h0 * r4 + h2 * r2
196         # t1[1] = h1 * r3 + h3 * r1
197         movd            r4,t1
198         movd            r3,t2
199         punpcklqdq      t2,t1
200         pmuludq         h01,t1
201         movd            r2,t2
202         movd            r1,t3
203         punpcklqdq      t3,t2
204         pmuludq         h23,t2
205         paddq           t2,t1
206         # t3[0] = h4 * r0
207         movd            r0,t3
208         pmuludq         h44,t3
209         # d4 = t1[0] + t1[1] + t3[0]
210         movdqa          t1,t4
211         psrldq          $8,t4
212         paddq           t4,t1
213         paddq           t3,t1
214         movq            t1,d4
216         # d1 += d0 >> 26
217         mov             d0,%rax
218         shr             $26,%rax
219         add             %rax,d1
220         # h0 = d0 & 0x3ffffff
221         mov             d0,%rbx
222         and             $0x3ffffff,%ebx
224         # d2 += d1 >> 26
225         mov             d1,%rax
226         shr             $26,%rax
227         add             %rax,d2
228         # h1 = d1 & 0x3ffffff
229         mov             d1,%rax
230         and             $0x3ffffff,%eax
231         mov             %eax,h1
233         # d3 += d2 >> 26
234         mov             d2,%rax
235         shr             $26,%rax
236         add             %rax,d3
237         # h2 = d2 & 0x3ffffff
238         mov             d2,%rax
239         and             $0x3ffffff,%eax
240         mov             %eax,h2
242         # d4 += d3 >> 26
243         mov             d3,%rax
244         shr             $26,%rax
245         add             %rax,d4
246         # h3 = d3 & 0x3ffffff
247         mov             d3,%rax
248         and             $0x3ffffff,%eax
249         mov             %eax,h3
251         # h0 += (d4 >> 26) * 5
252         mov             d4,%rax
253         shr             $26,%rax
254         lea             (%eax,%eax,4),%eax
255         add             %eax,%ebx
256         # h4 = d4 & 0x3ffffff
257         mov             d4,%rax
258         and             $0x3ffffff,%eax
259         mov             %eax,h4
261         # h1 += h0 >> 26
262         mov             %ebx,%eax
263         shr             $26,%eax
264         add             %eax,h1
265         # h0 = h0 & 0x3ffffff
266         andl            $0x3ffffff,%ebx
267         mov             %ebx,h0
269         add             $0x10,m
270         dec             %rcx
271         jnz             .Ldoblock
273         add             $0x10,%rsp
274         pop             %r12
275         pop             %rbx
276         ret
277 ENDPROC(poly1305_block_sse2)
280 #define u0 0x00(%r8)
281 #define u1 0x04(%r8)
282 #define u2 0x08(%r8)
283 #define u3 0x0c(%r8)
284 #define u4 0x10(%r8)
285 #define hc0 %xmm0
286 #define hc1 %xmm1
287 #define hc2 %xmm2
288 #define hc3 %xmm5
289 #define hc4 %xmm6
290 #define ru0 %xmm7
291 #define ru1 %xmm8
292 #define ru2 %xmm9
293 #define ru3 %xmm10
294 #define ru4 %xmm11
295 #define sv1 %xmm12
296 #define sv2 %xmm13
297 #define sv3 %xmm14
298 #define sv4 %xmm15
299 #undef d0
300 #define d0 %r13
302 ENTRY(poly1305_2block_sse2)
303         # %rdi: Accumulator h[5]
304         # %rsi: 16 byte input block m
305         # %rdx: Poly1305 key r[5]
306         # %rcx: Doubleblock count
307         # %r8:  Poly1305 derived key r^2 u[5]
309         # This two-block variant further improves performance by using loop
310         # unrolled block processing. This is more straight forward and does
311         # less byte shuffling, but requires a second Poly1305 key r^2:
312         # h = (h + m) * r    =>    h = (h + m1) * r^2 + m2 * r
314         push            %rbx
315         push            %r12
316         push            %r13
318         # combine r0,u0
319         movd            u0,ru0
320         movd            r0,t1
321         punpcklqdq      t1,ru0
323         # combine r1,u1 and s1=r1*5,v1=u1*5
324         movd            u1,ru1
325         movd            r1,t1
326         punpcklqdq      t1,ru1
327         movdqa          ru1,sv1
328         pslld           $2,sv1
329         paddd           ru1,sv1
331         # combine r2,u2 and s2=r2*5,v2=u2*5
332         movd            u2,ru2
333         movd            r2,t1
334         punpcklqdq      t1,ru2
335         movdqa          ru2,sv2
336         pslld           $2,sv2
337         paddd           ru2,sv2
339         # combine r3,u3 and s3=r3*5,v3=u3*5
340         movd            u3,ru3
341         movd            r3,t1
342         punpcklqdq      t1,ru3
343         movdqa          ru3,sv3
344         pslld           $2,sv3
345         paddd           ru3,sv3
347         # combine r4,u4 and s4=r4*5,v4=u4*5
348         movd            u4,ru4
349         movd            r4,t1
350         punpcklqdq      t1,ru4
351         movdqa          ru4,sv4
352         pslld           $2,sv4
353         paddd           ru4,sv4
355 .Ldoblock2:
356         # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ]
357         movd            0x00(m),hc0
358         movd            0x10(m),t1
359         punpcklqdq      t1,hc0
360         pand            ANMASK(%rip),hc0
361         movd            h0,t1
362         paddd           t1,hc0
363         # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ]
364         movd            0x03(m),hc1
365         movd            0x13(m),t1
366         punpcklqdq      t1,hc1
367         psrld           $2,hc1
368         pand            ANMASK(%rip),hc1
369         movd            h1,t1
370         paddd           t1,hc1
371         # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ]
372         movd            0x06(m),hc2
373         movd            0x16(m),t1
374         punpcklqdq      t1,hc2
375         psrld           $4,hc2
376         pand            ANMASK(%rip),hc2
377         movd            h2,t1
378         paddd           t1,hc2
379         # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ]
380         movd            0x09(m),hc3
381         movd            0x19(m),t1
382         punpcklqdq      t1,hc3
383         psrld           $6,hc3
384         pand            ANMASK(%rip),hc3
385         movd            h3,t1
386         paddd           t1,hc3
387         # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ]
388         movd            0x0c(m),hc4
389         movd            0x1c(m),t1
390         punpcklqdq      t1,hc4
391         psrld           $8,hc4
392         por             ORMASK(%rip),hc4
393         movd            h4,t1
394         paddd           t1,hc4
396         # t1 = [ hc0[1] * r0, hc0[0] * u0 ]
397         movdqa          ru0,t1
398         pmuludq         hc0,t1
399         # t1 += [ hc1[1] * s4, hc1[0] * v4 ]
400         movdqa          sv4,t2
401         pmuludq         hc1,t2
402         paddq           t2,t1
403         # t1 += [ hc2[1] * s3, hc2[0] * v3 ]
404         movdqa          sv3,t2
405         pmuludq         hc2,t2
406         paddq           t2,t1
407         # t1 += [ hc3[1] * s2, hc3[0] * v2 ]
408         movdqa          sv2,t2
409         pmuludq         hc3,t2
410         paddq           t2,t1
411         # t1 += [ hc4[1] * s1, hc4[0] * v1 ]
412         movdqa          sv1,t2
413         pmuludq         hc4,t2
414         paddq           t2,t1
415         # d0 = t1[0] + t1[1]
416         movdqa          t1,t2
417         psrldq          $8,t2
418         paddq           t2,t1
419         movq            t1,d0
421         # t1 = [ hc0[1] * r1, hc0[0] * u1 ]
422         movdqa          ru1,t1
423         pmuludq         hc0,t1
424         # t1 += [ hc1[1] * r0, hc1[0] * u0 ]
425         movdqa          ru0,t2
426         pmuludq         hc1,t2
427         paddq           t2,t1
428         # t1 += [ hc2[1] * s4, hc2[0] * v4 ]
429         movdqa          sv4,t2
430         pmuludq         hc2,t2
431         paddq           t2,t1
432         # t1 += [ hc3[1] * s3, hc3[0] * v3 ]
433         movdqa          sv3,t2
434         pmuludq         hc3,t2
435         paddq           t2,t1
436         # t1 += [ hc4[1] * s2, hc4[0] * v2 ]
437         movdqa          sv2,t2
438         pmuludq         hc4,t2
439         paddq           t2,t1
440         # d1 = t1[0] + t1[1]
441         movdqa          t1,t2
442         psrldq          $8,t2
443         paddq           t2,t1
444         movq            t1,d1
446         # t1 = [ hc0[1] * r2, hc0[0] * u2 ]
447         movdqa          ru2,t1
448         pmuludq         hc0,t1
449         # t1 += [ hc1[1] * r1, hc1[0] * u1 ]
450         movdqa          ru1,t2
451         pmuludq         hc1,t2
452         paddq           t2,t1
453         # t1 += [ hc2[1] * r0, hc2[0] * u0 ]
454         movdqa          ru0,t2
455         pmuludq         hc2,t2
456         paddq           t2,t1
457         # t1 += [ hc3[1] * s4, hc3[0] * v4 ]
458         movdqa          sv4,t2
459         pmuludq         hc3,t2
460         paddq           t2,t1
461         # t1 += [ hc4[1] * s3, hc4[0] * v3 ]
462         movdqa          sv3,t2
463         pmuludq         hc4,t2
464         paddq           t2,t1
465         # d2 = t1[0] + t1[1]
466         movdqa          t1,t2
467         psrldq          $8,t2
468         paddq           t2,t1
469         movq            t1,d2
471         # t1 = [ hc0[1] * r3, hc0[0] * u3 ]
472         movdqa          ru3,t1
473         pmuludq         hc0,t1
474         # t1 += [ hc1[1] * r2, hc1[0] * u2 ]
475         movdqa          ru2,t2
476         pmuludq         hc1,t2
477         paddq           t2,t1
478         # t1 += [ hc2[1] * r1, hc2[0] * u1 ]
479         movdqa          ru1,t2
480         pmuludq         hc2,t2
481         paddq           t2,t1
482         # t1 += [ hc3[1] * r0, hc3[0] * u0 ]
483         movdqa          ru0,t2
484         pmuludq         hc3,t2
485         paddq           t2,t1
486         # t1 += [ hc4[1] * s4, hc4[0] * v4 ]
487         movdqa          sv4,t2
488         pmuludq         hc4,t2
489         paddq           t2,t1
490         # d3 = t1[0] + t1[1]
491         movdqa          t1,t2
492         psrldq          $8,t2
493         paddq           t2,t1
494         movq            t1,d3
496         # t1 = [ hc0[1] * r4, hc0[0] * u4 ]
497         movdqa          ru4,t1
498         pmuludq         hc0,t1
499         # t1 += [ hc1[1] * r3, hc1[0] * u3 ]
500         movdqa          ru3,t2
501         pmuludq         hc1,t2
502         paddq           t2,t1
503         # t1 += [ hc2[1] * r2, hc2[0] * u2 ]
504         movdqa          ru2,t2
505         pmuludq         hc2,t2
506         paddq           t2,t1
507         # t1 += [ hc3[1] * r1, hc3[0] * u1 ]
508         movdqa          ru1,t2
509         pmuludq         hc3,t2
510         paddq           t2,t1
511         # t1 += [ hc4[1] * r0, hc4[0] * u0 ]
512         movdqa          ru0,t2
513         pmuludq         hc4,t2
514         paddq           t2,t1
515         # d4 = t1[0] + t1[1]
516         movdqa          t1,t2
517         psrldq          $8,t2
518         paddq           t2,t1
519         movq            t1,d4
521         # d1 += d0 >> 26
522         mov             d0,%rax
523         shr             $26,%rax
524         add             %rax,d1
525         # h0 = d0 & 0x3ffffff
526         mov             d0,%rbx
527         and             $0x3ffffff,%ebx
529         # d2 += d1 >> 26
530         mov             d1,%rax
531         shr             $26,%rax
532         add             %rax,d2
533         # h1 = d1 & 0x3ffffff
534         mov             d1,%rax
535         and             $0x3ffffff,%eax
536         mov             %eax,h1
538         # d3 += d2 >> 26
539         mov             d2,%rax
540         shr             $26,%rax
541         add             %rax,d3
542         # h2 = d2 & 0x3ffffff
543         mov             d2,%rax
544         and             $0x3ffffff,%eax
545         mov             %eax,h2
547         # d4 += d3 >> 26
548         mov             d3,%rax
549         shr             $26,%rax
550         add             %rax,d4
551         # h3 = d3 & 0x3ffffff
552         mov             d3,%rax
553         and             $0x3ffffff,%eax
554         mov             %eax,h3
556         # h0 += (d4 >> 26) * 5
557         mov             d4,%rax
558         shr             $26,%rax
559         lea             (%eax,%eax,4),%eax
560         add             %eax,%ebx
561         # h4 = d4 & 0x3ffffff
562         mov             d4,%rax
563         and             $0x3ffffff,%eax
564         mov             %eax,h4
566         # h1 += h0 >> 26
567         mov             %ebx,%eax
568         shr             $26,%eax
569         add             %eax,h1
570         # h0 = h0 & 0x3ffffff
571         andl            $0x3ffffff,%ebx
572         mov             %ebx,h0
574         add             $0x20,m
575         dec             %rcx
576         jnz             .Ldoblock2
578         pop             %r13
579         pop             %r12
580         pop             %rbx
581         ret
582 ENDPROC(poly1305_2block_sse2)