2 * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
4 * Copyright (C) 2015 Martin Willi
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
12 #include <linux/linkage.h>
14 .section .rodata.cst16.ANMASK, "aM", @progbits, 16
16 ANMASK: .octa 0x0000000003ffffff0000000003ffffff
18 .section .rodata.cst16.ORMASK, "aM", @progbits, 16
20 ORMASK: .octa 0x00000000010000000000000001000000
53 ENTRY(poly1305_block_sse2)
54 # %rdi: Accumulator h[5]
55 # %rsi: 16 byte input block m
56 # %rdx: Poly1305 key r[5]
59 # This single block variant tries to improve performance by doing two
60 # multiplications in parallel using SSE instructions. There is quite
61 # some quardword packing involved, hence the speedup is marginal.
69 lea (%eax,%eax,4),%eax
72 lea (%eax,%eax,4),%eax
75 lea (%eax,%eax,4),%eax
78 lea (%eax,%eax,4),%eax
81 movdqa ANMASK(%rip),mask
84 # h01 = [0, h1, 0, h0]
85 # h23 = [0, h3, 0, h2]
86 # h44 = [0, h4, 0, h4]
96 # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
103 # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
111 # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
119 # t1[0] = h0 * r0 + h2 * s3
120 # t1[1] = h1 * s4 + h3 * s2
130 # t2[0] = h0 * r1 + h2 * s4
131 # t2[1] = h1 * r0 + h3 * s3
147 # d0 = t1[0] + t1[1] + t3[0]
148 # d1 = t2[0] + t2[1] + t3[1]
158 # t1[0] = h0 * r2 + h2 * r0
159 # t1[1] = h1 * r1 + h3 * s4
169 # t2[0] = h0 * r3 + h2 * r1
170 # t2[1] = h1 * r2 + h3 * r0
186 # d2 = t1[0] + t1[1] + t3[0]
187 # d3 = t2[0] + t2[1] + t3[1]
197 # t1[0] = h0 * r4 + h2 * r2
198 # t1[1] = h1 * r3 + h3 * r1
211 # d4 = t1[0] + t1[1] + t3[0]
222 # h0 = d0 & 0x3ffffff
230 # h1 = d1 & 0x3ffffff
239 # h2 = d2 & 0x3ffffff
248 # h3 = d3 & 0x3ffffff
253 # h0 += (d4 >> 26) * 5
256 lea (%rax,%rax,4),%rax
258 # h4 = d4 & 0x3ffffff
267 # h0 = h0 & 0x3ffffff
279 ENDPROC(poly1305_block_sse2)
304 ENTRY(poly1305_2block_sse2)
305 # %rdi: Accumulator h[5]
306 # %rsi: 16 byte input block m
307 # %rdx: Poly1305 key r[5]
308 # %rcx: Doubleblock count
309 # %r8: Poly1305 derived key r^2 u[5]
311 # This two-block variant further improves performance by using loop
312 # unrolled block processing. This is more straight forward and does
313 # less byte shuffling, but requires a second Poly1305 key r^2:
314 # h = (h + m) * r => h = (h + m1) * r^2 + m2 * r
325 # combine r1,u1 and s1=r1*5,v1=u1*5
333 # combine r2,u2 and s2=r2*5,v2=u2*5
341 # combine r3,u3 and s3=r3*5,v3=u3*5
349 # combine r4,u4 and s4=r4*5,v4=u4*5
358 # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ]
362 pand ANMASK(%rip),hc0
365 # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ]
370 pand ANMASK(%rip),hc1
373 # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ]
378 pand ANMASK(%rip),hc2
381 # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ]
386 pand ANMASK(%rip),hc3
389 # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ]
398 # t1 = [ hc0[1] * r0, hc0[0] * u0 ]
401 # t1 += [ hc1[1] * s4, hc1[0] * v4 ]
405 # t1 += [ hc2[1] * s3, hc2[0] * v3 ]
409 # t1 += [ hc3[1] * s2, hc3[0] * v2 ]
413 # t1 += [ hc4[1] * s1, hc4[0] * v1 ]
423 # t1 = [ hc0[1] * r1, hc0[0] * u1 ]
426 # t1 += [ hc1[1] * r0, hc1[0] * u0 ]
430 # t1 += [ hc2[1] * s4, hc2[0] * v4 ]
434 # t1 += [ hc3[1] * s3, hc3[0] * v3 ]
438 # t1 += [ hc4[1] * s2, hc4[0] * v2 ]
448 # t1 = [ hc0[1] * r2, hc0[0] * u2 ]
451 # t1 += [ hc1[1] * r1, hc1[0] * u1 ]
455 # t1 += [ hc2[1] * r0, hc2[0] * u0 ]
459 # t1 += [ hc3[1] * s4, hc3[0] * v4 ]
463 # t1 += [ hc4[1] * s3, hc4[0] * v3 ]
473 # t1 = [ hc0[1] * r3, hc0[0] * u3 ]
476 # t1 += [ hc1[1] * r2, hc1[0] * u2 ]
480 # t1 += [ hc2[1] * r1, hc2[0] * u1 ]
484 # t1 += [ hc3[1] * r0, hc3[0] * u0 ]
488 # t1 += [ hc4[1] * s4, hc4[0] * v4 ]
498 # t1 = [ hc0[1] * r4, hc0[0] * u4 ]
501 # t1 += [ hc1[1] * r3, hc1[0] * u3 ]
505 # t1 += [ hc2[1] * r2, hc2[0] * u2 ]
509 # t1 += [ hc3[1] * r1, hc3[0] * u1 ]
513 # t1 += [ hc4[1] * r0, hc4[0] * u0 ]
523 # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
524 # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
525 # amount. Careful: we must not assume the carry bits 'd0 >> 26',
526 # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
527 # integers. It's true in a single-block implementation, but not here.
533 # h0 = d0 & 0x3ffffff
541 # h1 = d1 & 0x3ffffff
550 # h2 = d2 & 0x3ffffff
559 # h3 = d3 & 0x3ffffff
564 # h0 += (d4 >> 26) * 5
567 lea (%rax,%rax,4),%rax
569 # h4 = d4 & 0x3ffffff
578 # h0 = h0 & 0x3ffffff
590 ENDPROC(poly1305_2block_sse2)