2 * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
4 * Copyright (C) 2015 Martin Willi
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
12 #include <linux/linkage.h>
17 ANMASK: .octa 0x0000000003ffffff0000000003ffffff
18 ORMASK: .octa 0x00000000010000000000000001000000
51 ENTRY(poly1305_block_sse2)
52 # %rdi: Accumulator h[5]
53 # %rsi: 16 byte input block m
54 # %rdx: Poly1305 key r[5]
57 # This single block variant tries to improve performance by doing two
58 # multiplications in parallel using SSE instructions. There is quite
59 # some quardword packing involved, hence the speedup is marginal.
67 lea (%eax,%eax,4),%eax
70 lea (%eax,%eax,4),%eax
73 lea (%eax,%eax,4),%eax
76 lea (%eax,%eax,4),%eax
79 movdqa ANMASK(%rip),mask
82 # h01 = [0, h1, 0, h0]
83 # h23 = [0, h3, 0, h2]
84 # h44 = [0, h4, 0, h4]
94 # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
101 # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
109 # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
117 # t1[0] = h0 * r0 + h2 * s3
118 # t1[1] = h1 * s4 + h3 * s2
128 # t2[0] = h0 * r1 + h2 * s4
129 # t2[1] = h1 * r0 + h3 * s3
145 # d0 = t1[0] + t1[1] + t3[0]
146 # d1 = t2[0] + t2[1] + t3[1]
156 # t1[0] = h0 * r2 + h2 * r0
157 # t1[1] = h1 * r1 + h3 * s4
167 # t2[0] = h0 * r3 + h2 * r1
168 # t2[1] = h1 * r2 + h3 * r0
184 # d2 = t1[0] + t1[1] + t3[0]
185 # d3 = t2[0] + t2[1] + t3[1]
195 # t1[0] = h0 * r4 + h2 * r2
196 # t1[1] = h1 * r3 + h3 * r1
209 # d4 = t1[0] + t1[1] + t3[0]
220 # h0 = d0 & 0x3ffffff
228 # h1 = d1 & 0x3ffffff
237 # h2 = d2 & 0x3ffffff
246 # h3 = d3 & 0x3ffffff
251 # h0 += (d4 >> 26) * 5
254 lea (%eax,%eax,4),%eax
256 # h4 = d4 & 0x3ffffff
265 # h0 = h0 & 0x3ffffff
277 ENDPROC(poly1305_block_sse2)
302 ENTRY(poly1305_2block_sse2)
303 # %rdi: Accumulator h[5]
304 # %rsi: 16 byte input block m
305 # %rdx: Poly1305 key r[5]
306 # %rcx: Doubleblock count
307 # %r8: Poly1305 derived key r^2 u[5]
309 # This two-block variant further improves performance by using loop
310 # unrolled block processing. This is more straight forward and does
311 # less byte shuffling, but requires a second Poly1305 key r^2:
312 # h = (h + m) * r => h = (h + m1) * r^2 + m2 * r
323 # combine r1,u1 and s1=r1*5,v1=u1*5
331 # combine r2,u2 and s2=r2*5,v2=u2*5
339 # combine r3,u3 and s3=r3*5,v3=u3*5
347 # combine r4,u4 and s4=r4*5,v4=u4*5
356 # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ]
360 pand ANMASK(%rip),hc0
363 # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ]
368 pand ANMASK(%rip),hc1
371 # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ]
376 pand ANMASK(%rip),hc2
379 # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ]
384 pand ANMASK(%rip),hc3
387 # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ]
396 # t1 = [ hc0[1] * r0, hc0[0] * u0 ]
399 # t1 += [ hc1[1] * s4, hc1[0] * v4 ]
403 # t1 += [ hc2[1] * s3, hc2[0] * v3 ]
407 # t1 += [ hc3[1] * s2, hc3[0] * v2 ]
411 # t1 += [ hc4[1] * s1, hc4[0] * v1 ]
421 # t1 = [ hc0[1] * r1, hc0[0] * u1 ]
424 # t1 += [ hc1[1] * r0, hc1[0] * u0 ]
428 # t1 += [ hc2[1] * s4, hc2[0] * v4 ]
432 # t1 += [ hc3[1] * s3, hc3[0] * v3 ]
436 # t1 += [ hc4[1] * s2, hc4[0] * v2 ]
446 # t1 = [ hc0[1] * r2, hc0[0] * u2 ]
449 # t1 += [ hc1[1] * r1, hc1[0] * u1 ]
453 # t1 += [ hc2[1] * r0, hc2[0] * u0 ]
457 # t1 += [ hc3[1] * s4, hc3[0] * v4 ]
461 # t1 += [ hc4[1] * s3, hc4[0] * v3 ]
471 # t1 = [ hc0[1] * r3, hc0[0] * u3 ]
474 # t1 += [ hc1[1] * r2, hc1[0] * u2 ]
478 # t1 += [ hc2[1] * r1, hc2[0] * u1 ]
482 # t1 += [ hc3[1] * r0, hc3[0] * u0 ]
486 # t1 += [ hc4[1] * s4, hc4[0] * v4 ]
496 # t1 = [ hc0[1] * r4, hc0[0] * u4 ]
499 # t1 += [ hc1[1] * r3, hc1[0] * u3 ]
503 # t1 += [ hc2[1] * r2, hc2[0] * u2 ]
507 # t1 += [ hc3[1] * r1, hc3[0] * u1 ]
511 # t1 += [ hc4[1] * r0, hc4[0] * u0 ]
525 # h0 = d0 & 0x3ffffff
533 # h1 = d1 & 0x3ffffff
542 # h2 = d2 & 0x3ffffff
551 # h3 = d3 & 0x3ffffff
556 # h0 += (d4 >> 26) * 5
559 lea (%eax,%eax,4),%eax
561 # h4 = d4 & 0x3ffffff
570 # h0 = h0 & 0x3ffffff
582 ENDPROC(poly1305_2block_sse2)