mm-only debug patch...
[mmotm.git] / arch / x86 / crypto / salsa20-x86_64-asm_64.S
blob6214a9b09706c86f5894fff26061687d3684de17
1 # enter ECRYPT_encrypt_bytes
2 .text
3 .p2align 5
4 .globl ECRYPT_encrypt_bytes
5 ECRYPT_encrypt_bytes:
6         mov     %rsp,%r11
7         and     $31,%r11
8         add     $256,%r11
9         sub     %r11,%rsp
10         # x = arg1
11         mov     %rdi,%r8
12         # m = arg2
13         mov     %rsi,%rsi
14         # out = arg3
15         mov     %rdx,%rdi
16         # bytes = arg4
17         mov     %rcx,%rdx
18         #               unsigned>? bytes - 0
19         cmp     $0,%rdx
20         # comment:fp stack unchanged by jump
21         # goto done if !unsigned>
22         jbe     ._done
23         # comment:fp stack unchanged by fallthrough
24 # start:
25 ._start:
26         # r11_stack = r11
27         movq    %r11,0(%rsp)
28         # r12_stack = r12
29         movq    %r12,8(%rsp)
30         # r13_stack = r13
31         movq    %r13,16(%rsp)
32         # r14_stack = r14
33         movq    %r14,24(%rsp)
34         # r15_stack = r15
35         movq    %r15,32(%rsp)
36         # rbx_stack = rbx
37         movq    %rbx,40(%rsp)
38         # rbp_stack = rbp
39         movq    %rbp,48(%rsp)
40         # in0 = *(uint64 *) (x + 0)
41         movq    0(%r8),%rcx
42         # in2 = *(uint64 *) (x + 8)
43         movq    8(%r8),%r9
44         # in4 = *(uint64 *) (x + 16)
45         movq    16(%r8),%rax
46         # in6 = *(uint64 *) (x + 24)
47         movq    24(%r8),%r10
48         # in8 = *(uint64 *) (x + 32)
49         movq    32(%r8),%r11
50         # in10 = *(uint64 *) (x + 40)
51         movq    40(%r8),%r12
52         # in12 = *(uint64 *) (x + 48)
53         movq    48(%r8),%r13
54         # in14 = *(uint64 *) (x + 56)
55         movq    56(%r8),%r14
56         # j0 = in0
57         movq    %rcx,56(%rsp)
58         # j2 = in2
59         movq    %r9,64(%rsp)
60         # j4 = in4
61         movq    %rax,72(%rsp)
62         # j6 = in6
63         movq    %r10,80(%rsp)
64         # j8 = in8
65         movq    %r11,88(%rsp)
66         # j10 = in10
67         movq    %r12,96(%rsp)
68         # j12 = in12
69         movq    %r13,104(%rsp)
70         # j14 = in14
71         movq    %r14,112(%rsp)
72         # x_backup = x
73         movq    %r8,120(%rsp)
74 # bytesatleast1:
75 ._bytesatleast1:
76         #                   unsigned<? bytes - 64
77         cmp     $64,%rdx
78         # comment:fp stack unchanged by jump
79         #   goto nocopy if !unsigned<
80         jae     ._nocopy
81         #     ctarget = out
82         movq    %rdi,128(%rsp)
83         #     out = &tmp
84         leaq    192(%rsp),%rdi
85         #     i = bytes
86         mov     %rdx,%rcx
87         #     while (i) { *out++ = *m++; --i }
88         rep     movsb
89         #     out = &tmp
90         leaq    192(%rsp),%rdi
91         #     m = &tmp
92         leaq    192(%rsp),%rsi
93         # comment:fp stack unchanged by fallthrough
94 #   nocopy:
95 ._nocopy:
96         #   out_backup = out
97         movq    %rdi,136(%rsp)
98         #   m_backup = m
99         movq    %rsi,144(%rsp)
100         #   bytes_backup = bytes
101         movq    %rdx,152(%rsp)
102         #   x1 = j0
103         movq    56(%rsp),%rdi
104         #   x0 = x1
105         mov     %rdi,%rdx
106         #   (uint64) x1 >>= 32
107         shr     $32,%rdi
108         #               x3 = j2
109         movq    64(%rsp),%rsi
110         #               x2 = x3
111         mov     %rsi,%rcx
112         #               (uint64) x3 >>= 32
113         shr     $32,%rsi
114         #   x5 = j4
115         movq    72(%rsp),%r8
116         #   x4 = x5
117         mov     %r8,%r9
118         #   (uint64) x5 >>= 32
119         shr     $32,%r8
120         #   x5_stack = x5
121         movq    %r8,160(%rsp)
122         #               x7 = j6
123         movq    80(%rsp),%r8
124         #               x6 = x7
125         mov     %r8,%rax
126         #               (uint64) x7 >>= 32
127         shr     $32,%r8
128         #   x9 = j8
129         movq    88(%rsp),%r10
130         #   x8 = x9
131         mov     %r10,%r11
132         #   (uint64) x9 >>= 32
133         shr     $32,%r10
134         #               x11 = j10
135         movq    96(%rsp),%r12
136         #               x10 = x11
137         mov     %r12,%r13
138         #               x10_stack = x10
139         movq    %r13,168(%rsp)
140         #               (uint64) x11 >>= 32
141         shr     $32,%r12
142         #   x13 = j12
143         movq    104(%rsp),%r13
144         #   x12 = x13
145         mov     %r13,%r14
146         #   (uint64) x13 >>= 32
147         shr     $32,%r13
148         #               x15 = j14
149         movq    112(%rsp),%r15
150         #               x14 = x15
151         mov     %r15,%rbx
152         #               (uint64) x15 >>= 32
153         shr     $32,%r15
154         #               x15_stack = x15
155         movq    %r15,176(%rsp)
156         #   i = 20
157         mov     $20,%r15
158 #   mainloop:
159 ._mainloop:
160         #   i_backup = i
161         movq    %r15,184(%rsp)
162         #               x5 = x5_stack
163         movq    160(%rsp),%r15
164         # a = x12 + x0
165         lea     (%r14,%rdx),%rbp
166         # (uint32) a <<<= 7
167         rol     $7,%ebp
168         # x4 ^= a
169         xor     %rbp,%r9
170         #               b = x1 + x5
171         lea     (%rdi,%r15),%rbp
172         #               (uint32) b <<<= 7
173         rol     $7,%ebp
174         #               x9 ^= b
175         xor     %rbp,%r10
176         # a = x0 + x4
177         lea     (%rdx,%r9),%rbp
178         # (uint32) a <<<= 9
179         rol     $9,%ebp
180         # x8 ^= a
181         xor     %rbp,%r11
182         #               b = x5 + x9
183         lea     (%r15,%r10),%rbp
184         #               (uint32) b <<<= 9
185         rol     $9,%ebp
186         #               x13 ^= b
187         xor     %rbp,%r13
188         # a = x4 + x8
189         lea     (%r9,%r11),%rbp
190         # (uint32) a <<<= 13
191         rol     $13,%ebp
192         # x12 ^= a
193         xor     %rbp,%r14
194         #               b = x9 + x13
195         lea     (%r10,%r13),%rbp
196         #               (uint32) b <<<= 13
197         rol     $13,%ebp
198         #               x1 ^= b
199         xor     %rbp,%rdi
200         # a = x8 + x12
201         lea     (%r11,%r14),%rbp
202         # (uint32) a <<<= 18
203         rol     $18,%ebp
204         # x0 ^= a
205         xor     %rbp,%rdx
206         #               b = x13 + x1
207         lea     (%r13,%rdi),%rbp
208         #               (uint32) b <<<= 18
209         rol     $18,%ebp
210         #               x5 ^= b
211         xor     %rbp,%r15
212         #                               x10 = x10_stack
213         movq    168(%rsp),%rbp
214         #               x5_stack = x5
215         movq    %r15,160(%rsp)
216         #                               c = x6 + x10
217         lea     (%rax,%rbp),%r15
218         #                               (uint32) c <<<= 7
219         rol     $7,%r15d
220         #                               x14 ^= c
221         xor     %r15,%rbx
222         #                               c = x10 + x14
223         lea     (%rbp,%rbx),%r15
224         #                               (uint32) c <<<= 9
225         rol     $9,%r15d
226         #                               x2 ^= c
227         xor     %r15,%rcx
228         #                               c = x14 + x2
229         lea     (%rbx,%rcx),%r15
230         #                               (uint32) c <<<= 13
231         rol     $13,%r15d
232         #                               x6 ^= c
233         xor     %r15,%rax
234         #                               c = x2 + x6
235         lea     (%rcx,%rax),%r15
236         #                               (uint32) c <<<= 18
237         rol     $18,%r15d
238         #                               x10 ^= c
239         xor     %r15,%rbp
240         #                                               x15 = x15_stack
241         movq    176(%rsp),%r15
242         #                               x10_stack = x10
243         movq    %rbp,168(%rsp)
244         #                                               d = x11 + x15
245         lea     (%r12,%r15),%rbp
246         #                                               (uint32) d <<<= 7
247         rol     $7,%ebp
248         #                                               x3 ^= d
249         xor     %rbp,%rsi
250         #                                               d = x15 + x3
251         lea     (%r15,%rsi),%rbp
252         #                                               (uint32) d <<<= 9
253         rol     $9,%ebp
254         #                                               x7 ^= d
255         xor     %rbp,%r8
256         #                                               d = x3 + x7
257         lea     (%rsi,%r8),%rbp
258         #                                               (uint32) d <<<= 13
259         rol     $13,%ebp
260         #                                               x11 ^= d
261         xor     %rbp,%r12
262         #                                               d = x7 + x11
263         lea     (%r8,%r12),%rbp
264         #                                               (uint32) d <<<= 18
265         rol     $18,%ebp
266         #                                               x15 ^= d
267         xor     %rbp,%r15
268         #                                               x15_stack = x15
269         movq    %r15,176(%rsp)
270         #               x5 = x5_stack
271         movq    160(%rsp),%r15
272         # a = x3 + x0
273         lea     (%rsi,%rdx),%rbp
274         # (uint32) a <<<= 7
275         rol     $7,%ebp
276         # x1 ^= a
277         xor     %rbp,%rdi
278         #               b = x4 + x5
279         lea     (%r9,%r15),%rbp
280         #               (uint32) b <<<= 7
281         rol     $7,%ebp
282         #               x6 ^= b
283         xor     %rbp,%rax
284         # a = x0 + x1
285         lea     (%rdx,%rdi),%rbp
286         # (uint32) a <<<= 9
287         rol     $9,%ebp
288         # x2 ^= a
289         xor     %rbp,%rcx
290         #               b = x5 + x6
291         lea     (%r15,%rax),%rbp
292         #               (uint32) b <<<= 9
293         rol     $9,%ebp
294         #               x7 ^= b
295         xor     %rbp,%r8
296         # a = x1 + x2
297         lea     (%rdi,%rcx),%rbp
298         # (uint32) a <<<= 13
299         rol     $13,%ebp
300         # x3 ^= a
301         xor     %rbp,%rsi
302         #               b = x6 + x7
303         lea     (%rax,%r8),%rbp
304         #               (uint32) b <<<= 13
305         rol     $13,%ebp
306         #               x4 ^= b
307         xor     %rbp,%r9
308         # a = x2 + x3
309         lea     (%rcx,%rsi),%rbp
310         # (uint32) a <<<= 18
311         rol     $18,%ebp
312         # x0 ^= a
313         xor     %rbp,%rdx
314         #               b = x7 + x4
315         lea     (%r8,%r9),%rbp
316         #               (uint32) b <<<= 18
317         rol     $18,%ebp
318         #               x5 ^= b
319         xor     %rbp,%r15
320         #                               x10 = x10_stack
321         movq    168(%rsp),%rbp
322         #               x5_stack = x5
323         movq    %r15,160(%rsp)
324         #                               c = x9 + x10
325         lea     (%r10,%rbp),%r15
326         #                               (uint32) c <<<= 7
327         rol     $7,%r15d
328         #                               x11 ^= c
329         xor     %r15,%r12
330         #                               c = x10 + x11
331         lea     (%rbp,%r12),%r15
332         #                               (uint32) c <<<= 9
333         rol     $9,%r15d
334         #                               x8 ^= c
335         xor     %r15,%r11
336         #                               c = x11 + x8
337         lea     (%r12,%r11),%r15
338         #                               (uint32) c <<<= 13
339         rol     $13,%r15d
340         #                               x9 ^= c
341         xor     %r15,%r10
342         #                               c = x8 + x9
343         lea     (%r11,%r10),%r15
344         #                               (uint32) c <<<= 18
345         rol     $18,%r15d
346         #                               x10 ^= c
347         xor     %r15,%rbp
348         #                                               x15 = x15_stack
349         movq    176(%rsp),%r15
350         #                               x10_stack = x10
351         movq    %rbp,168(%rsp)
352         #                                               d = x14 + x15
353         lea     (%rbx,%r15),%rbp
354         #                                               (uint32) d <<<= 7
355         rol     $7,%ebp
356         #                                               x12 ^= d
357         xor     %rbp,%r14
358         #                                               d = x15 + x12
359         lea     (%r15,%r14),%rbp
360         #                                               (uint32) d <<<= 9
361         rol     $9,%ebp
362         #                                               x13 ^= d
363         xor     %rbp,%r13
364         #                                               d = x12 + x13
365         lea     (%r14,%r13),%rbp
366         #                                               (uint32) d <<<= 13
367         rol     $13,%ebp
368         #                                               x14 ^= d
369         xor     %rbp,%rbx
370         #                                               d = x13 + x14
371         lea     (%r13,%rbx),%rbp
372         #                                               (uint32) d <<<= 18
373         rol     $18,%ebp
374         #                                               x15 ^= d
375         xor     %rbp,%r15
376         #                                               x15_stack = x15
377         movq    %r15,176(%rsp)
378         #               x5 = x5_stack
379         movq    160(%rsp),%r15
380         # a = x12 + x0
381         lea     (%r14,%rdx),%rbp
382         # (uint32) a <<<= 7
383         rol     $7,%ebp
384         # x4 ^= a
385         xor     %rbp,%r9
386         #               b = x1 + x5
387         lea     (%rdi,%r15),%rbp
388         #               (uint32) b <<<= 7
389         rol     $7,%ebp
390         #               x9 ^= b
391         xor     %rbp,%r10
392         # a = x0 + x4
393         lea     (%rdx,%r9),%rbp
394         # (uint32) a <<<= 9
395         rol     $9,%ebp
396         # x8 ^= a
397         xor     %rbp,%r11
398         #               b = x5 + x9
399         lea     (%r15,%r10),%rbp
400         #               (uint32) b <<<= 9
401         rol     $9,%ebp
402         #               x13 ^= b
403         xor     %rbp,%r13
404         # a = x4 + x8
405         lea     (%r9,%r11),%rbp
406         # (uint32) a <<<= 13
407         rol     $13,%ebp
408         # x12 ^= a
409         xor     %rbp,%r14
410         #               b = x9 + x13
411         lea     (%r10,%r13),%rbp
412         #               (uint32) b <<<= 13
413         rol     $13,%ebp
414         #               x1 ^= b
415         xor     %rbp,%rdi
416         # a = x8 + x12
417         lea     (%r11,%r14),%rbp
418         # (uint32) a <<<= 18
419         rol     $18,%ebp
420         # x0 ^= a
421         xor     %rbp,%rdx
422         #               b = x13 + x1
423         lea     (%r13,%rdi),%rbp
424         #               (uint32) b <<<= 18
425         rol     $18,%ebp
426         #               x5 ^= b
427         xor     %rbp,%r15
428         #                               x10 = x10_stack
429         movq    168(%rsp),%rbp
430         #               x5_stack = x5
431         movq    %r15,160(%rsp)
432         #                               c = x6 + x10
433         lea     (%rax,%rbp),%r15
434         #                               (uint32) c <<<= 7
435         rol     $7,%r15d
436         #                               x14 ^= c
437         xor     %r15,%rbx
438         #                               c = x10 + x14
439         lea     (%rbp,%rbx),%r15
440         #                               (uint32) c <<<= 9
441         rol     $9,%r15d
442         #                               x2 ^= c
443         xor     %r15,%rcx
444         #                               c = x14 + x2
445         lea     (%rbx,%rcx),%r15
446         #                               (uint32) c <<<= 13
447         rol     $13,%r15d
448         #                               x6 ^= c
449         xor     %r15,%rax
450         #                               c = x2 + x6
451         lea     (%rcx,%rax),%r15
452         #                               (uint32) c <<<= 18
453         rol     $18,%r15d
454         #                               x10 ^= c
455         xor     %r15,%rbp
456         #                                               x15 = x15_stack
457         movq    176(%rsp),%r15
458         #                               x10_stack = x10
459         movq    %rbp,168(%rsp)
460         #                                               d = x11 + x15
461         lea     (%r12,%r15),%rbp
462         #                                               (uint32) d <<<= 7
463         rol     $7,%ebp
464         #                                               x3 ^= d
465         xor     %rbp,%rsi
466         #                                               d = x15 + x3
467         lea     (%r15,%rsi),%rbp
468         #                                               (uint32) d <<<= 9
469         rol     $9,%ebp
470         #                                               x7 ^= d
471         xor     %rbp,%r8
472         #                                               d = x3 + x7
473         lea     (%rsi,%r8),%rbp
474         #                                               (uint32) d <<<= 13
475         rol     $13,%ebp
476         #                                               x11 ^= d
477         xor     %rbp,%r12
478         #                                               d = x7 + x11
479         lea     (%r8,%r12),%rbp
480         #                                               (uint32) d <<<= 18
481         rol     $18,%ebp
482         #                                               x15 ^= d
483         xor     %rbp,%r15
484         #                                               x15_stack = x15
485         movq    %r15,176(%rsp)
486         #               x5 = x5_stack
487         movq    160(%rsp),%r15
488         # a = x3 + x0
489         lea     (%rsi,%rdx),%rbp
490         # (uint32) a <<<= 7
491         rol     $7,%ebp
492         # x1 ^= a
493         xor     %rbp,%rdi
494         #               b = x4 + x5
495         lea     (%r9,%r15),%rbp
496         #               (uint32) b <<<= 7
497         rol     $7,%ebp
498         #               x6 ^= b
499         xor     %rbp,%rax
500         # a = x0 + x1
501         lea     (%rdx,%rdi),%rbp
502         # (uint32) a <<<= 9
503         rol     $9,%ebp
504         # x2 ^= a
505         xor     %rbp,%rcx
506         #               b = x5 + x6
507         lea     (%r15,%rax),%rbp
508         #               (uint32) b <<<= 9
509         rol     $9,%ebp
510         #               x7 ^= b
511         xor     %rbp,%r8
512         # a = x1 + x2
513         lea     (%rdi,%rcx),%rbp
514         # (uint32) a <<<= 13
515         rol     $13,%ebp
516         # x3 ^= a
517         xor     %rbp,%rsi
518         #               b = x6 + x7
519         lea     (%rax,%r8),%rbp
520         #               (uint32) b <<<= 13
521         rol     $13,%ebp
522         #               x4 ^= b
523         xor     %rbp,%r9
524         # a = x2 + x3
525         lea     (%rcx,%rsi),%rbp
526         # (uint32) a <<<= 18
527         rol     $18,%ebp
528         # x0 ^= a
529         xor     %rbp,%rdx
530         #               b = x7 + x4
531         lea     (%r8,%r9),%rbp
532         #               (uint32) b <<<= 18
533         rol     $18,%ebp
534         #               x5 ^= b
535         xor     %rbp,%r15
536         #                               x10 = x10_stack
537         movq    168(%rsp),%rbp
538         #               x5_stack = x5
539         movq    %r15,160(%rsp)
540         #                               c = x9 + x10
541         lea     (%r10,%rbp),%r15
542         #                               (uint32) c <<<= 7
543         rol     $7,%r15d
544         #                               x11 ^= c
545         xor     %r15,%r12
546         #                               c = x10 + x11
547         lea     (%rbp,%r12),%r15
548         #                               (uint32) c <<<= 9
549         rol     $9,%r15d
550         #                               x8 ^= c
551         xor     %r15,%r11
552         #                               c = x11 + x8
553         lea     (%r12,%r11),%r15
554         #                               (uint32) c <<<= 13
555         rol     $13,%r15d
556         #                               x9 ^= c
557         xor     %r15,%r10
558         #                               c = x8 + x9
559         lea     (%r11,%r10),%r15
560         #                               (uint32) c <<<= 18
561         rol     $18,%r15d
562         #                               x10 ^= c
563         xor     %r15,%rbp
564         #                                               x15 = x15_stack
565         movq    176(%rsp),%r15
566         #                               x10_stack = x10
567         movq    %rbp,168(%rsp)
568         #                                               d = x14 + x15
569         lea     (%rbx,%r15),%rbp
570         #                                               (uint32) d <<<= 7
571         rol     $7,%ebp
572         #                                               x12 ^= d
573         xor     %rbp,%r14
574         #                                               d = x15 + x12
575         lea     (%r15,%r14),%rbp
576         #                                               (uint32) d <<<= 9
577         rol     $9,%ebp
578         #                                               x13 ^= d
579         xor     %rbp,%r13
580         #                                               d = x12 + x13
581         lea     (%r14,%r13),%rbp
582         #                                               (uint32) d <<<= 13
583         rol     $13,%ebp
584         #                                               x14 ^= d
585         xor     %rbp,%rbx
586         #                                               d = x13 + x14
587         lea     (%r13,%rbx),%rbp
588         #                                               (uint32) d <<<= 18
589         rol     $18,%ebp
590         #                                               x15 ^= d
591         xor     %rbp,%r15
592         #                                               x15_stack = x15
593         movq    %r15,176(%rsp)
594         #   i = i_backup
595         movq    184(%rsp),%r15
596         #                  unsigned>? i -= 4
597         sub     $4,%r15
598         # comment:fp stack unchanged by jump
599         # goto mainloop if unsigned>
600         ja      ._mainloop
601         #   (uint32) x2 += j2
602         addl    64(%rsp),%ecx
603         #   x3 <<= 32
604         shl     $32,%rsi
605         #   x3 += j2
606         addq    64(%rsp),%rsi
607         #   (uint64) x3 >>= 32
608         shr     $32,%rsi
609         #   x3 <<= 32
610         shl     $32,%rsi
611         #   x2 += x3
612         add     %rsi,%rcx
613         #   (uint32) x6 += j6
614         addl    80(%rsp),%eax
615         #   x7 <<= 32
616         shl     $32,%r8
617         #   x7 += j6
618         addq    80(%rsp),%r8
619         #   (uint64) x7 >>= 32
620         shr     $32,%r8
621         #   x7 <<= 32
622         shl     $32,%r8
623         #   x6 += x7
624         add     %r8,%rax
625         #   (uint32) x8 += j8
626         addl    88(%rsp),%r11d
627         #   x9 <<= 32
628         shl     $32,%r10
629         #   x9 += j8
630         addq    88(%rsp),%r10
631         #   (uint64) x9 >>= 32
632         shr     $32,%r10
633         #   x9 <<= 32
634         shl     $32,%r10
635         #   x8 += x9
636         add     %r10,%r11
637         #   (uint32) x12 += j12
638         addl    104(%rsp),%r14d
639         #   x13 <<= 32
640         shl     $32,%r13
641         #   x13 += j12
642         addq    104(%rsp),%r13
643         #   (uint64) x13 >>= 32
644         shr     $32,%r13
645         #   x13 <<= 32
646         shl     $32,%r13
647         #   x12 += x13
648         add     %r13,%r14
649         #   (uint32) x0 += j0
650         addl    56(%rsp),%edx
651         #   x1 <<= 32
652         shl     $32,%rdi
653         #   x1 += j0
654         addq    56(%rsp),%rdi
655         #   (uint64) x1 >>= 32
656         shr     $32,%rdi
657         #   x1 <<= 32
658         shl     $32,%rdi
659         #   x0 += x1
660         add     %rdi,%rdx
661         #   x5 = x5_stack
662         movq    160(%rsp),%rdi
663         #   (uint32) x4 += j4
664         addl    72(%rsp),%r9d
665         #   x5 <<= 32
666         shl     $32,%rdi
667         #   x5 += j4
668         addq    72(%rsp),%rdi
669         #   (uint64) x5 >>= 32
670         shr     $32,%rdi
671         #   x5 <<= 32
672         shl     $32,%rdi
673         #   x4 += x5
674         add     %rdi,%r9
675         #   x10 = x10_stack
676         movq    168(%rsp),%r8
677         #   (uint32) x10 += j10
678         addl    96(%rsp),%r8d
679         #   x11 <<= 32
680         shl     $32,%r12
681         #   x11 += j10
682         addq    96(%rsp),%r12
683         #   (uint64) x11 >>= 32
684         shr     $32,%r12
685         #   x11 <<= 32
686         shl     $32,%r12
687         #   x10 += x11
688         add     %r12,%r8
689         #   x15 = x15_stack
690         movq    176(%rsp),%rdi
691         #   (uint32) x14 += j14
692         addl    112(%rsp),%ebx
693         #   x15 <<= 32
694         shl     $32,%rdi
695         #   x15 += j14
696         addq    112(%rsp),%rdi
697         #   (uint64) x15 >>= 32
698         shr     $32,%rdi
699         #   x15 <<= 32
700         shl     $32,%rdi
701         #   x14 += x15
702         add     %rdi,%rbx
703         #   out = out_backup
704         movq    136(%rsp),%rdi
705         #   m = m_backup
706         movq    144(%rsp),%rsi
707         #   x0 ^= *(uint64 *) (m + 0)
708         xorq    0(%rsi),%rdx
709         #   *(uint64 *) (out + 0) = x0
710         movq    %rdx,0(%rdi)
711         #   x2 ^= *(uint64 *) (m + 8)
712         xorq    8(%rsi),%rcx
713         #   *(uint64 *) (out + 8) = x2
714         movq    %rcx,8(%rdi)
715         #   x4 ^= *(uint64 *) (m + 16)
716         xorq    16(%rsi),%r9
717         #   *(uint64 *) (out + 16) = x4
718         movq    %r9,16(%rdi)
719         #   x6 ^= *(uint64 *) (m + 24)
720         xorq    24(%rsi),%rax
721         #   *(uint64 *) (out + 24) = x6
722         movq    %rax,24(%rdi)
723         #   x8 ^= *(uint64 *) (m + 32)
724         xorq    32(%rsi),%r11
725         #   *(uint64 *) (out + 32) = x8
726         movq    %r11,32(%rdi)
727         #   x10 ^= *(uint64 *) (m + 40)
728         xorq    40(%rsi),%r8
729         #   *(uint64 *) (out + 40) = x10
730         movq    %r8,40(%rdi)
731         #   x12 ^= *(uint64 *) (m + 48)
732         xorq    48(%rsi),%r14
733         #   *(uint64 *) (out + 48) = x12
734         movq    %r14,48(%rdi)
735         #   x14 ^= *(uint64 *) (m + 56)
736         xorq    56(%rsi),%rbx
737         #   *(uint64 *) (out + 56) = x14
738         movq    %rbx,56(%rdi)
739         #   bytes = bytes_backup
740         movq    152(%rsp),%rdx
741         #   in8 = j8
742         movq    88(%rsp),%rcx
743         #   in8 += 1
744         add     $1,%rcx
745         #   j8 = in8
746         movq    %rcx,88(%rsp)
747         #                          unsigned>? unsigned<? bytes - 64
748         cmp     $64,%rdx
749         # comment:fp stack unchanged by jump
750         #   goto bytesatleast65 if unsigned>
751         ja      ._bytesatleast65
752         # comment:fp stack unchanged by jump
753         #     goto bytesatleast64 if !unsigned<
754         jae     ._bytesatleast64
755         #       m = out
756         mov     %rdi,%rsi
757         #       out = ctarget
758         movq    128(%rsp),%rdi
759         #       i = bytes
760         mov     %rdx,%rcx
761         #       while (i) { *out++ = *m++; --i }
762         rep     movsb
763         # comment:fp stack unchanged by fallthrough
764 #     bytesatleast64:
765 ._bytesatleast64:
766         #     x = x_backup
767         movq    120(%rsp),%rdi
768         #     in8 = j8
769         movq    88(%rsp),%rsi
770         #     *(uint64 *) (x + 32) = in8
771         movq    %rsi,32(%rdi)
772         #     r11 = r11_stack
773         movq    0(%rsp),%r11
774         #     r12 = r12_stack
775         movq    8(%rsp),%r12
776         #     r13 = r13_stack
777         movq    16(%rsp),%r13
778         #     r14 = r14_stack
779         movq    24(%rsp),%r14
780         #     r15 = r15_stack
781         movq    32(%rsp),%r15
782         #     rbx = rbx_stack
783         movq    40(%rsp),%rbx
784         #     rbp = rbp_stack
785         movq    48(%rsp),%rbp
786         # comment:fp stack unchanged by fallthrough
787 #     done:
788 ._done:
789         #     leave
790         add     %r11,%rsp
791         mov     %rdi,%rax
792         mov     %rsi,%rdx
793         ret
794 #   bytesatleast65:
795 ._bytesatleast65:
796         #   bytes -= 64
797         sub     $64,%rdx
798         #   out += 64
799         add     $64,%rdi
800         #   m += 64
801         add     $64,%rsi
802         # comment:fp stack unchanged by jump
803         # goto bytesatleast1
804         jmp     ._bytesatleast1
805 # enter ECRYPT_keysetup
806 .text
807 .p2align 5
808 .globl ECRYPT_keysetup
809 ECRYPT_keysetup:
810         mov     %rsp,%r11
811         and     $31,%r11
812         add     $256,%r11
813         sub     %r11,%rsp
814         #   k = arg2
815         mov     %rsi,%rsi
816         #   kbits = arg3
817         mov     %rdx,%rdx
818         #   x = arg1
819         mov     %rdi,%rdi
820         #   in0 = *(uint64 *) (k + 0)
821         movq    0(%rsi),%r8
822         #   in2 = *(uint64 *) (k + 8)
823         movq    8(%rsi),%r9
824         #   *(uint64 *) (x + 4) = in0
825         movq    %r8,4(%rdi)
826         #   *(uint64 *) (x + 12) = in2
827         movq    %r9,12(%rdi)
828         #                    unsigned<? kbits - 256
829         cmp     $256,%rdx
830         # comment:fp stack unchanged by jump
831         #   goto kbits128 if unsigned<
832         jb      ._kbits128
833 #   kbits256:
834 ._kbits256:
835         #     in10 = *(uint64 *) (k + 16)
836         movq    16(%rsi),%rdx
837         #     in12 = *(uint64 *) (k + 24)
838         movq    24(%rsi),%rsi
839         #     *(uint64 *) (x + 44) = in10
840         movq    %rdx,44(%rdi)
841         #     *(uint64 *) (x + 52) = in12
842         movq    %rsi,52(%rdi)
843         #     in0 = 1634760805
844         mov     $1634760805,%rsi
845         #     in4 = 857760878
846         mov     $857760878,%rdx
847         #     in10 = 2036477234
848         mov     $2036477234,%rcx
849         #     in14 = 1797285236
850         mov     $1797285236,%r8
851         #     *(uint32 *) (x + 0) = in0
852         movl    %esi,0(%rdi)
853         #     *(uint32 *) (x + 20) = in4
854         movl    %edx,20(%rdi)
855         #     *(uint32 *) (x + 40) = in10
856         movl    %ecx,40(%rdi)
857         #     *(uint32 *) (x + 60) = in14
858         movl    %r8d,60(%rdi)
859         # comment:fp stack unchanged by jump
860         #   goto keysetupdone
861         jmp     ._keysetupdone
862 #   kbits128:
863 ._kbits128:
864         #     in10 = *(uint64 *) (k + 0)
865         movq    0(%rsi),%rdx
866         #     in12 = *(uint64 *) (k + 8)
867         movq    8(%rsi),%rsi
868         #     *(uint64 *) (x + 44) = in10
869         movq    %rdx,44(%rdi)
870         #     *(uint64 *) (x + 52) = in12
871         movq    %rsi,52(%rdi)
872         #     in0 = 1634760805
873         mov     $1634760805,%rsi
874         #     in4 = 824206446
875         mov     $824206446,%rdx
876         #     in10 = 2036477238
877         mov     $2036477238,%rcx
878         #     in14 = 1797285236
879         mov     $1797285236,%r8
880         #     *(uint32 *) (x + 0) = in0
881         movl    %esi,0(%rdi)
882         #     *(uint32 *) (x + 20) = in4
883         movl    %edx,20(%rdi)
884         #     *(uint32 *) (x + 40) = in10
885         movl    %ecx,40(%rdi)
886         #     *(uint32 *) (x + 60) = in14
887         movl    %r8d,60(%rdi)
888 #   keysetupdone:
889 ._keysetupdone:
890         # leave
891         add     %r11,%rsp
892         mov     %rdi,%rax
893         mov     %rsi,%rdx
894         ret
895 # enter ECRYPT_ivsetup
896 .text
897 .p2align 5
898 .globl ECRYPT_ivsetup
899 ECRYPT_ivsetup:
900         mov     %rsp,%r11
901         and     $31,%r11
902         add     $256,%r11
903         sub     %r11,%rsp
904         #   iv = arg2
905         mov     %rsi,%rsi
906         #   x = arg1
907         mov     %rdi,%rdi
908         #   in6 = *(uint64 *) (iv + 0)
909         movq    0(%rsi),%rsi
910         #   in8 = 0
911         mov     $0,%r8
912         #   *(uint64 *) (x + 24) = in6
913         movq    %rsi,24(%rdi)
914         #   *(uint64 *) (x + 32) = in8
915         movq    %r8,32(%rdi)
916         # leave
917         add     %r11,%rsp
918         mov     %rdi,%rax
919         mov     %rsi,%rdx
920         ret