2 * x86-64 optimized assembler MD5 implementation
4 * Author: Marc Bevand, 2004
6 * This code was placed in the public domain by the author. The original
7 * publication can be found at:
9 * https://www.zorinaq.com/papers/md5-amd64.html
12 * No modifications were made aside from changing the function and file names.
13 * The MD5_CTX structure as expected here (from OpenSSL) is binary compatible
14 * with the md_context used by rsync, for the fields accessed.
16 * Benchmarks (in MB/s) C ASM
17 * - Intel Atom D2700 302 334
18 * - Intel i7-7700hq 351 376
19 * - AMD ThreadRipper 2950x 728 784
21 * The original code was also incorporated into OpenSSL. It has since been
22 * modified there. Those changes have not been made here due to licensing
23 * incompatibilities. Benchmarks of those changes on the above CPUs did not
24 * show any significant difference in performance, though.
28 #include "md-defines.h"
30 #if !defined USE_OPENSSL && CSUM_CHUNK == 64
35 .globl md5_process_asm
40 push %r13 # not really useful (r13 is unused)
44 # rdi = arg #1 (ctx, MD5_CTX pointer)
45 # rsi = arg #2 (ptr, data pointer)
46 # rdx = arg #3 (nbr, number of 16-word blocks to process)
47 mov %rdi, %rbp # rbp = ctx
48 shl $6, %rdx # rdx = nbr in bytes
49 lea (%rsi,%rdx), %rdi # rdi = end
50 mov 0*4(%rbp), %eax # eax = ctx->A
51 mov 1*4(%rbp), %ebx # ebx = ctx->B
52 mov 2*4(%rbp), %ecx # ecx = ctx->C
53 mov 3*4(%rbp), %edx # edx = ctx->D
61 cmp %rdi, %rsi # cmp end with ptr
62 je 1f # jmp if ptr == end
64 # BEGIN of loop over 16-word blocks
65 2: # save old values of A, B, C, D
70 mov 0*4(%rsi), %r10d /* (NEXT STEP) X[0] */
71 mov %edx, %r11d /* (NEXT STEP) z' = %edx */
72 xor %ecx, %r11d /* y ^ ... */
73 lea -680876936(%eax,%r10d),%eax /* Const + dst + ... */
74 and %ebx, %r11d /* x & ... */
75 xor %edx, %r11d /* z ^ ... */
76 mov 1*4(%rsi),%r10d /* (NEXT STEP) X[1] */
77 add %r11d, %eax /* dst += ... */
78 rol $7, %eax /* dst <<< s */
79 mov %ecx, %r11d /* (NEXT STEP) z' = %ecx */
80 add %ebx, %eax /* dst += x */
81 xor %ebx, %r11d /* y ^ ... */
82 lea -389564586(%edx,%r10d),%edx /* Const + dst + ... */
83 and %eax, %r11d /* x & ... */
84 xor %ecx, %r11d /* z ^ ... */
85 mov 2*4(%rsi),%r10d /* (NEXT STEP) X[2] */
86 add %r11d, %edx /* dst += ... */
87 rol $12, %edx /* dst <<< s */
88 mov %ebx, %r11d /* (NEXT STEP) z' = %ebx */
89 add %eax, %edx /* dst += x */
90 xor %eax, %r11d /* y ^ ... */
91 lea 606105819(%ecx,%r10d),%ecx /* Const + dst + ... */
92 and %edx, %r11d /* x & ... */
93 xor %ebx, %r11d /* z ^ ... */
94 mov 3*4(%rsi),%r10d /* (NEXT STEP) X[3] */
95 add %r11d, %ecx /* dst += ... */
96 rol $17, %ecx /* dst <<< s */
97 mov %eax, %r11d /* (NEXT STEP) z' = %eax */
98 add %edx, %ecx /* dst += x */
99 xor %edx, %r11d /* y ^ ... */
100 lea -1044525330(%ebx,%r10d),%ebx /* Const + dst + ... */
101 and %ecx, %r11d /* x & ... */
102 xor %eax, %r11d /* z ^ ... */
103 mov 4*4(%rsi),%r10d /* (NEXT STEP) X[4] */
104 add %r11d, %ebx /* dst += ... */
105 rol $22, %ebx /* dst <<< s */
106 mov %edx, %r11d /* (NEXT STEP) z' = %edx */
107 add %ecx, %ebx /* dst += x */
108 xor %ecx, %r11d /* y ^ ... */
109 lea -176418897(%eax,%r10d),%eax /* Const + dst + ... */
110 and %ebx, %r11d /* x & ... */
111 xor %edx, %r11d /* z ^ ... */
112 mov 5*4(%rsi),%r10d /* (NEXT STEP) X[5] */
113 add %r11d, %eax /* dst += ... */
114 rol $7, %eax /* dst <<< s */
115 mov %ecx, %r11d /* (NEXT STEP) z' = %ecx */
116 add %ebx, %eax /* dst += x */
117 xor %ebx, %r11d /* y ^ ... */
118 lea 1200080426(%edx,%r10d),%edx /* Const + dst + ... */
119 and %eax, %r11d /* x & ... */
120 xor %ecx, %r11d /* z ^ ... */
121 mov 6*4(%rsi),%r10d /* (NEXT STEP) X[6] */
122 add %r11d, %edx /* dst += ... */
123 rol $12, %edx /* dst <<< s */
124 mov %ebx, %r11d /* (NEXT STEP) z' = %ebx */
125 add %eax, %edx /* dst += x */
126 xor %eax, %r11d /* y ^ ... */
127 lea -1473231341(%ecx,%r10d),%ecx /* Const + dst + ... */
128 and %edx, %r11d /* x & ... */
129 xor %ebx, %r11d /* z ^ ... */
130 mov 7*4(%rsi),%r10d /* (NEXT STEP) X[7] */
131 add %r11d, %ecx /* dst += ... */
132 rol $17, %ecx /* dst <<< s */
133 mov %eax, %r11d /* (NEXT STEP) z' = %eax */
134 add %edx, %ecx /* dst += x */
135 xor %edx, %r11d /* y ^ ... */
136 lea -45705983(%ebx,%r10d),%ebx /* Const + dst + ... */
137 and %ecx, %r11d /* x & ... */
138 xor %eax, %r11d /* z ^ ... */
139 mov 8*4(%rsi),%r10d /* (NEXT STEP) X[8] */
140 add %r11d, %ebx /* dst += ... */
141 rol $22, %ebx /* dst <<< s */
142 mov %edx, %r11d /* (NEXT STEP) z' = %edx */
143 add %ecx, %ebx /* dst += x */
144 xor %ecx, %r11d /* y ^ ... */
145 lea 1770035416(%eax,%r10d),%eax /* Const + dst + ... */
146 and %ebx, %r11d /* x & ... */
147 xor %edx, %r11d /* z ^ ... */
148 mov 9*4(%rsi),%r10d /* (NEXT STEP) X[9] */
149 add %r11d, %eax /* dst += ... */
150 rol $7, %eax /* dst <<< s */
151 mov %ecx, %r11d /* (NEXT STEP) z' = %ecx */
152 add %ebx, %eax /* dst += x */
153 xor %ebx, %r11d /* y ^ ... */
154 lea -1958414417(%edx,%r10d),%edx /* Const + dst + ... */
155 and %eax, %r11d /* x & ... */
156 xor %ecx, %r11d /* z ^ ... */
157 mov 10*4(%rsi),%r10d /* (NEXT STEP) X[10] */
158 add %r11d, %edx /* dst += ... */
159 rol $12, %edx /* dst <<< s */
160 mov %ebx, %r11d /* (NEXT STEP) z' = %ebx */
161 add %eax, %edx /* dst += x */
162 xor %eax, %r11d /* y ^ ... */
163 lea -42063(%ecx,%r10d),%ecx /* Const + dst + ... */
164 and %edx, %r11d /* x & ... */
165 xor %ebx, %r11d /* z ^ ... */
166 mov 11*4(%rsi),%r10d /* (NEXT STEP) X[11] */
167 add %r11d, %ecx /* dst += ... */
168 rol $17, %ecx /* dst <<< s */
169 mov %eax, %r11d /* (NEXT STEP) z' = %eax */
170 add %edx, %ecx /* dst += x */
171 xor %edx, %r11d /* y ^ ... */
172 lea -1990404162(%ebx,%r10d),%ebx /* Const + dst + ... */
173 and %ecx, %r11d /* x & ... */
174 xor %eax, %r11d /* z ^ ... */
175 mov 12*4(%rsi),%r10d /* (NEXT STEP) X[12] */
176 add %r11d, %ebx /* dst += ... */
177 rol $22, %ebx /* dst <<< s */
178 mov %edx, %r11d /* (NEXT STEP) z' = %edx */
179 add %ecx, %ebx /* dst += x */
180 xor %ecx, %r11d /* y ^ ... */
181 lea 1804603682(%eax,%r10d),%eax /* Const + dst + ... */
182 and %ebx, %r11d /* x & ... */
183 xor %edx, %r11d /* z ^ ... */
184 mov 13*4(%rsi),%r10d /* (NEXT STEP) X[13] */
185 add %r11d, %eax /* dst += ... */
186 rol $7, %eax /* dst <<< s */
187 mov %ecx, %r11d /* (NEXT STEP) z' = %ecx */
188 add %ebx, %eax /* dst += x */
189 xor %ebx, %r11d /* y ^ ... */
190 lea -40341101(%edx,%r10d),%edx /* Const + dst + ... */
191 and %eax, %r11d /* x & ... */
192 xor %ecx, %r11d /* z ^ ... */
193 mov 14*4(%rsi),%r10d /* (NEXT STEP) X[14] */
194 add %r11d, %edx /* dst += ... */
195 rol $12, %edx /* dst <<< s */
196 mov %ebx, %r11d /* (NEXT STEP) z' = %ebx */
197 add %eax, %edx /* dst += x */
198 xor %eax, %r11d /* y ^ ... */
199 lea -1502002290(%ecx,%r10d),%ecx /* Const + dst + ... */
200 and %edx, %r11d /* x & ... */
201 xor %ebx, %r11d /* z ^ ... */
202 mov 15*4(%rsi),%r10d /* (NEXT STEP) X[15] */
203 add %r11d, %ecx /* dst += ... */
204 rol $17, %ecx /* dst <<< s */
205 mov %eax, %r11d /* (NEXT STEP) z' = %eax */
206 add %edx, %ecx /* dst += x */
207 xor %edx, %r11d /* y ^ ... */
208 lea 1236535329(%ebx,%r10d),%ebx /* Const + dst + ... */
209 and %ecx, %r11d /* x & ... */
210 xor %eax, %r11d /* z ^ ... */
211 mov 0*4(%rsi),%r10d /* (NEXT STEP) X[0] */
212 add %r11d, %ebx /* dst += ... */
213 rol $22, %ebx /* dst <<< s */
214 mov %edx, %r11d /* (NEXT STEP) z' = %edx */
215 add %ecx, %ebx /* dst += x */
216 mov 1*4(%rsi), %r10d /* (NEXT STEP) X[1] */
217 mov %edx, %r11d /* (NEXT STEP) z' = %edx */
218 mov %edx, %r12d /* (NEXT STEP) z' = %edx */
219 not %r11d /* not z */
220 lea -165796510(%eax,%r10d),%eax /* Const + dst + ... */
221 and %ebx, %r12d /* x & z */
222 and %ecx, %r11d /* y & (not z) */
223 mov 6*4(%rsi),%r10d /* (NEXT STEP) X[6] */
224 or %r11d, %r12d /* (y & (not z)) | (x & z) */
225 mov %ecx, %r11d /* (NEXT STEP) z' = %ecx */
226 add %r12d, %eax /* dst += ... */
227 mov %ecx, %r12d /* (NEXT STEP) z' = %ecx */
228 rol $5, %eax /* dst <<< s */
229 add %ebx, %eax /* dst += x */
230 not %r11d /* not z */
231 lea -1069501632(%edx,%r10d),%edx /* Const + dst + ... */
232 and %eax, %r12d /* x & z */
233 and %ebx, %r11d /* y & (not z) */
234 mov 11*4(%rsi),%r10d /* (NEXT STEP) X[11] */
235 or %r11d, %r12d /* (y & (not z)) | (x & z) */
236 mov %ebx, %r11d /* (NEXT STEP) z' = %ebx */
237 add %r12d, %edx /* dst += ... */
238 mov %ebx, %r12d /* (NEXT STEP) z' = %ebx */
239 rol $9, %edx /* dst <<< s */
240 add %eax, %edx /* dst += x */
241 not %r11d /* not z */
242 lea 643717713(%ecx,%r10d),%ecx /* Const + dst + ... */
243 and %edx, %r12d /* x & z */
244 and %eax, %r11d /* y & (not z) */
245 mov 0*4(%rsi),%r10d /* (NEXT STEP) X[0] */
246 or %r11d, %r12d /* (y & (not z)) | (x & z) */
247 mov %eax, %r11d /* (NEXT STEP) z' = %eax */
248 add %r12d, %ecx /* dst += ... */
249 mov %eax, %r12d /* (NEXT STEP) z' = %eax */
250 rol $14, %ecx /* dst <<< s */
251 add %edx, %ecx /* dst += x */
252 not %r11d /* not z */
253 lea -373897302(%ebx,%r10d),%ebx /* Const + dst + ... */
254 and %ecx, %r12d /* x & z */
255 and %edx, %r11d /* y & (not z) */
256 mov 5*4(%rsi),%r10d /* (NEXT STEP) X[5] */
257 or %r11d, %r12d /* (y & (not z)) | (x & z) */
258 mov %edx, %r11d /* (NEXT STEP) z' = %edx */
259 add %r12d, %ebx /* dst += ... */
260 mov %edx, %r12d /* (NEXT STEP) z' = %edx */
261 rol $20, %ebx /* dst <<< s */
262 add %ecx, %ebx /* dst += x */
263 not %r11d /* not z */
264 lea -701558691(%eax,%r10d),%eax /* Const + dst + ... */
265 and %ebx, %r12d /* x & z */
266 and %ecx, %r11d /* y & (not z) */
267 mov 10*4(%rsi),%r10d /* (NEXT STEP) X[10] */
268 or %r11d, %r12d /* (y & (not z)) | (x & z) */
269 mov %ecx, %r11d /* (NEXT STEP) z' = %ecx */
270 add %r12d, %eax /* dst += ... */
271 mov %ecx, %r12d /* (NEXT STEP) z' = %ecx */
272 rol $5, %eax /* dst <<< s */
273 add %ebx, %eax /* dst += x */
274 not %r11d /* not z */
275 lea 38016083(%edx,%r10d),%edx /* Const + dst + ... */
276 and %eax, %r12d /* x & z */
277 and %ebx, %r11d /* y & (not z) */
278 mov 15*4(%rsi),%r10d /* (NEXT STEP) X[15] */
279 or %r11d, %r12d /* (y & (not z)) | (x & z) */
280 mov %ebx, %r11d /* (NEXT STEP) z' = %ebx */
281 add %r12d, %edx /* dst += ... */
282 mov %ebx, %r12d /* (NEXT STEP) z' = %ebx */
283 rol $9, %edx /* dst <<< s */
284 add %eax, %edx /* dst += x */
285 not %r11d /* not z */
286 lea -660478335(%ecx,%r10d),%ecx /* Const + dst + ... */
287 and %edx, %r12d /* x & z */
288 and %eax, %r11d /* y & (not z) */
289 mov 4*4(%rsi),%r10d /* (NEXT STEP) X[4] */
290 or %r11d, %r12d /* (y & (not z)) | (x & z) */
291 mov %eax, %r11d /* (NEXT STEP) z' = %eax */
292 add %r12d, %ecx /* dst += ... */
293 mov %eax, %r12d /* (NEXT STEP) z' = %eax */
294 rol $14, %ecx /* dst <<< s */
295 add %edx, %ecx /* dst += x */
296 not %r11d /* not z */
297 lea -405537848(%ebx,%r10d),%ebx /* Const + dst + ... */
298 and %ecx, %r12d /* x & z */
299 and %edx, %r11d /* y & (not z) */
300 mov 9*4(%rsi),%r10d /* (NEXT STEP) X[9] */
301 or %r11d, %r12d /* (y & (not z)) | (x & z) */
302 mov %edx, %r11d /* (NEXT STEP) z' = %edx */
303 add %r12d, %ebx /* dst += ... */
304 mov %edx, %r12d /* (NEXT STEP) z' = %edx */
305 rol $20, %ebx /* dst <<< s */
306 add %ecx, %ebx /* dst += x */
307 not %r11d /* not z */
308 lea 568446438(%eax,%r10d),%eax /* Const + dst + ... */
309 and %ebx, %r12d /* x & z */
310 and %ecx, %r11d /* y & (not z) */
311 mov 14*4(%rsi),%r10d /* (NEXT STEP) X[14] */
312 or %r11d, %r12d /* (y & (not z)) | (x & z) */
313 mov %ecx, %r11d /* (NEXT STEP) z' = %ecx */
314 add %r12d, %eax /* dst += ... */
315 mov %ecx, %r12d /* (NEXT STEP) z' = %ecx */
316 rol $5, %eax /* dst <<< s */
317 add %ebx, %eax /* dst += x */
318 not %r11d /* not z */
319 lea -1019803690(%edx,%r10d),%edx /* Const + dst + ... */
320 and %eax, %r12d /* x & z */
321 and %ebx, %r11d /* y & (not z) */
322 mov 3*4(%rsi),%r10d /* (NEXT STEP) X[3] */
323 or %r11d, %r12d /* (y & (not z)) | (x & z) */
324 mov %ebx, %r11d /* (NEXT STEP) z' = %ebx */
325 add %r12d, %edx /* dst += ... */
326 mov %ebx, %r12d /* (NEXT STEP) z' = %ebx */
327 rol $9, %edx /* dst <<< s */
328 add %eax, %edx /* dst += x */
329 not %r11d /* not z */
330 lea -187363961(%ecx,%r10d),%ecx /* Const + dst + ... */
331 and %edx, %r12d /* x & z */
332 and %eax, %r11d /* y & (not z) */
333 mov 8*4(%rsi),%r10d /* (NEXT STEP) X[8] */
334 or %r11d, %r12d /* (y & (not z)) | (x & z) */
335 mov %eax, %r11d /* (NEXT STEP) z' = %eax */
336 add %r12d, %ecx /* dst += ... */
337 mov %eax, %r12d /* (NEXT STEP) z' = %eax */
338 rol $14, %ecx /* dst <<< s */
339 add %edx, %ecx /* dst += x */
340 not %r11d /* not z */
341 lea 1163531501(%ebx,%r10d),%ebx /* Const + dst + ... */
342 and %ecx, %r12d /* x & z */
343 and %edx, %r11d /* y & (not z) */
344 mov 13*4(%rsi),%r10d /* (NEXT STEP) X[13] */
345 or %r11d, %r12d /* (y & (not z)) | (x & z) */
346 mov %edx, %r11d /* (NEXT STEP) z' = %edx */
347 add %r12d, %ebx /* dst += ... */
348 mov %edx, %r12d /* (NEXT STEP) z' = %edx */
349 rol $20, %ebx /* dst <<< s */
350 add %ecx, %ebx /* dst += x */
351 not %r11d /* not z */
352 lea -1444681467(%eax,%r10d),%eax /* Const + dst + ... */
353 and %ebx, %r12d /* x & z */
354 and %ecx, %r11d /* y & (not z) */
355 mov 2*4(%rsi),%r10d /* (NEXT STEP) X[2] */
356 or %r11d, %r12d /* (y & (not z)) | (x & z) */
357 mov %ecx, %r11d /* (NEXT STEP) z' = %ecx */
358 add %r12d, %eax /* dst += ... */
359 mov %ecx, %r12d /* (NEXT STEP) z' = %ecx */
360 rol $5, %eax /* dst <<< s */
361 add %ebx, %eax /* dst += x */
362 not %r11d /* not z */
363 lea -51403784(%edx,%r10d),%edx /* Const + dst + ... */
364 and %eax, %r12d /* x & z */
365 and %ebx, %r11d /* y & (not z) */
366 mov 7*4(%rsi),%r10d /* (NEXT STEP) X[7] */
367 or %r11d, %r12d /* (y & (not z)) | (x & z) */
368 mov %ebx, %r11d /* (NEXT STEP) z' = %ebx */
369 add %r12d, %edx /* dst += ... */
370 mov %ebx, %r12d /* (NEXT STEP) z' = %ebx */
371 rol $9, %edx /* dst <<< s */
372 add %eax, %edx /* dst += x */
373 not %r11d /* not z */
374 lea 1735328473(%ecx,%r10d),%ecx /* Const + dst + ... */
375 and %edx, %r12d /* x & z */
376 and %eax, %r11d /* y & (not z) */
377 mov 12*4(%rsi),%r10d /* (NEXT STEP) X[12] */
378 or %r11d, %r12d /* (y & (not z)) | (x & z) */
379 mov %eax, %r11d /* (NEXT STEP) z' = %eax */
380 add %r12d, %ecx /* dst += ... */
381 mov %eax, %r12d /* (NEXT STEP) z' = %eax */
382 rol $14, %ecx /* dst <<< s */
383 add %edx, %ecx /* dst += x */
384 not %r11d /* not z */
385 lea -1926607734(%ebx,%r10d),%ebx /* Const + dst + ... */
386 and %ecx, %r12d /* x & z */
387 and %edx, %r11d /* y & (not z) */
388 mov 0*4(%rsi),%r10d /* (NEXT STEP) X[0] */
389 or %r11d, %r12d /* (y & (not z)) | (x & z) */
390 mov %edx, %r11d /* (NEXT STEP) z' = %edx */
391 add %r12d, %ebx /* dst += ... */
392 mov %edx, %r12d /* (NEXT STEP) z' = %edx */
393 rol $20, %ebx /* dst <<< s */
394 add %ecx, %ebx /* dst += x */
395 mov 5*4(%rsi), %r10d /* (NEXT STEP) X[5] */
396 mov %ecx, %r11d /* (NEXT STEP) y' = %ecx */
397 lea -378558(%eax,%r10d),%eax /* Const + dst + ... */
398 mov 8*4(%rsi),%r10d /* (NEXT STEP) X[8] */
399 xor %edx, %r11d /* z ^ ... */
400 xor %ebx, %r11d /* x ^ ... */
401 add %r11d, %eax /* dst += ... */
402 rol $4, %eax /* dst <<< s */
403 mov %ebx, %r11d /* (NEXT STEP) y' = %ebx */
404 add %ebx, %eax /* dst += x */
405 lea -2022574463(%edx,%r10d),%edx /* Const + dst + ... */
406 mov 11*4(%rsi),%r10d /* (NEXT STEP) X[11] */
407 xor %ecx, %r11d /* z ^ ... */
408 xor %eax, %r11d /* x ^ ... */
409 add %r11d, %edx /* dst += ... */
410 rol $11, %edx /* dst <<< s */
411 mov %eax, %r11d /* (NEXT STEP) y' = %eax */
412 add %eax, %edx /* dst += x */
413 lea 1839030562(%ecx,%r10d),%ecx /* Const + dst + ... */
414 mov 14*4(%rsi),%r10d /* (NEXT STEP) X[14] */
415 xor %ebx, %r11d /* z ^ ... */
416 xor %edx, %r11d /* x ^ ... */
417 add %r11d, %ecx /* dst += ... */
418 rol $16, %ecx /* dst <<< s */
419 mov %edx, %r11d /* (NEXT STEP) y' = %edx */
420 add %edx, %ecx /* dst += x */
421 lea -35309556(%ebx,%r10d),%ebx /* Const + dst + ... */
422 mov 1*4(%rsi),%r10d /* (NEXT STEP) X[1] */
423 xor %eax, %r11d /* z ^ ... */
424 xor %ecx, %r11d /* x ^ ... */
425 add %r11d, %ebx /* dst += ... */
426 rol $23, %ebx /* dst <<< s */
427 mov %ecx, %r11d /* (NEXT STEP) y' = %ecx */
428 add %ecx, %ebx /* dst += x */
429 lea -1530992060(%eax,%r10d),%eax /* Const + dst + ... */
430 mov 4*4(%rsi),%r10d /* (NEXT STEP) X[4] */
431 xor %edx, %r11d /* z ^ ... */
432 xor %ebx, %r11d /* x ^ ... */
433 add %r11d, %eax /* dst += ... */
434 rol $4, %eax /* dst <<< s */
435 mov %ebx, %r11d /* (NEXT STEP) y' = %ebx */
436 add %ebx, %eax /* dst += x */
437 lea 1272893353(%edx,%r10d),%edx /* Const + dst + ... */
438 mov 7*4(%rsi),%r10d /* (NEXT STEP) X[7] */
439 xor %ecx, %r11d /* z ^ ... */
440 xor %eax, %r11d /* x ^ ... */
441 add %r11d, %edx /* dst += ... */
442 rol $11, %edx /* dst <<< s */
443 mov %eax, %r11d /* (NEXT STEP) y' = %eax */
444 add %eax, %edx /* dst += x */
445 lea -155497632(%ecx,%r10d),%ecx /* Const + dst + ... */
446 mov 10*4(%rsi),%r10d /* (NEXT STEP) X[10] */
447 xor %ebx, %r11d /* z ^ ... */
448 xor %edx, %r11d /* x ^ ... */
449 add %r11d, %ecx /* dst += ... */
450 rol $16, %ecx /* dst <<< s */
451 mov %edx, %r11d /* (NEXT STEP) y' = %edx */
452 add %edx, %ecx /* dst += x */
453 lea -1094730640(%ebx,%r10d),%ebx /* Const + dst + ... */
454 mov 13*4(%rsi),%r10d /* (NEXT STEP) X[13] */
455 xor %eax, %r11d /* z ^ ... */
456 xor %ecx, %r11d /* x ^ ... */
457 add %r11d, %ebx /* dst += ... */
458 rol $23, %ebx /* dst <<< s */
459 mov %ecx, %r11d /* (NEXT STEP) y' = %ecx */
460 add %ecx, %ebx /* dst += x */
461 lea 681279174(%eax,%r10d),%eax /* Const + dst + ... */
462 mov 0*4(%rsi),%r10d /* (NEXT STEP) X[0] */
463 xor %edx, %r11d /* z ^ ... */
464 xor %ebx, %r11d /* x ^ ... */
465 add %r11d, %eax /* dst += ... */
466 rol $4, %eax /* dst <<< s */
467 mov %ebx, %r11d /* (NEXT STEP) y' = %ebx */
468 add %ebx, %eax /* dst += x */
469 lea -358537222(%edx,%r10d),%edx /* Const + dst + ... */
470 mov 3*4(%rsi),%r10d /* (NEXT STEP) X[3] */
471 xor %ecx, %r11d /* z ^ ... */
472 xor %eax, %r11d /* x ^ ... */
473 add %r11d, %edx /* dst += ... */
474 rol $11, %edx /* dst <<< s */
475 mov %eax, %r11d /* (NEXT STEP) y' = %eax */
476 add %eax, %edx /* dst += x */
477 lea -722521979(%ecx,%r10d),%ecx /* Const + dst + ... */
478 mov 6*4(%rsi),%r10d /* (NEXT STEP) X[6] */
479 xor %ebx, %r11d /* z ^ ... */
480 xor %edx, %r11d /* x ^ ... */
481 add %r11d, %ecx /* dst += ... */
482 rol $16, %ecx /* dst <<< s */
483 mov %edx, %r11d /* (NEXT STEP) y' = %edx */
484 add %edx, %ecx /* dst += x */
485 lea 76029189(%ebx,%r10d),%ebx /* Const + dst + ... */
486 mov 9*4(%rsi),%r10d /* (NEXT STEP) X[9] */
487 xor %eax, %r11d /* z ^ ... */
488 xor %ecx, %r11d /* x ^ ... */
489 add %r11d, %ebx /* dst += ... */
490 rol $23, %ebx /* dst <<< s */
491 mov %ecx, %r11d /* (NEXT STEP) y' = %ecx */
492 add %ecx, %ebx /* dst += x */
493 lea -640364487(%eax,%r10d),%eax /* Const + dst + ... */
494 mov 12*4(%rsi),%r10d /* (NEXT STEP) X[12] */
495 xor %edx, %r11d /* z ^ ... */
496 xor %ebx, %r11d /* x ^ ... */
497 add %r11d, %eax /* dst += ... */
498 rol $4, %eax /* dst <<< s */
499 mov %ebx, %r11d /* (NEXT STEP) y' = %ebx */
500 add %ebx, %eax /* dst += x */
501 lea -421815835(%edx,%r10d),%edx /* Const + dst + ... */
502 mov 15*4(%rsi),%r10d /* (NEXT STEP) X[15] */
503 xor %ecx, %r11d /* z ^ ... */
504 xor %eax, %r11d /* x ^ ... */
505 add %r11d, %edx /* dst += ... */
506 rol $11, %edx /* dst <<< s */
507 mov %eax, %r11d /* (NEXT STEP) y' = %eax */
508 add %eax, %edx /* dst += x */
509 lea 530742520(%ecx,%r10d),%ecx /* Const + dst + ... */
510 mov 2*4(%rsi),%r10d /* (NEXT STEP) X[2] */
511 xor %ebx, %r11d /* z ^ ... */
512 xor %edx, %r11d /* x ^ ... */
513 add %r11d, %ecx /* dst += ... */
514 rol $16, %ecx /* dst <<< s */
515 mov %edx, %r11d /* (NEXT STEP) y' = %edx */
516 add %edx, %ecx /* dst += x */
517 lea -995338651(%ebx,%r10d),%ebx /* Const + dst + ... */
518 mov 0*4(%rsi),%r10d /* (NEXT STEP) X[0] */
519 xor %eax, %r11d /* z ^ ... */
520 xor %ecx, %r11d /* x ^ ... */
521 add %r11d, %ebx /* dst += ... */
522 rol $23, %ebx /* dst <<< s */
523 mov %ecx, %r11d /* (NEXT STEP) y' = %ecx */
524 add %ecx, %ebx /* dst += x */
525 mov 0*4(%rsi), %r10d /* (NEXT STEP) X[0] */
526 mov $0xffffffff, %r11d
527 xor %edx, %r11d /* (NEXT STEP) not z' = not %edx*/
528 lea -198630844(%eax,%r10d),%eax /* Const + dst + ... */
529 or %ebx, %r11d /* x | ... */
530 xor %ecx, %r11d /* y ^ ... */
531 add %r11d, %eax /* dst += ... */
532 mov 7*4(%rsi),%r10d /* (NEXT STEP) X[7] */
533 mov $0xffffffff, %r11d
534 rol $6, %eax /* dst <<< s */
535 xor %ecx, %r11d /* (NEXT STEP) not z' = not %ecx */
536 add %ebx, %eax /* dst += x */
537 lea 1126891415(%edx,%r10d),%edx /* Const + dst + ... */
538 or %eax, %r11d /* x | ... */
539 xor %ebx, %r11d /* y ^ ... */
540 add %r11d, %edx /* dst += ... */
541 mov 14*4(%rsi),%r10d /* (NEXT STEP) X[14] */
542 mov $0xffffffff, %r11d
543 rol $10, %edx /* dst <<< s */
544 xor %ebx, %r11d /* (NEXT STEP) not z' = not %ebx */
545 add %eax, %edx /* dst += x */
546 lea -1416354905(%ecx,%r10d),%ecx /* Const + dst + ... */
547 or %edx, %r11d /* x | ... */
548 xor %eax, %r11d /* y ^ ... */
549 add %r11d, %ecx /* dst += ... */
550 mov 5*4(%rsi),%r10d /* (NEXT STEP) X[5] */
551 mov $0xffffffff, %r11d
552 rol $15, %ecx /* dst <<< s */
553 xor %eax, %r11d /* (NEXT STEP) not z' = not %eax */
554 add %edx, %ecx /* dst += x */
555 lea -57434055(%ebx,%r10d),%ebx /* Const + dst + ... */
556 or %ecx, %r11d /* x | ... */
557 xor %edx, %r11d /* y ^ ... */
558 add %r11d, %ebx /* dst += ... */
559 mov 12*4(%rsi),%r10d /* (NEXT STEP) X[12] */
560 mov $0xffffffff, %r11d
561 rol $21, %ebx /* dst <<< s */
562 xor %edx, %r11d /* (NEXT STEP) not z' = not %edx */
563 add %ecx, %ebx /* dst += x */
564 lea 1700485571(%eax,%r10d),%eax /* Const + dst + ... */
565 or %ebx, %r11d /* x | ... */
566 xor %ecx, %r11d /* y ^ ... */
567 add %r11d, %eax /* dst += ... */
568 mov 3*4(%rsi),%r10d /* (NEXT STEP) X[3] */
569 mov $0xffffffff, %r11d
570 rol $6, %eax /* dst <<< s */
571 xor %ecx, %r11d /* (NEXT STEP) not z' = not %ecx */
572 add %ebx, %eax /* dst += x */
573 lea -1894986606(%edx,%r10d),%edx /* Const + dst + ... */
574 or %eax, %r11d /* x | ... */
575 xor %ebx, %r11d /* y ^ ... */
576 add %r11d, %edx /* dst += ... */
577 mov 10*4(%rsi),%r10d /* (NEXT STEP) X[10] */
578 mov $0xffffffff, %r11d
579 rol $10, %edx /* dst <<< s */
580 xor %ebx, %r11d /* (NEXT STEP) not z' = not %ebx */
581 add %eax, %edx /* dst += x */
582 lea -1051523(%ecx,%r10d),%ecx /* Const + dst + ... */
583 or %edx, %r11d /* x | ... */
584 xor %eax, %r11d /* y ^ ... */
585 add %r11d, %ecx /* dst += ... */
586 mov 1*4(%rsi),%r10d /* (NEXT STEP) X[1] */
587 mov $0xffffffff, %r11d
588 rol $15, %ecx /* dst <<< s */
589 xor %eax, %r11d /* (NEXT STEP) not z' = not %eax */
590 add %edx, %ecx /* dst += x */
591 lea -2054922799(%ebx,%r10d),%ebx /* Const + dst + ... */
592 or %ecx, %r11d /* x | ... */
593 xor %edx, %r11d /* y ^ ... */
594 add %r11d, %ebx /* dst += ... */
595 mov 8*4(%rsi),%r10d /* (NEXT STEP) X[8] */
596 mov $0xffffffff, %r11d
597 rol $21, %ebx /* dst <<< s */
598 xor %edx, %r11d /* (NEXT STEP) not z' = not %edx */
599 add %ecx, %ebx /* dst += x */
600 lea 1873313359(%eax,%r10d),%eax /* Const + dst + ... */
601 or %ebx, %r11d /* x | ... */
602 xor %ecx, %r11d /* y ^ ... */
603 add %r11d, %eax /* dst += ... */
604 mov 15*4(%rsi),%r10d /* (NEXT STEP) X[15] */
605 mov $0xffffffff, %r11d
606 rol $6, %eax /* dst <<< s */
607 xor %ecx, %r11d /* (NEXT STEP) not z' = not %ecx */
608 add %ebx, %eax /* dst += x */
609 lea -30611744(%edx,%r10d),%edx /* Const + dst + ... */
610 or %eax, %r11d /* x | ... */
611 xor %ebx, %r11d /* y ^ ... */
612 add %r11d, %edx /* dst += ... */
613 mov 6*4(%rsi),%r10d /* (NEXT STEP) X[6] */
614 mov $0xffffffff, %r11d
615 rol $10, %edx /* dst <<< s */
616 xor %ebx, %r11d /* (NEXT STEP) not z' = not %ebx */
617 add %eax, %edx /* dst += x */
618 lea -1560198380(%ecx,%r10d),%ecx /* Const + dst + ... */
619 or %edx, %r11d /* x | ... */
620 xor %eax, %r11d /* y ^ ... */
621 add %r11d, %ecx /* dst += ... */
622 mov 13*4(%rsi),%r10d /* (NEXT STEP) X[13] */
623 mov $0xffffffff, %r11d
624 rol $15, %ecx /* dst <<< s */
625 xor %eax, %r11d /* (NEXT STEP) not z' = not %eax */
626 add %edx, %ecx /* dst += x */
627 lea 1309151649(%ebx,%r10d),%ebx /* Const + dst + ... */
628 or %ecx, %r11d /* x | ... */
629 xor %edx, %r11d /* y ^ ... */
630 add %r11d, %ebx /* dst += ... */
631 mov 4*4(%rsi),%r10d /* (NEXT STEP) X[4] */
632 mov $0xffffffff, %r11d
633 rol $21, %ebx /* dst <<< s */
634 xor %edx, %r11d /* (NEXT STEP) not z' = not %edx */
635 add %ecx, %ebx /* dst += x */
636 lea -145523070(%eax,%r10d),%eax /* Const + dst + ... */
637 or %ebx, %r11d /* x | ... */
638 xor %ecx, %r11d /* y ^ ... */
639 add %r11d, %eax /* dst += ... */
640 mov 11*4(%rsi),%r10d /* (NEXT STEP) X[11] */
641 mov $0xffffffff, %r11d
642 rol $6, %eax /* dst <<< s */
643 xor %ecx, %r11d /* (NEXT STEP) not z' = not %ecx */
644 add %ebx, %eax /* dst += x */
645 lea -1120210379(%edx,%r10d),%edx /* Const + dst + ... */
646 or %eax, %r11d /* x | ... */
647 xor %ebx, %r11d /* y ^ ... */
648 add %r11d, %edx /* dst += ... */
649 mov 2*4(%rsi),%r10d /* (NEXT STEP) X[2] */
650 mov $0xffffffff, %r11d
651 rol $10, %edx /* dst <<< s */
652 xor %ebx, %r11d /* (NEXT STEP) not z' = not %ebx */
653 add %eax, %edx /* dst += x */
654 lea 718787259(%ecx,%r10d),%ecx /* Const + dst + ... */
655 or %edx, %r11d /* x | ... */
656 xor %eax, %r11d /* y ^ ... */
657 add %r11d, %ecx /* dst += ... */
658 mov 9*4(%rsi),%r10d /* (NEXT STEP) X[9] */
659 mov $0xffffffff, %r11d
660 rol $15, %ecx /* dst <<< s */
661 xor %eax, %r11d /* (NEXT STEP) not z' = not %eax */
662 add %edx, %ecx /* dst += x */
663 lea -343485551(%ebx,%r10d),%ebx /* Const + dst + ... */
664 or %ecx, %r11d /* x | ... */
665 xor %edx, %r11d /* y ^ ... */
666 add %r11d, %ebx /* dst += ... */
667 mov 0*4(%rsi),%r10d /* (NEXT STEP) X[0] */
668 mov $0xffffffff, %r11d
669 rol $21, %ebx /* dst <<< s */
670 xor %edx, %r11d /* (NEXT STEP) not z' = not %edx */
671 add %ecx, %ebx /* dst += x */
672 # add old values of A, B, C, D
679 add $64, %rsi # ptr += 64
680 cmp %rdi, %rsi # cmp end with ptr
681 jb 2b # jmp if ptr < end
682 # END of loop over 16-word blocks
684 mov %eax, 0*4(%rbp) # ctx->A = A
685 mov %ebx, 1*4(%rbp) # ctx->B = B
686 mov %ecx, 2*4(%rbp) # ctx->C = C
687 mov %edx, 3*4(%rbp) # ctx->D = D
691 pop %r13 # not really useful (r13 is unused)
697 #endif /* !USE_OPENSSL ... */