Roll src/third_party/skia 26fc1bf:f559de4
[chromium-blink-merge.git] / third_party / boringssl / win-x86 / crypto / bn / x86-mont.asm
blobde7b949927258f9d725c73fa60bb278232f2dcd1
1 %ifidn __OUTPUT_FORMAT__,obj
2 section code use32 class=code align=64
3 %elifidn __OUTPUT_FORMAT__,win32
4 %ifdef __YASM_VERSION_ID__
5 %if __YASM_VERSION_ID__ < 01010000h
6 %error yasm version 1.1.0 or later needed.
7 %endif
8 ; Yasm automatically includes .00 and complains about redefining it.
9 ; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html
10 %else
11 $@feat.00 equ 1
12 %endif
13 section .text code align=64
14 %else
15 section .text code
16 %endif
17 ;extern _OPENSSL_ia32cap_P
18 global _bn_mul_mont
19 align 16
20 _bn_mul_mont:
21 L$_bn_mul_mont_begin:
22 push ebp
23 push ebx
24 push esi
25 push edi
26 xor eax,eax
27 mov edi,DWORD [40+esp]
28 cmp edi,4
29 jl NEAR L$000just_leave
30 lea esi,[20+esp]
31 lea edx,[24+esp]
32 mov ebp,esp
33 add edi,2
34 neg edi
35 lea esp,[edi*4+esp-32]
36 neg edi
37 mov eax,esp
38 sub eax,edx
39 and eax,2047
40 sub esp,eax
41 xor edx,esp
42 and edx,2048
43 xor edx,2048
44 sub esp,edx
45 and esp,-64
46 mov eax,DWORD [esi]
47 mov ebx,DWORD [4+esi]
48 mov ecx,DWORD [8+esi]
49 mov edx,DWORD [12+esi]
50 mov esi,DWORD [16+esi]
51 mov esi,DWORD [esi]
52 mov DWORD [4+esp],eax
53 mov DWORD [8+esp],ebx
54 mov DWORD [12+esp],ecx
55 mov DWORD [16+esp],edx
56 mov DWORD [20+esp],esi
57 lea ebx,[edi-3]
58 mov DWORD [24+esp],ebp
59 lea eax,[_OPENSSL_ia32cap_P]
60 bt DWORD [eax],26
61 jnc NEAR L$001non_sse2
62 mov eax,-1
63 movd mm7,eax
64 mov esi,DWORD [8+esp]
65 mov edi,DWORD [12+esp]
66 mov ebp,DWORD [16+esp]
67 xor edx,edx
68 xor ecx,ecx
69 movd mm4,DWORD [edi]
70 movd mm5,DWORD [esi]
71 movd mm3,DWORD [ebp]
72 pmuludq mm5,mm4
73 movq mm2,mm5
74 movq mm0,mm5
75 pand mm0,mm7
76 pmuludq mm5,[20+esp]
77 pmuludq mm3,mm5
78 paddq mm3,mm0
79 movd mm1,DWORD [4+ebp]
80 movd mm0,DWORD [4+esi]
81 psrlq mm2,32
82 psrlq mm3,32
83 inc ecx
84 align 16
85 L$0021st:
86 pmuludq mm0,mm4
87 pmuludq mm1,mm5
88 paddq mm2,mm0
89 paddq mm3,mm1
90 movq mm0,mm2
91 pand mm0,mm7
92 movd mm1,DWORD [4+ecx*4+ebp]
93 paddq mm3,mm0
94 movd mm0,DWORD [4+ecx*4+esi]
95 psrlq mm2,32
96 movd DWORD [28+ecx*4+esp],mm3
97 psrlq mm3,32
98 lea ecx,[1+ecx]
99 cmp ecx,ebx
100 jl NEAR L$0021st
101 pmuludq mm0,mm4
102 pmuludq mm1,mm5
103 paddq mm2,mm0
104 paddq mm3,mm1
105 movq mm0,mm2
106 pand mm0,mm7
107 paddq mm3,mm0
108 movd DWORD [28+ecx*4+esp],mm3
109 psrlq mm2,32
110 psrlq mm3,32
111 paddq mm3,mm2
112 movq [32+ebx*4+esp],mm3
113 inc edx
114 L$003outer:
115 xor ecx,ecx
116 movd mm4,DWORD [edx*4+edi]
117 movd mm5,DWORD [esi]
118 movd mm6,DWORD [32+esp]
119 movd mm3,DWORD [ebp]
120 pmuludq mm5,mm4
121 paddq mm5,mm6
122 movq mm0,mm5
123 movq mm2,mm5
124 pand mm0,mm7
125 pmuludq mm5,[20+esp]
126 pmuludq mm3,mm5
127 paddq mm3,mm0
128 movd mm6,DWORD [36+esp]
129 movd mm1,DWORD [4+ebp]
130 movd mm0,DWORD [4+esi]
131 psrlq mm2,32
132 psrlq mm3,32
133 paddq mm2,mm6
134 inc ecx
135 dec ebx
136 L$004inner:
137 pmuludq mm0,mm4
138 pmuludq mm1,mm5
139 paddq mm2,mm0
140 paddq mm3,mm1
141 movq mm0,mm2
142 movd mm6,DWORD [36+ecx*4+esp]
143 pand mm0,mm7
144 movd mm1,DWORD [4+ecx*4+ebp]
145 paddq mm3,mm0
146 movd mm0,DWORD [4+ecx*4+esi]
147 psrlq mm2,32
148 movd DWORD [28+ecx*4+esp],mm3
149 psrlq mm3,32
150 paddq mm2,mm6
151 dec ebx
152 lea ecx,[1+ecx]
153 jnz NEAR L$004inner
154 mov ebx,ecx
155 pmuludq mm0,mm4
156 pmuludq mm1,mm5
157 paddq mm2,mm0
158 paddq mm3,mm1
159 movq mm0,mm2
160 pand mm0,mm7
161 paddq mm3,mm0
162 movd DWORD [28+ecx*4+esp],mm3
163 psrlq mm2,32
164 psrlq mm3,32
165 movd mm6,DWORD [36+ebx*4+esp]
166 paddq mm3,mm2
167 paddq mm3,mm6
168 movq [32+ebx*4+esp],mm3
169 lea edx,[1+edx]
170 cmp edx,ebx
171 jle NEAR L$003outer
172 emms
173 jmp NEAR L$005common_tail
174 align 16
175 L$001non_sse2:
176 mov esi,DWORD [8+esp]
177 lea ebp,[1+ebx]
178 mov edi,DWORD [12+esp]
179 xor ecx,ecx
180 mov edx,esi
181 and ebp,1
182 sub edx,edi
183 lea eax,[4+ebx*4+edi]
184 or ebp,edx
185 mov edi,DWORD [edi]
186 jz NEAR L$006bn_sqr_mont
187 mov DWORD [28+esp],eax
188 mov eax,DWORD [esi]
189 xor edx,edx
190 align 16
191 L$007mull:
192 mov ebp,edx
193 mul edi
194 add ebp,eax
195 lea ecx,[1+ecx]
196 adc edx,0
197 mov eax,DWORD [ecx*4+esi]
198 cmp ecx,ebx
199 mov DWORD [28+ecx*4+esp],ebp
200 jl NEAR L$007mull
201 mov ebp,edx
202 mul edi
203 mov edi,DWORD [20+esp]
204 add eax,ebp
205 mov esi,DWORD [16+esp]
206 adc edx,0
207 imul edi,DWORD [32+esp]
208 mov DWORD [32+ebx*4+esp],eax
209 xor ecx,ecx
210 mov DWORD [36+ebx*4+esp],edx
211 mov DWORD [40+ebx*4+esp],ecx
212 mov eax,DWORD [esi]
213 mul edi
214 add eax,DWORD [32+esp]
215 mov eax,DWORD [4+esi]
216 adc edx,0
217 inc ecx
218 jmp NEAR L$0082ndmadd
219 align 16
220 L$0091stmadd:
221 mov ebp,edx
222 mul edi
223 add ebp,DWORD [32+ecx*4+esp]
224 lea ecx,[1+ecx]
225 adc edx,0
226 add ebp,eax
227 mov eax,DWORD [ecx*4+esi]
228 adc edx,0
229 cmp ecx,ebx
230 mov DWORD [28+ecx*4+esp],ebp
231 jl NEAR L$0091stmadd
232 mov ebp,edx
233 mul edi
234 add eax,DWORD [32+ebx*4+esp]
235 mov edi,DWORD [20+esp]
236 adc edx,0
237 mov esi,DWORD [16+esp]
238 add ebp,eax
239 adc edx,0
240 imul edi,DWORD [32+esp]
241 xor ecx,ecx
242 add edx,DWORD [36+ebx*4+esp]
243 mov DWORD [32+ebx*4+esp],ebp
244 adc ecx,0
245 mov eax,DWORD [esi]
246 mov DWORD [36+ebx*4+esp],edx
247 mov DWORD [40+ebx*4+esp],ecx
248 mul edi
249 add eax,DWORD [32+esp]
250 mov eax,DWORD [4+esi]
251 adc edx,0
252 mov ecx,1
253 align 16
254 L$0082ndmadd:
255 mov ebp,edx
256 mul edi
257 add ebp,DWORD [32+ecx*4+esp]
258 lea ecx,[1+ecx]
259 adc edx,0
260 add ebp,eax
261 mov eax,DWORD [ecx*4+esi]
262 adc edx,0
263 cmp ecx,ebx
264 mov DWORD [24+ecx*4+esp],ebp
265 jl NEAR L$0082ndmadd
266 mov ebp,edx
267 mul edi
268 add ebp,DWORD [32+ebx*4+esp]
269 adc edx,0
270 add ebp,eax
271 adc edx,0
272 mov DWORD [28+ebx*4+esp],ebp
273 xor eax,eax
274 mov ecx,DWORD [12+esp]
275 add edx,DWORD [36+ebx*4+esp]
276 adc eax,DWORD [40+ebx*4+esp]
277 lea ecx,[4+ecx]
278 mov DWORD [32+ebx*4+esp],edx
279 cmp ecx,DWORD [28+esp]
280 mov DWORD [36+ebx*4+esp],eax
281 je NEAR L$005common_tail
282 mov edi,DWORD [ecx]
283 mov esi,DWORD [8+esp]
284 mov DWORD [12+esp],ecx
285 xor ecx,ecx
286 xor edx,edx
287 mov eax,DWORD [esi]
288 jmp NEAR L$0091stmadd
289 align 16
290 L$006bn_sqr_mont:
291 mov DWORD [esp],ebx
292 mov DWORD [12+esp],ecx
293 mov eax,edi
294 mul edi
295 mov DWORD [32+esp],eax
296 mov ebx,edx
297 shr edx,1
298 and ebx,1
299 inc ecx
300 align 16
301 L$010sqr:
302 mov eax,DWORD [ecx*4+esi]
303 mov ebp,edx
304 mul edi
305 add eax,ebp
306 lea ecx,[1+ecx]
307 adc edx,0
308 lea ebp,[eax*2+ebx]
309 shr eax,31
310 cmp ecx,DWORD [esp]
311 mov ebx,eax
312 mov DWORD [28+ecx*4+esp],ebp
313 jl NEAR L$010sqr
314 mov eax,DWORD [ecx*4+esi]
315 mov ebp,edx
316 mul edi
317 add eax,ebp
318 mov edi,DWORD [20+esp]
319 adc edx,0
320 mov esi,DWORD [16+esp]
321 lea ebp,[eax*2+ebx]
322 imul edi,DWORD [32+esp]
323 shr eax,31
324 mov DWORD [32+ecx*4+esp],ebp
325 lea ebp,[edx*2+eax]
326 mov eax,DWORD [esi]
327 shr edx,31
328 mov DWORD [36+ecx*4+esp],ebp
329 mov DWORD [40+ecx*4+esp],edx
330 mul edi
331 add eax,DWORD [32+esp]
332 mov ebx,ecx
333 adc edx,0
334 mov eax,DWORD [4+esi]
335 mov ecx,1
336 align 16
337 L$0113rdmadd:
338 mov ebp,edx
339 mul edi
340 add ebp,DWORD [32+ecx*4+esp]
341 adc edx,0
342 add ebp,eax
343 mov eax,DWORD [4+ecx*4+esi]
344 adc edx,0
345 mov DWORD [28+ecx*4+esp],ebp
346 mov ebp,edx
347 mul edi
348 add ebp,DWORD [36+ecx*4+esp]
349 lea ecx,[2+ecx]
350 adc edx,0
351 add ebp,eax
352 mov eax,DWORD [ecx*4+esi]
353 adc edx,0
354 cmp ecx,ebx
355 mov DWORD [24+ecx*4+esp],ebp
356 jl NEAR L$0113rdmadd
357 mov ebp,edx
358 mul edi
359 add ebp,DWORD [32+ebx*4+esp]
360 adc edx,0
361 add ebp,eax
362 adc edx,0
363 mov DWORD [28+ebx*4+esp],ebp
364 mov ecx,DWORD [12+esp]
365 xor eax,eax
366 mov esi,DWORD [8+esp]
367 add edx,DWORD [36+ebx*4+esp]
368 adc eax,DWORD [40+ebx*4+esp]
369 mov DWORD [32+ebx*4+esp],edx
370 cmp ecx,ebx
371 mov DWORD [36+ebx*4+esp],eax
372 je NEAR L$005common_tail
373 mov edi,DWORD [4+ecx*4+esi]
374 lea ecx,[1+ecx]
375 mov eax,edi
376 mov DWORD [12+esp],ecx
377 mul edi
378 add eax,DWORD [32+ecx*4+esp]
379 adc edx,0
380 mov DWORD [32+ecx*4+esp],eax
381 xor ebp,ebp
382 cmp ecx,ebx
383 lea ecx,[1+ecx]
384 je NEAR L$012sqrlast
385 mov ebx,edx
386 shr edx,1
387 and ebx,1
388 align 16
389 L$013sqradd:
390 mov eax,DWORD [ecx*4+esi]
391 mov ebp,edx
392 mul edi
393 add eax,ebp
394 lea ebp,[eax*1+eax]
395 adc edx,0
396 shr eax,31
397 add ebp,DWORD [32+ecx*4+esp]
398 lea ecx,[1+ecx]
399 adc eax,0
400 add ebp,ebx
401 adc eax,0
402 cmp ecx,DWORD [esp]
403 mov DWORD [28+ecx*4+esp],ebp
404 mov ebx,eax
405 jle NEAR L$013sqradd
406 mov ebp,edx
407 add edx,edx
408 shr ebp,31
409 add edx,ebx
410 adc ebp,0
411 L$012sqrlast:
412 mov edi,DWORD [20+esp]
413 mov esi,DWORD [16+esp]
414 imul edi,DWORD [32+esp]
415 add edx,DWORD [32+ecx*4+esp]
416 mov eax,DWORD [esi]
417 adc ebp,0
418 mov DWORD [32+ecx*4+esp],edx
419 mov DWORD [36+ecx*4+esp],ebp
420 mul edi
421 add eax,DWORD [32+esp]
422 lea ebx,[ecx-1]
423 adc edx,0
424 mov ecx,1
425 mov eax,DWORD [4+esi]
426 jmp NEAR L$0113rdmadd
427 align 16
428 L$005common_tail:
429 mov ebp,DWORD [16+esp]
430 mov edi,DWORD [4+esp]
431 lea esi,[32+esp]
432 mov eax,DWORD [esi]
433 mov ecx,ebx
434 xor edx,edx
435 align 16
436 L$014sub:
437 sbb eax,DWORD [edx*4+ebp]
438 mov DWORD [edx*4+edi],eax
439 dec ecx
440 mov eax,DWORD [4+edx*4+esi]
441 lea edx,[1+edx]
442 jge NEAR L$014sub
443 sbb eax,0
444 align 16
445 L$015copy:
446 mov edx,DWORD [ebx*4+esi]
447 mov ebp,DWORD [ebx*4+edi]
448 xor edx,ebp
449 and edx,eax
450 xor edx,ebp
451 mov DWORD [ebx*4+esi],ecx
452 mov DWORD [ebx*4+edi],edx
453 dec ebx
454 jge NEAR L$015copy
455 mov esp,DWORD [24+esp]
456 mov eax,1
457 L$000just_leave:
458 pop edi
459 pop esi
460 pop ebx
461 pop ebp
463 db 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
464 db 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
465 db 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
466 db 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
467 db 111,114,103,62,0
468 segment .bss
469 common _OPENSSL_ia32cap_P 16