r418: Sole edit by using shift key
[cinelerra_cv/mob.git] / mpeg2enc / dist2_mmx.s
blob0f41fd8271b9247c00d011b2011803247f77cb96
2 ; dist2_mmx.s: mmX optimized squared distance sum
4 ; Original believed to be Copyright (C) 2000 Brent Byeler
6 ; This program is free software; you can reaxstribute it and/or
7 ; modify it under the terms of the GNU General Public License
8 ; as published by the Free Software Foundation; either version 2
9 ; of the License, or (at your option) any later version.
11 ; This program is distributed in the hope that it will be useful,
12 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ; GNU General Public License for more details.
16 ; You should have received a copy of the GNU General Public License
17 ; along with this program; if not, write to the Free Software
18 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 ; total squared difference between two (16*h) blocks
22 ; including optional half pel interpolation of [ebp+8] ; blk1 (hx,hy)
23 ; blk1,blk2: addresses of top left pels of both blocks
24 ; lx: distance (in bytes) of vertically adjacent pels
25 ; hx,hy: flags for horizontal and/or vertical interpolation
26 ; h: height of block (usually 8 or 16)
27 ; mmX version
29 global dist2_mmx
30 ; int dist2_mmx(unsigned char *blk1, unsigned char *blk2,
31 ; int lx, int hx, int hy, int h)
33 ; mm7 = 0
35 ; eax = pblk1
36 ; ebx = pblk2
37 ; ecx = temp
38 ; edx = distance_sum
39 ; edi = h
40 ; esi = lx
42 ;;
43 ;; private constants needed
44 ;;
46 SECTION .data
47 align 16
48 twos:
49 dw 2
50 dw 2
51 dw 2
52 dw 2
54 align 32
55 dist2_mmx:
56 push ebp ; save frame pointer
57 mov ebp, esp ; link
58 push ebx
59 push ecx
60 push edx
61 push esi
62 push edi
64 mov esi, [ebp+16] ; lx
65 mov eax, [ebp+20] ; hx
66 mov edx, [ebp+24] ; hy
67 mov edi, [ebp+28] ; h
69 pxor mm5, mm5 ; sum
70 test edi, edi ; h = 0?
71 jle near d2exit
73 pxor mm7, mm7 ; get zeros i mm7
75 test eax, eax ; hx != 0?
76 jne near d2is10
77 test edx, edx ; hy != 0?
78 jne near d2is10
80 mov eax, [ebp+8]
81 mov ebx, [ebp+12]
82 jmp d2top00
84 align 32
85 d2top00:
86 movq mm0, [eax]
87 movq mm1, mm0
88 punpcklbw mm0, mm7
89 punpckhbw mm1, mm7
91 movq mm2, [ebx]
92 movq mm3, mm2
93 punpcklbw mm2, mm7
94 punpckhbw mm3, mm7
96 psubw mm0, mm2
97 psubw mm1, mm3
98 pmaddwd mm0, mm0
99 pmaddwd mm1, mm1
100 paddd mm0, mm1
102 movq mm1, [eax+8]
103 movq mm2, mm1
104 punpcklbw mm1, mm7
105 punpckhbw mm2, mm7
107 movq mm3, [ebx+8]
108 movq mm4, mm3
109 punpcklbw mm3, mm7
110 punpckhbw mm4, mm7
112 psubw mm1, mm3
113 psubw mm2, mm4
114 pmaddwd mm1, mm1
115 pmaddwd mm2, mm2
116 paddd mm1, mm2
118 paddd mm0, mm1
120 ;; Accumulate sum in edx... we use mm5
121 ;movd ecx, mm0
122 ;add edx, ecx
123 ;psrlq mm0, 32
124 ;movd ecx, mm0
125 ;add edx, ecx
126 paddd mm5, mm0
128 add eax, esi
129 add ebx, esi
130 dec edi
131 jg d2top00
132 jmp d2exit
135 d2is10:
136 test eax, eax
137 je near d2is01
138 test edx, edx
139 jne near d2is01
142 mov eax, [ebp+8] ; blk1
143 mov ebx, [ebp+12] ; blk1
145 pxor mm6, mm6 ; mm6 = 0 and isn't changed anyplace in the loop..
146 pcmpeqw mm1, mm1
147 psubw mm6, mm1
148 jmp d2top10
150 align 32
151 d2top10:
152 movq mm0, [eax]
153 movq mm1, mm0
154 punpcklbw mm0, mm7
155 punpckhbw mm1, mm7
156 movq mm2, [eax+1]
157 movq mm3, mm2
158 punpcklbw mm2, mm7
159 punpckhbw mm3, mm7
160 paddw mm0, mm2
161 paddw mm1, mm3
162 paddw mm0, mm6 ; here we add mm6 = 0.... weird...
163 paddw mm1, mm6
164 psrlw mm0, 1
165 psrlw mm1, 1
167 movq mm2, [ebx]
168 movq mm3, mm2
169 punpcklbw mm2, mm7
170 punpckhbw mm3, mm7
172 psubw mm0, mm2
173 psubw mm1, mm3
174 pmaddwd mm0, mm0
175 pmaddwd mm1, mm1
176 paddd mm0, mm1
178 movq mm1, [eax+8]
179 movq mm2, mm1
180 punpcklbw mm1, mm7
181 punpckhbw mm2, mm7
182 movq mm3, [eax+9]
183 movq mm4, mm3
184 punpcklbw mm3, mm7
185 punpckhbw mm4, mm7
186 paddw mm1, mm3
187 paddw mm2, mm4
188 paddw mm1, mm6
189 paddw mm2, mm6
190 psrlw mm1, 1
191 psrlw mm2, 1
193 movq mm3, [ebx+8]
194 movq mm4, mm3
195 punpcklbw mm3, mm7
196 punpckhbw mm4, mm7
198 psubw mm1, mm3
199 psubw mm2, mm4
200 pmaddwd mm1, mm1
201 pmaddwd mm2, mm2
202 paddd mm1, mm2
205 paddd mm0, mm1
206 ; Accumulate mm0 sum on edx... we'll use mm5 for this and add up at the end
207 ; movd ecx, mm0
208 ; add edx, ecx
209 ; psrlq mm0, 32
210 ; movd ecx, mm0
211 ; add edx, ecx
212 paddd mm5, mm0
213 add eax, esi
214 add ebx, esi
215 dec edi
216 jg near d2top10
219 jmp d2exit
221 d2is01:
222 test eax, eax
223 jne near d2is11
224 test edx, edx
225 je near d2is11
227 mov eax, [ebp+8] ; blk1
228 mov edx, [ebp+12] ; blk2
229 mov ebx, eax
230 add ebx, esi ; blk1 + lx
232 pxor mm6, mm6
233 pcmpeqw mm1, mm1
234 psubw mm6, mm1 ; mm6 = 1
235 jmp d2top01
237 align 32
238 d2top01:
239 movq mm0, [eax]
240 movq mm1, mm0
241 punpcklbw mm0, mm7
242 punpckhbw mm1, mm7
243 movq mm2, [ebx]
244 movq mm3, mm2
245 punpcklbw mm2, mm7
246 punpckhbw mm3, mm7
247 paddw mm0, mm2
248 paddw mm1, mm3
249 paddw mm0, mm6
250 paddw mm1, mm6
251 psrlw mm0, 1
252 psrlw mm1, 1
254 movq mm2, [edx]
255 movq mm3, mm2
256 punpcklbw mm2, mm7
257 punpckhbw mm3, mm7
259 psubw mm0, mm2
260 psubw mm1, mm3
262 pmaddwd mm0, mm0
263 pmaddwd mm1, mm1
264 paddd mm0, mm1
266 movq mm1, [eax+8]
267 movq mm2, mm1
268 punpcklbw mm1, mm7
269 punpckhbw mm2, mm7
271 movq mm3, [ebx+8]
272 movq mm4, mm3
273 punpcklbw mm3, mm7
274 punpckhbw mm4, mm7
276 paddw mm1, mm3
277 paddw mm2, mm4
278 paddw mm1, mm6
279 paddw mm2, mm6
280 psrlw mm1, 1
281 psrlw mm2, 1
283 movq mm3, [edx+8]
284 movq mm4, mm3
285 punpcklbw mm3, mm7
286 punpckhbw mm4, mm7
288 psubw mm1, mm3
289 psubw mm2, mm4
291 pmaddwd mm1, mm1
292 pmaddwd mm2, mm2
293 paddd mm0, mm1
294 paddd mm0, mm2
296 ;; Accumulate in "s" - we use mm5 for the purpose
298 ;movd ecx, mm0
299 ;add s, ecx
300 ;psrlq mm0, 32
301 ;movd ecx, mm0
302 ;add s, ecx
303 paddd mm5, mm0
305 ;; Originally this moved
306 mov eax, ebx ; eax = eax + lx
307 add edx, esi ; edx = edx + lx
308 add ebx, esi ; ebx = ebx + lx
309 dec edi
310 jg near d2top01
311 jmp d2exit
313 d2is11:
314 mov eax, [ebp+8] ; blk1
315 mov edx, [ebp+12] ; blk2
316 mov ebx, eax ; blk1
317 add ebx, esi ; ebx = blk1 + lx
318 jmp d2top11
320 align 32
321 d2top11:
322 movq mm0, [eax]
323 movq mm1, mm0
324 punpcklbw mm0, mm7
325 punpckhbw mm1, mm7
326 movq mm2, [eax+1]
327 movq mm3, mm2
328 punpcklbw mm2, mm7
329 punpckhbw mm3, mm7
330 paddw mm0, mm2
331 paddw mm1, mm3
332 movq mm2, [ebx]
333 movq mm3, mm2
334 punpcklbw mm2, mm7
335 punpckhbw mm3, mm7
336 movq mm4, [ebx+1]
337 movq mm6, mm4
338 punpcklbw mm4, mm7
339 punpckhbw mm6, mm7
340 paddw mm2, mm4
341 paddw mm3, mm6
342 paddw mm0, mm2
343 paddw mm1, mm3
344 ;pxor mm6, mm6 ; mm6 = 0
345 ;pcmpeqw mm5, mm5 ; mm5 = -1
346 ;psubw mm6, mm5 ; mm6 = 1
347 ;paddw mm6, mm6 ; mm6 = 2
348 movq mm6, [twos]
349 paddw mm0, mm6 ; round mm0
350 paddw mm1, mm6 ; round mm1
351 psrlw mm0, 2
352 psrlw mm1, 2
354 movq mm2, [edx]
355 movq mm3, mm2
356 punpcklbw mm2, mm7
357 punpckhbw mm3, mm7
359 psubw mm0, mm2
360 psubw mm1, mm3
361 pmaddwd mm0, mm0
362 pmaddwd mm1, mm1
363 paddd mm0, mm1
365 movq mm1, [eax+8]
366 movq mm2, mm1
367 punpcklbw mm1, mm7
368 punpckhbw mm2, mm7
370 movq mm3, [eax+9]
371 movq mm4, mm3
372 punpcklbw mm3, mm7
373 punpckhbw mm4, mm7
375 paddw mm1, mm3
376 paddw mm2, mm4
378 movq mm3, [ebx+8]
379 movq mm4, mm3
380 punpcklbw mm3, mm7
381 punpckhbw mm4, mm7
382 paddw mm1, mm3
383 paddw mm2, mm4
385 movq mm3, [ebx+9]
386 movq mm4, mm3
387 punpcklbw mm3, mm7
388 punpckhbw mm4, mm7
390 paddw mm1, mm3
391 paddw mm2, mm4
393 ;pxor mm6, mm6 ; Zero mm6
394 ;pcmpeqw mm5, mm5 ; mm5 = -1
395 ;psubw mm6, mm5 ; mm6 = 1
396 ;paddw mm6, mm6 ; mm6 = 2
397 ;paddw mm1, mm6 ; round mm1 and mm2
398 ;paddw mm2, mm6
399 movq mm6, [twos]
400 paddw mm1, mm6
401 paddw mm2, mm6
403 psrlw mm1, 2
404 psrlw mm2, 2
406 movq mm3, [edx+8]
407 movq mm4, mm3
408 punpcklbw mm3, mm7
409 punpckhbw mm4, mm7
411 psubw mm1, mm3
412 psubw mm2, mm4
413 pmaddwd mm1, mm1
414 pmaddwd mm2, mm2
415 paddd mm1, mm2
417 paddd mm0, mm1
420 ;; Accumulate the result in "s" we use mm6 for the purpose...
421 ;movd ecx, mm0
422 ; add s, ecx
423 ;psrlq mm0, 32
424 ;movd ecx, mm0
425 ;add s, ecx
426 paddd mm5, mm0
428 mov eax, ebx ; ahem ebx = eax at start of loop and wasn't changed...
429 add ebx, esi
430 add edx, esi
431 dec edi
432 jg near d2top11
435 d2exit:
436 ;; Put the final sum in eax for return...
437 movd eax, mm5
438 psrlq mm5, 32
439 movd ecx, mm5
440 add eax, ecx
442 pop edi
443 pop esi
444 pop edx
445 pop ecx
446 pop ebx
448 pop ebp ; restore stack pointer
450 emms ; clear mmx registers
451 ret
454 ; total squared difference between two (8*h) blocks
455 ; blk1,blk2: addresses of top left pels of both blocks
456 ; lx: distance (in bytes) of vertically adjacent pels
457 ; h: height of block (usually 4, or 8)
458 ; mmX version
460 global dist2_22_mmx
461 ; int dist2_22_mmx(unsigned char *blk1, unsigned char *blk2,
462 ; int lx, int h)
464 ; mm7 = 0
466 ; eax = pblk1
467 ; ebx = pblk2
468 ; ecx = temp
469 ; edx = distance_sum
470 ; edi = h
471 ; esi = lx
473 align 32
474 dist2_22_mmx:
475 push ebp ; save frame pointer
476 mov ebp, esp ; link
477 push ebx
478 push ecx
479 push edx
480 push esi
481 push edi
483 mov esi, [ebp+16] ; lx
484 mov edi, [ebp+20] ; h
486 pxor mm5, mm5 ; sum
487 test edi, edi ; h = 0?
488 jle near d2exit
490 pxor mm7, mm7 ; get zeros i mm7
492 mov eax, [ebp+8] ; blk1
493 mov ebx, [ebp+12] ; blk2
494 jmp d2top22
496 align 32
497 d2top22:
498 movq mm0, [eax]
499 movq mm1, mm0
500 punpcklbw mm0, mm7
501 punpckhbw mm1, mm7
503 movq mm2, [ebx]
504 movq mm3, mm2
505 punpcklbw mm2, mm7
506 punpckhbw mm3, mm7
508 psubw mm0, mm2
509 psubw mm1, mm3
510 pmaddwd mm0, mm0
511 pmaddwd mm1, mm1
512 paddd mm5, mm0
513 paddd mm5, mm1
515 add eax, esi
516 add ebx, esi
517 dec edi
518 jg d2top22
519 jmp d2exit
522 ; total squared difference between interpolation of two (8*h) blocks and
523 ; another 8*h block
524 ; blk1,blk2: addresses of top left pels of both blocks
525 ; lx: distance (in bytes) of vertically adjacent pels
526 ; h: height of block (usually 4, or 8)
527 ; mmX version
529 global bdist2_22_mmx
530 ; int bdist2_22_mmx(unsigned char *blk1f, unsigned char*blk1b,
531 ; unsigned char *blk2,
532 ; int lx, int h)
534 ; mm7 = 0
536 ; eax = pblk1f
537 ; ebx = pblk2
538 ; ecx = pblk1b
539 ; edx = distance_sum
540 ; edi = h
541 ; esi = lx
543 align 32
544 bdist2_22_mmx:
545 push ebp ; save frame pointer
546 mov ebp, esp ; link
547 push ebx
548 push ecx
549 push edx
550 push esi
551 push edi
553 mov esi, [ebp+20] ; lx
554 mov edi, [ebp+24] ; h
556 pxor mm5, mm5 ; sum
557 test edi, edi ; h = 0?
558 jle near d2exit
560 pxor mm7, mm7 ; get zeros i mm7
562 mov eax, [ebp+8] ; blk1f
563 mov ebx, [ebp+12] ; blk1b
564 mov ecx, [ebp+16] ; blk2
565 jmp bd2top22
567 align 32
568 bd2top22:
569 movq mm0, [eax]
570 movq mm1, mm0
571 movq mm4, [ebx]
572 movq mm6, mm4
573 punpcklbw mm0, mm7
574 punpckhbw mm1, mm7
575 punpcklbw mm4, mm7
576 punpckhbw mm6, mm7
578 movq mm2, [ecx]
579 movq mm3, mm2
580 punpcklbw mm2, mm7
581 punpckhbw mm3, mm7
583 paddw mm0, mm4
584 psrlw mm0, 1
585 psubw mm0, mm2
586 pmaddwd mm0, mm0
587 paddw mm1, mm6
588 psrlw mm1, 1
589 psubw mm1, mm3
590 pmaddwd mm1, mm1
591 paddd mm5, mm0
592 paddd mm5, mm1
594 add eax, esi
595 add ebx, esi
596 add ecx, esi
597 dec edi
598 jg bd2top22
599 jmp d2exit