threadprogress: reorder instructions to fix race.
[ffmpeg.git] / libavcodec / x86 / me_cmp.asm
blob923eb8078b485c95e986a6e92c6149ede8e182e0
1 ;*****************************************************************************
2 ;* SIMD-optimized motion compensation estimation
3 ;*****************************************************************************
4 ;* Copyright (c) 2000, 2001 Fabrice Bellard
5 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 ;*
7 ;* This file is part of FFmpeg.
8 ;*
9 ;* FFmpeg is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
14 ;* FFmpeg is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with FFmpeg; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;*****************************************************************************
24 %include "libavutil/x86/x86util.asm"
26 SECTION_RODATA
28 cextern pb_1
29 cextern pb_80
31 SECTION .text
33 %macro DIFF_PIXELS_1 4
34 movh %1, %3
35 movh %2, %4
36 punpcklbw %2, %1
37 punpcklbw %1, %1
38 psubw %1, %2
39 %endmacro
41 ; %1=const uint8_t *pix1, %2=const uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3
42 ; %6=temporary storage location
43 ; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64)
44 %macro DIFF_PIXELS_8 6
45 DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3]
46 DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3]
47 DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3]
48 add %1, %5
49 add %2, %5
50 DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3]
51 DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3]
52 DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3]
53 DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3]
54 %ifdef m8
55 DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3]
56 %else
57 mova [%6], m0
58 DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3]
59 mova m0, [%6]
60 %endif
61 sub %1, %5
62 sub %2, %5
63 %endmacro
65 %macro HADAMARD8 0
66 SUMSUB_BADC w, 0, 1, 2, 3
67 SUMSUB_BADC w, 4, 5, 6, 7
68 SUMSUB_BADC w, 0, 2, 1, 3
69 SUMSUB_BADC w, 4, 6, 5, 7
70 SUMSUB_BADC w, 0, 4, 1, 5
71 SUMSUB_BADC w, 2, 6, 3, 7
72 %endmacro
74 %macro ABS1_SUM 3
75 ABS1 %1, %2
76 paddusw %3, %1
77 %endmacro
79 %macro ABS2_SUM 6
80 ABS2 %1, %2, %3, %4
81 paddusw %5, %1
82 paddusw %6, %2
83 %endmacro
85 %macro ABS_SUM_8x8_64 1
86 ABS2 m0, m1, m8, m9
87 ABS2_SUM m2, m3, m8, m9, m0, m1
88 ABS2_SUM m4, m5, m8, m9, m0, m1
89 ABS2_SUM m6, m7, m8, m9, m0, m1
90 paddusw m0, m1
91 %endmacro
93 %macro ABS_SUM_8x8_32 1
94 mova [%1], m7
95 ABS1 m0, m7
96 ABS1 m1, m7
97 ABS1_SUM m2, m7, m0
98 ABS1_SUM m3, m7, m1
99 ABS1_SUM m4, m7, m0
100 ABS1_SUM m5, m7, m1
101 ABS1_SUM m6, m7, m0
102 mova m2, [%1]
103 ABS1_SUM m2, m7, m1
104 paddusw m0, m1
105 %endmacro
107 ; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to
108 ; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
109 ; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
110 %macro HSUM 3
111 %if cpuflag(sse2)
112 movhlps %2, %1
113 paddusw %1, %2
114 pshuflw %2, %1, 0xE
115 paddusw %1, %2
116 pshuflw %2, %1, 0x1
117 paddusw %1, %2
118 movd %3, %1
119 %elif cpuflag(mmxext)
120 pshufw %2, %1, 0xE
121 paddusw %1, %2
122 pshufw %2, %1, 0x1
123 paddusw %1, %2
124 movd %3, %1
125 %elif cpuflag(mmx)
126 mova %2, %1
127 psrlq %1, 32
128 paddusw %1, %2
129 mova %2, %1
130 psrlq %1, 16
131 paddusw %1, %2
132 movd %3, %1
133 %endif
134 %endmacro
136 %macro STORE4 5
137 mova [%1+mmsize*0], %2
138 mova [%1+mmsize*1], %3
139 mova [%1+mmsize*2], %4
140 mova [%1+mmsize*3], %5
141 %endmacro
143 %macro LOAD4 5
144 mova %2, [%1+mmsize*0]
145 mova %3, [%1+mmsize*1]
146 mova %4, [%1+mmsize*2]
147 mova %5, [%1+mmsize*3]
148 %endmacro
150 %macro hadamard8_16_wrapper 2
151 cglobal hadamard8_diff, 4, 4, %1
152 %ifndef m8
153 %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
154 SUB rsp, pad
155 %endif
156 call hadamard8x8_diff %+ SUFFIX
157 %ifndef m8
158 ADD rsp, pad
159 %endif
162 cglobal hadamard8_diff16, 5, 6, %1
163 %ifndef m8
164 %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
165 SUB rsp, pad
166 %endif
168 call hadamard8x8_diff %+ SUFFIX
169 mov r5d, eax
171 add r1, 8
172 add r2, 8
173 call hadamard8x8_diff %+ SUFFIX
174 add r5d, eax
176 cmp r4d, 16
177 jne .done
179 lea r1, [r1+r3*8-8]
180 lea r2, [r2+r3*8-8]
181 call hadamard8x8_diff %+ SUFFIX
182 add r5d, eax
184 add r1, 8
185 add r2, 8
186 call hadamard8x8_diff %+ SUFFIX
187 add r5d, eax
189 .done:
190 mov eax, r5d
191 %ifndef m8
192 ADD rsp, pad
193 %endif
195 %endmacro
197 %macro HADAMARD8_DIFF 0-1
198 %if cpuflag(sse2)
199 hadamard8x8_diff %+ SUFFIX:
200 lea r0, [r3*3]
201 DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize
202 HADAMARD8
203 %if ARCH_X86_64
204 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
205 %else
206 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize]
207 %endif
208 HADAMARD8
209 ABS_SUM_8x8 rsp+gprsize
210 HSUM m0, m1, eax
211 and eax, 0xFFFF
214 hadamard8_16_wrapper %1, 3
215 %elif cpuflag(mmx)
216 ALIGN 16
217 ; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, const uint8_t *src1,
218 ; const uint8_t *src2, ptrdiff_t stride, int h)
219 ; r0 = void *s = unused, int h = unused (always 8)
220 ; note how r1, r2 and r3 are not clobbered in this function, so 16x16
221 ; can simply call this 2x2x (and that's why we access rsp+gprsize
222 ; everywhere, which is rsp of calling func
223 hadamard8x8_diff %+ SUFFIX:
224 lea r0, [r3*3]
226 ; first 4x8 pixels
227 DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60
228 HADAMARD8
229 mova [rsp+gprsize+0x60], m7
230 TRANSPOSE4x4W 0, 1, 2, 3, 7
231 STORE4 rsp+gprsize, m0, m1, m2, m3
232 mova m7, [rsp+gprsize+0x60]
233 TRANSPOSE4x4W 4, 5, 6, 7, 0
234 STORE4 rsp+gprsize+0x40, m4, m5, m6, m7
236 ; second 4x8 pixels
237 DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60
238 HADAMARD8
239 mova [rsp+gprsize+0x60], m7
240 TRANSPOSE4x4W 0, 1, 2, 3, 7
241 STORE4 rsp+gprsize+0x20, m0, m1, m2, m3
242 mova m7, [rsp+gprsize+0x60]
243 TRANSPOSE4x4W 4, 5, 6, 7, 0
245 LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3
246 HADAMARD8
247 ABS_SUM_8x8_32 rsp+gprsize+0x60
248 mova [rsp+gprsize+0x60], m0
250 LOAD4 rsp+gprsize , m0, m1, m2, m3
251 LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7
252 HADAMARD8
253 ABS_SUM_8x8_32 rsp+gprsize
254 paddusw m0, [rsp+gprsize+0x60]
256 HSUM m0, m1, eax
257 and rax, 0xFFFF
260 hadamard8_16_wrapper 0, 14
261 %endif
262 %endmacro
264 %if HAVE_ALIGNED_STACK == 0
265 INIT_MMX mmxext
266 HADAMARD8_DIFF
267 %endif
269 INIT_XMM sse2
270 %if ARCH_X86_64
271 %define ABS_SUM_8x8 ABS_SUM_8x8_64
272 %else
273 %define ABS_SUM_8x8 ABS_SUM_8x8_32
274 %endif
275 HADAMARD8_DIFF 10
277 INIT_XMM ssse3
278 %define ABS_SUM_8x8 ABS_SUM_8x8_64
279 HADAMARD8_DIFF 9
281 ; int ff_sse*_*(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
282 ; ptrdiff_t line_size, int h)
284 %macro SUM_SQUARED_ERRORS 1
285 cglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h
286 %if %1 == mmsize
287 shr hd, 1
288 %endif
289 pxor m0, m0 ; mm0 = 0
290 pxor m7, m7 ; mm7 holds the sum
292 .next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
293 movu m1, [pix1q] ; m1 = pix1[0][0-15], [0-7] for mmx
294 movu m2, [pix2q] ; m2 = pix2[0][0-15], [0-7] for mmx
295 %if %1 == mmsize
296 movu m3, [pix1q+lsizeq] ; m3 = pix1[1][0-15], [0-7] for mmx
297 movu m4, [pix2q+lsizeq] ; m4 = pix2[1][0-15], [0-7] for mmx
298 %else ; %1 / 2 == mmsize; mmx only
299 mova m3, [pix1q+8] ; m3 = pix1[0][8-15]
300 mova m4, [pix2q+8] ; m4 = pix2[0][8-15]
301 %endif
303 ; todo: mm1-mm2, mm3-mm4
304 ; algo: subtract mm1 from mm2 with saturation and vice versa
305 ; OR the result to get the absolute difference
306 mova m5, m1
307 mova m6, m3
308 psubusb m1, m2
309 psubusb m3, m4
310 psubusb m2, m5
311 psubusb m4, m6
313 por m2, m1
314 por m4, m3
316 ; now convert to 16-bit vectors so we can square them
317 mova m1, m2
318 mova m3, m4
320 punpckhbw m2, m0
321 punpckhbw m4, m0
322 punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2)
323 punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4)
325 pmaddwd m2, m2
326 pmaddwd m4, m4
327 pmaddwd m1, m1
328 pmaddwd m3, m3
330 paddd m1, m2
331 paddd m3, m4
332 paddd m7, m1
333 paddd m7, m3
335 %if %1 == mmsize
336 lea pix1q, [pix1q + 2*lsizeq]
337 lea pix2q, [pix2q + 2*lsizeq]
338 %else
339 add pix1q, lsizeq
340 add pix2q, lsizeq
341 %endif
342 dec hd
343 jnz .next2lines
345 HADDD m7, m1
346 movd eax, m7 ; return value
348 %endmacro
350 INIT_MMX mmx
351 SUM_SQUARED_ERRORS 8
353 INIT_MMX mmx
354 SUM_SQUARED_ERRORS 16
356 INIT_XMM sse2
357 SUM_SQUARED_ERRORS 16
359 ;-----------------------------------------------
360 ;int ff_sum_abs_dctelem(const int16_t *block)
361 ;-----------------------------------------------
362 ; %1 = number of xmm registers used
363 ; %2 = number of inline loops
365 %macro SUM_ABS_DCTELEM 2
366 cglobal sum_abs_dctelem, 1, 1, %1, block
367 pxor m0, m0
368 pxor m1, m1
369 %assign %%i 0
370 %rep %2
371 mova m2, [blockq+mmsize*(0+%%i)]
372 mova m3, [blockq+mmsize*(1+%%i)]
373 mova m4, [blockq+mmsize*(2+%%i)]
374 mova m5, [blockq+mmsize*(3+%%i)]
375 ABS1_SUM m2, m6, m0
376 ABS1_SUM m3, m6, m1
377 ABS1_SUM m4, m6, m0
378 ABS1_SUM m5, m6, m1
379 %assign %%i %%i+4
380 %endrep
381 paddusw m0, m1
382 HSUM m0, m1, eax
383 and eax, 0xFFFF
385 %endmacro
387 INIT_XMM sse2
388 SUM_ABS_DCTELEM 7, 2
389 INIT_XMM ssse3
390 SUM_ABS_DCTELEM 6, 2
392 ;------------------------------------------------------------------------------
393 ; int ff_hf_noise*_mmx(const uint8_t *pix1, ptrdiff_t lsize, int h)
394 ;------------------------------------------------------------------------------
395 ; %1 = 8/16. %2-5=m#
396 %macro HF_NOISE_PART1 5
397 mova m%2, [pix1q]
398 %if %1 == 8
399 mova m%3, m%2
400 psllq m%2, 8
401 psrlq m%3, 8
402 psrlq m%2, 8
403 %else
404 mova m%3, [pix1q+1]
405 %endif
406 mova m%4, m%2
407 mova m%5, m%3
408 punpcklbw m%2, m7
409 punpcklbw m%3, m7
410 punpckhbw m%4, m7
411 punpckhbw m%5, m7
412 psubw m%2, m%3
413 psubw m%4, m%5
414 %endmacro
416 ; %1-2 = m#
417 %macro HF_NOISE_PART2 4
418 psubw m%1, m%3
419 psubw m%2, m%4
420 pxor m3, m3
421 pxor m1, m1
422 pcmpgtw m3, m%1
423 pcmpgtw m1, m%2
424 pxor m%1, m3
425 pxor m%2, m1
426 psubw m%1, m3
427 psubw m%2, m1
428 paddw m%2, m%1
429 paddw m6, m%2
430 %endmacro
432 ; %1 = 8/16
433 %macro HF_NOISE 1
434 cglobal hf_noise%1, 3,3,0, pix1, lsize, h
435 sub hd, 2
436 pxor m7, m7
437 pxor m6, m6
438 HF_NOISE_PART1 %1, 0, 1, 2, 3
439 add pix1q, lsizeq
440 HF_NOISE_PART1 %1, 4, 1, 5, 3
441 HF_NOISE_PART2 0, 2, 4, 5
442 add pix1q, lsizeq
443 .loop:
444 HF_NOISE_PART1 %1, 0, 1, 2, 3
445 HF_NOISE_PART2 4, 5, 0, 2
446 add pix1q, lsizeq
447 HF_NOISE_PART1 %1, 4, 1, 5, 3
448 HF_NOISE_PART2 0, 2, 4, 5
449 add pix1q, lsizeq
450 sub hd, 2
451 jne .loop
453 mova m0, m6
454 punpcklwd m0, m7
455 punpckhwd m6, m7
456 paddd m6, m0
457 mova m0, m6
458 psrlq m6, 32
459 paddd m0, m6
460 movd eax, m0 ; eax = result of hf_noise8;
461 RET ; return eax;
462 %endmacro
464 INIT_MMX mmx
465 HF_NOISE 8
466 HF_NOISE 16
468 ;---------------------------------------------------------------------------------------
469 ;int ff_sad_<opt>(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h);
470 ;---------------------------------------------------------------------------------------
471 ;%1 = 8/16
472 %macro SAD 1
473 cglobal sad%1, 5, 5, 3, v, pix1, pix2, stride, h
474 movu m2, [pix2q]
475 movu m1, [pix2q+strideq]
476 psadbw m2, [pix1q]
477 psadbw m1, [pix1q+strideq]
478 paddw m2, m1
479 %if %1 != mmsize
480 movu m0, [pix2q+8]
481 movu m1, [pix2q+strideq+8]
482 psadbw m0, [pix1q+8]
483 psadbw m1, [pix1q+strideq+8]
484 paddw m2, m0
485 paddw m2, m1
486 %endif
487 sub hd, 2
489 align 16
490 .loop:
491 lea pix1q, [pix1q+strideq*2]
492 lea pix2q, [pix2q+strideq*2]
493 movu m0, [pix2q]
494 movu m1, [pix2q+strideq]
495 psadbw m0, [pix1q]
496 psadbw m1, [pix1q+strideq]
497 paddw m2, m0
498 paddw m2, m1
499 %if %1 != mmsize
500 movu m0, [pix2q+8]
501 movu m1, [pix2q+strideq+8]
502 psadbw m0, [pix1q+8]
503 psadbw m1, [pix1q+strideq+8]
504 paddw m2, m0
505 paddw m2, m1
506 %endif
507 sub hd, 2
508 jg .loop
509 %if mmsize == 16
510 movhlps m0, m2
511 paddw m2, m0
512 %endif
513 movd eax, m2
515 %endmacro
517 INIT_MMX mmxext
518 SAD 8
519 SAD 16
520 INIT_XMM sse2
521 SAD 16
523 ;------------------------------------------------------------------------------------------
524 ;int ff_sad_x2_<opt>(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h);
525 ;------------------------------------------------------------------------------------------
526 ;%1 = 8/16
527 %macro SAD_X2 1
528 cglobal sad%1_x2, 5, 5, 5, v, pix1, pix2, stride, h
529 movu m0, [pix2q]
530 movu m2, [pix2q+strideq]
531 %if mmsize == 16
532 movu m3, [pix2q+1]
533 movu m4, [pix2q+strideq+1]
534 pavgb m0, m3
535 pavgb m2, m4
536 %else
537 pavgb m0, [pix2q+1]
538 pavgb m2, [pix2q+strideq+1]
539 %endif
540 psadbw m0, [pix1q]
541 psadbw m2, [pix1q+strideq]
542 paddw m0, m2
543 %if %1 != mmsize
544 movu m1, [pix2q+8]
545 movu m2, [pix2q+strideq+8]
546 pavgb m1, [pix2q+9]
547 pavgb m2, [pix2q+strideq+9]
548 psadbw m1, [pix1q+8]
549 psadbw m2, [pix1q+strideq+8]
550 paddw m0, m1
551 paddw m0, m2
552 %endif
553 sub hd, 2
555 align 16
556 .loop:
557 lea pix1q, [pix1q+2*strideq]
558 lea pix2q, [pix2q+2*strideq]
559 movu m1, [pix2q]
560 movu m2, [pix2q+strideq]
561 %if mmsize == 16
562 movu m3, [pix2q+1]
563 movu m4, [pix2q+strideq+1]
564 pavgb m1, m3
565 pavgb m2, m4
566 %else
567 pavgb m1, [pix2q+1]
568 pavgb m2, [pix2q+strideq+1]
569 %endif
570 psadbw m1, [pix1q]
571 psadbw m2, [pix1q+strideq]
572 paddw m0, m1
573 paddw m0, m2
574 %if %1 != mmsize
575 movu m1, [pix2q+8]
576 movu m2, [pix2q+strideq+8]
577 pavgb m1, [pix2q+9]
578 pavgb m2, [pix2q+strideq+9]
579 psadbw m1, [pix1q+8]
580 psadbw m2, [pix1q+strideq+8]
581 paddw m0, m1
582 paddw m0, m2
583 %endif
584 sub hd, 2
585 jg .loop
586 %if mmsize == 16
587 movhlps m1, m0
588 paddw m0, m1
589 %endif
590 movd eax, m0
592 %endmacro
594 INIT_MMX mmxext
595 SAD_X2 8
596 SAD_X2 16
597 INIT_XMM sse2
598 SAD_X2 16
600 ;------------------------------------------------------------------------------------------
601 ;int ff_sad_y2_<opt>(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h);
602 ;------------------------------------------------------------------------------------------
603 ;%1 = 8/16
604 %macro SAD_Y2 1
605 cglobal sad%1_y2, 5, 5, 4, v, pix1, pix2, stride, h
606 movu m1, [pix2q]
607 movu m0, [pix2q+strideq]
608 movu m3, [pix2q+2*strideq]
609 pavgb m1, m0
610 pavgb m0, m3
611 psadbw m1, [pix1q]
612 psadbw m0, [pix1q+strideq]
613 paddw m0, m1
614 mova m1, m3
615 %if %1 != mmsize
616 movu m4, [pix2q+8]
617 movu m5, [pix2q+strideq+8]
618 movu m6, [pix2q+2*strideq+8]
619 pavgb m4, m5
620 pavgb m5, m6
621 psadbw m4, [pix1q+8]
622 psadbw m5, [pix1q+strideq+8]
623 paddw m0, m4
624 paddw m0, m5
625 mova m4, m6
626 %endif
627 add pix2q, strideq
628 sub hd, 2
630 align 16
631 .loop:
632 lea pix1q, [pix1q+2*strideq]
633 lea pix2q, [pix2q+2*strideq]
634 movu m2, [pix2q]
635 movu m3, [pix2q+strideq]
636 pavgb m1, m2
637 pavgb m2, m3
638 psadbw m1, [pix1q]
639 psadbw m2, [pix1q+strideq]
640 paddw m0, m1
641 paddw m0, m2
642 mova m1, m3
643 %if %1 != mmsize
644 movu m5, [pix2q+8]
645 movu m6, [pix2q+strideq+8]
646 pavgb m4, m5
647 pavgb m5, m6
648 psadbw m4, [pix1q+8]
649 psadbw m5, [pix1q+strideq+8]
650 paddw m0, m4
651 paddw m0, m5
652 mova m4, m6
653 %endif
654 sub hd, 2
655 jg .loop
656 %if mmsize == 16
657 movhlps m1, m0
658 paddw m0, m1
659 %endif
660 movd eax, m0
662 %endmacro
664 INIT_MMX mmxext
665 SAD_Y2 8
666 SAD_Y2 16
667 INIT_XMM sse2
668 SAD_Y2 16
670 ;-------------------------------------------------------------------------------------------
671 ;int ff_sad_approx_xy2_<opt>(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h);
672 ;-------------------------------------------------------------------------------------------
673 ;%1 = 8/16
674 %macro SAD_APPROX_XY2 1
675 cglobal sad%1_approx_xy2, 5, 5, 7, v, pix1, pix2, stride, h
676 mova m4, [pb_1]
677 movu m1, [pix2q]
678 movu m0, [pix2q+strideq]
679 movu m3, [pix2q+2*strideq]
680 %if mmsize == 16
681 movu m5, [pix2q+1]
682 movu m6, [pix2q+strideq+1]
683 movu m2, [pix2q+2*strideq+1]
684 pavgb m1, m5
685 pavgb m0, m6
686 pavgb m3, m2
687 %else
688 pavgb m1, [pix2q+1]
689 pavgb m0, [pix2q+strideq+1]
690 pavgb m3, [pix2q+2*strideq+1]
691 %endif
692 psubusb m0, m4
693 pavgb m1, m0
694 pavgb m0, m3
695 psadbw m1, [pix1q]
696 psadbw m0, [pix1q+strideq]
697 paddw m0, m1
698 mova m1, m3
699 %if %1 != mmsize
700 movu m5, [pix2q+8]
701 movu m6, [pix2q+strideq+8]
702 movu m7, [pix2q+2*strideq+8]
703 pavgb m5, [pix2q+1+8]
704 pavgb m6, [pix2q+strideq+1+8]
705 pavgb m7, [pix2q+2*strideq+1+8]
706 psubusb m6, m4
707 pavgb m5, m6
708 pavgb m6, m7
709 psadbw m5, [pix1q+8]
710 psadbw m6, [pix1q+strideq+8]
711 paddw m0, m5
712 paddw m0, m6
713 mova m5, m7
714 %endif
715 add pix2q, strideq
716 sub hd, 2
718 align 16
719 .loop:
720 lea pix1q, [pix1q+2*strideq]
721 lea pix2q, [pix2q+2*strideq]
722 movu m2, [pix2q]
723 movu m3, [pix2q+strideq]
724 %if mmsize == 16
725 movu m5, [pix2q+1]
726 movu m6, [pix2q+strideq+1]
727 pavgb m2, m5
728 pavgb m3, m6
729 %else
730 pavgb m2, [pix2q+1]
731 pavgb m3, [pix2q+strideq+1]
732 %endif
733 psubusb m2, m4
734 pavgb m1, m2
735 pavgb m2, m3
736 psadbw m1, [pix1q]
737 psadbw m2, [pix1q+strideq]
738 paddw m0, m1
739 paddw m0, m2
740 mova m1, m3
741 %if %1 != mmsize
742 movu m6, [pix2q+8]
743 movu m7, [pix2q+strideq+8]
744 pavgb m6, [pix2q+8+1]
745 pavgb m7, [pix2q+strideq+8+1]
746 psubusb m6, m4
747 pavgb m5, m6
748 pavgb m6, m7
749 psadbw m5, [pix1q+8]
750 psadbw m6, [pix1q+strideq+8]
751 paddw m0, m5
752 paddw m0, m6
753 mova m5, m7
754 %endif
755 sub hd, 2
756 jg .loop
757 %if mmsize == 16
758 movhlps m1, m0
759 paddw m0, m1
760 %endif
761 movd eax, m0
763 %endmacro
765 INIT_MMX mmxext
766 SAD_APPROX_XY2 8
767 SAD_APPROX_XY2 16
768 INIT_XMM sse2
769 SAD_APPROX_XY2 16
771 ;--------------------------------------------------------------------
772 ;int ff_vsad_intra(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
773 ; ptrdiff_t line_size, int h);
774 ;--------------------------------------------------------------------
775 ; %1 = 8/16
776 %macro VSAD_INTRA 1
777 cglobal vsad_intra%1, 5, 5, 3, v, pix1, pix2, lsize, h
778 mova m0, [pix1q]
779 %if %1 == mmsize
780 mova m2, [pix1q+lsizeq]
781 psadbw m0, m2
782 %else
783 mova m2, [pix1q+lsizeq]
784 mova m3, [pix1q+8]
785 mova m4, [pix1q+lsizeq+8]
786 psadbw m0, m2
787 psadbw m3, m4
788 paddw m0, m3
789 %endif
790 sub hd, 2
792 .loop:
793 lea pix1q, [pix1q + 2*lsizeq]
794 %if %1 == mmsize
795 mova m1, [pix1q]
796 psadbw m2, m1
797 paddw m0, m2
798 mova m2, [pix1q+lsizeq]
799 psadbw m1, m2
800 paddw m0, m1
801 %else
802 mova m1, [pix1q]
803 mova m3, [pix1q+8]
804 psadbw m2, m1
805 psadbw m4, m3
806 paddw m0, m2
807 paddw m0, m4
808 mova m2, [pix1q+lsizeq]
809 mova m4, [pix1q+lsizeq+8]
810 psadbw m1, m2
811 psadbw m3, m4
812 paddw m0, m1
813 paddw m0, m3
814 %endif
815 sub hd, 2
816 jg .loop
818 %if mmsize == 16
819 pshufd m1, m0, 0xe
820 paddd m0, m1
821 %endif
822 movd eax, m0
824 %endmacro
826 INIT_MMX mmxext
827 VSAD_INTRA 8
828 VSAD_INTRA 16
829 INIT_XMM sse2
830 VSAD_INTRA 16
832 ;---------------------------------------------------------------------
833 ;int ff_vsad_approx(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
834 ; ptrdiff_t line_size, int h);
835 ;---------------------------------------------------------------------
836 ; %1 = 8/16
837 %macro VSAD_APPROX 1
838 cglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h
839 mova m1, [pb_80]
840 mova m0, [pix1q]
841 %if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
842 mova m4, [pix1q+lsizeq]
843 %if mmsize == 16
844 movu m3, [pix2q]
845 movu m2, [pix2q+lsizeq]
846 psubb m0, m3
847 psubb m4, m2
848 %else
849 psubb m0, [pix2q]
850 psubb m4, [pix2q+lsizeq]
851 %endif
852 pxor m0, m1
853 pxor m4, m1
854 psadbw m0, m4
855 %else ; vsad16_mmxext
856 mova m3, [pix1q+8]
857 psubb m0, [pix2q]
858 psubb m3, [pix2q+8]
859 pxor m0, m1
860 pxor m3, m1
861 mova m4, [pix1q+lsizeq]
862 mova m5, [pix1q+lsizeq+8]
863 psubb m4, [pix2q+lsizeq]
864 psubb m5, [pix2q+lsizeq+8]
865 pxor m4, m1
866 pxor m5, m1
867 psadbw m0, m4
868 psadbw m3, m5
869 paddw m0, m3
870 %endif
871 sub hd, 2
873 .loop:
874 lea pix1q, [pix1q + 2*lsizeq]
875 lea pix2q, [pix2q + 2*lsizeq]
876 mova m2, [pix1q]
877 %if %1 == mmsize ; vsad8_mmxext, vsad16_sse2
878 %if mmsize == 16
879 movu m3, [pix2q]
880 psubb m2, m3
881 %else
882 psubb m2, [pix2q]
883 %endif
884 pxor m2, m1
885 psadbw m4, m2
886 paddw m0, m4
887 mova m4, [pix1q+lsizeq]
888 movu m3, [pix2q+lsizeq]
889 psubb m4, m3
890 pxor m4, m1
891 psadbw m2, m4
892 paddw m0, m2
893 %else ; vsad16_mmxext
894 mova m3, [pix1q+8]
895 psubb m2, [pix2q]
896 psubb m3, [pix2q+8]
897 pxor m2, m1
898 pxor m3, m1
899 psadbw m4, m2
900 psadbw m5, m3
901 paddw m0, m4
902 paddw m0, m5
903 mova m4, [pix1q+lsizeq]
904 mova m5, [pix1q+lsizeq+8]
905 psubb m4, [pix2q+lsizeq]
906 psubb m5, [pix2q+lsizeq+8]
907 pxor m4, m1
908 pxor m5, m1
909 psadbw m2, m4
910 psadbw m3, m5
911 paddw m0, m2
912 paddw m0, m3
913 %endif
914 sub hd, 2
915 jg .loop
917 %if mmsize == 16
918 pshufd m1, m0, 0xe
919 paddd m0, m1
920 %endif
921 movd eax, m0
923 %endmacro
925 INIT_MMX mmxext
926 VSAD_APPROX 8
927 VSAD_APPROX 16
928 INIT_XMM sse2
929 VSAD_APPROX 16