1 ;*****************************************************************************
2 ;* SIMD-optimized motion compensation estimation
3 ;*****************************************************************************
4 ;* Copyright (c) 2000, 2001 Fabrice Bellard
5 ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
7 ;* This file is part of FFmpeg.
9 ;* FFmpeg is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
14 ;* FFmpeg is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with FFmpeg; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;*****************************************************************************
24 %include "libavutil/x86/x86util.asm"
33 %macro DIFF_PIXELS_1
4
41 ; %1=const uint8_t *pix1, %2=const uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3
42 ; %6=temporary storage location
43 ; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64)
44 %macro DIFF_PIXELS_8
6
45 DIFF_PIXELS_1 m0
, m7
, [%1 +%3], [%2 +%3]
46 DIFF_PIXELS_1 m1
, m7
, [%1+%4 +%3], [%2+%4 +%3]
47 DIFF_PIXELS_1 m2
, m7
, [%1+%4*2+%3], [%2+%4*2+%3]
50 DIFF_PIXELS_1 m3
, m7
, [%1 +%3], [%2 +%3]
51 DIFF_PIXELS_1 m4
, m7
, [%1+%4 +%3], [%2+%4 +%3]
52 DIFF_PIXELS_1 m5
, m7
, [%1+%4*2+%3], [%2+%4*2+%3]
53 DIFF_PIXELS_1 m6
, m7
, [%1+%5 +%3], [%2+%5 +%3]
55 DIFF_PIXELS_1 m7
, m8
, [%1+%4*4+%3], [%2+%4*4+%3]
58 DIFF_PIXELS_1 m7
, m0
, [%1+%4*4+%3], [%2+%4*4+%3]
66 SUMSUB_BADC w
, 0, 1, 2, 3
67 SUMSUB_BADC w
, 4, 5, 6, 7
68 SUMSUB_BADC w
, 0, 2, 1, 3
69 SUMSUB_BADC w
, 4, 6, 5, 7
70 SUMSUB_BADC w
, 0, 4, 1, 5
71 SUMSUB_BADC w
, 2, 6, 3, 7
85 %macro ABS_SUM_8x8_64
1
87 ABS2_SUM m2
, m3
, m8
, m9
, m0
, m1
88 ABS2_SUM m4
, m5
, m8
, m9
, m0
, m1
89 ABS2_SUM m6
, m7
, m8
, m9
, m0
, m1
93 %macro ABS_SUM_8x8_32
1
107 ; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to
108 ; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
109 ; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
119 %elif cpuflag
(mmxext
)
137 mova
[%1+mmsize
*0], %2
138 mova
[%1+mmsize
*1], %3
139 mova
[%1+mmsize
*2], %4
140 mova
[%1+mmsize
*3], %5
144 mova
%2, [%1+mmsize
*0]
145 mova
%3, [%1+mmsize
*1]
146 mova
%4, [%1+mmsize
*2]
147 mova
%5, [%1+mmsize
*3]
150 %macro hadamard8_16_wrapper
2
151 cglobal hadamard8_diff
, 4, 4, %1
153 %assign pad
%2*mmsize
-(4+stack_offset
&(mmsize
-1))
156 call hadamard8x8_diff
%+ SUFFIX
162 cglobal hadamard8_diff16
, 5, 6, %1
164 %assign pad
%2*mmsize
-(4+stack_offset
&(mmsize
-1))
168 call hadamard8x8_diff
%+ SUFFIX
173 call hadamard8x8_diff
%+ SUFFIX
181 call hadamard8x8_diff
%+ SUFFIX
186 call hadamard8x8_diff
%+ SUFFIX
197 %macro HADAMARD8_DIFF
0-1
199 hadamard8x8_diff
%+ SUFFIX:
201 DIFF_PIXELS_8 r1
, r2
, 0, r3
, r0
, rsp
+gprsize
204 TRANSPOSE8x8W
0, 1, 2, 3, 4, 5, 6, 7, 8
206 TRANSPOSE8x8W
0, 1, 2, 3, 4, 5, 6, 7, [rsp
+gprsize
], [rsp
+mmsize
+gprsize
]
209 ABS_SUM_8x8 rsp
+gprsize
214 hadamard8_16_wrapper
%1, 3
217 ; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, const uint8_t *src1,
218 ; const uint8_t *src2, ptrdiff_t stride, int h)
219 ; r0 = void *s = unused, int h = unused (always 8)
220 ; note how r1, r2 and r3 are not clobbered in this function, so 16x16
221 ; can simply call this 2x2x (and that's why we access rsp+gprsize
222 ; everywhere, which is rsp of calling func
223 hadamard8x8_diff
%+ SUFFIX:
227 DIFF_PIXELS_8 r1
, r2
, 0, r3
, r0
, rsp
+gprsize
+0x60
229 mova
[rsp
+gprsize
+0x60], m7
230 TRANSPOSE4x4W
0, 1, 2, 3, 7
231 STORE4 rsp
+gprsize
, m0
, m1
, m2
, m3
232 mova m7
, [rsp
+gprsize
+0x60]
233 TRANSPOSE4x4W
4, 5, 6, 7, 0
234 STORE4 rsp
+gprsize
+0x40, m4
, m5
, m6
, m7
237 DIFF_PIXELS_8 r1
, r2
, 4, r3
, r0
, rsp
+gprsize
+0x60
239 mova
[rsp
+gprsize
+0x60], m7
240 TRANSPOSE4x4W
0, 1, 2, 3, 7
241 STORE4 rsp
+gprsize
+0x20, m0
, m1
, m2
, m3
242 mova m7
, [rsp
+gprsize
+0x60]
243 TRANSPOSE4x4W
4, 5, 6, 7, 0
245 LOAD4 rsp
+gprsize
+0x40, m0
, m1
, m2
, m3
247 ABS_SUM_8x8_32 rsp
+gprsize
+0x60
248 mova
[rsp
+gprsize
+0x60], m0
250 LOAD4 rsp
+gprsize
, m0
, m1
, m2
, m3
251 LOAD4 rsp
+gprsize
+0x20, m4
, m5
, m6
, m7
253 ABS_SUM_8x8_32 rsp
+gprsize
254 paddusw m0
, [rsp
+gprsize
+0x60]
260 hadamard8_16_wrapper
0, 14
264 %if HAVE_ALIGNED_STACK
== 0
271 %define ABS_SUM_8x8 ABS_SUM_8x8_64
273 %define ABS_SUM_8x8 ABS_SUM_8x8_32
278 %define ABS_SUM_8x8 ABS_SUM_8x8_64
281 ; int ff_sse*_*(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
282 ; ptrdiff_t line_size, int h)
284 %macro SUM_SQUARED_ERRORS
1
285 cglobal sse
%1, 5,5,8, v
, pix1
, pix2
, lsize
, h
289 pxor m0
, m0
; mm0 = 0
290 pxor m7
, m7
; mm7 holds the sum
292 .
next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
293 movu m1
, [pix1q
] ; m1 = pix1[0][0-15], [0-7] for mmx
294 movu m2
, [pix2q
] ; m2 = pix2[0][0-15], [0-7] for mmx
296 movu m3
, [pix1q
+lsizeq
] ; m3 = pix1[1][0-15], [0-7] for mmx
297 movu m4
, [pix2q
+lsizeq
] ; m4 = pix2[1][0-15], [0-7] for mmx
298 %else
; %1 / 2 == mmsize; mmx only
299 mova m3
, [pix1q
+8] ; m3 = pix1[0][8-15]
300 mova m4
, [pix2q
+8] ; m4 = pix2[0][8-15]
303 ; todo: mm1-mm2, mm3-mm4
304 ; algo: subtract mm1 from mm2 with saturation and vice versa
305 ; OR the result to get the absolute difference
316 ; now convert to 16-bit vectors so we can square them
322 punpcklbw m1
, m0
; mm1 not spread over (mm1,mm2)
323 punpcklbw m3
, m0
; mm4 not spread over (mm3,mm4)
336 lea pix1q
, [pix1q
+ 2*lsizeq
]
337 lea pix2q
, [pix2q
+ 2*lsizeq
]
346 movd
eax, m7
; return value
354 SUM_SQUARED_ERRORS
16
357 SUM_SQUARED_ERRORS
16
359 ;-----------------------------------------------
360 ;int ff_sum_abs_dctelem(const int16_t *block)
361 ;-----------------------------------------------
362 ; %1 = number of xmm registers used
363 ; %2 = number of inline loops
365 %macro SUM_ABS_DCTELEM
2
366 cglobal sum_abs_dctelem
, 1, 1, %1, block
371 mova m2
, [blockq
+mmsize
*(0+%%i
)]
372 mova m3
, [blockq
+mmsize
*(1+%%i
)]
373 mova m4
, [blockq
+mmsize
*(2+%%i
)]
374 mova m5
, [blockq
+mmsize
*(3+%%i
)]
392 ;------------------------------------------------------------------------------
393 ; int ff_hf_noise*_mmx(const uint8_t *pix1, ptrdiff_t lsize, int h)
394 ;------------------------------------------------------------------------------
396 %macro HF_NOISE_PART1
5
417 %macro HF_NOISE_PART2
4
434 cglobal hf_noise
%1, 3,3,0, pix1
, lsize
, h
438 HF_NOISE_PART1
%1, 0, 1, 2, 3
440 HF_NOISE_PART1
%1, 4, 1, 5, 3
441 HF_NOISE_PART2
0, 2, 4, 5
444 HF_NOISE_PART1
%1, 0, 1, 2, 3
445 HF_NOISE_PART2
4, 5, 0, 2
447 HF_NOISE_PART1
%1, 4, 1, 5, 3
448 HF_NOISE_PART2
0, 2, 4, 5
460 movd
eax, m0
; eax = result of hf_noise8;
468 ;---------------------------------------------------------------------------------------
469 ;int ff_sad_<opt>(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h);
470 ;---------------------------------------------------------------------------------------
473 cglobal sad
%1, 5, 5, 3, v
, pix1
, pix2
, stride
, h
475 movu m1
, [pix2q
+strideq
]
477 psadbw m1
, [pix1q
+strideq
]
481 movu m1
, [pix2q
+strideq
+8]
483 psadbw m1
, [pix1q
+strideq
+8]
491 lea pix1q
, [pix1q
+strideq
*2]
492 lea pix2q
, [pix2q
+strideq
*2]
494 movu m1
, [pix2q
+strideq
]
496 psadbw m1
, [pix1q
+strideq
]
501 movu m1
, [pix2q
+strideq
+8]
503 psadbw m1
, [pix1q
+strideq
+8]
523 ;------------------------------------------------------------------------------------------
524 ;int ff_sad_x2_<opt>(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h);
525 ;------------------------------------------------------------------------------------------
528 cglobal sad
%1_x2
, 5, 5, 5, v
, pix1
, pix2
, stride
, h
530 movu m2
, [pix2q
+strideq
]
533 movu m4
, [pix2q
+strideq
+1]
538 pavgb m2
, [pix2q
+strideq
+1]
541 psadbw m2
, [pix1q
+strideq
]
545 movu m2
, [pix2q
+strideq
+8]
547 pavgb m2
, [pix2q
+strideq
+9]
549 psadbw m2
, [pix1q
+strideq
+8]
557 lea pix1q
, [pix1q
+2*strideq
]
558 lea pix2q
, [pix2q
+2*strideq
]
560 movu m2
, [pix2q
+strideq
]
563 movu m4
, [pix2q
+strideq
+1]
568 pavgb m2
, [pix2q
+strideq
+1]
571 psadbw m2
, [pix1q
+strideq
]
576 movu m2
, [pix2q
+strideq
+8]
578 pavgb m2
, [pix2q
+strideq
+9]
580 psadbw m2
, [pix1q
+strideq
+8]
600 ;------------------------------------------------------------------------------------------
601 ;int ff_sad_y2_<opt>(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h);
602 ;------------------------------------------------------------------------------------------
605 cglobal sad
%1_y2
, 5, 5, 4, v
, pix1
, pix2
, stride
, h
607 movu m0
, [pix2q
+strideq
]
608 movu m3
, [pix2q
+2*strideq
]
612 psadbw m0
, [pix1q
+strideq
]
617 movu m5
, [pix2q
+strideq
+8]
618 movu m6
, [pix2q
+2*strideq
+8]
622 psadbw m5
, [pix1q
+strideq
+8]
632 lea pix1q
, [pix1q
+2*strideq
]
633 lea pix2q
, [pix2q
+2*strideq
]
635 movu m3
, [pix2q
+strideq
]
639 psadbw m2
, [pix1q
+strideq
]
645 movu m6
, [pix2q
+strideq
+8]
649 psadbw m5
, [pix1q
+strideq
+8]
670 ;-------------------------------------------------------------------------------------------
671 ;int ff_sad_approx_xy2_<opt>(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2, ptrdiff_t stride, int h);
672 ;-------------------------------------------------------------------------------------------
674 %macro SAD_APPROX_XY2
1
675 cglobal sad
%1_approx_xy2
, 5, 5, 7, v
, pix1
, pix2
, stride
, h
678 movu m0
, [pix2q
+strideq
]
679 movu m3
, [pix2q
+2*strideq
]
682 movu m6
, [pix2q
+strideq
+1]
683 movu m2
, [pix2q
+2*strideq
+1]
689 pavgb m0
, [pix2q
+strideq
+1]
690 pavgb m3
, [pix2q
+2*strideq
+1]
696 psadbw m0
, [pix1q
+strideq
]
701 movu m6
, [pix2q
+strideq
+8]
702 movu m7
, [pix2q
+2*strideq
+8]
703 pavgb m5
, [pix2q
+1+8]
704 pavgb m6
, [pix2q
+strideq
+1+8]
705 pavgb m7
, [pix2q
+2*strideq
+1+8]
710 psadbw m6
, [pix1q
+strideq
+8]
720 lea pix1q
, [pix1q
+2*strideq
]
721 lea pix2q
, [pix2q
+2*strideq
]
723 movu m3
, [pix2q
+strideq
]
726 movu m6
, [pix2q
+strideq
+1]
731 pavgb m3
, [pix2q
+strideq
+1]
737 psadbw m2
, [pix1q
+strideq
]
743 movu m7
, [pix2q
+strideq
+8]
744 pavgb m6
, [pix2q
+8+1]
745 pavgb m7
, [pix2q
+strideq
+8+1]
750 psadbw m6
, [pix1q
+strideq
+8]
771 ;--------------------------------------------------------------------
772 ;int ff_vsad_intra(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
773 ; ptrdiff_t line_size, int h);
774 ;--------------------------------------------------------------------
777 cglobal vsad_intra
%1, 5, 5, 3, v
, pix1
, pix2
, lsize
, h
780 mova m2
, [pix1q
+lsizeq
]
783 mova m2
, [pix1q
+lsizeq
]
785 mova m4
, [pix1q
+lsizeq
+8]
793 lea pix1q
, [pix1q
+ 2*lsizeq
]
798 mova m2
, [pix1q
+lsizeq
]
808 mova m2
, [pix1q
+lsizeq
]
809 mova m4
, [pix1q
+lsizeq
+8]
832 ;---------------------------------------------------------------------
833 ;int ff_vsad_approx(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
834 ; ptrdiff_t line_size, int h);
835 ;---------------------------------------------------------------------
838 cglobal vsad
%1_approx
, 5, 5, 5, v
, pix1
, pix2
, lsize
, h
841 %if
%1 == mmsize
; vsad8_mmxext, vsad16_sse2
842 mova m4
, [pix1q
+lsizeq
]
845 movu m2
, [pix2q
+lsizeq
]
850 psubb m4
, [pix2q
+lsizeq
]
855 %else
; vsad16_mmxext
861 mova m4
, [pix1q
+lsizeq
]
862 mova m5
, [pix1q
+lsizeq
+8]
863 psubb m4
, [pix2q
+lsizeq
]
864 psubb m5
, [pix2q
+lsizeq
+8]
874 lea pix1q
, [pix1q
+ 2*lsizeq
]
875 lea pix2q
, [pix2q
+ 2*lsizeq
]
877 %if
%1 == mmsize
; vsad8_mmxext, vsad16_sse2
887 mova m4
, [pix1q
+lsizeq
]
888 movu m3
, [pix2q
+lsizeq
]
893 %else
; vsad16_mmxext
903 mova m4
, [pix1q
+lsizeq
]
904 mova m5
, [pix1q
+lsizeq
+8]
905 psubb m4
, [pix2q
+lsizeq
]
906 psubb m5
, [pix2q
+lsizeq
+8]