2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/x86_cpu.h"
26 #include "libavcodec/dsputil.h"
27 #include "libavcodec/mpegvideo.h"
28 #include "dsputil_mmx.h"
31 static void get_pixels_mmx(DCTELEM
*block
, const uint8_t *pixels
, int line_size
)
34 "mov $-128, %%"REG_a
" \n\t"
35 "pxor %%mm7, %%mm7 \n\t"
38 "movq (%0), %%mm0 \n\t"
39 "movq (%0, %2), %%mm2 \n\t"
40 "movq %%mm0, %%mm1 \n\t"
41 "movq %%mm2, %%mm3 \n\t"
42 "punpcklbw %%mm7, %%mm0 \n\t"
43 "punpckhbw %%mm7, %%mm1 \n\t"
44 "punpcklbw %%mm7, %%mm2 \n\t"
45 "punpckhbw %%mm7, %%mm3 \n\t"
46 "movq %%mm0, (%1, %%"REG_a
") \n\t"
47 "movq %%mm1, 8(%1, %%"REG_a
") \n\t"
48 "movq %%mm2, 16(%1, %%"REG_a
") \n\t"
49 "movq %%mm3, 24(%1, %%"REG_a
") \n\t"
51 "add $32, %%"REG_a
" \n\t"
54 : "r" (block
+64), "r" ((x86_reg
)line_size
), "r" ((x86_reg
)line_size
*2)
59 static inline void diff_pixels_mmx(DCTELEM
*block
, const uint8_t *s1
, const uint8_t *s2
, int stride
)
62 "pxor %%mm7, %%mm7 \n\t"
63 "mov $-128, %%"REG_a
" \n\t"
66 "movq (%0), %%mm0 \n\t"
67 "movq (%1), %%mm2 \n\t"
68 "movq %%mm0, %%mm1 \n\t"
69 "movq %%mm2, %%mm3 \n\t"
70 "punpcklbw %%mm7, %%mm0 \n\t"
71 "punpckhbw %%mm7, %%mm1 \n\t"
72 "punpcklbw %%mm7, %%mm2 \n\t"
73 "punpckhbw %%mm7, %%mm3 \n\t"
74 "psubw %%mm2, %%mm0 \n\t"
75 "psubw %%mm3, %%mm1 \n\t"
76 "movq %%mm0, (%2, %%"REG_a
") \n\t"
77 "movq %%mm1, 8(%2, %%"REG_a
") \n\t"
80 "add $16, %%"REG_a
" \n\t"
82 : "+r" (s1
), "+r" (s2
)
83 : "r" (block
+64), "r" ((x86_reg
)stride
)
88 static int pix_sum16_mmx(uint8_t * pix
, int line_size
){
91 x86_reg index
= -line_size
*h
;
94 "pxor %%mm7, %%mm7 \n\t"
95 "pxor %%mm6, %%mm6 \n\t"
97 "movq (%2, %1), %%mm0 \n\t"
98 "movq (%2, %1), %%mm1 \n\t"
99 "movq 8(%2, %1), %%mm2 \n\t"
100 "movq 8(%2, %1), %%mm3 \n\t"
101 "punpcklbw %%mm7, %%mm0 \n\t"
102 "punpckhbw %%mm7, %%mm1 \n\t"
103 "punpcklbw %%mm7, %%mm2 \n\t"
104 "punpckhbw %%mm7, %%mm3 \n\t"
105 "paddw %%mm0, %%mm1 \n\t"
106 "paddw %%mm2, %%mm3 \n\t"
107 "paddw %%mm1, %%mm3 \n\t"
108 "paddw %%mm3, %%mm6 \n\t"
111 "movq %%mm6, %%mm5 \n\t"
112 "psrlq $32, %%mm6 \n\t"
113 "paddw %%mm5, %%mm6 \n\t"
114 "movq %%mm6, %%mm5 \n\t"
115 "psrlq $16, %%mm6 \n\t"
116 "paddw %%mm5, %%mm6 \n\t"
117 "movd %%mm6, %0 \n\t"
118 "andl $0xFFFF, %0 \n\t"
119 : "=&r" (sum
), "+r" (index
)
120 : "r" (pix
- index
), "r" ((x86_reg
)line_size
)
126 static int pix_norm1_mmx(uint8_t *pix
, int line_size
) {
133 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
134 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
136 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
138 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
139 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
141 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
142 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
143 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
145 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
146 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
148 "pmaddwd %%mm3,%%mm3\n"
149 "pmaddwd %%mm4,%%mm4\n"
151 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
152 pix2^2+pix3^2+pix6^2+pix7^2) */
153 "paddd %%mm3,%%mm4\n"
154 "paddd %%mm2,%%mm7\n"
157 "paddd %%mm4,%%mm7\n"
162 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
163 "paddd %%mm7,%%mm1\n"
165 : "+r" (pix
), "=r"(tmp
) : "r" ((x86_reg
)line_size
) : "%ecx" );
169 static int sse8_mmx(void *v
, uint8_t * pix1
, uint8_t * pix2
, int line_size
, int h
) {
174 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
175 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
177 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
178 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
179 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
180 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
182 /* todo: mm1-mm2, mm3-mm4 */
183 /* algo: subtract mm1 from mm2 with saturation and vice versa */
184 /* OR the results to get absolute difference */
187 "psubusb %%mm2,%%mm1\n"
188 "psubusb %%mm4,%%mm3\n"
189 "psubusb %%mm5,%%mm2\n"
190 "psubusb %%mm6,%%mm4\n"
195 /* now convert to 16-bit vectors so we can square them */
199 "punpckhbw %%mm0,%%mm2\n"
200 "punpckhbw %%mm0,%%mm4\n"
201 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
202 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
204 "pmaddwd %%mm2,%%mm2\n"
205 "pmaddwd %%mm4,%%mm4\n"
206 "pmaddwd %%mm1,%%mm1\n"
207 "pmaddwd %%mm3,%%mm3\n"
209 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
210 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
212 "paddd %%mm2,%%mm1\n"
213 "paddd %%mm4,%%mm3\n"
214 "paddd %%mm1,%%mm7\n"
215 "paddd %%mm3,%%mm7\n"
221 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
222 "paddd %%mm7,%%mm1\n"
224 : "+r" (pix1
), "+r" (pix2
), "=r"(tmp
)
225 : "r" ((x86_reg
)line_size
) , "m" (h
)
230 static int sse16_mmx(void *v
, uint8_t * pix1
, uint8_t * pix2
, int line_size
, int h
) {
234 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
235 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
237 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
238 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
239 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
240 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
242 /* todo: mm1-mm2, mm3-mm4 */
243 /* algo: subtract mm1 from mm2 with saturation and vice versa */
244 /* OR the results to get absolute difference */
247 "psubusb %%mm2,%%mm1\n"
248 "psubusb %%mm4,%%mm3\n"
249 "psubusb %%mm5,%%mm2\n"
250 "psubusb %%mm6,%%mm4\n"
255 /* now convert to 16-bit vectors so we can square them */
259 "punpckhbw %%mm0,%%mm2\n"
260 "punpckhbw %%mm0,%%mm4\n"
261 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
262 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
264 "pmaddwd %%mm2,%%mm2\n"
265 "pmaddwd %%mm4,%%mm4\n"
266 "pmaddwd %%mm1,%%mm1\n"
267 "pmaddwd %%mm3,%%mm3\n"
272 "paddd %%mm2,%%mm1\n"
273 "paddd %%mm4,%%mm3\n"
274 "paddd %%mm1,%%mm7\n"
275 "paddd %%mm3,%%mm7\n"
281 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
282 "paddd %%mm7,%%mm1\n"
284 : "+r" (pix1
), "+r" (pix2
), "=r"(tmp
)
285 : "r" ((x86_reg
)line_size
) , "m" (h
)
290 static int sse16_sse2(void *v
, uint8_t * pix1
, uint8_t * pix2
, int line_size
, int h
) {
294 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
295 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
297 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
298 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
299 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
300 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
302 /* todo: mm1-mm2, mm3-mm4 */
303 /* algo: subtract mm1 from mm2 with saturation and vice versa */
304 /* OR the results to get absolute difference */
305 "movdqa %%xmm1,%%xmm5\n"
306 "movdqa %%xmm3,%%xmm6\n"
307 "psubusb %%xmm2,%%xmm1\n"
308 "psubusb %%xmm4,%%xmm3\n"
309 "psubusb %%xmm5,%%xmm2\n"
310 "psubusb %%xmm6,%%xmm4\n"
312 "por %%xmm1,%%xmm2\n"
313 "por %%xmm3,%%xmm4\n"
315 /* now convert to 16-bit vectors so we can square them */
316 "movdqa %%xmm2,%%xmm1\n"
317 "movdqa %%xmm4,%%xmm3\n"
319 "punpckhbw %%xmm0,%%xmm2\n"
320 "punpckhbw %%xmm0,%%xmm4\n"
321 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
322 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
324 "pmaddwd %%xmm2,%%xmm2\n"
325 "pmaddwd %%xmm4,%%xmm4\n"
326 "pmaddwd %%xmm1,%%xmm1\n"
327 "pmaddwd %%xmm3,%%xmm3\n"
329 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
330 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
332 "paddd %%xmm2,%%xmm1\n"
333 "paddd %%xmm4,%%xmm3\n"
334 "paddd %%xmm1,%%xmm7\n"
335 "paddd %%xmm3,%%xmm7\n"
340 "movdqa %%xmm7,%%xmm1\n"
341 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
342 "paddd %%xmm1,%%xmm7\n"
343 "movdqa %%xmm7,%%xmm1\n"
344 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
345 "paddd %%xmm1,%%xmm7\n"
347 : "+r" (pix1
), "+r" (pix2
), "+r"(h
), "=r"(tmp
)
348 : "r" ((x86_reg
)line_size
));
352 static int hf_noise8_mmx(uint8_t * pix1
, int line_size
, int h
) {
360 "movq %%mm0, %%mm1\n"
364 "movq %%mm0, %%mm2\n"
365 "movq %%mm1, %%mm3\n"
366 "punpcklbw %%mm7,%%mm0\n"
367 "punpcklbw %%mm7,%%mm1\n"
368 "punpckhbw %%mm7,%%mm2\n"
369 "punpckhbw %%mm7,%%mm3\n"
370 "psubw %%mm1, %%mm0\n"
371 "psubw %%mm3, %%mm2\n"
376 "movq %%mm4, %%mm1\n"
380 "movq %%mm4, %%mm5\n"
381 "movq %%mm1, %%mm3\n"
382 "punpcklbw %%mm7,%%mm4\n"
383 "punpcklbw %%mm7,%%mm1\n"
384 "punpckhbw %%mm7,%%mm5\n"
385 "punpckhbw %%mm7,%%mm3\n"
386 "psubw %%mm1, %%mm4\n"
387 "psubw %%mm3, %%mm5\n"
388 "psubw %%mm4, %%mm0\n"
389 "psubw %%mm5, %%mm2\n"
390 "pxor %%mm3, %%mm3\n"
391 "pxor %%mm1, %%mm1\n"
392 "pcmpgtw %%mm0, %%mm3\n\t"
393 "pcmpgtw %%mm2, %%mm1\n\t"
394 "pxor %%mm3, %%mm0\n"
395 "pxor %%mm1, %%mm2\n"
396 "psubw %%mm3, %%mm0\n"
397 "psubw %%mm1, %%mm2\n"
398 "paddw %%mm0, %%mm2\n"
399 "paddw %%mm2, %%mm6\n"
405 "movq %%mm0, %%mm1\n"
409 "movq %%mm0, %%mm2\n"
410 "movq %%mm1, %%mm3\n"
411 "punpcklbw %%mm7,%%mm0\n"
412 "punpcklbw %%mm7,%%mm1\n"
413 "punpckhbw %%mm7,%%mm2\n"
414 "punpckhbw %%mm7,%%mm3\n"
415 "psubw %%mm1, %%mm0\n"
416 "psubw %%mm3, %%mm2\n"
417 "psubw %%mm0, %%mm4\n"
418 "psubw %%mm2, %%mm5\n"
419 "pxor %%mm3, %%mm3\n"
420 "pxor %%mm1, %%mm1\n"
421 "pcmpgtw %%mm4, %%mm3\n\t"
422 "pcmpgtw %%mm5, %%mm1\n\t"
423 "pxor %%mm3, %%mm4\n"
424 "pxor %%mm1, %%mm5\n"
425 "psubw %%mm3, %%mm4\n"
426 "psubw %%mm1, %%mm5\n"
427 "paddw %%mm4, %%mm5\n"
428 "paddw %%mm5, %%mm6\n"
433 "movq %%mm4, %%mm1\n"
437 "movq %%mm4, %%mm5\n"
438 "movq %%mm1, %%mm3\n"
439 "punpcklbw %%mm7,%%mm4\n"
440 "punpcklbw %%mm7,%%mm1\n"
441 "punpckhbw %%mm7,%%mm5\n"
442 "punpckhbw %%mm7,%%mm3\n"
443 "psubw %%mm1, %%mm4\n"
444 "psubw %%mm3, %%mm5\n"
445 "psubw %%mm4, %%mm0\n"
446 "psubw %%mm5, %%mm2\n"
447 "pxor %%mm3, %%mm3\n"
448 "pxor %%mm1, %%mm1\n"
449 "pcmpgtw %%mm0, %%mm3\n\t"
450 "pcmpgtw %%mm2, %%mm1\n\t"
451 "pxor %%mm3, %%mm0\n"
452 "pxor %%mm1, %%mm2\n"
453 "psubw %%mm3, %%mm0\n"
454 "psubw %%mm1, %%mm2\n"
455 "paddw %%mm0, %%mm2\n"
456 "paddw %%mm2, %%mm6\n"
462 "movq %%mm6, %%mm0\n"
463 "punpcklwd %%mm7,%%mm0\n"
464 "punpckhwd %%mm7,%%mm6\n"
465 "paddd %%mm0, %%mm6\n"
469 "paddd %%mm6,%%mm0\n"
471 : "+r" (pix1
), "=r"(tmp
)
472 : "r" ((x86_reg
)line_size
) , "g" (h
-2)
477 static int hf_noise16_mmx(uint8_t * pix1
, int line_size
, int h
) {
487 "movq %%mm0, %%mm2\n"
488 "movq %%mm1, %%mm3\n"
489 "punpcklbw %%mm7,%%mm0\n"
490 "punpcklbw %%mm7,%%mm1\n"
491 "punpckhbw %%mm7,%%mm2\n"
492 "punpckhbw %%mm7,%%mm3\n"
493 "psubw %%mm1, %%mm0\n"
494 "psubw %%mm3, %%mm2\n"
500 "movq %%mm4, %%mm5\n"
501 "movq %%mm1, %%mm3\n"
502 "punpcklbw %%mm7,%%mm4\n"
503 "punpcklbw %%mm7,%%mm1\n"
504 "punpckhbw %%mm7,%%mm5\n"
505 "punpckhbw %%mm7,%%mm3\n"
506 "psubw %%mm1, %%mm4\n"
507 "psubw %%mm3, %%mm5\n"
508 "psubw %%mm4, %%mm0\n"
509 "psubw %%mm5, %%mm2\n"
510 "pxor %%mm3, %%mm3\n"
511 "pxor %%mm1, %%mm1\n"
512 "pcmpgtw %%mm0, %%mm3\n\t"
513 "pcmpgtw %%mm2, %%mm1\n\t"
514 "pxor %%mm3, %%mm0\n"
515 "pxor %%mm1, %%mm2\n"
516 "psubw %%mm3, %%mm0\n"
517 "psubw %%mm1, %%mm2\n"
518 "paddw %%mm0, %%mm2\n"
519 "paddw %%mm2, %%mm6\n"
526 "movq %%mm0, %%mm2\n"
527 "movq %%mm1, %%mm3\n"
528 "punpcklbw %%mm7,%%mm0\n"
529 "punpcklbw %%mm7,%%mm1\n"
530 "punpckhbw %%mm7,%%mm2\n"
531 "punpckhbw %%mm7,%%mm3\n"
532 "psubw %%mm1, %%mm0\n"
533 "psubw %%mm3, %%mm2\n"
534 "psubw %%mm0, %%mm4\n"
535 "psubw %%mm2, %%mm5\n"
536 "pxor %%mm3, %%mm3\n"
537 "pxor %%mm1, %%mm1\n"
538 "pcmpgtw %%mm4, %%mm3\n\t"
539 "pcmpgtw %%mm5, %%mm1\n\t"
540 "pxor %%mm3, %%mm4\n"
541 "pxor %%mm1, %%mm5\n"
542 "psubw %%mm3, %%mm4\n"
543 "psubw %%mm1, %%mm5\n"
544 "paddw %%mm4, %%mm5\n"
545 "paddw %%mm5, %%mm6\n"
551 "movq %%mm4, %%mm5\n"
552 "movq %%mm1, %%mm3\n"
553 "punpcklbw %%mm7,%%mm4\n"
554 "punpcklbw %%mm7,%%mm1\n"
555 "punpckhbw %%mm7,%%mm5\n"
556 "punpckhbw %%mm7,%%mm3\n"
557 "psubw %%mm1, %%mm4\n"
558 "psubw %%mm3, %%mm5\n"
559 "psubw %%mm4, %%mm0\n"
560 "psubw %%mm5, %%mm2\n"
561 "pxor %%mm3, %%mm3\n"
562 "pxor %%mm1, %%mm1\n"
563 "pcmpgtw %%mm0, %%mm3\n\t"
564 "pcmpgtw %%mm2, %%mm1\n\t"
565 "pxor %%mm3, %%mm0\n"
566 "pxor %%mm1, %%mm2\n"
567 "psubw %%mm3, %%mm0\n"
568 "psubw %%mm1, %%mm2\n"
569 "paddw %%mm0, %%mm2\n"
570 "paddw %%mm2, %%mm6\n"
576 "movq %%mm6, %%mm0\n"
577 "punpcklwd %%mm7,%%mm0\n"
578 "punpckhwd %%mm7,%%mm6\n"
579 "paddd %%mm0, %%mm6\n"
583 "paddd %%mm6,%%mm0\n"
585 : "+r" (pix1
), "=r"(tmp
)
586 : "r" ((x86_reg
)line_size
) , "g" (h
-2)
588 return tmp
+ hf_noise8_mmx(pix
+8, line_size
, h
);
591 static int nsse16_mmx(void *p
, uint8_t * pix1
, uint8_t * pix2
, int line_size
, int h
) {
592 MpegEncContext
*c
= p
;
595 if(c
) score1
= c
->dsp
.sse
[0](c
, pix1
, pix2
, line_size
, h
);
596 else score1
= sse16_mmx(c
, pix1
, pix2
, line_size
, h
);
597 score2
= hf_noise16_mmx(pix1
, line_size
, h
) - hf_noise16_mmx(pix2
, line_size
, h
);
599 if(c
) return score1
+ FFABS(score2
)*c
->avctx
->nsse_weight
;
600 else return score1
+ FFABS(score2
)*8;
603 static int nsse8_mmx(void *p
, uint8_t * pix1
, uint8_t * pix2
, int line_size
, int h
) {
604 MpegEncContext
*c
= p
;
605 int score1
= sse8_mmx(c
, pix1
, pix2
, line_size
, h
);
606 int score2
= hf_noise8_mmx(pix1
, line_size
, h
) - hf_noise8_mmx(pix2
, line_size
, h
);
608 if(c
) return score1
+ FFABS(score2
)*c
->avctx
->nsse_weight
;
609 else return score1
+ FFABS(score2
)*8;
612 static int vsad_intra16_mmx(void *v
, uint8_t * pix
, uint8_t * dummy
, int line_size
, int h
) {
615 assert( (((int)pix
) & 7) == 0);
616 assert((line_size
&7) ==0);
618 #define SUM(in0, in1, out0, out1) \
619 "movq (%0), %%mm2\n"\
620 "movq 8(%0), %%mm3\n"\
622 "movq %%mm2, " #out0 "\n"\
623 "movq %%mm3, " #out1 "\n"\
624 "psubusb " #in0 ", %%mm2\n"\
625 "psubusb " #in1 ", %%mm3\n"\
626 "psubusb " #out0 ", " #in0 "\n"\
627 "psubusb " #out1 ", " #in1 "\n"\
628 "por %%mm2, " #in0 "\n"\
629 "por %%mm3, " #in1 "\n"\
630 "movq " #in0 ", %%mm2\n"\
631 "movq " #in1 ", %%mm3\n"\
632 "punpcklbw %%mm7, " #in0 "\n"\
633 "punpcklbw %%mm7, " #in1 "\n"\
634 "punpckhbw %%mm7, %%mm2\n"\
635 "punpckhbw %%mm7, %%mm3\n"\
636 "paddw " #in1 ", " #in0 "\n"\
637 "paddw %%mm3, %%mm2\n"\
638 "paddw %%mm2, " #in0 "\n"\
639 "paddw " #in0 ", %%mm6\n"
652 SUM(%%mm4
, %%mm5
, %%mm0
, %%mm1
)
654 SUM(%%mm0
, %%mm1
, %%mm4
, %%mm5
)
661 "paddw %%mm6,%%mm0\n"
664 "paddw %%mm6,%%mm0\n"
666 : "+r" (pix
), "=r"(tmp
)
667 : "r" ((x86_reg
)line_size
) , "m" (h
)
673 static int vsad_intra16_mmx2(void *v
, uint8_t * pix
, uint8_t * dummy
, int line_size
, int h
) {
676 assert( (((int)pix
) & 7) == 0);
677 assert((line_size
&7) ==0);
679 #define SUM(in0, in1, out0, out1) \
680 "movq (%0), " #out0 "\n"\
681 "movq 8(%0), " #out1 "\n"\
683 "psadbw " #out0 ", " #in0 "\n"\
684 "psadbw " #out1 ", " #in1 "\n"\
685 "paddw " #in1 ", " #in0 "\n"\
686 "paddw " #in0 ", %%mm6\n"
698 SUM(%%mm4
, %%mm5
, %%mm0
, %%mm1
)
700 SUM(%%mm0
, %%mm1
, %%mm4
, %%mm5
)
706 : "+r" (pix
), "=r"(tmp
)
707 : "r" ((x86_reg
)line_size
) , "m" (h
)
713 static int vsad16_mmx(void *v
, uint8_t * pix1
, uint8_t * pix2
, int line_size
, int h
) {
716 assert( (((int)pix1
) & 7) == 0);
717 assert( (((int)pix2
) & 7) == 0);
718 assert((line_size
&7) ==0);
720 #define SUM(in0, in1, out0, out1) \
722 "movq (%1)," #out0 "\n"\
723 "movq 8(%0),%%mm3\n"\
724 "movq 8(%1)," #out1 "\n"\
727 "psubb " #out0 ", %%mm2\n"\
728 "psubb " #out1 ", %%mm3\n"\
729 "pxor %%mm7, %%mm2\n"\
730 "pxor %%mm7, %%mm3\n"\
731 "movq %%mm2, " #out0 "\n"\
732 "movq %%mm3, " #out1 "\n"\
733 "psubusb " #in0 ", %%mm2\n"\
734 "psubusb " #in1 ", %%mm3\n"\
735 "psubusb " #out0 ", " #in0 "\n"\
736 "psubusb " #out1 ", " #in1 "\n"\
737 "por %%mm2, " #in0 "\n"\
738 "por %%mm3, " #in1 "\n"\
739 "movq " #in0 ", %%mm2\n"\
740 "movq " #in1 ", %%mm3\n"\
741 "punpcklbw %%mm7, " #in0 "\n"\
742 "punpcklbw %%mm7, " #in1 "\n"\
743 "punpckhbw %%mm7, %%mm2\n"\
744 "punpckhbw %%mm7, %%mm3\n"\
745 "paddw " #in1 ", " #in0 "\n"\
746 "paddw %%mm3, %%mm2\n"\
747 "paddw %%mm2, " #in0 "\n"\
748 "paddw " #in0 ", %%mm6\n"
754 "pcmpeqw %%mm7,%%mm7\n"
756 "packsswb %%mm7, %%mm7\n"
763 "psubb %%mm2, %%mm0\n"
764 "psubb %%mm3, %%mm1\n"
765 "pxor %%mm7, %%mm0\n"
766 "pxor %%mm7, %%mm1\n"
770 SUM(%%mm4
, %%mm5
, %%mm0
, %%mm1
)
772 SUM(%%mm0
, %%mm1
, %%mm4
, %%mm5
)
779 "paddw %%mm6,%%mm0\n"
782 "paddw %%mm6,%%mm0\n"
784 : "+r" (pix1
), "+r" (pix2
), "=r"(tmp
)
785 : "r" ((x86_reg
)line_size
) , "m" (h
)
791 static int vsad16_mmx2(void *v
, uint8_t * pix1
, uint8_t * pix2
, int line_size
, int h
) {
794 assert( (((int)pix1
) & 7) == 0);
795 assert( (((int)pix2
) & 7) == 0);
796 assert((line_size
&7) ==0);
798 #define SUM(in0, in1, out0, out1) \
799 "movq (%0)," #out0 "\n"\
801 "movq 8(%0)," #out1 "\n"\
802 "movq 8(%1),%%mm3\n"\
805 "psubb %%mm2, " #out0 "\n"\
806 "psubb %%mm3, " #out1 "\n"\
807 "pxor %%mm7, " #out0 "\n"\
808 "pxor %%mm7, " #out1 "\n"\
809 "psadbw " #out0 ", " #in0 "\n"\
810 "psadbw " #out1 ", " #in1 "\n"\
811 "paddw " #in1 ", " #in0 "\n"\
812 "paddw " #in0 ", %%mm6\n"
817 "pcmpeqw %%mm7,%%mm7\n"
819 "packsswb %%mm7, %%mm7\n"
826 "psubb %%mm2, %%mm0\n"
827 "psubb %%mm3, %%mm1\n"
828 "pxor %%mm7, %%mm0\n"
829 "pxor %%mm7, %%mm1\n"
833 SUM(%%mm4
, %%mm5
, %%mm0
, %%mm1
)
835 SUM(%%mm0
, %%mm1
, %%mm4
, %%mm5
)
841 : "+r" (pix1
), "+r" (pix2
), "=r"(tmp
)
842 : "r" ((x86_reg
)line_size
) , "m" (h
)
848 static void diff_bytes_mmx(uint8_t *dst
, uint8_t *src1
, uint8_t *src2
, int w
){
852 "movq (%2, %0), %%mm0 \n\t"
853 "movq (%1, %0), %%mm1 \n\t"
854 "psubb %%mm0, %%mm1 \n\t"
855 "movq %%mm1, (%3, %0) \n\t"
856 "movq 8(%2, %0), %%mm0 \n\t"
857 "movq 8(%1, %0), %%mm1 \n\t"
858 "psubb %%mm0, %%mm1 \n\t"
859 "movq %%mm1, 8(%3, %0) \n\t"
864 : "r"(src1
), "r"(src2
), "r"(dst
), "r"((x86_reg
)w
-15)
867 dst
[i
+0] = src1
[i
+0]-src2
[i
+0];
870 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst
, uint8_t *src1
, uint8_t *src2
, int w
, int *left
, int *left_top
){
876 "movq -1(%1, %0), %%mm0 \n\t" // LT
877 "movq (%1, %0), %%mm1 \n\t" // T
878 "movq -1(%2, %0), %%mm2 \n\t" // L
879 "movq (%2, %0), %%mm3 \n\t" // X
880 "movq %%mm2, %%mm4 \n\t" // L
881 "psubb %%mm0, %%mm2 \n\t"
882 "paddb %%mm1, %%mm2 \n\t" // L + T - LT
883 "movq %%mm4, %%mm5 \n\t" // L
884 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
885 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
886 "pminub %%mm2, %%mm4 \n\t"
887 "pmaxub %%mm1, %%mm4 \n\t"
888 "psubb %%mm4, %%mm3 \n\t" // dst - pred
889 "movq %%mm3, (%3, %0) \n\t"
894 : "r"(src1
), "r"(src2
), "r"(dst
), "r"((x86_reg
)w
)
900 dst
[0]= src2
[0] - mid_pred(l
, src1
[0], (l
+ src1
[0] - lt
)&0xFF);
902 *left_top
= src1
[w
-1];
906 #define DIFF_PIXELS_1(m,a,t,p1,p2)\
907 "mov"#m" "#p1", "#a" \n\t"\
908 "mov"#m" "#p2", "#t" \n\t"\
909 "punpcklbw "#a", "#t" \n\t"\
910 "punpcklbw "#a", "#a" \n\t"\
911 "psubw "#t", "#a" \n\t"\
913 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
914 uint8_t *p1b=p1, *p2b=p2;\
916 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
917 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
918 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
921 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
922 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
923 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
924 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
925 "mov"#m1" "#mm"0, %0 \n\t"\
926 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
927 "mov"#m1" %0, "#mm"0 \n\t"\
928 : "+m"(temp), "+r"(p1b), "+r"(p2b)\
929 : "r"((x86_reg)stride), "r"((x86_reg)stride*3)\
932 //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
934 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
935 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
937 #define LBUTTERFLY2(a1,b1,a2,b2)\
938 "paddw " #b1 ", " #a1 " \n\t"\
939 "paddw " #b2 ", " #a2 " \n\t"\
940 "paddw " #b1 ", " #b1 " \n\t"\
941 "paddw " #b2 ", " #b2 " \n\t"\
942 "psubw " #a1 ", " #b1 " \n\t"\
943 "psubw " #a2 ", " #b2 " \n\t"
945 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
946 LBUTTERFLY2(m0, m1, m2, m3)\
947 LBUTTERFLY2(m4, m5, m6, m7)\
948 LBUTTERFLY2(m0, m2, m1, m3)\
949 LBUTTERFLY2(m4, m6, m5, m7)\
950 LBUTTERFLY2(m0, m4, m1, m5)\
951 LBUTTERFLY2(m2, m6, m3, m7)\
953 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
955 #define MMABS_MMX(a,z)\
956 "pxor " #z ", " #z " \n\t"\
957 "pcmpgtw " #a ", " #z " \n\t"\
958 "pxor " #z ", " #a " \n\t"\
959 "psubw " #z ", " #a " \n\t"
961 #define MMABS_MMX2(a,z)\
962 "pxor " #z ", " #z " \n\t"\
963 "psubw " #a ", " #z " \n\t"\
964 "pmaxsw " #z ", " #a " \n\t"
966 #define MMABS_SSSE3(a,z)\
967 "pabsw " #a ", " #a " \n\t"
969 #define MMABS_SUM(a,z, sum)\
971 "paddusw " #a ", " #sum " \n\t"
973 #define MMABS_SUM_8x8_NOSPILL\
974 MMABS(%%xmm0, %%xmm8)\
975 MMABS(%%xmm1, %%xmm9)\
976 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
977 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
978 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
979 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
980 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
981 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
982 "paddusw %%xmm1, %%xmm0 \n\t"
985 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
987 #define MMABS_SUM_8x8_SSE2\
988 "movdqa %%xmm7, (%1) \n\t"\
989 MMABS(%%xmm0, %%xmm7)\
990 MMABS(%%xmm1, %%xmm7)\
991 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
992 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
993 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
994 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
995 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
996 "movdqa (%1), %%xmm2 \n\t"\
997 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
998 "paddusw %%xmm1, %%xmm0 \n\t"
1001 #define LOAD4(o, a, b, c, d)\
1002 "movq "#o"(%1), "#a" \n\t"\
1003 "movq "#o"+8(%1), "#b" \n\t"\
1004 "movq "#o"+16(%1), "#c" \n\t"\
1005 "movq "#o"+24(%1), "#d" \n\t"\
1007 #define STORE4(o, a, b, c, d)\
1008 "movq "#a", "#o"(%1) \n\t"\
1009 "movq "#b", "#o"+8(%1) \n\t"\
1010 "movq "#c", "#o"+16(%1) \n\t"\
1011 "movq "#d", "#o"+24(%1) \n\t"\
1013 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1014 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1015 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1016 #define HSUM_MMX(a, t, dst)\
1017 "movq "#a", "#t" \n\t"\
1018 "psrlq $32, "#a" \n\t"\
1019 "paddusw "#t", "#a" \n\t"\
1020 "movq "#a", "#t" \n\t"\
1021 "psrlq $16, "#a" \n\t"\
1022 "paddusw "#t", "#a" \n\t"\
1023 "movd "#a", "#dst" \n\t"\
1025 #define HSUM_MMX2(a, t, dst)\
1026 "pshufw $0x0E, "#a", "#t" \n\t"\
1027 "paddusw "#t", "#a" \n\t"\
1028 "pshufw $0x01, "#a", "#t" \n\t"\
1029 "paddusw "#t", "#a" \n\t"\
1030 "movd "#a", "#dst" \n\t"\
1032 #define HSUM_SSE2(a, t, dst)\
1033 "movhlps "#a", "#t" \n\t"\
1034 "paddusw "#t", "#a" \n\t"\
1035 "pshuflw $0x0E, "#a", "#t" \n\t"\
1036 "paddusw "#t", "#a" \n\t"\
1037 "pshuflw $0x01, "#a", "#t" \n\t"\
1038 "paddusw "#t", "#a" \n\t"\
1039 "movd "#a", "#dst" \n\t"\
1041 #define HADAMARD8_DIFF_MMX(cpu) \
1042 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1043 DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1048 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1053 "movq %%mm7, 96(%1) \n\t"\
1055 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1056 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1058 "movq 96(%1), %%mm7 \n\t"\
1059 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1060 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1066 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1071 "movq %%mm7, 96(%1) \n\t"\
1073 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1074 STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
1076 "movq 96(%1), %%mm7 \n\t"\
1077 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1078 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
1079 "movq %%mm6, %%mm7 \n\t"\
1080 "movq %%mm0, %%mm6 \n\t"\
1082 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1085 "movq %%mm7, 64(%1) \n\t"\
1086 MMABS(%%mm0, %%mm7)\
1087 MMABS(%%mm1, %%mm7)\
1088 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1089 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1090 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1091 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1092 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1093 "movq 64(%1), %%mm2 \n\t"\
1094 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1095 "paddusw %%mm1, %%mm0 \n\t"\
1096 "movq %%mm0, 64(%1) \n\t"\
1098 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1099 LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
1102 "movq %%mm7, (%1) \n\t"\
1103 MMABS(%%mm0, %%mm7)\
1104 MMABS(%%mm1, %%mm7)\
1105 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1106 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1107 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1108 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1109 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1110 "movq (%1), %%mm2 \n\t"\
1111 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1112 "paddusw 64(%1), %%mm0 \n\t"\
1113 "paddusw %%mm1, %%mm0 \n\t"\
1115 HSUM(%%mm0, %%mm1, %0)\
1122 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1124 #define HADAMARD8_DIFF_SSE2(cpu) \
1125 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1126 DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1131 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1134 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1135 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1136 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1138 HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1144 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1146 #define MMABS(a,z) MMABS_MMX(a,z)
1147 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1148 HADAMARD8_DIFF_MMX(mmx
)
1152 #define MMABS(a,z) MMABS_MMX2(a,z)
1153 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
1154 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1155 HADAMARD8_DIFF_MMX(mmx2
)
1156 HADAMARD8_DIFF_SSE2(sse2
)
1158 #undef MMABS_SUM_8x8
1162 #define MMABS(a,z) MMABS_SSSE3(a,z)
1163 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
1164 HADAMARD8_DIFF_SSE2(ssse3
)
1166 #undef MMABS_SUM_8x8
1169 #define DCT_SAD4(m,mm,o)\
1170 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
1171 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
1172 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
1173 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
1174 MMABS_SUM(mm##2, mm##6, mm##0)\
1175 MMABS_SUM(mm##3, mm##7, mm##1)\
1176 MMABS_SUM(mm##4, mm##6, mm##0)\
1177 MMABS_SUM(mm##5, mm##7, mm##1)\
1179 #define DCT_SAD_MMX\
1180 "pxor %%mm0, %%mm0 \n\t"\
1181 "pxor %%mm1, %%mm1 \n\t"\
1182 DCT_SAD4(q, %%mm, 0)\
1183 DCT_SAD4(q, %%mm, 8)\
1184 DCT_SAD4(q, %%mm, 64)\
1185 DCT_SAD4(q, %%mm, 72)\
1186 "paddusw %%mm1, %%mm0 \n\t"\
1187 HSUM(%%mm0, %%mm1, %0)
1189 #define DCT_SAD_SSE2\
1190 "pxor %%xmm0, %%xmm0 \n\t"\
1191 "pxor %%xmm1, %%xmm1 \n\t"\
1192 DCT_SAD4(dqa, %%xmm, 0)\
1193 DCT_SAD4(dqa, %%xmm, 64)\
1194 "paddusw %%xmm1, %%xmm0 \n\t"\
1195 HSUM(%%xmm0, %%xmm1, %0)
1197 #define DCT_SAD_FUNC(cpu) \
1198 static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1208 #define DCT_SAD DCT_SAD_MMX
1209 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1210 #define MMABS(a,z) MMABS_MMX(a,z)
1215 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1216 #define MMABS(a,z) MMABS_MMX2(a,z)
1221 #define DCT_SAD DCT_SAD_SSE2
1222 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1227 #define MMABS(a,z) MMABS_SSSE3(a,z)
1234 static int ssd_int8_vs_int16_mmx(const int8_t *pix1
, const int16_t *pix2
, int size
){
1238 "pxor %%mm4, %%mm4 \n"
1241 "movq (%2,%0), %%mm2 \n"
1242 "movq (%3,%0,2), %%mm0 \n"
1243 "movq 8(%3,%0,2), %%mm1 \n"
1244 "punpckhbw %%mm2, %%mm3 \n"
1245 "punpcklbw %%mm2, %%mm2 \n"
1246 "psraw $8, %%mm3 \n"
1247 "psraw $8, %%mm2 \n"
1248 "psubw %%mm3, %%mm1 \n"
1249 "psubw %%mm2, %%mm0 \n"
1250 "pmaddwd %%mm1, %%mm1 \n"
1251 "pmaddwd %%mm0, %%mm0 \n"
1252 "paddd %%mm1, %%mm4 \n"
1253 "paddd %%mm0, %%mm4 \n"
1255 "movq %%mm4, %%mm3 \n"
1256 "psrlq $32, %%mm3 \n"
1257 "paddd %%mm3, %%mm4 \n"
1260 :"r"(pix1
), "r"(pix2
)
1265 #define PHADDD(a, t)\
1266 "movq "#a", "#t" \n\t"\
1267 "psrlq $32, "#a" \n\t"\
1268 "paddd "#t", "#a" \n\t"
1270 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
1271 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
1272 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
1274 #define PMULHRW(x, y, s, o)\
1275 "pmulhw " #s ", "#x " \n\t"\
1276 "pmulhw " #s ", "#y " \n\t"\
1277 "paddw " #o ", "#x " \n\t"\
1278 "paddw " #o ", "#y " \n\t"\
1279 "psraw $1, "#x " \n\t"\
1280 "psraw $1, "#y " \n\t"
1281 #define DEF(x) x ## _mmx
1282 #define SET_RND MOVQ_WONE
1283 #define SCALE_OFFSET 1
1285 #include "dsputil_mmx_qns.h"
1292 #define DEF(x) x ## _3dnow
1294 #define SCALE_OFFSET 0
1295 #define PMULHRW(x, y, s, o)\
1296 "pmulhrw " #s ", "#x " \n\t"\
1297 "pmulhrw " #s ", "#y " \n\t"
1299 #include "dsputil_mmx_qns.h"
1308 #define DEF(x) x ## _ssse3
1310 #define SCALE_OFFSET -1
1311 #define PHADDD(a, t)\
1312 "pshufw $0x0E, "#a", "#t" \n\t"\
1313 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
1314 #define PMULHRW(x, y, s, o)\
1315 "pmulhrsw " #s ", "#x " \n\t"\
1316 "pmulhrsw " #s ", "#y " \n\t"
1318 #include "dsputil_mmx_qns.h"
1329 void ff_flac_compute_autocorr_sse2(const int32_t *data
, int len
, int lag
,
1333 void dsputilenc_init_mmx(DSPContext
* c
, AVCodecContext
*avctx
)
1335 if (mm_flags
& MM_MMX
) {
1336 const int dct_algo
= avctx
->dct_algo
;
1337 if(dct_algo
==FF_DCT_AUTO
|| dct_algo
==FF_DCT_MMX
){
1338 if(mm_flags
& MM_SSE2
){
1339 c
->fdct
= ff_fdct_sse2
;
1340 }else if(mm_flags
& MM_MMXEXT
){
1341 c
->fdct
= ff_fdct_mmx2
;
1343 c
->fdct
= ff_fdct_mmx
;
1347 c
->get_pixels
= get_pixels_mmx
;
1348 c
->diff_pixels
= diff_pixels_mmx
;
1349 c
->pix_sum
= pix_sum16_mmx
;
1351 c
->diff_bytes
= diff_bytes_mmx
;
1352 c
->sum_abs_dctelem
= sum_abs_dctelem_mmx
;
1354 c
->hadamard8_diff
[0]= hadamard8_diff16_mmx
;
1355 c
->hadamard8_diff
[1]= hadamard8_diff_mmx
;
1357 c
->pix_norm1
= pix_norm1_mmx
;
1358 c
->sse
[0] = (mm_flags
& MM_SSE2
) ? sse16_sse2
: sse16_mmx
;
1359 c
->sse
[1] = sse8_mmx
;
1360 c
->vsad
[4]= vsad_intra16_mmx
;
1362 c
->nsse
[0] = nsse16_mmx
;
1363 c
->nsse
[1] = nsse8_mmx
;
1364 if(!(avctx
->flags
& CODEC_FLAG_BITEXACT
)){
1365 c
->vsad
[0] = vsad16_mmx
;
1368 if(!(avctx
->flags
& CODEC_FLAG_BITEXACT
)){
1369 c
->try_8x8basis
= try_8x8basis_mmx
;
1371 c
->add_8x8basis
= add_8x8basis_mmx
;
1373 c
->ssd_int8_vs_int16
= ssd_int8_vs_int16_mmx
;
1376 if (mm_flags
& MM_MMXEXT
) {
1377 c
->sum_abs_dctelem
= sum_abs_dctelem_mmx2
;
1378 c
->hadamard8_diff
[0]= hadamard8_diff16_mmx2
;
1379 c
->hadamard8_diff
[1]= hadamard8_diff_mmx2
;
1380 c
->vsad
[4]= vsad_intra16_mmx2
;
1382 if(!(avctx
->flags
& CODEC_FLAG_BITEXACT
)){
1383 c
->vsad
[0] = vsad16_mmx2
;
1386 c
->sub_hfyu_median_prediction
= sub_hfyu_median_prediction_mmx2
;
1389 if(mm_flags
& MM_SSE2
){
1390 c
->sum_abs_dctelem
= sum_abs_dctelem_sse2
;
1391 c
->hadamard8_diff
[0]= hadamard8_diff16_sse2
;
1392 c
->hadamard8_diff
[1]= hadamard8_diff_sse2
;
1393 if (ENABLE_FLAC_ENCODER
)
1394 c
->flac_compute_autocorr
= ff_flac_compute_autocorr_sse2
;
1398 if(mm_flags
& MM_SSSE3
){
1399 if(!(avctx
->flags
& CODEC_FLAG_BITEXACT
)){
1400 c
->try_8x8basis
= try_8x8basis_ssse3
;
1402 c
->add_8x8basis
= add_8x8basis_ssse3
;
1403 c
->sum_abs_dctelem
= sum_abs_dctelem_ssse3
;
1404 c
->hadamard8_diff
[0]= hadamard8_diff16_ssse3
;
1405 c
->hadamard8_diff
[1]= hadamard8_diff_ssse3
;
1409 if(mm_flags
& MM_3DNOW
){
1410 if(!(avctx
->flags
& CODEC_FLAG_BITEXACT
)){
1411 c
->try_8x8basis
= try_8x8basis_3dnow
;
1413 c
->add_8x8basis
= add_8x8basis_3dnow
;
1417 dsputil_init_pix_mmx(c
, avctx
);