1 /********************************************************************
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
9 * by the Xiph.Org Foundation http://www.xiph.org/ *
11 ********************************************************************
16 ********************************************************************/
20 #include "codec_internal.h"
25 static const __attribute__ ((aligned(8),used
)) ogg_int64_t V128
= 0x0080008000800080LL
;
27 #define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
28 #define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
29 #define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
32 " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */ \
33 " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr */ \
34 " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */\
35 " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */\
36 /* convert from UINT8 to INT16 */ \
37 " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */ \
38 " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr) */ \
39 " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */ \
40 " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr) */ \
41 /* start calculation */ \
42 " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ReconPtr */ \
43 " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ReconPtr */ \
44 " movq %%mm0, (%2) \n\t" /* write answer out */ \
45 " movq %%mm2, 8(%2) \n\t" /* write answer out */ \
46 /* Increment pointers */ \
51 static void sub8x8__mmx (unsigned char *FiltPtr
, unsigned char *ReconPtr
,
52 ogg_int16_t
*DctInputPtr
, ogg_uint32_t PixelsPerLine
,
53 ogg_uint32_t ReconPixelsPerLine
)
55 __asm__
__volatile__ (
58 " pxor %%mm7, %%mm7 \n\t"
70 : "m" (PixelsPerLine
),
71 "m" (ReconPixelsPerLine
)
76 #define SUB_128_LOOP \
77 " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */ \
78 " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */\
79 /* convert from UINT8 to INT16 */ \
80 " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */ \
81 " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */ \
82 /* start calculation */ \
83 " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - 128 */ \
84 " psubw %%mm1, %%mm2 \n\t" /* mm2 = FiltPtr - 128 */ \
85 " movq %%mm0, (%1) \n\t" /* write answer out */ \
86 " movq %%mm2, 8(%1) \n\t" /* write answer out */ \
87 /* Increment pointers */ \
92 static void sub8x8_128__mmx (unsigned char *FiltPtr
, ogg_int16_t
*DctInputPtr
,
93 ogg_uint32_t PixelsPerLine
)
95 __asm__
__volatile__ (
98 " pxor %%mm7, %%mm7 \n\t"
99 " movq %[V128], %%mm1 \n\t"
110 : "m" (PixelsPerLine
),
116 #define SUB_AVG2_LOOP \
117 " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */ \
118 " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr1 */ \
119 " movq (%2), %%mm4 \n\t" /* mm1 = ReconPtr2 */ \
120 " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */\
121 " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */\
122 " movq %%mm4, %%mm5 \n\t" /* dup to prepare for up conversion */\
123 /* convert from UINT8 to INT16 */ \
124 " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */ \
125 " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr1) */ \
126 " punpcklbw %%mm7, %%mm4 \n\t" /* mm1 = INT16(ReconPtr2) */ \
127 " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */ \
128 " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr1) */ \
129 " punpckhbw %%mm7, %%mm5 \n\t" /* mm3 = INT16(ReconPtr2) */ \
130 /* average ReconPtr1 and ReconPtr2 */ \
131 " paddw %%mm4, %%mm1 \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */ \
132 " paddw %%mm5, %%mm3 \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */ \
133 " psrlw $1, %%mm1 \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */ \
134 " psrlw $1, %%mm3 \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */ \
135 " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ \
136 " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ \
137 " movq %%mm0, (%3) \n\t" /* write answer out */ \
138 " movq %%mm2, 8(%3) \n\t" /* write answer out */ \
139 /* Increment pointers */ \
140 " add $16, %3 \n\t" \
146 static void sub8x8avg2__mmx (unsigned char *FiltPtr
, unsigned char *ReconPtr1
,
147 unsigned char *ReconPtr2
, ogg_int16_t
*DctInputPtr
,
148 ogg_uint32_t PixelsPerLine
,
149 ogg_uint32_t ReconPixelsPerLine
)
151 __asm__
__volatile__ (
154 " pxor %%mm7, %%mm7 \n\t"
167 : "m" (PixelsPerLine
),
168 "m" (ReconPixelsPerLine
)
173 static ogg_uint32_t
row_sad8__mmx (unsigned char *Src1
, unsigned char *Src2
)
177 __asm__
__volatile__ (
180 " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
181 " pxor %%mm7, %%mm7 \n\t" /* zero out mm7 for unpack */
182 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
183 " movq (%2), %%mm1 \n\t"
185 " movq %%mm0, %%mm2 \n\t"
186 " psubusb %%mm1, %%mm0 \n\t" /* A - B */
187 " psubusb %%mm2, %%mm1 \n\t" /* B - A */
188 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
190 " movq %%mm0, %%mm1 \n\t"
192 " punpcklbw %%mm6, %%mm0 \n\t" /* ; unpack low four bytes to higher precision */
193 " punpckhbw %%mm7, %%mm1 \n\t" /* ; unpack high four bytes to higher precision */
195 " movq %%mm0, %%mm2 \n\t"
196 " movq %%mm1, %%mm3 \n\t"
197 " psrlq $32, %%mm2 \n\t" /* fold and add */
198 " psrlq $32, %%mm3 \n\t"
199 " paddw %%mm2, %%mm0 \n\t"
200 " paddw %%mm3, %%mm1 \n\t"
201 " movq %%mm0, %%mm2 \n\t"
202 " movq %%mm1, %%mm3 \n\t"
203 " psrlq $16, %%mm2 \n\t"
204 " psrlq $16, %%mm3 \n\t"
205 " paddw %%mm2, %%mm0 \n\t"
206 " paddw %%mm3, %%mm1 \n\t"
208 " psubusw %%mm0, %%mm1 \n\t"
209 " paddw %%mm0, %%mm1 \n\t" /* mm1 = max(mm1, mm0) */
210 " movd %%mm1, %0 \n\t"
211 " andl $0xffff, %0 \n\t"
222 static ogg_uint32_t
col_sad8x8__mmx (unsigned char *Src1
, unsigned char *Src2
,
227 __asm__
__volatile__ (
230 " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
231 " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
232 " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
233 " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
234 " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
235 " mov $4, %%edi \n\t" /* 4 rows */
237 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
238 " movq (%2), %%mm1 \n\t" /* take 8 bytes */
240 " movq %%mm0, %%mm2 \n\t"
241 " psubusb %%mm1, %%mm0 \n\t" /* A - B */
242 " psubusb %%mm2, %%mm1 \n\t" /* B - A */
243 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
244 " movq %%mm0, %%mm1 \n\t"
246 " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
247 " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
248 " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
249 " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
250 " add %3, %1 \n\t" /* Inc pointer into the new data */
251 " add %3, %2 \n\t" /* Inc pointer into the new data */
256 " mov $4, %%edi \n\t" /* 4 rows */
258 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
259 " movq (%2), %%mm1 \n\t" /* take 8 bytes */
261 " movq %%mm0, %%mm2 \n\t"
262 " psubusb %%mm1, %%mm0 \n\t" /* A - B */
263 " psubusb %%mm2, %%mm1 \n\t" /* B - A */
264 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
265 " movq %%mm0, %%mm1 \n\t"
267 " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
268 " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
269 " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
270 " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
271 " add %3, %1 \n\t" /* Inc pointer into the new data */
272 " add %3, %2 \n\t" /* Inc pointer into the new data */
277 " psubusw %%mm6, %%mm7 \n\t"
278 " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm7, mm6) */
279 " psubusw %%mm4, %%mm5 \n\t"
280 " paddw %%mm4, %%mm5 \n\t" /* mm5 = max(mm5, mm4) */
281 " psubusw %%mm5, %%mm7 \n\t"
282 " paddw %%mm5, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
283 " movq %%mm7, %%mm6 \n\t"
284 " psrlq $32, %%mm6 \n\t"
285 " psubusw %%mm6, %%mm7 \n\t"
286 " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
287 " movq %%mm7, %%mm6 \n\t"
288 " psrlq $16, %%mm6 \n\t"
289 " psubusw %%mm6, %%mm7 \n\t"
290 " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
291 " movd %%mm7, %0 \n\t"
292 " andl $0xffff, %0 \n\t"
305 " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
306 " movq (%2), %%mm1 \n\t" \
307 " movq %%mm0, %%mm2 \n\t" \
308 " psubusb %%mm1, %%mm0 \n\t" /* A - B */ \
309 " psubusb %%mm2, %%mm1 \n\t" /* B - A */ \
310 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ \
311 " movq %%mm0, %%mm1 \n\t" \
312 " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */ \
313 " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
314 " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */ \
315 " add %3, %1 \n\t" /* Inc pointer into the new data */ \
316 " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ \
317 " add %4, %2 \n\t" /* Inc pointer into ref data */
319 static ogg_uint32_t
sad8x8__mmx (unsigned char *ptr1
, ogg_uint32_t stride1
,
320 unsigned char *ptr2
, ogg_uint32_t stride2
)
322 ogg_uint32_t DiffVal
;
324 __asm__
__volatile__ (
326 " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
327 " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
336 " movq %%mm7, %%mm0 \n\t"
337 " psrlq $32, %%mm7 \n\t"
338 " paddw %%mm0, %%mm7 \n\t"
339 " movq %%mm7, %%mm0 \n\t"
340 " psrlq $16, %%mm7 \n\t"
341 " paddw %%mm0, %%mm7 \n\t"
342 " movd %%mm7, %0 \n\t"
343 " andl $0xffff, %0 \n\t"
356 static ogg_uint32_t
sad8x8_thres__mmx (unsigned char *ptr1
, ogg_uint32_t stride1
,
357 unsigned char *ptr2
, ogg_uint32_t stride2
,
360 return sad8x8__mmx (ptr1
, stride1
, ptr2
, stride2
);
363 static ogg_uint32_t
sad8x8_xy2_thres__mmx (unsigned char *SrcData
, ogg_uint32_t SrcStride
,
364 unsigned char *RefDataPtr1
,
365 unsigned char *RefDataPtr2
, ogg_uint32_t RefStride
,
368 ogg_uint32_t DiffVal
;
370 __asm__
__volatile__ (
373 " pcmpeqd %%mm5, %%mm5 \n\t" /* fefefefefefefefe in mm5 */
374 " paddb %%mm5, %%mm5 \n\t"
376 " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
377 " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
378 " mov $8, %%edi \n\t" /* 8 rows */
380 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
382 " movq (%2), %%mm2 \n\t"
383 " movq (%3), %%mm3 \n\t" /* take average of mm2 and mm3 */
384 " movq %%mm2, %%mm1 \n\t"
385 " pand %%mm3, %%mm1 \n\t"
386 " pxor %%mm2, %%mm3 \n\t"
387 " pand %%mm5, %%mm3 \n\t"
388 " psrlq $1, %%mm3 \n\t"
389 " paddb %%mm3, %%mm1 \n\t"
391 " movq %%mm0, %%mm2 \n\t"
393 " psubusb %%mm1, %%mm0 \n\t" /* A - B */
394 " psubusb %%mm2, %%mm1 \n\t" /* B - A */
395 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
396 " movq %%mm0, %%mm1 \n\t"
398 " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */
399 " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
400 " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */
401 " add %4, %1 \n\t" /* Inc pointer into the new data */
402 " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
403 " add %5, %2 \n\t" /* Inc pointer into ref data */
404 " add %5, %3 \n\t" /* Inc pointer into ref data */
409 " movq %%mm7, %%mm0 \n\t"
410 " psrlq $32, %%mm7 \n\t"
411 " paddw %%mm0, %%mm7 \n\t"
412 " movq %%mm7, %%mm0 \n\t"
413 " psrlq $16, %%mm7 \n\t"
414 " paddw %%mm0, %%mm7 \n\t"
415 " movd %%mm7, %0 \n\t"
416 " andl $0xffff, %0 \n\t"
430 static ogg_uint32_t
intra8x8_err__mmx (unsigned char *DataPtr
, ogg_uint32_t Stride
)
435 __asm__
__volatile__ (
438 " pxor %%mm5, %%mm5 \n\t"
439 " pxor %%mm6, %%mm6 \n\t"
440 " pxor %%mm7, %%mm7 \n\t"
441 " mov $8, %%edi \n\t"
443 " movq (%2), %%mm0 \n\t" /* take 8 bytes */
444 " movq %%mm0, %%mm2 \n\t"
446 " punpcklbw %%mm6, %%mm0 \n\t"
447 " punpckhbw %%mm6, %%mm2 \n\t"
449 " paddw %%mm0, %%mm5 \n\t"
450 " paddw %%mm2, %%mm5 \n\t"
452 " pmaddwd %%mm0, %%mm0 \n\t"
453 " pmaddwd %%mm2, %%mm2 \n\t"
455 " paddd %%mm0, %%mm7 \n\t"
456 " paddd %%mm2, %%mm7 \n\t"
458 " add %3, %2 \n\t" /* Inc pointer into src data */
463 " movq %%mm5, %%mm0 \n\t"
464 " psrlq $32, %%mm5 \n\t"
465 " paddw %%mm0, %%mm5 \n\t"
466 " movq %%mm5, %%mm0 \n\t"
467 " psrlq $16, %%mm5 \n\t"
468 " paddw %%mm0, %%mm5 \n\t"
469 " movd %%mm5, %%edi \n\t"
470 " movsx %%di, %%edi \n\t"
471 " movl %%edi, %0 \n\t"
473 " movq %%mm7, %%mm0 \n\t"
474 " psrlq $32, %%mm7 \n\t"
475 " paddd %%mm0, %%mm7 \n\t"
476 " movd %%mm7, %1 \n\t"
485 /* Compute population variance as mis-match metric. */
486 return (( (XXSum
<<6) - XSum
*XSum
) );
489 static ogg_uint32_t
inter8x8_err__mmx (unsigned char *SrcData
, ogg_uint32_t SrcStride
,
490 unsigned char *RefDataPtr
, ogg_uint32_t RefStride
)
495 __asm__
__volatile__ (
498 " pxor %%mm5, %%mm5 \n\t"
499 " pxor %%mm6, %%mm6 \n\t"
500 " pxor %%mm7, %%mm7 \n\t"
501 " mov $8, %%edi \n\t"
503 " movq (%2), %%mm0 \n\t" /* take 8 bytes */
504 " movq (%3), %%mm1 \n\t"
505 " movq %%mm0, %%mm2 \n\t"
506 " movq %%mm1, %%mm3 \n\t"
508 " punpcklbw %%mm6, %%mm0 \n\t"
509 " punpcklbw %%mm6, %%mm1 \n\t"
510 " punpckhbw %%mm6, %%mm2 \n\t"
511 " punpckhbw %%mm6, %%mm3 \n\t"
513 " psubsw %%mm1, %%mm0 \n\t"
514 " psubsw %%mm3, %%mm2 \n\t"
516 " paddw %%mm0, %%mm5 \n\t"
517 " paddw %%mm2, %%mm5 \n\t"
519 " pmaddwd %%mm0, %%mm0 \n\t"
520 " pmaddwd %%mm2, %%mm2 \n\t"
522 " paddd %%mm0, %%mm7 \n\t"
523 " paddd %%mm2, %%mm7 \n\t"
525 " add %4, %2 \n\t" /* Inc pointer into src data */
526 " add %5, %3 \n\t" /* Inc pointer into ref data */
531 " movq %%mm5, %%mm0 \n\t"
532 " psrlq $32, %%mm5 \n\t"
533 " paddw %%mm0, %%mm5 \n\t"
534 " movq %%mm5, %%mm0 \n\t"
535 " psrlq $16, %%mm5 \n\t"
536 " paddw %%mm0, %%mm5 \n\t"
537 " movd %%mm5, %%edi \n\t"
538 " movsx %%di, %%edi \n\t"
539 " movl %%edi, %0 \n\t"
541 " movq %%mm7, %%mm0 \n\t"
542 " psrlq $32, %%mm7 \n\t"
543 " paddd %%mm0, %%mm7 \n\t"
544 " movd %%mm7, %1 \n\t"
555 /* Compute and return population variance as mis-match metric. */
556 return (( (XXSum
<<6) - XSum
*XSum
));
559 static ogg_uint32_t
inter8x8_err_xy2__mmx (unsigned char *SrcData
, ogg_uint32_t SrcStride
,
560 unsigned char *RefDataPtr1
,
561 unsigned char *RefDataPtr2
, ogg_uint32_t RefStride
)
566 __asm__
__volatile__ (
569 " pcmpeqd %%mm4, %%mm4 \n\t" /* fefefefefefefefe in mm4 */
570 " paddb %%mm4, %%mm4 \n\t"
571 " pxor %%mm5, %%mm5 \n\t"
572 " pxor %%mm6, %%mm6 \n\t"
573 " pxor %%mm7, %%mm7 \n\t"
574 " mov $8, %%edi \n\t"
576 " movq (%2), %%mm0 \n\t" /* take 8 bytes */
578 " movq (%3), %%mm2 \n\t"
579 " movq (%4), %%mm3 \n\t" /* take average of mm2 and mm3 */
580 " movq %%mm2, %%mm1 \n\t"
581 " pand %%mm3, %%mm1 \n\t"
582 " pxor %%mm2, %%mm3 \n\t"
583 " pand %%mm4, %%mm3 \n\t"
584 " psrlq $1, %%mm3 \n\t"
585 " paddb %%mm3, %%mm1 \n\t"
587 " movq %%mm0, %%mm2 \n\t"
588 " movq %%mm1, %%mm3 \n\t"
590 " punpcklbw %%mm6, %%mm0 \n\t"
591 " punpcklbw %%mm6, %%mm1 \n\t"
592 " punpckhbw %%mm6, %%mm2 \n\t"
593 " punpckhbw %%mm6, %%mm3 \n\t"
595 " psubsw %%mm1, %%mm0 \n\t"
596 " psubsw %%mm3, %%mm2 \n\t"
598 " paddw %%mm0, %%mm5 \n\t"
599 " paddw %%mm2, %%mm5 \n\t"
601 " pmaddwd %%mm0, %%mm0 \n\t"
602 " pmaddwd %%mm2, %%mm2 \n\t"
604 " paddd %%mm0, %%mm7 \n\t"
605 " paddd %%mm2, %%mm7 \n\t"
607 " add %5, %2 \n\t" /* Inc pointer into src data */
608 " add %6, %3 \n\t" /* Inc pointer into ref data */
609 " add %6, %4 \n\t" /* Inc pointer into ref data */
614 " movq %%mm5, %%mm0 \n\t"
615 " psrlq $32, %%mm5 \n\t"
616 " paddw %%mm0, %%mm5 \n\t"
617 " movq %%mm5, %%mm0 \n\t"
618 " psrlq $16, %%mm5 \n\t"
619 " paddw %%mm0, %%mm5 \n\t"
620 " movd %%mm5, %%edi \n\t"
621 " movsx %%di, %%edi \n\t"
622 " movl %%edi, %0 \n\t"
624 " movq %%mm7, %%mm0 \n\t"
625 " psrlq $32, %%mm7 \n\t"
626 " paddd %%mm0, %%mm7 \n\t"
627 " movd %%mm7, %1 \n\t"
639 /* Compute and return population variance as mis-match metric. */
640 return (( (XXSum
<<6) - XSum
*XSum
));
643 static void restore_fpu (void)
645 __asm__
__volatile__ (
650 void dsp_mmx_init(DspFunctions
*funcs
)
652 TH_DEBUG("enabling accelerated x86_32 mmx dsp functions.\n");
653 funcs
->restore_fpu
= restore_fpu
;
654 funcs
->sub8x8
= sub8x8__mmx
;
655 funcs
->sub8x8_128
= sub8x8_128__mmx
;
656 funcs
->sub8x8avg2
= sub8x8avg2__mmx
;
657 funcs
->row_sad8
= row_sad8__mmx
;
658 funcs
->col_sad8x8
= col_sad8x8__mmx
;
659 funcs
->sad8x8
= sad8x8__mmx
;
660 funcs
->sad8x8_thres
= sad8x8_thres__mmx
;
661 funcs
->sad8x8_xy2_thres
= sad8x8_xy2_thres__mmx
;
662 funcs
->intra8x8_err
= intra8x8_err__mmx
;
663 funcs
->inter8x8_err
= inter8x8_err__mmx
;
664 funcs
->inter8x8_err_xy2
= inter8x8_err_xy2__mmx
;