Add Russian translation provided by Валерий Крувялис <valkru@mail.ru>
[xiph-mirror.git] / theora-old / lib / x86_32 / dsp_mmx.c
blob8cead896c33f9f613d25c3f4d64c08b9e0dbf1d7
1 /********************************************************************
2 * *
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7 * *
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
9 * by the Xiph.Org Foundation http://www.xiph.org/ *
10 * *
11 ********************************************************************
13 function:
14 last mod: $Id$
16 ********************************************************************/
18 #include <stdlib.h>
20 #include "codec_internal.h"
21 #include "dsp.h"
23 #if defined(USE_ASM)
25 static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x0080008000800080LL;
27 #define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
28 #define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
29 #define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
31 #define SUB_LOOP \
32 " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */ \
33 " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr */ \
34 " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */\
35 " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */\
36 /* convert from UINT8 to INT16 */ \
37 " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */ \
38 " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr) */ \
39 " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */ \
40 " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr) */ \
41 /* start calculation */ \
42 " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ReconPtr */ \
43 " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ReconPtr */ \
44 " movq %%mm0, (%2) \n\t" /* write answer out */ \
45 " movq %%mm2, 8(%2) \n\t" /* write answer out */ \
46 /* Increment pointers */ \
47 " add $16, %2 \n\t" \
48 " add %3, %0 \n\t" \
49 " add %4, %1 \n\t"
51 static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
52 ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
53 ogg_uint32_t ReconPixelsPerLine)
55 __asm__ __volatile__ (
56 " .p2align 4 \n\t"
58 " pxor %%mm7, %%mm7 \n\t"
59 SUB_LOOP
60 SUB_LOOP
61 SUB_LOOP
62 SUB_LOOP
63 SUB_LOOP
64 SUB_LOOP
65 SUB_LOOP
66 SUB_LOOP
67 : "+r" (FiltPtr),
68 "+r" (ReconPtr),
69 "+r" (DctInputPtr)
70 : "m" (PixelsPerLine),
71 "m" (ReconPixelsPerLine)
72 : "memory"
76 #define SUB_128_LOOP \
77 " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */ \
78 " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */\
79 /* convert from UINT8 to INT16 */ \
80 " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */ \
81 " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */ \
82 /* start calculation */ \
83 " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - 128 */ \
84 " psubw %%mm1, %%mm2 \n\t" /* mm2 = FiltPtr - 128 */ \
85 " movq %%mm0, (%1) \n\t" /* write answer out */ \
86 " movq %%mm2, 8(%1) \n\t" /* write answer out */ \
87 /* Increment pointers */ \
88 " add $16, %1 \n\t" \
89 " add %2, %0 \n\t"
92 static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
93 ogg_uint32_t PixelsPerLine)
95 __asm__ __volatile__ (
96 " .p2align 4 \n\t"
98 " pxor %%mm7, %%mm7 \n\t"
99 " movq %[V128], %%mm1 \n\t"
100 SUB_128_LOOP
101 SUB_128_LOOP
102 SUB_128_LOOP
103 SUB_128_LOOP
104 SUB_128_LOOP
105 SUB_128_LOOP
106 SUB_128_LOOP
107 SUB_128_LOOP
108 : "+r" (FiltPtr),
109 "+r" (DctInputPtr)
110 : "m" (PixelsPerLine),
111 [V128] "m" (V128)
112 : "memory"
116 #define SUB_AVG2_LOOP \
117 " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */ \
118 " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr1 */ \
119 " movq (%2), %%mm4 \n\t" /* mm1 = ReconPtr2 */ \
120 " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */\
121 " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */\
122 " movq %%mm4, %%mm5 \n\t" /* dup to prepare for up conversion */\
123 /* convert from UINT8 to INT16 */ \
124 " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */ \
125 " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr1) */ \
126 " punpcklbw %%mm7, %%mm4 \n\t" /* mm1 = INT16(ReconPtr2) */ \
127 " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */ \
128 " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr1) */ \
129 " punpckhbw %%mm7, %%mm5 \n\t" /* mm3 = INT16(ReconPtr2) */ \
130 /* average ReconPtr1 and ReconPtr2 */ \
131 " paddw %%mm4, %%mm1 \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */ \
132 " paddw %%mm5, %%mm3 \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */ \
133 " psrlw $1, %%mm1 \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */ \
134 " psrlw $1, %%mm3 \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */ \
135 " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ \
136 " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ \
137 " movq %%mm0, (%3) \n\t" /* write answer out */ \
138 " movq %%mm2, 8(%3) \n\t" /* write answer out */ \
139 /* Increment pointers */ \
140 " add $16, %3 \n\t" \
141 " add %4, %0 \n\t" \
142 " add %5, %1 \n\t" \
143 " add %5, %2 \n\t"
146 static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
147 unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
148 ogg_uint32_t PixelsPerLine,
149 ogg_uint32_t ReconPixelsPerLine)
151 __asm__ __volatile__ (
152 " .p2align 4 \n\t"
154 " pxor %%mm7, %%mm7 \n\t"
155 SUB_AVG2_LOOP
156 SUB_AVG2_LOOP
157 SUB_AVG2_LOOP
158 SUB_AVG2_LOOP
159 SUB_AVG2_LOOP
160 SUB_AVG2_LOOP
161 SUB_AVG2_LOOP
162 SUB_AVG2_LOOP
163 : "+r" (FiltPtr),
164 "+r" (ReconPtr1),
165 "+r" (ReconPtr2),
166 "+r" (DctInputPtr)
167 : "m" (PixelsPerLine),
168 "m" (ReconPixelsPerLine)
169 : "memory"
173 static ogg_uint32_t row_sad8__mmx (unsigned char *Src1, unsigned char *Src2)
175 ogg_uint32_t MaxSad;
177 __asm__ __volatile__ (
178 " .p2align 4 \n\t"
180 " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
181 " pxor %%mm7, %%mm7 \n\t" /* zero out mm7 for unpack */
182 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
183 " movq (%2), %%mm1 \n\t"
185 " movq %%mm0, %%mm2 \n\t"
186 " psubusb %%mm1, %%mm0 \n\t" /* A - B */
187 " psubusb %%mm2, %%mm1 \n\t" /* B - A */
188 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
190 " movq %%mm0, %%mm1 \n\t"
192 " punpcklbw %%mm6, %%mm0 \n\t" /* ; unpack low four bytes to higher precision */
193 " punpckhbw %%mm7, %%mm1 \n\t" /* ; unpack high four bytes to higher precision */
195 " movq %%mm0, %%mm2 \n\t"
196 " movq %%mm1, %%mm3 \n\t"
197 " psrlq $32, %%mm2 \n\t" /* fold and add */
198 " psrlq $32, %%mm3 \n\t"
199 " paddw %%mm2, %%mm0 \n\t"
200 " paddw %%mm3, %%mm1 \n\t"
201 " movq %%mm0, %%mm2 \n\t"
202 " movq %%mm1, %%mm3 \n\t"
203 " psrlq $16, %%mm2 \n\t"
204 " psrlq $16, %%mm3 \n\t"
205 " paddw %%mm2, %%mm0 \n\t"
206 " paddw %%mm3, %%mm1 \n\t"
208 " psubusw %%mm0, %%mm1 \n\t"
209 " paddw %%mm0, %%mm1 \n\t" /* mm1 = max(mm1, mm0) */
210 " movd %%mm1, %0 \n\t"
211 " andl $0xffff, %0 \n\t"
213 : "=m" (MaxSad),
214 "+r" (Src1),
215 "+r" (Src2)
217 : "memory"
219 return MaxSad;
222 static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2,
223 ogg_uint32_t stride)
225 ogg_uint32_t MaxSad;
227 __asm__ __volatile__ (
228 " .p2align 4 \n\t"
230 " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
231 " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
232 " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
233 " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
234 " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
235 " mov $4, %%edi \n\t" /* 4 rows */
236 "1: \n\t"
237 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
238 " movq (%2), %%mm1 \n\t" /* take 8 bytes */
240 " movq %%mm0, %%mm2 \n\t"
241 " psubusb %%mm1, %%mm0 \n\t" /* A - B */
242 " psubusb %%mm2, %%mm1 \n\t" /* B - A */
243 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
244 " movq %%mm0, %%mm1 \n\t"
246 " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
247 " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
248 " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
249 " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
250 " add %3, %1 \n\t" /* Inc pointer into the new data */
251 " add %3, %2 \n\t" /* Inc pointer into the new data */
253 " dec %%edi \n\t"
254 " jnz 1b \n\t"
256 " mov $4, %%edi \n\t" /* 4 rows */
257 "2: \n\t"
258 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
259 " movq (%2), %%mm1 \n\t" /* take 8 bytes */
261 " movq %%mm0, %%mm2 \n\t"
262 " psubusb %%mm1, %%mm0 \n\t" /* A - B */
263 " psubusb %%mm2, %%mm1 \n\t" /* B - A */
264 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
265 " movq %%mm0, %%mm1 \n\t"
267 " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
268 " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
269 " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
270 " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
271 " add %3, %1 \n\t" /* Inc pointer into the new data */
272 " add %3, %2 \n\t" /* Inc pointer into the new data */
274 " dec %%edi \n\t"
275 " jnz 2b \n\t"
277 " psubusw %%mm6, %%mm7 \n\t"
278 " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm7, mm6) */
279 " psubusw %%mm4, %%mm5 \n\t"
280 " paddw %%mm4, %%mm5 \n\t" /* mm5 = max(mm5, mm4) */
281 " psubusw %%mm5, %%mm7 \n\t"
282 " paddw %%mm5, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
283 " movq %%mm7, %%mm6 \n\t"
284 " psrlq $32, %%mm6 \n\t"
285 " psubusw %%mm6, %%mm7 \n\t"
286 " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
287 " movq %%mm7, %%mm6 \n\t"
288 " psrlq $16, %%mm6 \n\t"
289 " psubusw %%mm6, %%mm7 \n\t"
290 " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
291 " movd %%mm7, %0 \n\t"
292 " andl $0xffff, %0 \n\t"
294 : "=r" (MaxSad),
295 "+r" (Src1),
296 "+r" (Src2)
297 : "r" (stride)
298 : "memory", "edi"
301 return MaxSad;
304 #define SAD_LOOP \
305 " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
306 " movq (%2), %%mm1 \n\t" \
307 " movq %%mm0, %%mm2 \n\t" \
308 " psubusb %%mm1, %%mm0 \n\t" /* A - B */ \
309 " psubusb %%mm2, %%mm1 \n\t" /* B - A */ \
310 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ \
311 " movq %%mm0, %%mm1 \n\t" \
312 " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */ \
313 " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
314 " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */ \
315 " add %3, %1 \n\t" /* Inc pointer into the new data */ \
316 " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ \
317 " add %4, %2 \n\t" /* Inc pointer into ref data */
319 static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
320 unsigned char *ptr2, ogg_uint32_t stride2)
322 ogg_uint32_t DiffVal;
324 __asm__ __volatile__ (
325 " .p2align 4 \n\t"
326 " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
327 " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
328 SAD_LOOP
329 SAD_LOOP
330 SAD_LOOP
331 SAD_LOOP
332 SAD_LOOP
333 SAD_LOOP
334 SAD_LOOP
335 SAD_LOOP
336 " movq %%mm7, %%mm0 \n\t"
337 " psrlq $32, %%mm7 \n\t"
338 " paddw %%mm0, %%mm7 \n\t"
339 " movq %%mm7, %%mm0 \n\t"
340 " psrlq $16, %%mm7 \n\t"
341 " paddw %%mm0, %%mm7 \n\t"
342 " movd %%mm7, %0 \n\t"
343 " andl $0xffff, %0 \n\t"
345 : "=m" (DiffVal),
346 "+r" (ptr1),
347 "+r" (ptr2)
348 : "r" (stride1),
349 "r" (stride2)
350 : "memory"
353 return DiffVal;
356 static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
357 unsigned char *ptr2, ogg_uint32_t stride2,
358 ogg_uint32_t thres)
360 return sad8x8__mmx (ptr1, stride1, ptr2, stride2);
363 static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
364 unsigned char *RefDataPtr1,
365 unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
366 ogg_uint32_t thres)
368 ogg_uint32_t DiffVal;
370 __asm__ __volatile__ (
371 " .p2align 4 \n\t"
373 " pcmpeqd %%mm5, %%mm5 \n\t" /* fefefefefefefefe in mm5 */
374 " paddb %%mm5, %%mm5 \n\t"
376 " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
377 " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
378 " mov $8, %%edi \n\t" /* 8 rows */
379 "1: \n\t"
380 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
382 " movq (%2), %%mm2 \n\t"
383 " movq (%3), %%mm3 \n\t" /* take average of mm2 and mm3 */
384 " movq %%mm2, %%mm1 \n\t"
385 " pand %%mm3, %%mm1 \n\t"
386 " pxor %%mm2, %%mm3 \n\t"
387 " pand %%mm5, %%mm3 \n\t"
388 " psrlq $1, %%mm3 \n\t"
389 " paddb %%mm3, %%mm1 \n\t"
391 " movq %%mm0, %%mm2 \n\t"
393 " psubusb %%mm1, %%mm0 \n\t" /* A - B */
394 " psubusb %%mm2, %%mm1 \n\t" /* B - A */
395 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
396 " movq %%mm0, %%mm1 \n\t"
398 " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */
399 " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
400 " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */
401 " add %4, %1 \n\t" /* Inc pointer into the new data */
402 " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
403 " add %5, %2 \n\t" /* Inc pointer into ref data */
404 " add %5, %3 \n\t" /* Inc pointer into ref data */
406 " dec %%edi \n\t"
407 " jnz 1b \n\t"
409 " movq %%mm7, %%mm0 \n\t"
410 " psrlq $32, %%mm7 \n\t"
411 " paddw %%mm0, %%mm7 \n\t"
412 " movq %%mm7, %%mm0 \n\t"
413 " psrlq $16, %%mm7 \n\t"
414 " paddw %%mm0, %%mm7 \n\t"
415 " movd %%mm7, %0 \n\t"
416 " andl $0xffff, %0 \n\t"
418 : "=m" (DiffVal),
419 "+r" (SrcData),
420 "+r" (RefDataPtr1),
421 "+r" (RefDataPtr2)
422 : "m" (SrcStride),
423 "m" (RefStride)
424 : "edi", "memory"
427 return DiffVal;
430 static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride)
432 ogg_uint32_t XSum;
433 ogg_uint32_t XXSum;
435 __asm__ __volatile__ (
436 " .p2align 4 \n\t"
438 " pxor %%mm5, %%mm5 \n\t"
439 " pxor %%mm6, %%mm6 \n\t"
440 " pxor %%mm7, %%mm7 \n\t"
441 " mov $8, %%edi \n\t"
442 "1: \n\t"
443 " movq (%2), %%mm0 \n\t" /* take 8 bytes */
444 " movq %%mm0, %%mm2 \n\t"
446 " punpcklbw %%mm6, %%mm0 \n\t"
447 " punpckhbw %%mm6, %%mm2 \n\t"
449 " paddw %%mm0, %%mm5 \n\t"
450 " paddw %%mm2, %%mm5 \n\t"
452 " pmaddwd %%mm0, %%mm0 \n\t"
453 " pmaddwd %%mm2, %%mm2 \n\t"
455 " paddd %%mm0, %%mm7 \n\t"
456 " paddd %%mm2, %%mm7 \n\t"
458 " add %3, %2 \n\t" /* Inc pointer into src data */
460 " dec %%edi \n\t"
461 " jnz 1b \n\t"
463 " movq %%mm5, %%mm0 \n\t"
464 " psrlq $32, %%mm5 \n\t"
465 " paddw %%mm0, %%mm5 \n\t"
466 " movq %%mm5, %%mm0 \n\t"
467 " psrlq $16, %%mm5 \n\t"
468 " paddw %%mm0, %%mm5 \n\t"
469 " movd %%mm5, %%edi \n\t"
470 " movsx %%di, %%edi \n\t"
471 " movl %%edi, %0 \n\t"
473 " movq %%mm7, %%mm0 \n\t"
474 " psrlq $32, %%mm7 \n\t"
475 " paddd %%mm0, %%mm7 \n\t"
476 " movd %%mm7, %1 \n\t"
478 : "=r" (XSum),
479 "=r" (XXSum),
480 "+r" (DataPtr)
481 : "r" (Stride)
482 : "edi", "memory"
485 /* Compute population variance as mis-match metric. */
486 return (( (XXSum<<6) - XSum*XSum ) );
489 static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
490 unsigned char *RefDataPtr, ogg_uint32_t RefStride)
492 ogg_uint32_t XSum;
493 ogg_uint32_t XXSum;
495 __asm__ __volatile__ (
496 " .p2align 4 \n\t"
498 " pxor %%mm5, %%mm5 \n\t"
499 " pxor %%mm6, %%mm6 \n\t"
500 " pxor %%mm7, %%mm7 \n\t"
501 " mov $8, %%edi \n\t"
502 "1: \n\t"
503 " movq (%2), %%mm0 \n\t" /* take 8 bytes */
504 " movq (%3), %%mm1 \n\t"
505 " movq %%mm0, %%mm2 \n\t"
506 " movq %%mm1, %%mm3 \n\t"
508 " punpcklbw %%mm6, %%mm0 \n\t"
509 " punpcklbw %%mm6, %%mm1 \n\t"
510 " punpckhbw %%mm6, %%mm2 \n\t"
511 " punpckhbw %%mm6, %%mm3 \n\t"
513 " psubsw %%mm1, %%mm0 \n\t"
514 " psubsw %%mm3, %%mm2 \n\t"
516 " paddw %%mm0, %%mm5 \n\t"
517 " paddw %%mm2, %%mm5 \n\t"
519 " pmaddwd %%mm0, %%mm0 \n\t"
520 " pmaddwd %%mm2, %%mm2 \n\t"
522 " paddd %%mm0, %%mm7 \n\t"
523 " paddd %%mm2, %%mm7 \n\t"
525 " add %4, %2 \n\t" /* Inc pointer into src data */
526 " add %5, %3 \n\t" /* Inc pointer into ref data */
528 " dec %%edi \n\t"
529 " jnz 1b \n\t"
531 " movq %%mm5, %%mm0 \n\t"
532 " psrlq $32, %%mm5 \n\t"
533 " paddw %%mm0, %%mm5 \n\t"
534 " movq %%mm5, %%mm0 \n\t"
535 " psrlq $16, %%mm5 \n\t"
536 " paddw %%mm0, %%mm5 \n\t"
537 " movd %%mm5, %%edi \n\t"
538 " movsx %%di, %%edi \n\t"
539 " movl %%edi, %0 \n\t"
541 " movq %%mm7, %%mm0 \n\t"
542 " psrlq $32, %%mm7 \n\t"
543 " paddd %%mm0, %%mm7 \n\t"
544 " movd %%mm7, %1 \n\t"
546 : "=m" (XSum),
547 "=m" (XXSum),
548 "+r" (SrcData),
549 "+r" (RefDataPtr)
550 : "m" (SrcStride),
551 "m" (RefStride)
552 : "edi", "memory"
555 /* Compute and return population variance as mis-match metric. */
556 return (( (XXSum<<6) - XSum*XSum ));
559 static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
560 unsigned char *RefDataPtr1,
561 unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
563 ogg_uint32_t XSum;
564 ogg_uint32_t XXSum;
566 __asm__ __volatile__ (
567 " .p2align 4 \n\t"
569 " pcmpeqd %%mm4, %%mm4 \n\t" /* fefefefefefefefe in mm4 */
570 " paddb %%mm4, %%mm4 \n\t"
571 " pxor %%mm5, %%mm5 \n\t"
572 " pxor %%mm6, %%mm6 \n\t"
573 " pxor %%mm7, %%mm7 \n\t"
574 " mov $8, %%edi \n\t"
575 "1: \n\t"
576 " movq (%2), %%mm0 \n\t" /* take 8 bytes */
578 " movq (%3), %%mm2 \n\t"
579 " movq (%4), %%mm3 \n\t" /* take average of mm2 and mm3 */
580 " movq %%mm2, %%mm1 \n\t"
581 " pand %%mm3, %%mm1 \n\t"
582 " pxor %%mm2, %%mm3 \n\t"
583 " pand %%mm4, %%mm3 \n\t"
584 " psrlq $1, %%mm3 \n\t"
585 " paddb %%mm3, %%mm1 \n\t"
587 " movq %%mm0, %%mm2 \n\t"
588 " movq %%mm1, %%mm3 \n\t"
590 " punpcklbw %%mm6, %%mm0 \n\t"
591 " punpcklbw %%mm6, %%mm1 \n\t"
592 " punpckhbw %%mm6, %%mm2 \n\t"
593 " punpckhbw %%mm6, %%mm3 \n\t"
595 " psubsw %%mm1, %%mm0 \n\t"
596 " psubsw %%mm3, %%mm2 \n\t"
598 " paddw %%mm0, %%mm5 \n\t"
599 " paddw %%mm2, %%mm5 \n\t"
601 " pmaddwd %%mm0, %%mm0 \n\t"
602 " pmaddwd %%mm2, %%mm2 \n\t"
604 " paddd %%mm0, %%mm7 \n\t"
605 " paddd %%mm2, %%mm7 \n\t"
607 " add %5, %2 \n\t" /* Inc pointer into src data */
608 " add %6, %3 \n\t" /* Inc pointer into ref data */
609 " add %6, %4 \n\t" /* Inc pointer into ref data */
611 " dec %%edi \n\t"
612 " jnz 1b \n\t"
614 " movq %%mm5, %%mm0 \n\t"
615 " psrlq $32, %%mm5 \n\t"
616 " paddw %%mm0, %%mm5 \n\t"
617 " movq %%mm5, %%mm0 \n\t"
618 " psrlq $16, %%mm5 \n\t"
619 " paddw %%mm0, %%mm5 \n\t"
620 " movd %%mm5, %%edi \n\t"
621 " movsx %%di, %%edi \n\t"
622 " movl %%edi, %0 \n\t"
624 " movq %%mm7, %%mm0 \n\t"
625 " psrlq $32, %%mm7 \n\t"
626 " paddd %%mm0, %%mm7 \n\t"
627 " movd %%mm7, %1 \n\t"
629 : "=m" (XSum),
630 "=m" (XXSum),
631 "+r" (SrcData),
632 "+r" (RefDataPtr1),
633 "+r" (RefDataPtr2)
634 : "m" (SrcStride),
635 "m" (RefStride)
636 : "edi", "memory"
639 /* Compute and return population variance as mis-match metric. */
640 return (( (XXSum<<6) - XSum*XSum ));
643 static void restore_fpu (void)
645 __asm__ __volatile__ (
646 " emms \n\t"
650 void dsp_mmx_init(DspFunctions *funcs)
652 TH_DEBUG("enabling accelerated x86_32 mmx dsp functions.\n");
653 funcs->restore_fpu = restore_fpu;
654 funcs->sub8x8 = sub8x8__mmx;
655 funcs->sub8x8_128 = sub8x8_128__mmx;
656 funcs->sub8x8avg2 = sub8x8avg2__mmx;
657 funcs->row_sad8 = row_sad8__mmx;
658 funcs->col_sad8x8 = col_sad8x8__mmx;
659 funcs->sad8x8 = sad8x8__mmx;
660 funcs->sad8x8_thres = sad8x8_thres__mmx;
661 funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmx;
662 funcs->intra8x8_err = intra8x8_err__mmx;
663 funcs->inter8x8_err = inter8x8_err__mmx;
664 funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx;
667 #endif /* USE_ASM */