1 /********************************************************************
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
9 * by the Xiph.Org Foundation http://www.xiph.org/ *
11 ********************************************************************
16 ********************************************************************/
20 #include "codec_internal.h"
25 #define SAD_MMXEXT_LOOP \
26 " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
27 " movq (%2), %%mm1 \n\t" \
28 " psadbw %%mm1, %%mm0 \n\t" \
29 " add %3, %1 \n\t" /* Inc pointer into the new data */ \
30 " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
31 " add %4, %2 \n\t" /* Inc pointer into ref data */
34 static ogg_uint32_t
sad8x8__mmxext (unsigned char *ptr1
, ogg_uint32_t stride1
,
35 unsigned char *ptr2
, ogg_uint32_t stride2
)
39 __asm__
__volatile__ (
41 " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
51 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
52 " movq (%2), %%mm1 \n\t"
53 " psadbw %%mm1, %%mm0 \n\t"
54 " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
55 " movd %%mm7, %0 \n\t"
68 #define SAD_TRES_LOOP \
69 " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
70 " movq (%2), %%mm1 \n\t" \
71 " psadbw %%mm1, %%mm0 \n\t" \
72 " add %3, %1 \n\t" /* Inc pointer into the new data */ \
73 " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
74 " add %4, %2 \n\t" /* Inc pointer into ref data */
77 static ogg_uint32_t
sad8x8_thres__mmxext (unsigned char *ptr1
, ogg_uint32_t stride1
,
78 unsigned char *ptr2
, ogg_uint32_t stride2
,
83 __asm__
__volatile__ (
85 " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
96 " movd %%mm7, %0 \n\t"
109 #define SAD_XY2_TRES \
110 " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
111 " movq (%2), %%mm1 \n\t" \
112 " movq (%3), %%mm2 \n\t" \
113 " pavgb %%mm2, %%mm1 \n\t" \
114 " psadbw %%mm1, %%mm0 \n\t" \
116 " add %4, %1 \n\t" /* Inc pointer into the new data */ \
117 " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
118 " add %5, %2 \n\t" /* Inc pointer into ref data */ \
119 " add %5, %3 \n\t" /* Inc pointer into ref data */
122 static ogg_uint32_t
sad8x8_xy2_thres__mmxext (unsigned char *SrcData
, ogg_uint32_t SrcStride
,
123 unsigned char *RefDataPtr1
,
124 unsigned char *RefDataPtr2
, ogg_uint32_t RefStride
,
127 ogg_uint32_t DiffVal
;
129 __asm__
__volatile__ (
131 " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
141 " movd %%mm7, %0 \n\t"
154 static ogg_uint32_t
row_sad8__mmxext (unsigned char *Src1
, unsigned char *Src2
)
158 __asm__
__volatile__ (
161 " movd (%1), %%mm0 \n\t"
162 " movd (%2), %%mm1 \n\t"
163 " psadbw %%mm0, %%mm1 \n\t"
164 " movd 4(%1), %%mm2 \n\t"
165 " movd 4(%2), %%mm3 \n\t"
166 " psadbw %%mm2, %%mm3 \n\t"
168 " pmaxsw %%mm1, %%mm3 \n\t"
169 " movd %%mm3, %0 \n\t"
170 " andl $0xffff, %0 \n\t"
182 static ogg_uint32_t
col_sad8x8__mmxext (unsigned char *Src1
, unsigned char *Src2
,
187 __asm__
__volatile__ (
190 " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
191 " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
192 " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
193 " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
194 " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
195 " mov $4, %%edi \n\t" /* 4 rows */
197 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
198 " movq (%2), %%mm1 \n\t" /* take 8 bytes */
200 " movq %%mm0, %%mm2 \n\t"
201 " psubusb %%mm1, %%mm0 \n\t" /* A - B */
202 " psubusb %%mm2, %%mm1 \n\t" /* B - A */
203 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
204 " movq %%mm0, %%mm1 \n\t"
206 " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
207 " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
208 " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
209 " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
210 " add %3, %1 \n\t" /* Inc pointer into the new data */
211 " add %3, %2 \n\t" /* Inc pointer into the new data */
216 " mov $4, %%edi \n\t" /* 4 rows */
218 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
219 " movq (%2), %%mm1 \n\t" /* take 8 bytes */
221 " movq %%mm0, %%mm2 \n\t"
222 " psubusb %%mm1, %%mm0 \n\t" /* A - B */
223 " psubusb %%mm2, %%mm1 \n\t" /* B - A */
224 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
225 " movq %%mm0, %%mm1 \n\t"
227 " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
228 " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
229 " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
230 " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
231 " add %3, %1 \n\t" /* Inc pointer into the new data */
232 " add %3, %2 \n\t" /* Inc pointer into the new data */
237 " pmaxsw %%mm6, %%mm7 \n\t"
238 " pmaxsw %%mm4, %%mm5 \n\t"
239 " pmaxsw %%mm5, %%mm7 \n\t"
240 " movq %%mm7, %%mm6 \n\t"
241 " psrlq $32, %%mm6 \n\t"
242 " pmaxsw %%mm6, %%mm7 \n\t"
243 " movq %%mm7, %%mm6 \n\t"
244 " psrlq $16, %%mm6 \n\t"
245 " pmaxsw %%mm6, %%mm7 \n\t"
246 " movd %%mm7, %0 \n\t"
247 " andl $0xffff, %0 \n\t"
259 static ogg_uint32_t
inter8x8_err_xy2__mmxext (unsigned char *SrcData
, ogg_uint32_t SrcStride
,
260 unsigned char *RefDataPtr1
,
261 unsigned char *RefDataPtr2
, ogg_uint32_t RefStride
)
266 __asm__
__volatile__ (
269 " pxor %%mm4, %%mm4 \n\t"
270 " pxor %%mm5, %%mm5 \n\t"
271 " pxor %%mm6, %%mm6 \n\t"
272 " pxor %%mm7, %%mm7 \n\t"
273 " mov $8, %%edi \n\t"
275 " movq (%2), %%mm0 \n\t" /* take 8 bytes */
277 " movq (%3), %%mm2 \n\t"
278 " movq (%4), %%mm1 \n\t" /* take average of mm2 and mm1 */
279 " pavgb %%mm2, %%mm1 \n\t"
281 " movq %%mm0, %%mm2 \n\t"
282 " movq %%mm1, %%mm3 \n\t"
284 " punpcklbw %%mm6, %%mm0 \n\t"
285 " punpcklbw %%mm4, %%mm1 \n\t"
286 " punpckhbw %%mm6, %%mm2 \n\t"
287 " punpckhbw %%mm4, %%mm3 \n\t"
289 " psubsw %%mm1, %%mm0 \n\t"
290 " psubsw %%mm3, %%mm2 \n\t"
292 " paddw %%mm0, %%mm5 \n\t"
293 " paddw %%mm2, %%mm5 \n\t"
295 " pmaddwd %%mm0, %%mm0 \n\t"
296 " pmaddwd %%mm2, %%mm2 \n\t"
298 " paddd %%mm0, %%mm7 \n\t"
299 " paddd %%mm2, %%mm7 \n\t"
301 " add %5, %2 \n\t" /* Inc pointer into src data */
302 " add %6, %3 \n\t" /* Inc pointer into ref data */
303 " add %6, %4 \n\t" /* Inc pointer into ref data */
308 " movq %%mm5, %%mm0 \n\t"
309 " psrlq $32, %%mm5 \n\t"
310 " paddw %%mm0, %%mm5 \n\t"
311 " movq %%mm5, %%mm0 \n\t"
312 " psrlq $16, %%mm5 \n\t"
313 " paddw %%mm0, %%mm5 \n\t"
314 " movd %%mm5, %%edi \n\t"
315 " movsx %%di, %%edi \n\t"
316 " movl %%edi, %0 \n\t"
318 " movq %%mm7, %%mm0 \n\t"
319 " psrlq $32, %%mm7 \n\t"
320 " paddd %%mm0, %%mm7 \n\t"
321 " movd %%mm7, %1 \n\t"
333 /* Compute and return population variance as mis-match metric. */
334 return (( (XXSum
<<6) - XSum
*XSum
));
337 void dsp_mmxext_init(DspFunctions
*funcs
)
339 TH_DEBUG("enabling accelerated x86_32 mmxext dsp functions.\n");
340 funcs
->row_sad8
= row_sad8__mmxext
;
341 funcs
->col_sad8x8
= col_sad8x8__mmxext
;
342 funcs
->sad8x8
= sad8x8__mmxext
;
343 funcs
->sad8x8_thres
= sad8x8_thres__mmxext
;
344 funcs
->sad8x8_xy2_thres
= sad8x8_xy2_thres__mmxext
;
345 funcs
->inter8x8_err_xy2
= inter8x8_err_xy2__mmxext
;