1 /********************************************************************
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
9 * by the Xiph.Org Foundation http://www.xiph.org/ *
11 ********************************************************************
16 ********************************************************************/
20 #include "codec_internal.h"
23 typedef unsigned long long ogg_uint64_t
;
25 static ogg_uint32_t
sad8x8__mmxext (unsigned char *ptr1
, ogg_uint32_t stride1
,
26 unsigned char *ptr2
, ogg_uint32_t stride2
)
30 __asm__
__volatile__ (
32 " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
35 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
36 " movq (%2), %%mm1 \n\t"
37 " psadbw %%mm1, %%mm0 \n\t"
38 " add %3, %1 \n\t" /* Inc pointer into the new data */
39 " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
40 " add %4, %2 \n\t" /* Inc pointer into ref data */
43 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
44 " movq (%2), %%mm1 \n\t"
45 " psadbw %%mm1, %%mm0 \n\t"
46 " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
47 " movd %%mm7, %0 \n\t"
52 : "r" ((ogg_uint64_t
)stride1
),
53 "r" ((ogg_uint64_t
)stride2
)
60 static ogg_uint32_t
sad8x8_thres__mmxext (unsigned char *ptr1
, ogg_uint32_t stride1
,
61 unsigned char *ptr2
, ogg_uint32_t stride2
,
66 __asm__
__volatile__ (
68 " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
71 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
72 " movq (%2), %%mm1 \n\t"
73 " psadbw %%mm1, %%mm0 \n\t"
74 " add %3, %1 \n\t" /* Inc pointer into the new data */
75 " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
76 " add %4, %2 \n\t" /* Inc pointer into ref data */
79 " movd %%mm7, %0 \n\t"
84 : "r" ((ogg_uint64_t
)stride1
),
85 "r" ((ogg_uint64_t
)stride2
)
92 static ogg_uint32_t
sad8x8_xy2_thres__mmxext (unsigned char *SrcData
, ogg_uint32_t SrcStride
,
93 unsigned char *RefDataPtr1
,
94 unsigned char *RefDataPtr2
, ogg_uint32_t RefStride
,
99 __asm__
__volatile__ (
101 " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
103 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
104 " movq (%2), %%mm1 \n\t"
105 " movq (%3), %%mm2 \n\t"
106 " pavgb %%mm2, %%mm1 \n\t"
107 " psadbw %%mm1, %%mm0 \n\t"
109 " add %4, %1 \n\t" /* Inc pointer into the new data */
110 " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
111 " add %5, %2 \n\t" /* Inc pointer into ref data */
112 " add %5, %3 \n\t" /* Inc pointer into ref data */
115 " movd %%mm7, %0 \n\t"
120 : "r" ((ogg_uint64_t
)SrcStride
),
121 "r" ((ogg_uint64_t
)RefStride
)
128 static ogg_uint32_t
row_sad8__mmxext (unsigned char *Src1
, unsigned char *Src2
)
132 __asm__
__volatile__ (
135 " movd (%1), %%mm0 \n\t"
136 " movd (%2), %%mm1 \n\t"
137 " psadbw %%mm0, %%mm1 \n\t"
138 " movd 4(%1), %%mm2 \n\t"
139 " movd 4(%2), %%mm3 \n\t"
140 " psadbw %%mm2, %%mm3 \n\t"
142 " pmaxsw %%mm1, %%mm3 \n\t"
143 " movd %%mm3, %0 \n\t"
144 " andl $0xffff, %0 \n\t"
156 static ogg_uint32_t
col_sad8x8__mmxext (unsigned char *Src1
, unsigned char *Src2
,
161 __asm__
__volatile__ (
164 " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
165 " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
166 " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
167 " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
168 " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
169 " mov $4, %%rdi \n\t" /* 4 rows */
171 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
172 " movq (%2), %%mm1 \n\t" /* take 8 bytes */
174 " movq %%mm0, %%mm2 \n\t"
175 " psubusb %%mm1, %%mm0 \n\t" /* A - B */
176 " psubusb %%mm2, %%mm1 \n\t" /* B - A */
177 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
178 " movq %%mm0, %%mm1 \n\t"
180 " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
181 " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
182 " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
183 " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
184 " add %3, %1 \n\t" /* Inc pointer into the new data */
185 " add %3, %2 \n\t" /* Inc pointer into the new data */
190 " mov $4, %%rdi \n\t" /* 4 rows */
192 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
193 " movq (%2), %%mm1 \n\t" /* take 8 bytes */
195 " movq %%mm0, %%mm2 \n\t"
196 " psubusb %%mm1, %%mm0 \n\t" /* A - B */
197 " psubusb %%mm2, %%mm1 \n\t" /* B - A */
198 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
199 " movq %%mm0, %%mm1 \n\t"
201 " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
202 " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
203 " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
204 " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
205 " add %3, %1 \n\t" /* Inc pointer into the new data */
206 " add %3, %2 \n\t" /* Inc pointer into the new data */
211 " pmaxsw %%mm6, %%mm7 \n\t"
212 " pmaxsw %%mm4, %%mm5 \n\t"
213 " pmaxsw %%mm5, %%mm7 \n\t"
214 " movq %%mm7, %%mm6 \n\t"
215 " psrlq $32, %%mm6 \n\t"
216 " pmaxsw %%mm6, %%mm7 \n\t"
217 " movq %%mm7, %%mm6 \n\t"
218 " psrlq $16, %%mm6 \n\t"
219 " pmaxsw %%mm6, %%mm7 \n\t"
220 " movd %%mm7, %0 \n\t"
221 " andl $0xffff, %0 \n\t"
226 : "r" ((ogg_uint64_t
)stride
)
233 static ogg_uint32_t
inter8x8_err_xy2__mmxext (unsigned char *SrcData
, ogg_uint32_t SrcStride
,
234 unsigned char *RefDataPtr1
,
235 unsigned char *RefDataPtr2
, ogg_uint32_t RefStride
)
240 __asm__
__volatile__ (
243 " pxor %%mm4, %%mm4 \n\t"
244 " pxor %%mm5, %%mm5 \n\t"
245 " pxor %%mm6, %%mm6 \n\t"
246 " pxor %%mm7, %%mm7 \n\t"
247 " mov $8, %%rdi \n\t"
249 " movq (%2), %%mm0 \n\t" /* take 8 bytes */
251 " movq (%3), %%mm2 \n\t"
252 " movq (%4), %%mm1 \n\t" /* take average of mm2 and mm1 */
253 " pavgb %%mm2, %%mm1 \n\t"
255 " movq %%mm0, %%mm2 \n\t"
256 " movq %%mm1, %%mm3 \n\t"
258 " punpcklbw %%mm6, %%mm0 \n\t"
259 " punpcklbw %%mm4, %%mm1 \n\t"
260 " punpckhbw %%mm6, %%mm2 \n\t"
261 " punpckhbw %%mm4, %%mm3 \n\t"
263 " psubsw %%mm1, %%mm0 \n\t"
264 " psubsw %%mm3, %%mm2 \n\t"
266 " paddw %%mm0, %%mm5 \n\t"
267 " paddw %%mm2, %%mm5 \n\t"
269 " pmaddwd %%mm0, %%mm0 \n\t"
270 " pmaddwd %%mm2, %%mm2 \n\t"
272 " paddd %%mm0, %%mm7 \n\t"
273 " paddd %%mm2, %%mm7 \n\t"
275 " add %5, %2 \n\t" /* Inc pointer into src data */
276 " add %6, %3 \n\t" /* Inc pointer into ref data */
277 " add %6, %4 \n\t" /* Inc pointer into ref data */
282 " movq %%mm5, %%mm0 \n\t"
283 " psrlq $32, %%mm5 \n\t"
284 " paddw %%mm0, %%mm5 \n\t"
285 " movq %%mm5, %%mm0 \n\t"
286 " psrlq $16, %%mm5 \n\t"
287 " paddw %%mm0, %%mm5 \n\t"
288 " movd %%mm5, %%edi \n\t"
289 " movsx %%di, %%edi \n\t"
290 " movl %%edi, %0 \n\t"
292 " movq %%mm7, %%mm0 \n\t"
293 " psrlq $32, %%mm7 \n\t"
294 " paddd %%mm0, %%mm7 \n\t"
295 " movd %%mm7, %1 \n\t"
302 : "r" ((ogg_uint64_t
)SrcStride
),
303 "r" ((ogg_uint64_t
)RefStride
)
307 /* Compute and return population variance as mis-match metric. */
308 return (( (XXSum
<<6) - XSum
*XSum
));
311 void dsp_mmxext_init(DspFunctions
*funcs
)
313 TH_DEBUG("enabling accerated x86_64 mmxext dsp functions.\n");
314 funcs
->row_sad8
= row_sad8__mmxext
;
315 funcs
->col_sad8x8
= col_sad8x8__mmxext
;
316 funcs
->sad8x8
= sad8x8__mmxext
;
317 funcs
->sad8x8_thres
= sad8x8_thres__mmxext
;
318 funcs
->sad8x8_xy2_thres
= sad8x8_xy2_thres__mmxext
;
319 funcs
->inter8x8_err_xy2
= inter8x8_err_xy2__mmxext
;