Add Russian translation provided by Валерий Крувялис <valkru@mail.ru>
[xiph-mirror.git] / theora-old / lib / x86_32 / dsp_mmxext.c
blobff8d60c0a7d2c1f63af9e4728c175316b06f2957
1 /********************************************************************
2 * *
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7 * *
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
9 * by the Xiph.Org Foundation http://www.xiph.org/ *
10 * *
11 ********************************************************************
13 function:
14 last mod: $Id$
16 ********************************************************************/
18 #include <stdlib.h>
20 #include "codec_internal.h"
21 #include "dsp.h"
23 #if defined(USE_ASM)
25 #define SAD_MMXEXT_LOOP \
26 " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
27 " movq (%2), %%mm1 \n\t" \
28 " psadbw %%mm1, %%mm0 \n\t" \
29 " add %3, %1 \n\t" /* Inc pointer into the new data */ \
30 " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
31 " add %4, %2 \n\t" /* Inc pointer into ref data */
34 static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
35 unsigned char *ptr2, ogg_uint32_t stride2)
37 ogg_uint32_t DiffVal;
39 __asm__ __volatile__ (
40 " .p2align 4 \n\t"
41 " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
43 SAD_MMXEXT_LOOP
44 SAD_MMXEXT_LOOP
45 SAD_MMXEXT_LOOP
46 SAD_MMXEXT_LOOP
47 SAD_MMXEXT_LOOP
48 SAD_MMXEXT_LOOP
49 SAD_MMXEXT_LOOP
51 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
52 " movq (%2), %%mm1 \n\t"
53 " psadbw %%mm1, %%mm0 \n\t"
54 " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
55 " movd %%mm7, %0 \n\t"
57 : "=r" (DiffVal),
58 "+r" (ptr1),
59 "+r" (ptr2)
60 : "r" (stride1),
61 "r" (stride2)
62 : "memory"
65 return DiffVal;
68 #define SAD_TRES_LOOP \
69 " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
70 " movq (%2), %%mm1 \n\t" \
71 " psadbw %%mm1, %%mm0 \n\t" \
72 " add %3, %1 \n\t" /* Inc pointer into the new data */ \
73 " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
74 " add %4, %2 \n\t" /* Inc pointer into ref data */
77 static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
78 unsigned char *ptr2, ogg_uint32_t stride2,
79 ogg_uint32_t thres)
81 ogg_uint32_t DiffVal;
83 __asm__ __volatile__ (
84 " .p2align 4 \n\t"
85 " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
87 SAD_TRES_LOOP
88 SAD_TRES_LOOP
89 SAD_TRES_LOOP
90 SAD_TRES_LOOP
91 SAD_TRES_LOOP
92 SAD_TRES_LOOP
93 SAD_TRES_LOOP
94 SAD_TRES_LOOP
96 " movd %%mm7, %0 \n\t"
98 : "=r" (DiffVal),
99 "+r" (ptr1),
100 "+r" (ptr2)
101 : "r" (stride1),
102 "r" (stride2)
103 : "memory"
106 return DiffVal;
109 #define SAD_XY2_TRES \
110 " movq (%1), %%mm0 \n\t" /* take 8 bytes */ \
111 " movq (%2), %%mm1 \n\t" \
112 " movq (%3), %%mm2 \n\t" \
113 " pavgb %%mm2, %%mm1 \n\t" \
114 " psadbw %%mm1, %%mm0 \n\t" \
116 " add %4, %1 \n\t" /* Inc pointer into the new data */ \
117 " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ \
118 " add %5, %2 \n\t" /* Inc pointer into ref data */ \
119 " add %5, %3 \n\t" /* Inc pointer into ref data */
122 static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
123 unsigned char *RefDataPtr1,
124 unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
125 ogg_uint32_t thres)
127 ogg_uint32_t DiffVal;
129 __asm__ __volatile__ (
130 " .p2align 4 \n\t"
131 " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
132 SAD_XY2_TRES
133 SAD_XY2_TRES
134 SAD_XY2_TRES
135 SAD_XY2_TRES
136 SAD_XY2_TRES
137 SAD_XY2_TRES
138 SAD_XY2_TRES
139 SAD_XY2_TRES
141 " movd %%mm7, %0 \n\t"
142 : "=m" (DiffVal),
143 "+r" (SrcData),
144 "+r" (RefDataPtr1),
145 "+r" (RefDataPtr2)
146 : "m" (SrcStride),
147 "m" (RefStride)
148 : "memory"
151 return DiffVal;
154 static ogg_uint32_t row_sad8__mmxext (unsigned char *Src1, unsigned char *Src2)
156 ogg_uint32_t MaxSad;
158 __asm__ __volatile__ (
159 " .p2align 4 \n\t"
161 " movd (%1), %%mm0 \n\t"
162 " movd (%2), %%mm1 \n\t"
163 " psadbw %%mm0, %%mm1 \n\t"
164 " movd 4(%1), %%mm2 \n\t"
165 " movd 4(%2), %%mm3 \n\t"
166 " psadbw %%mm2, %%mm3 \n\t"
168 " pmaxsw %%mm1, %%mm3 \n\t"
169 " movd %%mm3, %0 \n\t"
170 " andl $0xffff, %0 \n\t"
172 : "=m" (MaxSad),
173 "+r" (Src1),
174 "+r" (Src2)
176 : "memory"
179 return MaxSad;
182 static ogg_uint32_t col_sad8x8__mmxext (unsigned char *Src1, unsigned char *Src2,
183 ogg_uint32_t stride)
185 ogg_uint32_t MaxSad;
187 __asm__ __volatile__ (
188 " .p2align 4 \n\t"
190 " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
191 " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
192 " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
193 " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
194 " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
195 " mov $4, %%edi \n\t" /* 4 rows */
196 "1: \n\t"
197 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
198 " movq (%2), %%mm1 \n\t" /* take 8 bytes */
200 " movq %%mm0, %%mm2 \n\t"
201 " psubusb %%mm1, %%mm0 \n\t" /* A - B */
202 " psubusb %%mm2, %%mm1 \n\t" /* B - A */
203 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
204 " movq %%mm0, %%mm1 \n\t"
206 " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
207 " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
208 " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
209 " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
210 " add %3, %1 \n\t" /* Inc pointer into the new data */
211 " add %3, %2 \n\t" /* Inc pointer into the new data */
213 " dec %%edi \n\t"
214 " jnz 1b \n\t"
216 " mov $4, %%edi \n\t" /* 4 rows */
217 "2: \n\t"
218 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
219 " movq (%2), %%mm1 \n\t" /* take 8 bytes */
221 " movq %%mm0, %%mm2 \n\t"
222 " psubusb %%mm1, %%mm0 \n\t" /* A - B */
223 " psubusb %%mm2, %%mm1 \n\t" /* B - A */
224 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
225 " movq %%mm0, %%mm1 \n\t"
227 " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
228 " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
229 " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
230 " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
231 " add %3, %1 \n\t" /* Inc pointer into the new data */
232 " add %3, %2 \n\t" /* Inc pointer into the new data */
234 " dec %%edi \n\t"
235 " jnz 2b \n\t"
237 " pmaxsw %%mm6, %%mm7 \n\t"
238 " pmaxsw %%mm4, %%mm5 \n\t"
239 " pmaxsw %%mm5, %%mm7 \n\t"
240 " movq %%mm7, %%mm6 \n\t"
241 " psrlq $32, %%mm6 \n\t"
242 " pmaxsw %%mm6, %%mm7 \n\t"
243 " movq %%mm7, %%mm6 \n\t"
244 " psrlq $16, %%mm6 \n\t"
245 " pmaxsw %%mm6, %%mm7 \n\t"
246 " movd %%mm7, %0 \n\t"
247 " andl $0xffff, %0 \n\t"
249 : "=r" (MaxSad),
250 "+r" (Src1),
251 "+r" (Src2)
252 : "r" (stride)
253 : "memory", "edi"
256 return MaxSad;
259 static ogg_uint32_t inter8x8_err_xy2__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
260 unsigned char *RefDataPtr1,
261 unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
263 ogg_uint32_t XSum;
264 ogg_uint32_t XXSum;
266 __asm__ __volatile__ (
267 " .p2align 4 \n\t"
269 " pxor %%mm4, %%mm4 \n\t"
270 " pxor %%mm5, %%mm5 \n\t"
271 " pxor %%mm6, %%mm6 \n\t"
272 " pxor %%mm7, %%mm7 \n\t"
273 " mov $8, %%edi \n\t"
274 "1: \n\t"
275 " movq (%2), %%mm0 \n\t" /* take 8 bytes */
277 " movq (%3), %%mm2 \n\t"
278 " movq (%4), %%mm1 \n\t" /* take average of mm2 and mm1 */
279 " pavgb %%mm2, %%mm1 \n\t"
281 " movq %%mm0, %%mm2 \n\t"
282 " movq %%mm1, %%mm3 \n\t"
284 " punpcklbw %%mm6, %%mm0 \n\t"
285 " punpcklbw %%mm4, %%mm1 \n\t"
286 " punpckhbw %%mm6, %%mm2 \n\t"
287 " punpckhbw %%mm4, %%mm3 \n\t"
289 " psubsw %%mm1, %%mm0 \n\t"
290 " psubsw %%mm3, %%mm2 \n\t"
292 " paddw %%mm0, %%mm5 \n\t"
293 " paddw %%mm2, %%mm5 \n\t"
295 " pmaddwd %%mm0, %%mm0 \n\t"
296 " pmaddwd %%mm2, %%mm2 \n\t"
298 " paddd %%mm0, %%mm7 \n\t"
299 " paddd %%mm2, %%mm7 \n\t"
301 " add %5, %2 \n\t" /* Inc pointer into src data */
302 " add %6, %3 \n\t" /* Inc pointer into ref data */
303 " add %6, %4 \n\t" /* Inc pointer into ref data */
305 " dec %%edi \n\t"
306 " jnz 1b \n\t"
308 " movq %%mm5, %%mm0 \n\t"
309 " psrlq $32, %%mm5 \n\t"
310 " paddw %%mm0, %%mm5 \n\t"
311 " movq %%mm5, %%mm0 \n\t"
312 " psrlq $16, %%mm5 \n\t"
313 " paddw %%mm0, %%mm5 \n\t"
314 " movd %%mm5, %%edi \n\t"
315 " movsx %%di, %%edi \n\t"
316 " movl %%edi, %0 \n\t"
318 " movq %%mm7, %%mm0 \n\t"
319 " psrlq $32, %%mm7 \n\t"
320 " paddd %%mm0, %%mm7 \n\t"
321 " movd %%mm7, %1 \n\t"
323 : "=m" (XSum),
324 "=m" (XXSum),
325 "+r" (SrcData),
326 "+r" (RefDataPtr1),
327 "+r" (RefDataPtr2)
328 : "m" (SrcStride),
329 "m" (RefStride)
330 : "edi", "memory"
333 /* Compute and return population variance as mis-match metric. */
334 return (( (XXSum<<6) - XSum*XSum ));
337 void dsp_mmxext_init(DspFunctions *funcs)
339 TH_DEBUG("enabling accelerated x86_32 mmxext dsp functions.\n");
340 funcs->row_sad8 = row_sad8__mmxext;
341 funcs->col_sad8x8 = col_sad8x8__mmxext;
342 funcs->sad8x8 = sad8x8__mmxext;
343 funcs->sad8x8_thres = sad8x8_thres__mmxext;
344 funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmxext;
345 funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmxext;
348 #endif /* USE_ASM */