Add Russian translation provided by Валерий Крувялис <valkru@mail.ru>
[xiph-mirror.git] / theora-old / lib / x86_64 / dsp_mmxext.c
blobc27142a24599aa18cbb8b304066e0a71e6aacd7f
1 /********************************************************************
2 * *
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7 * *
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
9 * by the Xiph.Org Foundation http://www.xiph.org/ *
10 * *
11 ********************************************************************
13 function:
14 last mod: $Id$
16 ********************************************************************/
18 #include <stdlib.h>
20 #include "codec_internal.h"
21 #include "dsp.h"
23 typedef unsigned long long ogg_uint64_t;
25 static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
26 unsigned char *ptr2, ogg_uint32_t stride2)
28 ogg_uint32_t DiffVal;
30 __asm__ __volatile__ (
31 " .balign 16 \n\t"
32 " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
34 ".rept 7 \n\t"
35 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
36 " movq (%2), %%mm1 \n\t"
37 " psadbw %%mm1, %%mm0 \n\t"
38 " add %3, %1 \n\t" /* Inc pointer into the new data */
39 " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
40 " add %4, %2 \n\t" /* Inc pointer into ref data */
41 ".endr \n\t"
43 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
44 " movq (%2), %%mm1 \n\t"
45 " psadbw %%mm1, %%mm0 \n\t"
46 " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
47 " movd %%mm7, %0 \n\t"
49 : "=r" (DiffVal),
50 "+r" (ptr1),
51 "+r" (ptr2)
52 : "r" ((ogg_uint64_t)stride1),
53 "r" ((ogg_uint64_t)stride2)
54 : "memory"
57 return DiffVal;
60 static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
61 unsigned char *ptr2, ogg_uint32_t stride2,
62 ogg_uint32_t thres)
64 ogg_uint32_t DiffVal;
66 __asm__ __volatile__ (
67 " .balign 16 \n\t"
68 " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
70 ".rept 8 \n\t"
71 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
72 " movq (%2), %%mm1 \n\t"
73 " psadbw %%mm1, %%mm0 \n\t"
74 " add %3, %1 \n\t" /* Inc pointer into the new data */
75 " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
76 " add %4, %2 \n\t" /* Inc pointer into ref data */
77 ".endr \n\t"
79 " movd %%mm7, %0 \n\t"
81 : "=r" (DiffVal),
82 "+r" (ptr1),
83 "+r" (ptr2)
84 : "r" ((ogg_uint64_t)stride1),
85 "r" ((ogg_uint64_t)stride2)
86 : "memory"
89 return DiffVal;
92 static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
93 unsigned char *RefDataPtr1,
94 unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
95 ogg_uint32_t thres)
97 ogg_uint32_t DiffVal;
99 __asm__ __volatile__ (
100 " .balign 16 \n\t"
101 " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
102 ".rept 8 \n\t"
103 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
104 " movq (%2), %%mm1 \n\t"
105 " movq (%3), %%mm2 \n\t"
106 " pavgb %%mm2, %%mm1 \n\t"
107 " psadbw %%mm1, %%mm0 \n\t"
109 " add %4, %1 \n\t" /* Inc pointer into the new data */
110 " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
111 " add %5, %2 \n\t" /* Inc pointer into ref data */
112 " add %5, %3 \n\t" /* Inc pointer into ref data */
113 ".endr \n\t"
115 " movd %%mm7, %0 \n\t"
116 : "=m" (DiffVal),
117 "+r" (SrcData),
118 "+r" (RefDataPtr1),
119 "+r" (RefDataPtr2)
120 : "r" ((ogg_uint64_t)SrcStride),
121 "r" ((ogg_uint64_t)RefStride)
122 : "memory"
125 return DiffVal;
128 static ogg_uint32_t row_sad8__mmxext (unsigned char *Src1, unsigned char *Src2)
130 ogg_uint32_t MaxSad;
132 __asm__ __volatile__ (
133 " .balign 16 \n\t"
135 " movd (%1), %%mm0 \n\t"
136 " movd (%2), %%mm1 \n\t"
137 " psadbw %%mm0, %%mm1 \n\t"
138 " movd 4(%1), %%mm2 \n\t"
139 " movd 4(%2), %%mm3 \n\t"
140 " psadbw %%mm2, %%mm3 \n\t"
142 " pmaxsw %%mm1, %%mm3 \n\t"
143 " movd %%mm3, %0 \n\t"
144 " andl $0xffff, %0 \n\t"
146 : "=m" (MaxSad),
147 "+r" (Src1),
148 "+r" (Src2)
150 : "memory"
153 return MaxSad;
156 static ogg_uint32_t col_sad8x8__mmxext (unsigned char *Src1, unsigned char *Src2,
157 ogg_uint32_t stride)
159 ogg_uint32_t MaxSad;
161 __asm__ __volatile__ (
162 " .balign 16 \n\t"
164 " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
165 " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
166 " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
167 " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
168 " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
169 " mov $4, %%rdi \n\t" /* 4 rows */
170 "1: \n\t"
171 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
172 " movq (%2), %%mm1 \n\t" /* take 8 bytes */
174 " movq %%mm0, %%mm2 \n\t"
175 " psubusb %%mm1, %%mm0 \n\t" /* A - B */
176 " psubusb %%mm2, %%mm1 \n\t" /* B - A */
177 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
178 " movq %%mm0, %%mm1 \n\t"
180 " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
181 " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
182 " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
183 " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
184 " add %3, %1 \n\t" /* Inc pointer into the new data */
185 " add %3, %2 \n\t" /* Inc pointer into the new data */
187 " dec %%rdi \n\t"
188 " jnz 1b \n\t"
190 " mov $4, %%rdi \n\t" /* 4 rows */
191 "2: \n\t"
192 " movq (%1), %%mm0 \n\t" /* take 8 bytes */
193 " movq (%2), %%mm1 \n\t" /* take 8 bytes */
195 " movq %%mm0, %%mm2 \n\t"
196 " psubusb %%mm1, %%mm0 \n\t" /* A - B */
197 " psubusb %%mm2, %%mm1 \n\t" /* B - A */
198 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
199 " movq %%mm0, %%mm1 \n\t"
201 " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
202 " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
203 " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
204 " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
205 " add %3, %1 \n\t" /* Inc pointer into the new data */
206 " add %3, %2 \n\t" /* Inc pointer into the new data */
208 " dec %%rdi \n\t"
209 " jnz 2b \n\t"
211 " pmaxsw %%mm6, %%mm7 \n\t"
212 " pmaxsw %%mm4, %%mm5 \n\t"
213 " pmaxsw %%mm5, %%mm7 \n\t"
214 " movq %%mm7, %%mm6 \n\t"
215 " psrlq $32, %%mm6 \n\t"
216 " pmaxsw %%mm6, %%mm7 \n\t"
217 " movq %%mm7, %%mm6 \n\t"
218 " psrlq $16, %%mm6 \n\t"
219 " pmaxsw %%mm6, %%mm7 \n\t"
220 " movd %%mm7, %0 \n\t"
221 " andl $0xffff, %0 \n\t"
223 : "=r" (MaxSad),
224 "+r" (Src1),
225 "+r" (Src2)
226 : "r" ((ogg_uint64_t)stride)
227 : "memory", "rdi"
230 return MaxSad;
233 static ogg_uint32_t inter8x8_err_xy2__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
234 unsigned char *RefDataPtr1,
235 unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
237 ogg_uint64_t XSum;
238 ogg_uint64_t XXSum;
240 __asm__ __volatile__ (
241 " .balign 16 \n\t"
243 " pxor %%mm4, %%mm4 \n\t"
244 " pxor %%mm5, %%mm5 \n\t"
245 " pxor %%mm6, %%mm6 \n\t"
246 " pxor %%mm7, %%mm7 \n\t"
247 " mov $8, %%rdi \n\t"
248 "1: \n\t"
249 " movq (%2), %%mm0 \n\t" /* take 8 bytes */
251 " movq (%3), %%mm2 \n\t"
252 " movq (%4), %%mm1 \n\t" /* take average of mm2 and mm1 */
253 " pavgb %%mm2, %%mm1 \n\t"
255 " movq %%mm0, %%mm2 \n\t"
256 " movq %%mm1, %%mm3 \n\t"
258 " punpcklbw %%mm6, %%mm0 \n\t"
259 " punpcklbw %%mm4, %%mm1 \n\t"
260 " punpckhbw %%mm6, %%mm2 \n\t"
261 " punpckhbw %%mm4, %%mm3 \n\t"
263 " psubsw %%mm1, %%mm0 \n\t"
264 " psubsw %%mm3, %%mm2 \n\t"
266 " paddw %%mm0, %%mm5 \n\t"
267 " paddw %%mm2, %%mm5 \n\t"
269 " pmaddwd %%mm0, %%mm0 \n\t"
270 " pmaddwd %%mm2, %%mm2 \n\t"
272 " paddd %%mm0, %%mm7 \n\t"
273 " paddd %%mm2, %%mm7 \n\t"
275 " add %5, %2 \n\t" /* Inc pointer into src data */
276 " add %6, %3 \n\t" /* Inc pointer into ref data */
277 " add %6, %4 \n\t" /* Inc pointer into ref data */
279 " dec %%rdi \n\t"
280 " jnz 1b \n\t"
282 " movq %%mm5, %%mm0 \n\t"
283 " psrlq $32, %%mm5 \n\t"
284 " paddw %%mm0, %%mm5 \n\t"
285 " movq %%mm5, %%mm0 \n\t"
286 " psrlq $16, %%mm5 \n\t"
287 " paddw %%mm0, %%mm5 \n\t"
288 " movd %%mm5, %%edi \n\t"
289 " movsx %%di, %%edi \n\t"
290 " movl %%edi, %0 \n\t"
292 " movq %%mm7, %%mm0 \n\t"
293 " psrlq $32, %%mm7 \n\t"
294 " paddd %%mm0, %%mm7 \n\t"
295 " movd %%mm7, %1 \n\t"
297 : "=m" (XSum),
298 "=m" (XXSum),
299 "+r" (SrcData),
300 "+r" (RefDataPtr1),
301 "+r" (RefDataPtr2)
302 : "r" ((ogg_uint64_t)SrcStride),
303 "r" ((ogg_uint64_t)RefStride)
304 : "rdi", "memory"
307 /* Compute and return population variance as mis-match metric. */
308 return (( (XXSum<<6) - XSum*XSum ));
311 void dsp_mmxext_init(DspFunctions *funcs)
313 TH_DEBUG("enabling accerated x86_64 mmxext dsp functions.\n");
314 funcs->row_sad8 = row_sad8__mmxext;
315 funcs->col_sad8x8 = col_sad8x8__mmxext;
316 funcs->sad8x8 = sad8x8__mmxext;
317 funcs->sad8x8_thres = sad8x8_thres__mmxext;
318 funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmxext;
319 funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmxext;