Add Russian translation provided by Валерий Крувялис <valkru@mail.ru>
[xiph-mirror.git] / theora-old / lib / x86_64 / dsp_mmx.c
bloba14fe2a455e199acbd6e87ff827430ea4dd3a8d6
1 /********************************************************************
2 * *
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7 * *
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
9 * by the Xiph.Org Foundation http://www.xiph.org/ *
10 * *
11 ********************************************************************
13 function:
14 last mod: $Id$
16 ********************************************************************/
18 #include <stdlib.h>
20 #include "codec_internal.h"
21 #include "dsp.h"
23 typedef unsigned long long ogg_uint64_t;
25 static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x0080008000800080LL;
27 #define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
28 #define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
29 #define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
31 static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
32 ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
33 ogg_uint32_t ReconPixelsPerLine)
35 __asm__ __volatile__ (
36 " .balign 16 \n\t"
38 " pxor %%mm7, %%mm7 \n\t"
40 ".rept 8 \n\t"
41 " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
42 " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr */
43 " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
44 " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */
45 /* convert from UINT8 to INT16 */
46 " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
47 " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr) */
48 " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
49 " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr) */
50 /* start calculation */
51 " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ReconPtr */
52 " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ReconPtr */
53 " movq %%mm0, (%2) \n\t" /* write answer out */
54 " movq %%mm2, 8(%2) \n\t" /* write answer out */
55 /* Increment pointers */
56 " add $16, %2 \n\t"
57 " add %3, %0 \n\t"
58 " add %4, %1 \n\t"
59 ".endr \n\t"
61 : "+r" (FiltPtr),
62 "+r" (ReconPtr),
63 "+r" (DctInputPtr)
64 : "r" ((ogg_uint64_t)PixelsPerLine),
65 "r" ((ogg_uint64_t)ReconPixelsPerLine)
66 : "memory"
70 static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
71 ogg_uint32_t PixelsPerLine)
73 ogg_uint64_t ppl = PixelsPerLine;
75 __asm__ __volatile__ (
76 " .balign 16 \n\t"
78 " pxor %%mm7, %%mm7 \n\t"
79 " movq %[V128], %%mm1 \n\t"
81 ".rept 8 \n\t"
82 " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
83 " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
84 /* convert from UINT8 to INT16 */
85 " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
86 " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
87 /* start calculation */
88 " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - 128 */
89 " psubw %%mm1, %%mm2 \n\t" /* mm2 = FiltPtr - 128 */
90 " movq %%mm0, (%1) \n\t" /* write answer out */
91 " movq %%mm2, 8(%1) \n\t" /* write answer out */
92 /* Increment pointers */
93 " add $16, %1 \n\t"
94 " add %2, %0 \n\t"
95 ".endr \n\t"
97 : "+r" (FiltPtr),
98 "+r" (DctInputPtr)
99 : "r" (ppl), /* gcc bug? a cast won't work here, e.g. (ogg_uint64_t)PixelsPerLine */
100 [V128] "m" (V128)
101 : "memory"
105 static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
106 unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
107 ogg_uint32_t PixelsPerLine,
108 ogg_uint32_t ReconPixelsPerLine)
110 __asm__ __volatile__ (
111 " .balign 16 \n\t"
113 " pxor %%mm7, %%mm7 \n\t"
115 ".rept 8 \n\t"
116 " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
117 " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr1 */
118 " movq (%2), %%mm4 \n\t" /* mm1 = ReconPtr2 */
119 " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
120 " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */
121 " movq %%mm4, %%mm5 \n\t" /* dup to prepare for up conversion */
122 /* convert from UINT8 to INT16 */
123 " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
124 " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr1) */
125 " punpcklbw %%mm7, %%mm4 \n\t" /* mm1 = INT16(ReconPtr2) */
126 " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
127 " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr1) */
128 " punpckhbw %%mm7, %%mm5 \n\t" /* mm3 = INT16(ReconPtr2) */
129 /* average ReconPtr1 and ReconPtr2 */
130 " paddw %%mm4, %%mm1 \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */
131 " paddw %%mm5, %%mm3 \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */
132 " psrlw $1, %%mm1 \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
133 " psrlw $1, %%mm3 \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
134 " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
135 " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
136 " movq %%mm0, (%3) \n\t" /* write answer out */
137 " movq %%mm2, 8(%3) \n\t" /* write answer out */
138 /* Increment pointers */
139 " add $16, %3 \n\t"
140 " add %4, %0 \n\t"
141 " add %5, %1 \n\t"
142 " add %5, %2 \n\t"
143 ".endr \n\t"
145 : "+r" (FiltPtr),
146 "+r" (ReconPtr1),
147 "+r" (ReconPtr2),
148 "+r" (DctInputPtr)
149 : "r" ((ogg_uint64_t)PixelsPerLine),
150 "r" ((ogg_uint64_t)ReconPixelsPerLine)
151 : "memory"
155 static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride)
157 ogg_uint64_t XSum;
158 ogg_uint64_t XXSum;
160 __asm__ __volatile__ (
161 " .balign 16 \n\t"
163 " pxor %%mm5, %%mm5 \n\t"
164 " pxor %%mm6, %%mm6 \n\t"
165 " pxor %%mm7, %%mm7 \n\t"
166 " mov $8, %%rdi \n\t"
167 "1: \n\t"
168 " movq (%2), %%mm0 \n\t" /* take 8 bytes */
169 " movq %%mm0, %%mm2 \n\t"
171 " punpcklbw %%mm6, %%mm0 \n\t"
172 " punpckhbw %%mm6, %%mm2 \n\t"
174 " paddw %%mm0, %%mm5 \n\t"
175 " paddw %%mm2, %%mm5 \n\t"
177 " pmaddwd %%mm0, %%mm0 \n\t"
178 " pmaddwd %%mm2, %%mm2 \n\t"
180 " paddd %%mm0, %%mm7 \n\t"
181 " paddd %%mm2, %%mm7 \n\t"
183 " add %3, %2 \n\t" /* Inc pointer into src data */
185 " dec %%rdi \n\t"
186 " jnz 1b \n\t"
188 " movq %%mm5, %%mm0 \n\t"
189 " psrlq $32, %%mm5 \n\t"
190 " paddw %%mm0, %%mm5 \n\t"
191 " movq %%mm5, %%mm0 \n\t"
192 " psrlq $16, %%mm5 \n\t"
193 " paddw %%mm0, %%mm5 \n\t"
194 " movd %%mm5, %%rdi \n\t"
195 " movsx %%di, %%rdi \n\t"
196 " mov %%rdi, %0 \n\t"
198 " movq %%mm7, %%mm0 \n\t"
199 " psrlq $32, %%mm7 \n\t"
200 " paddd %%mm0, %%mm7 \n\t"
201 " movd %%mm7, %1 \n\t"
203 : "=r" (XSum),
204 "=r" (XXSum),
205 "+r" (DataPtr)
206 : "r" ((ogg_uint64_t)Stride)
207 : "rdi", "memory"
210 /* Compute population variance as mis-match metric. */
211 return (( (XXSum<<6) - XSum*XSum ) );
214 static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
215 unsigned char *RefDataPtr, ogg_uint32_t RefStride)
217 ogg_uint64_t XSum;
218 ogg_uint64_t XXSum;
220 __asm__ __volatile__ (
221 " .balign 16 \n\t"
223 " pxor %%mm5, %%mm5 \n\t"
224 " pxor %%mm6, %%mm6 \n\t"
225 " pxor %%mm7, %%mm7 \n\t"
226 " mov $8, %%rdi \n\t"
227 "1: \n\t"
228 " movq (%2), %%mm0 \n\t" /* take 8 bytes */
229 " movq (%3), %%mm1 \n\t"
230 " movq %%mm0, %%mm2 \n\t"
231 " movq %%mm1, %%mm3 \n\t"
233 " punpcklbw %%mm6, %%mm0 \n\t"
234 " punpcklbw %%mm6, %%mm1 \n\t"
235 " punpckhbw %%mm6, %%mm2 \n\t"
236 " punpckhbw %%mm6, %%mm3 \n\t"
238 " psubsw %%mm1, %%mm0 \n\t"
239 " psubsw %%mm3, %%mm2 \n\t"
241 " paddw %%mm0, %%mm5 \n\t"
242 " paddw %%mm2, %%mm5 \n\t"
244 " pmaddwd %%mm0, %%mm0 \n\t"
245 " pmaddwd %%mm2, %%mm2 \n\t"
247 " paddd %%mm0, %%mm7 \n\t"
248 " paddd %%mm2, %%mm7 \n\t"
250 " add %4, %2 \n\t" /* Inc pointer into src data */
251 " add %5, %3 \n\t" /* Inc pointer into ref data */
253 " dec %%rdi \n\t"
254 " jnz 1b \n\t"
256 " movq %%mm5, %%mm0 \n\t"
257 " psrlq $32, %%mm5 \n\t"
258 " paddw %%mm0, %%mm5 \n\t"
259 " movq %%mm5, %%mm0 \n\t"
260 " psrlq $16, %%mm5 \n\t"
261 " paddw %%mm0, %%mm5 \n\t"
262 " movd %%mm5, %%rdi \n\t"
263 " movsx %%di, %%rdi \n\t"
264 " mov %%rdi, %0 \n\t"
266 " movq %%mm7, %%mm0 \n\t"
267 " psrlq $32, %%mm7 \n\t"
268 " paddd %%mm0, %%mm7 \n\t"
269 " movd %%mm7, %1 \n\t"
271 : "=m" (XSum),
272 "=m" (XXSum),
273 "+r" (SrcData),
274 "+r" (RefDataPtr)
275 : "r" ((ogg_uint64_t)SrcStride),
276 "r" ((ogg_uint64_t)RefStride)
277 : "rdi", "memory"
280 /* Compute and return population variance as mis-match metric. */
281 return (( (XXSum<<6) - XSum*XSum ));
284 static void restore_fpu (void)
286 __asm__ __volatile__ (
287 " emms \n\t"
291 void dsp_mmx_init(DspFunctions *funcs)
293 TH_DEBUG("setting accelerated x86_64 mmx dsp functions.\n");
294 funcs->restore_fpu = restore_fpu;
295 funcs->sub8x8 = sub8x8__mmx;
296 funcs->sub8x8_128 = sub8x8_128__mmx;
297 funcs->sub8x8avg2 = sub8x8avg2__mmx;
298 funcs->intra8x8_err = intra8x8_err__mmx;
299 funcs->inter8x8_err = inter8x8_err__mmx;