Add Russian translation provided by Валерий Крувялис <valkru@mail.ru>
[xiph-mirror.git] / theora-old / lib / x86_32 / fdct_mmx.c
blobcd70a48837a10c294fc4be52293f255600a8921f
1 /********************************************************************
2 * *
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7 * *
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2001 *
9 * by the Xiph.Org Foundation http://www.xiph.org/ *
10 * *
11 ********************************************************************/
13 /* mmx fdct implementation */
14 /* $Id$ */
16 #include "theora/theora.h"
17 #include "codec_internal.h"
18 #include "dsp.h"
20 #if defined(USE_ASM)
22 static const __attribute__ ((aligned(8),used)) ogg_int64_t xC1S7 = 0x0fb15fb15fb15fb15LL;
23 static const __attribute__ ((aligned(8),used)) ogg_int64_t xC2S6 = 0x0ec83ec83ec83ec83LL;
24 static const __attribute__ ((aligned(8),used)) ogg_int64_t xC3S5 = 0x0d4dbd4dbd4dbd4dbLL;
25 static const __attribute__ ((aligned(8),used)) ogg_int64_t xC4S4 = 0x0b505b505b505b505LL;
26 static const __attribute__ ((aligned(8),used)) ogg_int64_t xC5S3 = 0x08e3a8e3a8e3a8e3aLL;
27 static const __attribute__ ((aligned(8),used)) ogg_int64_t xC6S2 = 0x061f861f861f861f8LL;
28 static const __attribute__ ((aligned(8),used)) ogg_int64_t xC7S1 = 0x031f131f131f131f1LL;
30 /* execute stage 1 of forward DCT */
31 #define Fdct_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,temp) \
32 " movq " #ip0 ", %%mm0 \n\t" \
33 " movq " #ip1 ", %%mm1 \n\t" \
34 " movq " #ip3 ", %%mm2 \n\t" \
35 " movq " #ip5 ", %%mm3 \n\t" \
36 " movq %%mm0, %%mm4 \n\t" \
37 " movq %%mm1, %%mm5 \n\t" \
38 " movq %%mm2, %%mm6 \n\t" \
39 " movq %%mm3, %%mm7 \n\t" \
41 " paddsw " #ip7 ", %%mm0 \n\t" /* mm0 = ip0 + ip7 = is07 */ \
42 " paddsw " #ip2 ", %%mm1 \n\t" /* mm1 = ip1 + ip2 = is12 */ \
43 " paddsw " #ip4 ", %%mm2 \n\t" /* mm2 = ip3 + ip4 = is34 */ \
44 " paddsw " #ip6 ", %%mm3 \n\t" /* mm3 = ip5 + ip6 = is56 */ \
45 " psubsw " #ip7 ", %%mm4 \n\t" /* mm4 = ip0 - ip7 = id07 */ \
46 " psubsw " #ip2 ", %%mm5 \n\t" /* mm5 = ip1 - ip2 = id12 */ \
48 " psubsw %%mm2, %%mm0 \n\t" /* mm0 = is07 - is34 */ \
50 " paddsw %%mm2, %%mm2 \n\t" \
52 " psubsw " #ip4 ", %%mm6 \n\t" /* mm6 = ip3 - ip4 = id34 */ \
54 " paddsw %%mm0, %%mm2 \n\t" /* mm2 = is07 + is34 = is0734 */ \
55 " psubsw %%mm3, %%mm1 \n\t" /* mm1 = is12 - is56 */ \
56 " movq %%mm0," #temp " \n\t" /* Save is07 - is34 to free mm0; */ \
57 " paddsw %%mm3, %%mm3 \n\t" \
58 " paddsw %%mm1, %%mm3 \n\t" /* mm3 = is12 + 1s56 = is1256 */ \
60 " psubsw " #ip6 ", %%mm7 \n\t" /* mm7 = ip5 - ip6 = id56 */ \
61 /* ------------------------------------------------------------------- */ \
62 " psubsw %%mm7, %%mm5 \n\t" /* mm5 = id12 - id56 */ \
63 " paddsw %%mm7, %%mm7 \n\t" \
64 " paddsw %%mm5, %%mm7 \n\t" /* mm7 = id12 + id56 */ \
65 /* ------------------------------------------------------------------- */ \
66 " psubsw %%mm3, %%mm2 \n\t" /* mm2 = is0734 - is1256 */ \
67 " paddsw %%mm3, %%mm3 \n\t" \
69 " movq %%mm2, %%mm0 \n\t" /* make a copy */ \
70 " paddsw %%mm2, %%mm3 \n\t" /* mm3 = is0734 + is1256 */ \
72 " pmulhw %[xC4S4], %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */ \
73 " paddw %%mm2, %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) */ \
74 " psrlw $15, %%mm2 \n\t" \
75 " paddw %%mm2, %%mm0 \n\t" /* Truncate mm0, now it is op[4] */ \
77 " movq %%mm3, %%mm2 \n\t" \
78 " movq %%mm0," #ip4 " \n\t" /* save ip4, now mm0,mm2 are free */ \
80 " movq %%mm3, %%mm0 \n\t" \
81 " pmulhw %[xC4S4], %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */ \
83 " psrlw $15, %%mm2 \n\t" \
84 " paddw %%mm0, %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) */ \
85 " paddw %%mm2, %%mm3 \n\t" /* Truncate mm3, now it is op[0] */ \
87 " movq %%mm3," #ip0 " \n\t" \
88 /* ------------------------------------------------------------------- */ \
89 " movq " #temp ", %%mm3 \n\t" /* mm3 = irot_input_y */ \
90 " pmulhw %[xC2S6], %%mm3 \n\t" /* mm3 = xC2S6 * irot_input_y - irot_input_y */ \
92 " movq " #temp ", %%mm2 \n\t" \
93 " movq %%mm2, %%mm0 \n\t" \
95 " psrlw $15, %%mm2 \n\t" /* mm3 = xC2S6 * irot_input_y */ \
96 " paddw %%mm0, %%mm3 \n\t" \
98 " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
99 " movq %%mm5, %%mm0 \n\t" \
101 " movq %%mm5, %%mm2 \n\t" \
102 " pmulhw %[xC6S2], %%mm0 \n\t" /* mm0 = xC6S2 * irot_input_x */ \
104 " psrlw $15, %%mm2 \n\t" \
105 " paddw %%mm2, %%mm0 \n\t" /* Truncated */ \
107 " paddsw %%mm0, %%mm3 \n\t" /* ip[2] */ \
108 " movq %%mm3," #ip2 " \n\t" /* Save ip2 */ \
110 " movq %%mm5, %%mm0 \n\t" \
111 " movq %%mm5, %%mm2 \n\t" \
113 " pmulhw %[xC2S6], %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x - irot_input_x */ \
114 " psrlw $15, %%mm2 \n\t" \
116 " movq " #temp ", %%mm3 \n\t" \
117 " paddw %%mm0, %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x */ \
119 " paddw %%mm2, %%mm5 \n\t" /* Truncated */ \
120 " movq %%mm3, %%mm2 \n\t" \
122 " pmulhw %[xC6S2], %%mm3 \n\t" /* mm3 = xC6S2 * irot_input_y */ \
123 " psrlw $15, %%mm2 \n\t" \
125 " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
126 " psubsw %%mm5, %%mm3 \n\t" \
128 " movq %%mm3," #ip6 " \n\t" \
129 /* ------------------------------------------------------------------- */ \
130 " movq %[xC4S4], %%mm0 \n\t" \
131 " movq %%mm1, %%mm2 \n\t" \
132 " movq %%mm1, %%mm3 \n\t" \
134 " pmulhw %%mm0, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */ \
135 " psrlw $15, %%mm2 \n\t" \
137 " paddw %%mm3, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) */ \
138 " paddw %%mm2, %%mm1 \n\t" /* Truncate mm1, now it is icommon_product1 */ \
140 " movq %%mm7, %%mm2 \n\t" \
141 " movq %%mm7, %%mm3 \n\t" \
143 " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */ \
144 " psrlw $15, %%mm2 \n\t" \
146 " paddw %%mm3, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) */ \
147 " paddw %%mm2, %%mm7 \n\t" /* Truncate mm7, now it is icommon_product2 */ \
148 /* ------------------------------------------------------------------- */ \
149 " pxor %%mm0, %%mm0 \n\t" /* Clear mm0 */ \
150 " psubsw %%mm6, %%mm0 \n\t" /* mm0 = - id34 */ \
152 " psubsw %%mm7, %%mm0 \n\t" /* mm0 = - ( id34 + idcommon_product2 ) */ \
153 " paddsw %%mm6, %%mm6 \n\t" \
154 " paddsw %%mm0, %%mm6 \n\t" /* mm6 = id34 - icommon_product2 */ \
156 " psubsw %%mm1, %%mm4 \n\t" /* mm4 = id07 - icommon_product1 */ \
157 " paddsw %%mm1, %%mm1 \n\t" \
158 " paddsw %%mm4, %%mm1 \n\t" /* mm1 = id07 + icommon_product1 */ \
159 /* ------------------------------------------------------------------- */ \
160 " movq %[xC1S7], %%mm7 \n\t" \
161 " movq %%mm1, %%mm2 \n\t" \
163 " movq %%mm1, %%mm3 \n\t" \
164 " pmulhw %%mm7, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - irot_input_x */ \
166 " movq %[xC7S1], %%mm7 \n\t" \
167 " psrlw $15, %%mm2 \n\t" \
169 " paddw %%mm3, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x */ \
170 " paddw %%mm2, %%mm1 \n\t" /* Trucated */ \
172 " pmulhw %%mm7, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x */ \
173 " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
175 " movq %%mm0, %%mm5 \n\t" \
176 " movq %%mm0, %%mm2 \n\t" \
178 " movq %[xC1S7], %%mm7 \n\t" \
179 " pmulhw %%mm7, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y - irot_input_y */ \
181 " movq %[xC7S1], %%mm7 \n\t" \
182 " psrlw $15, %%mm2 \n\t" \
184 " paddw %%mm5, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y */ \
185 " paddw %%mm2, %%mm0 \n\t" /* Truncated */ \
187 " pmulhw %%mm7, %%mm5 \n\t" /* mm5 = xC7S1 * irot_input_y */ \
188 " paddw %%mm2, %%mm5 \n\t" /* Truncated */ \
190 " psubsw %%mm5, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */ \
191 " paddsw %%mm0, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */ \
193 " movq %%mm1," #ip1 " \n\t" \
194 " movq %%mm3," #ip7 " \n\t" \
195 /* ------------------------------------------------------------------- */ \
196 " movq %[xC3S5], %%mm0 \n\t" \
197 " movq %[xC5S3], %%mm1 \n\t" \
199 " movq %%mm6, %%mm5 \n\t" \
200 " movq %%mm6, %%mm7 \n\t" \
202 " movq %%mm4, %%mm2 \n\t" \
203 " movq %%mm4, %%mm3 \n\t" \
205 " pmulhw %%mm0, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x - irot_input_x */ \
206 " pmulhw %%mm1, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y - irot_input_y */ \
208 " psrlw $15, %%mm2 \n\t" \
209 " psrlw $15, %%mm5 \n\t" \
211 " paddw %%mm3, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x */ \
212 " paddw %%mm7, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y */ \
214 " paddw %%mm2, %%mm4 \n\t" /* Truncated */ \
215 " paddw %%mm5, %%mm6 \n\t" /* Truncated */ \
217 " psubsw %%mm6, %%mm4 \n\t" /* ip3 */ \
218 " movq %%mm4," #ip3 " \n\t" \
220 " movq %%mm3, %%mm4 \n\t" \
221 " movq %%mm7, %%mm6 \n\t" \
223 " pmulhw %%mm1, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x - irot_input_x */ \
224 " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y - irot_input_y */ \
226 " paddw %%mm2, %%mm4 \n\t" \
227 " paddw %%mm5, %%mm6 \n\t" \
229 " paddw %%mm4, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x */ \
230 " paddw %%mm6, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y */ \
232 " paddw %%mm7, %%mm3 \n\t" /* ip5 */ \
233 " movq %%mm3," #ip5 " \n\t"
235 #define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7, \
236 op0,op1,op2,op3,op4,op5,op6,op7) \
237 " movq " #ip0 ", %%mm0 \n\t" /* mm0 = a0 a1 a2 a3 */ \
238 " movq " #ip4 ", %%mm4 \n\t" /* mm4 = e4 e5 e6 e7 */ \
239 " movq " #ip1 ", %%mm1 \n\t" /* mm1 = b0 b1 b2 b3 */ \
240 " movq " #ip5 ", %%mm5 \n\t" /* mm5 = f4 f5 f6 f7 */ \
241 " movq " #ip2 ", %%mm2 \n\t" /* mm2 = c0 c1 c2 c3 */ \
242 " movq " #ip6 ", %%mm6 \n\t" /* mm6 = g4 g5 g6 g7 */ \
243 " movq " #ip3 ", %%mm3 \n\t" /* mm3 = d0 d1 d2 d3 */ \
244 " movq %%mm1," #op1 " \n\t" /* save b0 b1 b2 b3 */ \
245 " movq " #ip7 ", %%mm7 \n\t" /* mm7 = h0 h1 h2 h3 */ \
246 /* Transpose 2x8 block */ \
247 " movq %%mm4, %%mm1 \n\t" /* mm1 = e3 e2 e1 e0 */ \
248 " punpcklwd %%mm5, %%mm4 \n\t" /* mm4 = f1 e1 f0 e0 */ \
249 " movq %%mm0," #op0 " \n\t" /* save a3 a2 a1 a0 */ \
250 " punpckhwd %%mm5, %%mm1 \n\t" /* mm1 = f3 e3 f2 e2 */ \
251 " movq %%mm6, %%mm0 \n\t" /* mm0 = g3 g2 g1 g0 */ \
252 " punpcklwd %%mm7, %%mm6 \n\t" /* mm6 = h1 g1 h0 g0 */ \
253 " movq %%mm4, %%mm5 \n\t" /* mm5 = f1 e1 f0 e0 */ \
254 " punpckldq %%mm6, %%mm4 \n\t" /* mm4 = h0 g0 f0 e0 = MM4 */ \
255 " punpckhdq %%mm6, %%mm5 \n\t" /* mm5 = h1 g1 f1 e1 = MM5 */ \
256 " movq %%mm1, %%mm6 \n\t" /* mm6 = f3 e3 f2 e2 */ \
257 " movq %%mm4," #op4 " \n\t" \
258 " punpckhwd %%mm7, %%mm0 \n\t" /* mm0 = h3 g3 h2 g2 */ \
259 " movq %%mm5," #op5 " \n\t" \
260 " punpckhdq %%mm0, %%mm6 \n\t" /* mm6 = h3 g3 f3 e3 = MM7 */ \
261 " movq " #op0 ", %%mm4 \n\t" /* mm4 = a3 a2 a1 a0 */ \
262 " punpckldq %%mm0, %%mm1 \n\t" /* mm1 = h2 g2 f2 e2 = MM6 */ \
263 " movq " #op1 ", %%mm5 \n\t" /* mm5 = b3 b2 b1 b0 */ \
264 " movq %%mm4, %%mm0 \n\t" /* mm0 = a3 a2 a1 a0 */ \
265 " movq %%mm6," #op7 " \n\t" \
266 " punpcklwd %%mm5, %%mm0 \n\t" /* mm0 = b1 a1 b0 a0 */ \
267 " movq %%mm1," #op6 " \n\t" \
268 " punpckhwd %%mm5, %%mm4 \n\t" /* mm4 = b3 a3 b2 a2 */ \
269 " movq %%mm2, %%mm5 \n\t" /* mm5 = c3 c2 c1 c0 */ \
270 " punpcklwd %%mm3, %%mm2 \n\t" /* mm2 = d1 c1 d0 c0 */ \
271 " movq %%mm0, %%mm1 \n\t" /* mm1 = b1 a1 b0 a0 */ \
272 " punpckldq %%mm2, %%mm0 \n\t" /* mm0 = d0 c0 b0 a0 = MM0 */ \
273 " punpckhdq %%mm2, %%mm1 \n\t" /* mm1 = d1 c1 b1 a1 = MM1 */ \
274 " movq %%mm4, %%mm2 \n\t" /* mm2 = b3 a3 b2 a2 */ \
275 " movq %%mm0," #op0 " \n\t" \
276 " punpckhwd %%mm3, %%mm5 \n\t" /* mm5 = d3 c3 d2 c2 */ \
277 " movq %%mm1," #op1 " \n\t" \
278 " punpckhdq %%mm5, %%mm4 \n\t" /* mm4 = d3 c3 b3 a3 = MM3 */ \
279 " punpckldq %%mm5, %%mm2 \n\t" /* mm2 = d2 c2 b2 a2 = MM2 */ \
280 " movq %%mm4," #op3 " \n\t" \
281 " movq %%mm2," #op2 " \n\t"
284 /* This performs a 2D Forward DCT on an 8x8 block with short
285 coefficients. We try to do the truncation to match the C
286 version. */
287 static void fdct_short__mmx ( ogg_int16_t *InputData, ogg_int16_t *OutputData)
289 ogg_int64_t __attribute__((aligned(8))) align_tmp[16];
290 ogg_int16_t *const temp= (ogg_int16_t*)align_tmp;
292 __asm__ __volatile__ (
293 " .p2align 4 \n\t"
295 * Input data is an 8x8 block. To make processing of the data more efficent
296 * we will transpose the block of data to two 4x8 blocks???
298 Transpose_mmx ( (%0), 16(%0), 32(%0), 48(%0), 8(%0), 24(%0), 40(%0), 56(%0),
299 (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1))
300 Fdct_mmx ( (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1), (%2))
302 Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0), 88(%0),104(%0),120(%0),
303 64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
304 Fdct_mmx (64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
306 Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1),
307 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
308 Fdct_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), (%2))
310 Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1),
311 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
312 Fdct_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
314 " emms \n\t"
316 : "+r" (InputData),
317 "+r" (OutputData)
318 : "r" (temp),
319 [xC1S7] "m" (xC1S7), /* gcc 3.1+ allows named asm parameters */
320 [xC2S6] "m" (xC2S6),
321 [xC3S5] "m" (xC3S5),
322 [xC4S4] "m" (xC4S4),
323 [xC5S3] "m" (xC5S3),
324 [xC6S2] "m" (xC6S2),
325 [xC7S1] "m" (xC7S1)
326 : "memory"
330 /* install our implementation in the function table */
331 void dsp_mmx_fdct_init(DspFunctions *funcs)
333 TH_DEBUG("enabling accelerated x86_32 mmx fdct function.\n");
334 funcs->fdct_short = fdct_short__mmx;
337 #endif /* USE_ASM */