1 /********************************************************************
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006 *
9 * by the Xiph.Org Foundation http://www.xiph.org/ *
11 ********************************************************************/
13 /* mmx fdct implementation for x86_64 */
16 #include "theora/theora.h"
17 #include "codec_internal.h"
20 static const __attribute__ ((aligned(8),used
)) ogg_int64_t xC1S7
= 0x0fb15fb15fb15fb15LL
;
21 static const __attribute__ ((aligned(8),used
)) ogg_int64_t xC2S6
= 0x0ec83ec83ec83ec83LL
;
22 static const __attribute__ ((aligned(8),used
)) ogg_int64_t xC3S5
= 0x0d4dbd4dbd4dbd4dbLL
;
23 static const __attribute__ ((aligned(8),used
)) ogg_int64_t xC4S4
= 0x0b505b505b505b505LL
;
24 static const __attribute__ ((aligned(8),used
)) ogg_int64_t xC5S3
= 0x08e3a8e3a8e3a8e3aLL
;
25 static const __attribute__ ((aligned(8),used
)) ogg_int64_t xC6S2
= 0x061f861f861f861f8LL
;
26 static const __attribute__ ((aligned(8),used
)) ogg_int64_t xC7S1
= 0x031f131f131f131f1LL
;
28 #if defined(__MINGW32__) || defined(__CYGWIN__) || \
29 defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
35 /* execute stage 1 of forward DCT */
36 #define Fdct_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,temp) \
37 " movq " #ip0 ", %%mm0 \n\t" \
38 " movq " #ip1 ", %%mm1 \n\t" \
39 " movq " #ip3 ", %%mm2 \n\t" \
40 " movq " #ip5 ", %%mm3 \n\t" \
41 " movq %%mm0, %%mm4 \n\t" \
42 " movq %%mm1, %%mm5 \n\t" \
43 " movq %%mm2, %%mm6 \n\t" \
44 " movq %%mm3, %%mm7 \n\t" \
46 " paddsw " #ip7 ", %%mm0 \n\t" /* mm0 = ip0 + ip7 = is07 */ \
47 " paddsw " #ip2 ", %%mm1 \n\t" /* mm1 = ip1 + ip2 = is12 */ \
48 " paddsw " #ip4 ", %%mm2 \n\t" /* mm2 = ip3 + ip4 = is34 */ \
49 " paddsw " #ip6 ", %%mm3 \n\t" /* mm3 = ip5 + ip6 = is56 */ \
50 " psubsw " #ip7 ", %%mm4 \n\t" /* mm4 = ip0 - ip7 = id07 */ \
51 " psubsw " #ip2 ", %%mm5 \n\t" /* mm5 = ip1 - ip2 = id12 */ \
53 " psubsw %%mm2, %%mm0 \n\t" /* mm0 = is07 - is34 */ \
55 " paddsw %%mm2, %%mm2 \n\t" \
57 " psubsw " #ip4 ", %%mm6 \n\t" /* mm6 = ip3 - ip4 = id34 */ \
59 " paddsw %%mm0, %%mm2 \n\t" /* mm2 = is07 + is34 = is0734 */ \
60 " psubsw %%mm3, %%mm1 \n\t" /* mm1 = is12 - is56 */ \
61 " movq %%mm0," #temp " \n\t" /* Save is07 - is34 to free mm0; */ \
62 " paddsw %%mm3, %%mm3 \n\t" \
63 " paddsw %%mm1, %%mm3 \n\t" /* mm3 = is12 + 1s56 = is1256 */ \
65 " psubsw " #ip6 ", %%mm7 \n\t" /* mm7 = ip5 - ip6 = id56 */ \
66 /* ------------------------------------------------------------------- */ \
67 " psubsw %%mm7, %%mm5 \n\t" /* mm5 = id12 - id56 */ \
68 " paddsw %%mm7, %%mm7 \n\t" \
69 " paddsw %%mm5, %%mm7 \n\t" /* mm7 = id12 + id56 */ \
70 /* ------------------------------------------------------------------- */ \
71 " psubsw %%mm3, %%mm2 \n\t" /* mm2 = is0734 - is1256 */ \
72 " paddsw %%mm3, %%mm3 \n\t" \
74 " movq %%mm2, %%mm0 \n\t" /* make a copy */ \
75 " paddsw %%mm2, %%mm3 \n\t" /* mm3 = is0734 + is1256 */ \
77 " pmulhw %[xC4S4], %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */ \
78 " paddw %%mm2, %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) */ \
79 " psrlw $15, %%mm2 \n\t" \
80 " paddw %%mm2, %%mm0 \n\t" /* Truncate mm0, now it is op[4] */ \
82 " movq %%mm3, %%mm2 \n\t" \
83 " movq %%mm0," #ip4 " \n\t" /* save ip4, now mm0,mm2 are free */ \
85 " movq %%mm3, %%mm0 \n\t" \
86 " pmulhw %[xC4S4], %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */ \
88 " psrlw $15, %%mm2 \n\t" \
89 " paddw %%mm0, %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) */ \
90 " paddw %%mm2, %%mm3 \n\t" /* Truncate mm3, now it is op[0] */ \
92 " movq %%mm3," #ip0 " \n\t" \
93 /* ------------------------------------------------------------------- */ \
94 " movq " #temp ", %%mm3 \n\t" /* mm3 = irot_input_y */ \
95 " pmulhw %[xC2S6], %%mm3 \n\t" /* mm3 = xC2S6 * irot_input_y - irot_input_y */ \
97 " movq " #temp ", %%mm2 \n\t" \
98 " movq %%mm2, %%mm0 \n\t" \
100 " psrlw $15, %%mm2 \n\t" /* mm3 = xC2S6 * irot_input_y */ \
101 " paddw %%mm0, %%mm3 \n\t" \
103 " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
104 " movq %%mm5, %%mm0 \n\t" \
106 " movq %%mm5, %%mm2 \n\t" \
107 " pmulhw %[xC6S2], %%mm0 \n\t" /* mm0 = xC6S2 * irot_input_x */ \
109 " psrlw $15, %%mm2 \n\t" \
110 " paddw %%mm2, %%mm0 \n\t" /* Truncated */ \
112 " paddsw %%mm0, %%mm3 \n\t" /* ip[2] */ \
113 " movq %%mm3," #ip2 " \n\t" /* Save ip2 */ \
115 " movq %%mm5, %%mm0 \n\t" \
116 " movq %%mm5, %%mm2 \n\t" \
118 " pmulhw %[xC2S6], %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x - irot_input_x */ \
119 " psrlw $15, %%mm2 \n\t" \
121 " movq " #temp ", %%mm3 \n\t" \
122 " paddw %%mm0, %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x */ \
124 " paddw %%mm2, %%mm5 \n\t" /* Truncated */ \
125 " movq %%mm3, %%mm2 \n\t" \
127 " pmulhw %[xC6S2], %%mm3 \n\t" /* mm3 = xC6S2 * irot_input_y */ \
128 " psrlw $15, %%mm2 \n\t" \
130 " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
131 " psubsw %%mm5, %%mm3 \n\t" \
133 " movq %%mm3," #ip6 " \n\t" \
134 /* ------------------------------------------------------------------- */ \
135 " movq %[xC4S4], %%mm0 \n\t" \
136 " movq %%mm1, %%mm2 \n\t" \
137 " movq %%mm1, %%mm3 \n\t" \
139 " pmulhw %%mm0, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */ \
140 " psrlw $15, %%mm2 \n\t" \
142 " paddw %%mm3, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) */ \
143 " paddw %%mm2, %%mm1 \n\t" /* Truncate mm1, now it is icommon_product1 */ \
145 " movq %%mm7, %%mm2 \n\t" \
146 " movq %%mm7, %%mm3 \n\t" \
148 " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */ \
149 " psrlw $15, %%mm2 \n\t" \
151 " paddw %%mm3, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) */ \
152 " paddw %%mm2, %%mm7 \n\t" /* Truncate mm7, now it is icommon_product2 */ \
153 /* ------------------------------------------------------------------- */ \
154 " pxor %%mm0, %%mm0 \n\t" /* Clear mm0 */ \
155 " psubsw %%mm6, %%mm0 \n\t" /* mm0 = - id34 */ \
157 " psubsw %%mm7, %%mm0 \n\t" /* mm0 = - ( id34 + idcommon_product2 ) */ \
158 " paddsw %%mm6, %%mm6 \n\t" \
159 " paddsw %%mm0, %%mm6 \n\t" /* mm6 = id34 - icommon_product2 */ \
161 " psubsw %%mm1, %%mm4 \n\t" /* mm4 = id07 - icommon_product1 */ \
162 " paddsw %%mm1, %%mm1 \n\t" \
163 " paddsw %%mm4, %%mm1 \n\t" /* mm1 = id07 + icommon_product1 */ \
164 /* ------------------------------------------------------------------- */ \
165 " movq %[xC1S7], %%mm7 \n\t" \
166 " movq %%mm1, %%mm2 \n\t" \
168 " movq %%mm1, %%mm3 \n\t" \
169 " pmulhw %%mm7, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - irot_input_x */ \
171 " movq %[xC7S1], %%mm7 \n\t" \
172 " psrlw $15, %%mm2 \n\t" \
174 " paddw %%mm3, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x */ \
175 " paddw %%mm2, %%mm1 \n\t" /* Trucated */ \
177 " pmulhw %%mm7, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x */ \
178 " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
180 " movq %%mm0, %%mm5 \n\t" \
181 " movq %%mm0, %%mm2 \n\t" \
183 " movq %[xC1S7], %%mm7 \n\t" \
184 " pmulhw %%mm7, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y - irot_input_y */ \
186 " movq %[xC7S1], %%mm7 \n\t" \
187 " psrlw $15, %%mm2 \n\t" \
189 " paddw %%mm5, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y */ \
190 " paddw %%mm2, %%mm0 \n\t" /* Truncated */ \
192 " pmulhw %%mm7, %%mm5 \n\t" /* mm5 = xC7S1 * irot_input_y */ \
193 " paddw %%mm2, %%mm5 \n\t" /* Truncated */ \
195 " psubsw %%mm5, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */ \
196 " paddsw %%mm0, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */ \
198 " movq %%mm1," #ip1 " \n\t" \
199 " movq %%mm3," #ip7 " \n\t" \
200 /* ------------------------------------------------------------------- */ \
201 " movq %[xC3S5], %%mm0 \n\t" \
202 " movq %[xC5S3], %%mm1 \n\t" \
204 " movq %%mm6, %%mm5 \n\t" \
205 " movq %%mm6, %%mm7 \n\t" \
207 " movq %%mm4, %%mm2 \n\t" \
208 " movq %%mm4, %%mm3 \n\t" \
210 " pmulhw %%mm0, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x - irot_input_x */ \
211 " pmulhw %%mm1, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y - irot_input_y */ \
213 " psrlw $15, %%mm2 \n\t" \
214 " psrlw $15, %%mm5 \n\t" \
216 " paddw %%mm3, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x */ \
217 " paddw %%mm7, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y */ \
219 " paddw %%mm2, %%mm4 \n\t" /* Truncated */ \
220 " paddw %%mm5, %%mm6 \n\t" /* Truncated */ \
222 " psubsw %%mm6, %%mm4 \n\t" /* ip3 */ \
223 " movq %%mm4," #ip3 " \n\t" \
225 " movq %%mm3, %%mm4 \n\t" \
226 " movq %%mm7, %%mm6 \n\t" \
228 " pmulhw %%mm1, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x - irot_input_x */ \
229 " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y - irot_input_y */ \
231 " paddw %%mm2, %%mm4 \n\t" \
232 " paddw %%mm5, %%mm6 \n\t" \
234 " paddw %%mm4, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x */ \
235 " paddw %%mm6, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y */ \
237 " paddw %%mm7, %%mm3 \n\t" /* ip5 */ \
238 " movq %%mm3," #ip5 " \n\t"
240 #define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7, \
241 op0,op1,op2,op3,op4,op5,op6,op7) \
242 " movq " #ip0 ", %%mm0 \n\t" /* mm0 = a0 a1 a2 a3 */ \
243 " movq " #ip4 ", %%mm4 \n\t" /* mm4 = e4 e5 e6 e7 */ \
244 " movq " #ip1 ", %%mm1 \n\t" /* mm1 = b0 b1 b2 b3 */ \
245 " movq " #ip5 ", %%mm5 \n\t" /* mm5 = f4 f5 f6 f7 */ \
246 " movq " #ip2 ", %%mm2 \n\t" /* mm2 = c0 c1 c2 c3 */ \
247 " movq " #ip6 ", %%mm6 \n\t" /* mm6 = g4 g5 g6 g7 */ \
248 " movq " #ip3 ", %%mm3 \n\t" /* mm3 = d0 d1 d2 d3 */ \
249 " movq %%mm1," #op1 " \n\t" /* save b0 b1 b2 b3 */ \
250 " movq " #ip7 ", %%mm7 \n\t" /* mm7 = h0 h1 h2 h3 */ \
251 /* Transpose 2x8 block */ \
252 " movq %%mm4, %%mm1 \n\t" /* mm1 = e3 e2 e1 e0 */ \
253 " punpcklwd %%mm5, %%mm4 \n\t" /* mm4 = f1 e1 f0 e0 */ \
254 " movq %%mm0," #op0 " \n\t" /* save a3 a2 a1 a0 */ \
255 " punpckhwd %%mm5, %%mm1 \n\t" /* mm1 = f3 e3 f2 e2 */ \
256 " movq %%mm6, %%mm0 \n\t" /* mm0 = g3 g2 g1 g0 */ \
257 " punpcklwd %%mm7, %%mm6 \n\t" /* mm6 = h1 g1 h0 g0 */ \
258 " movq %%mm4, %%mm5 \n\t" /* mm5 = f1 e1 f0 e0 */ \
259 " punpckldq %%mm6, %%mm4 \n\t" /* mm4 = h0 g0 f0 e0 = MM4 */ \
260 " punpckhdq %%mm6, %%mm5 \n\t" /* mm5 = h1 g1 f1 e1 = MM5 */ \
261 " movq %%mm1, %%mm6 \n\t" /* mm6 = f3 e3 f2 e2 */ \
262 " movq %%mm4," #op4 " \n\t" \
263 " punpckhwd %%mm7, %%mm0 \n\t" /* mm0 = h3 g3 h2 g2 */ \
264 " movq %%mm5," #op5 " \n\t" \
265 " punpckhdq %%mm0, %%mm6 \n\t" /* mm6 = h3 g3 f3 e3 = MM7 */ \
266 " movq " #op0 ", %%mm4 \n\t" /* mm4 = a3 a2 a1 a0 */ \
267 " punpckldq %%mm0, %%mm1 \n\t" /* mm1 = h2 g2 f2 e2 = MM6 */ \
268 " movq " #op1 ", %%mm5 \n\t" /* mm5 = b3 b2 b1 b0 */ \
269 " movq %%mm4, %%mm0 \n\t" /* mm0 = a3 a2 a1 a0 */ \
270 " movq %%mm6," #op7 " \n\t" \
271 " punpcklwd %%mm5, %%mm0 \n\t" /* mm0 = b1 a1 b0 a0 */ \
272 " movq %%mm1," #op6 " \n\t" \
273 " punpckhwd %%mm5, %%mm4 \n\t" /* mm4 = b3 a3 b2 a2 */ \
274 " movq %%mm2, %%mm5 \n\t" /* mm5 = c3 c2 c1 c0 */ \
275 " punpcklwd %%mm3, %%mm2 \n\t" /* mm2 = d1 c1 d0 c0 */ \
276 " movq %%mm0, %%mm1 \n\t" /* mm1 = b1 a1 b0 a0 */ \
277 " punpckldq %%mm2, %%mm0 \n\t" /* mm0 = d0 c0 b0 a0 = MM0 */ \
278 " punpckhdq %%mm2, %%mm1 \n\t" /* mm1 = d1 c1 b1 a1 = MM1 */ \
279 " movq %%mm4, %%mm2 \n\t" /* mm2 = b3 a3 b2 a2 */ \
280 " movq %%mm0," #op0 " \n\t" \
281 " punpckhwd %%mm3, %%mm5 \n\t" /* mm5 = d3 c3 d2 c2 */ \
282 " movq %%mm1," #op1 " \n\t" \
283 " punpckhdq %%mm5, %%mm4 \n\t" /* mm4 = d3 c3 b3 a3 = MM3 */ \
284 " punpckldq %%mm5, %%mm2 \n\t" /* mm2 = d2 c2 b2 a2 = MM2 */ \
285 " movq %%mm4," #op3 " \n\t" \
286 " movq %%mm2," #op2 " \n\t"
289 /* This performs a 2D Forward DCT on an 8x8 block with short
290 coefficients. We try to do the truncation to match the C
292 static void fdct_short__mmx ( ogg_int16_t
*InputData
, ogg_int16_t
*OutputData
)
294 ogg_int64_t
__attribute__((aligned(8))) align_tmp
[16];
295 ogg_int16_t
*const temp
= (ogg_int16_t
*)align_tmp
;
297 __asm__
__volatile__ (
300 * Input data is an 8x8 block. To make processing of the data more efficent
301 * we will transpose the block of data to two 4x8 blocks???
303 Transpose_mmx ( (%0), 16(%0), 32(%0), 48(%0), 8(%0), 24(%0), 40(%0), 56(%0),
304 (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1))
305 Fdct_mmx ( (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1), (%2))
307 Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0), 88(%0),104(%0),120(%0),
308 64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
309 Fdct_mmx (64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
311 Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1),
312 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
313 Fdct_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), (%2))
315 Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1),
316 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
317 Fdct_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
324 [xC1S7
] "m" (xC1S7
), /* gcc 3.1+ allows named asm parameters */
335 /* install our implementation in the function table */
336 void dsp_mmx_fdct_init(DspFunctions
*funcs
)
338 TH_DEBUG("enabling accelerated x86_64 mmx fdct function.\n");
339 funcs
->fdct_short
= fdct_short__mmx
;