1 /********************************************************************
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
9 * by the Xiph.Org Foundation http://www.xiph.org/ *
11 ********************************************************************
16 ********************************************************************/
20 #include "codec_internal.h"
23 typedef unsigned long long ogg_uint64_t
;
25 static const __attribute__ ((aligned(8),used
)) ogg_int64_t V128
= 0x0080008000800080LL
;
27 #define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
28 #define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
29 #define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
31 static void sub8x8__mmx (unsigned char *FiltPtr
, unsigned char *ReconPtr
,
32 ogg_int16_t
*DctInputPtr
, ogg_uint32_t PixelsPerLine
,
33 ogg_uint32_t ReconPixelsPerLine
)
35 __asm__
__volatile__ (
38 " pxor %%mm7, %%mm7 \n\t"
41 " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
42 " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr */
43 " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
44 " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */
45 /* convert from UINT8 to INT16 */
46 " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
47 " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr) */
48 " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
49 " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr) */
50 /* start calculation */
51 " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ReconPtr */
52 " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ReconPtr */
53 " movq %%mm0, (%2) \n\t" /* write answer out */
54 " movq %%mm2, 8(%2) \n\t" /* write answer out */
55 /* Increment pointers */
64 : "r" ((ogg_uint64_t
)PixelsPerLine
),
65 "r" ((ogg_uint64_t
)ReconPixelsPerLine
)
70 static void sub8x8_128__mmx (unsigned char *FiltPtr
, ogg_int16_t
*DctInputPtr
,
71 ogg_uint32_t PixelsPerLine
)
73 ogg_uint64_t ppl
= PixelsPerLine
;
75 __asm__
__volatile__ (
78 " pxor %%mm7, %%mm7 \n\t"
79 " movq %[V128], %%mm1 \n\t"
82 " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
83 " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
84 /* convert from UINT8 to INT16 */
85 " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
86 " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
87 /* start calculation */
88 " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - 128 */
89 " psubw %%mm1, %%mm2 \n\t" /* mm2 = FiltPtr - 128 */
90 " movq %%mm0, (%1) \n\t" /* write answer out */
91 " movq %%mm2, 8(%1) \n\t" /* write answer out */
92 /* Increment pointers */
99 : "r" (ppl
), /* gcc bug? a cast won't work here, e.g. (ogg_uint64_t)PixelsPerLine */
105 static void sub8x8avg2__mmx (unsigned char *FiltPtr
, unsigned char *ReconPtr1
,
106 unsigned char *ReconPtr2
, ogg_int16_t
*DctInputPtr
,
107 ogg_uint32_t PixelsPerLine
,
108 ogg_uint32_t ReconPixelsPerLine
)
110 __asm__
__volatile__ (
113 " pxor %%mm7, %%mm7 \n\t"
116 " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
117 " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr1 */
118 " movq (%2), %%mm4 \n\t" /* mm1 = ReconPtr2 */
119 " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
120 " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */
121 " movq %%mm4, %%mm5 \n\t" /* dup to prepare for up conversion */
122 /* convert from UINT8 to INT16 */
123 " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
124 " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr1) */
125 " punpcklbw %%mm7, %%mm4 \n\t" /* mm1 = INT16(ReconPtr2) */
126 " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
127 " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr1) */
128 " punpckhbw %%mm7, %%mm5 \n\t" /* mm3 = INT16(ReconPtr2) */
129 /* average ReconPtr1 and ReconPtr2 */
130 " paddw %%mm4, %%mm1 \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */
131 " paddw %%mm5, %%mm3 \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */
132 " psrlw $1, %%mm1 \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
133 " psrlw $1, %%mm3 \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
134 " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
135 " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
136 " movq %%mm0, (%3) \n\t" /* write answer out */
137 " movq %%mm2, 8(%3) \n\t" /* write answer out */
138 /* Increment pointers */
149 : "r" ((ogg_uint64_t
)PixelsPerLine
),
150 "r" ((ogg_uint64_t
)ReconPixelsPerLine
)
155 static ogg_uint32_t
intra8x8_err__mmx (unsigned char *DataPtr
, ogg_uint32_t Stride
)
160 __asm__
__volatile__ (
163 " pxor %%mm5, %%mm5 \n\t"
164 " pxor %%mm6, %%mm6 \n\t"
165 " pxor %%mm7, %%mm7 \n\t"
166 " mov $8, %%rdi \n\t"
168 " movq (%2), %%mm0 \n\t" /* take 8 bytes */
169 " movq %%mm0, %%mm2 \n\t"
171 " punpcklbw %%mm6, %%mm0 \n\t"
172 " punpckhbw %%mm6, %%mm2 \n\t"
174 " paddw %%mm0, %%mm5 \n\t"
175 " paddw %%mm2, %%mm5 \n\t"
177 " pmaddwd %%mm0, %%mm0 \n\t"
178 " pmaddwd %%mm2, %%mm2 \n\t"
180 " paddd %%mm0, %%mm7 \n\t"
181 " paddd %%mm2, %%mm7 \n\t"
183 " add %3, %2 \n\t" /* Inc pointer into src data */
188 " movq %%mm5, %%mm0 \n\t"
189 " psrlq $32, %%mm5 \n\t"
190 " paddw %%mm0, %%mm5 \n\t"
191 " movq %%mm5, %%mm0 \n\t"
192 " psrlq $16, %%mm5 \n\t"
193 " paddw %%mm0, %%mm5 \n\t"
194 " movd %%mm5, %%rdi \n\t"
195 " movsx %%di, %%rdi \n\t"
196 " mov %%rdi, %0 \n\t"
198 " movq %%mm7, %%mm0 \n\t"
199 " psrlq $32, %%mm7 \n\t"
200 " paddd %%mm0, %%mm7 \n\t"
201 " movd %%mm7, %1 \n\t"
206 : "r" ((ogg_uint64_t
)Stride
)
210 /* Compute population variance as mis-match metric. */
211 return (( (XXSum
<<6) - XSum
*XSum
) );
214 static ogg_uint32_t
inter8x8_err__mmx (unsigned char *SrcData
, ogg_uint32_t SrcStride
,
215 unsigned char *RefDataPtr
, ogg_uint32_t RefStride
)
220 __asm__
__volatile__ (
223 " pxor %%mm5, %%mm5 \n\t"
224 " pxor %%mm6, %%mm6 \n\t"
225 " pxor %%mm7, %%mm7 \n\t"
226 " mov $8, %%rdi \n\t"
228 " movq (%2), %%mm0 \n\t" /* take 8 bytes */
229 " movq (%3), %%mm1 \n\t"
230 " movq %%mm0, %%mm2 \n\t"
231 " movq %%mm1, %%mm3 \n\t"
233 " punpcklbw %%mm6, %%mm0 \n\t"
234 " punpcklbw %%mm6, %%mm1 \n\t"
235 " punpckhbw %%mm6, %%mm2 \n\t"
236 " punpckhbw %%mm6, %%mm3 \n\t"
238 " psubsw %%mm1, %%mm0 \n\t"
239 " psubsw %%mm3, %%mm2 \n\t"
241 " paddw %%mm0, %%mm5 \n\t"
242 " paddw %%mm2, %%mm5 \n\t"
244 " pmaddwd %%mm0, %%mm0 \n\t"
245 " pmaddwd %%mm2, %%mm2 \n\t"
247 " paddd %%mm0, %%mm7 \n\t"
248 " paddd %%mm2, %%mm7 \n\t"
250 " add %4, %2 \n\t" /* Inc pointer into src data */
251 " add %5, %3 \n\t" /* Inc pointer into ref data */
256 " movq %%mm5, %%mm0 \n\t"
257 " psrlq $32, %%mm5 \n\t"
258 " paddw %%mm0, %%mm5 \n\t"
259 " movq %%mm5, %%mm0 \n\t"
260 " psrlq $16, %%mm5 \n\t"
261 " paddw %%mm0, %%mm5 \n\t"
262 " movd %%mm5, %%rdi \n\t"
263 " movsx %%di, %%rdi \n\t"
264 " mov %%rdi, %0 \n\t"
266 " movq %%mm7, %%mm0 \n\t"
267 " psrlq $32, %%mm7 \n\t"
268 " paddd %%mm0, %%mm7 \n\t"
269 " movd %%mm7, %1 \n\t"
275 : "r" ((ogg_uint64_t
)SrcStride
),
276 "r" ((ogg_uint64_t
)RefStride
)
280 /* Compute and return population variance as mis-match metric. */
281 return (( (XXSum
<<6) - XSum
*XSum
));
284 static void restore_fpu (void)
286 __asm__
__volatile__ (
291 void dsp_mmx_init(DspFunctions
*funcs
)
293 TH_DEBUG("setting accelerated x86_64 mmx dsp functions.\n");
294 funcs
->restore_fpu
= restore_fpu
;
295 funcs
->sub8x8
= sub8x8__mmx
;
296 funcs
->sub8x8_128
= sub8x8_128__mmx
;
297 funcs
->sub8x8avg2
= sub8x8avg2__mmx
;
298 funcs
->intra8x8_err
= intra8x8_err__mmx
;
299 funcs
->inter8x8_err
= inter8x8_err__mmx
;