Add Russian translation provided by Валерий Крувялис <valkru@mail.ru>
[xiph-mirror.git] / theora-old / lib / x86_32 / dct_decode_mmx.c
blob17e3049e6c98b080f594c3bd5e2fe328b61225f1
1 /********************************************************************
2 * *
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7 * *
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
9 * by the Xiph.Org Foundation http://www.xiph.org/ *
10 * *
11 ********************************************************************
13 function:
14 last mod: $Id: dsp_mmx.c 12440 2007-02-06 16:36:26Z j $
16 ********************************************************************/
18 #include <stdlib.h>
20 #include "codec_internal.h"
22 #if defined(USE_ASM)
24 static const __attribute__((aligned(8),used)) ogg_int64_t V3= 0x0003000300030003LL;
25 static const __attribute__((aligned(8),used)) ogg_int64_t V804= 0x0804080408040804LL;
27 #if defined(__APPLE__)
28 #define MANGLE(x) "_"#x
29 #else
30 #define MANGLE(x) #x
31 #endif
33 static void FilterHoriz__mmx(unsigned char * PixelPtr,
34 ogg_int32_t LineLength,
35 ogg_int16_t *BoundingValuePtr){
37 #define OC_LOOP_H_4x4 \
38 __asm__ __volatile__( \
39 "lea (%1,%1,2),%%esi\n" /* esi = ystride*3 */ \
40 "movd (%0), %%mm0\n" /* 0 0 0 0 3 2 1 0 */ \
41 "movd (%0,%1),%%mm1\n" /* 0 0 0 0 7 6 5 4 */ \
42 "movd (%0,%1,2),%%mm2\n" /* 0 0 0 0 b a 9 8 */ \
43 "movd (%0,%%esi),%%mm3\n" /* 0 0 0 0 f e d c */ \
44 "punpcklbw %%mm1,%%mm0\n" /* mm0 = 7 3 6 2 5 1 4 0 */ \
45 "punpcklbw %%mm3,%%mm2\n" /* mm2 = f b e a d 9 c 8 */ \
46 "movq %%mm0,%%mm1\n" /* mm1 = 7 3 6 2 5 1 4 0 */ \
47 "punpcklwd %%mm2,%%mm1\n" /* mm1 = d 9 5 1 c 8 4 0 */ \
48 "punpckhwd %%mm2,%%mm0\n" /* mm0 = f b 7 3 e a 6 2 */ \
49 "pxor %%mm7,%%mm7\n" \
50 "movq %%mm1,%%mm5\n" /* mm5 = d 9 5 1 c 8 4 0 */ \
51 "punpckhbw %%mm7,%%mm5\n" /* mm5 = 0 d 0 9 0 5 0 1 = pix[1]*/ \
52 "punpcklbw %%mm7,%%mm1\n" /* mm1 = 0 c 0 8 0 4 0 0 = pix[0]*/ \
53 "movq %%mm0,%%mm3\n" /* mm3 = f b 7 3 e a 6 2 */ \
54 "punpckhbw %%mm7,%%mm3\n" /* mm3 = 0 f 0 b 0 7 0 3 = pix[3]*/ \
55 "punpcklbw %%mm7,%%mm0\n" /* mm0 = 0 e 0 a 0 6 0 2 = pix[2]*/ \
57 "psubw %%mm3,%%mm1\n" /* mm1 = pix[0]-pix[3] mm1 - mm3 */ \
58 "movq %%mm0,%%mm7\n" /* mm7 = pix[2]*/ \
59 "psubw %%mm5,%%mm0\n" /* mm0 = pix[2]-pix[1] mm0 - mm5*/ \
60 "PMULLW "MANGLE(V3)",%%mm0\n" /* *3 */ \
61 "paddw %%mm0,%%mm1\n" /* mm1 has f[0] ... f[4]*/ \
62 "paddw "MANGLE(V804)",%%mm1\n"/* add 4 */ /* add 256 after shift */ \
63 "psraw $3,%%mm1\n" /* >>3 */ \
64 " pextrw $0,%%mm1,%%esi\n" /* In MM1 we have 4 f coefs (16bits) */ \
65 " pextrw $1,%%mm1,%%edi\n" /* now perform MM4 = *(_bv+ f) */ \
66 " pinsrw $0,(%2,%%esi,2),%%mm4\n" \
67 " pextrw $2,%%mm1,%%esi\n" \
68 " pinsrw $1,(%2,%%edi,2),%%mm4\n" \
69 " pextrw $3,%%mm1,%%edi\n" \
70 " pinsrw $2,(%2,%%esi,2),%%mm4\n" \
71 " pinsrw $3,(%2,%%edi,2),%%mm4\n" /* new f vals loaded */ \
72 "pxor %%mm0,%%mm0\n" \
73 " paddw %%mm4,%%mm5\n" /*(pix[1]+f);*/ \
74 " psubw %%mm4,%%mm7\n" /* (pix[2]-f); */ \
75 " packuswb %%mm0,%%mm5\n" /* mm5 = x x x x newpix1 */ \
76 " packuswb %%mm0,%%mm7\n" /* mm7 = x x x x newpix2 */ \
77 " punpcklbw %%mm7,%%mm5\n" /* 2 1 2 1 2 1 2 1 */ \
78 " movd %%mm5,%%eax\n" /* eax = newpix21 */ \
79 " movw %%ax,1(%0)\n" \
80 " psrlq $32,%%mm5\n" /* why is so big stall here ? */ \
81 " shrl $16,%%eax\n" \
82 " lea 1(%0,%1,2),%%edi\n" \
83 " movw %%ax,1(%0,%1,1)\n" \
84 " movd %%mm5,%%eax\n" /* eax = newpix21 high part */ \
85 " lea (%1,%1,2),%%esi\n" \
86 " movw %%ax,(%%edi)\n" \
87 " shrl $16,%%eax\n" \
88 " movw %%ax,1(%0,%%esi)\n" \
89 : \
90 : "r" (PixelPtr), "r" (LineLength), "r" (BoundingValuePtr-256) \
91 : "esi", "edi" , "memory", "eax" \
94 OC_LOOP_H_4x4
95 PixelPtr += LineLength*4;
96 OC_LOOP_H_4x4
97 __asm__ __volatile__("emms\n");
100 static void FilterVert__mmx(unsigned char * PixelPtr,
101 ogg_int32_t LineLength,
102 ogg_int16_t *BoundingValuePtr){
103 __asm__ __volatile__(
104 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
105 "movq (%0),%%mm7\n" /* mm7 = pix[0..7] */
106 "lea (%1,%1,2),%%esi\n" /* esi = ystride*3 */
107 "movq (%0,%%esi),%%mm4\n" /* mm4 = pix[0..7+ystride*3] */
108 "movq %%mm7,%%mm6\n" /* mm6 = pix[0..7] */
109 "punpcklbw %%mm0,%%mm6\n" /* expand unsigned pix[0..3] to 16 bits */
110 "movq %%mm4,%%mm5\n"
111 "punpckhbw %%mm0,%%mm7\n" /* expand unsigned pix[4..7] to 16 bits */
112 "punpcklbw %%mm0,%%mm4\n" /* expand other arrays too */
113 "punpckhbw %%mm0,%%mm5\n"
114 "psubw %%mm4,%%mm6\n" /* mm6 = mm6 - mm4 */
115 "psubw %%mm5,%%mm7\n" /* mm7 = mm7 - mm5 */
116 /* mm7:mm6 = _p[0]-_p[ystride*3] */
117 "movq (%0,%1),%%mm4\n" /* mm4 = pix[0..7+ystride] */
118 "movq %%mm4,%%mm5\n"
119 "movq (%0,%1,2),%%mm2\n" /* mm2 = pix[0..7+ystride*2] */
120 "movq %%mm2,%%mm3\n"
121 "movq %%mm2,%%mm1\n" //ystride*2
122 "punpckhbw %%mm0,%%mm5\n"
123 "punpcklbw %%mm0,%%mm4\n"
124 "punpckhbw %%mm0,%%mm3\n"
125 "punpcklbw %%mm0,%%mm2\n"
126 "psubw %%mm5,%%mm3\n"
127 "psubw %%mm4,%%mm2\n"
128 /* mm3:mm2 = (pix[ystride*2]-pix[ystride]); */
129 "PMULLW "MANGLE(V3)",%%mm3\n" /* *3 */
130 "PMULLW "MANGLE(V3)",%%mm2\n" /* *3 */
131 "paddw %%mm7,%%mm3\n" /* highpart */
132 "paddw %%mm6,%%mm2\n" /* lowpart of pix[0]-pix[ystride*3]+3*(pix[ystride*2]-pix[ystride]); */
133 "paddw "MANGLE(V804)",%%mm3\n" /* add 4 */ /* add 256 after shift */
134 "paddw "MANGLE(V804)",%%mm2\n" /* add 4 */ /* add 256 after shift */
135 "psraw $3,%%mm3\n" /* >>3 f coefs high */
136 "psraw $3,%%mm2\n" /* >>3 f coefs low */
138 " pextrw $0,%%mm2,%%esi\n" /* In MM3:MM2 we have f coefs (16bits) */
139 " pextrw $1,%%mm2,%%edi\n" /* now perform MM7:MM6 = *(_bv+ f) */
140 " pinsrw $0,(%2,%%esi,2),%%mm6\n"
141 " pinsrw $1,(%2,%%edi,2),%%mm6\n"
143 " pextrw $2,%%mm2,%%esi\n"
144 " pextrw $3,%%mm2,%%edi\n"
145 " pinsrw $2,(%2,%%esi,2),%%mm6\n"
146 " pinsrw $3,(%2,%%edi,2),%%mm6\n"
148 " pextrw $0,%%mm3,%%esi\n"
149 " pextrw $1,%%mm3,%%edi\n"
150 " pinsrw $0,(%2,%%esi,2),%%mm7\n"
151 " pinsrw $1,(%2,%%edi,2),%%mm7\n"
153 " pextrw $2,%%mm3,%%esi\n"
154 " pextrw $3,%%mm3,%%edi\n"
155 " pinsrw $2,(%2,%%esi,2),%%mm7\n"
156 " pinsrw $3,(%2,%%edi,2),%%mm7\n" //MM7 MM6 f=*(_bv+(f+4>>3));
158 "paddw %%mm6,%%mm4\n" /* (pix[ystride]+f); */
159 "paddw %%mm7,%%mm5\n" /* (pix[ystride]+f); */
160 "movq %%mm1,%%mm2\n"
161 "punpcklbw %%mm0,%%mm1\n"
162 "punpckhbw %%mm0,%%mm2\n" //[ystride*2]
163 "psubw %%mm6,%%mm1\n" /* (pix[ystride*2]-f); */
164 "psubw %%mm7,%%mm2\n" /* (pix[ystride*2]-f); */
165 "packuswb %%mm2,%%mm1\n"
166 "packuswb %%mm5,%%mm4\n"
167 "movq %%mm1,(%0,%1,2)\n" /* pix[ystride*2]= */
168 "movq %%mm4,(%0,%1)\n" /* pix[ystride]= */
169 "emms\n"
171 : "r" (PixelPtr-2*LineLength), "r" (LineLength), "r" (BoundingValuePtr-256)
172 : "esi", "edi" , "memory"
176 /* install our implementation in the function table */
177 void dsp_mmx_dct_decode_init(DspFunctions *funcs)
179 TH_DEBUG("enabling accelerated x86_32 mmx dct decode functions.\n");
180 funcs->FilterVert = FilterVert__mmx;
181 funcs->FilterHoriz = FilterHoriz__mmx;
184 #endif /* USE_ASM */