r1009: Move the dependencies to newer package names
[cinelerra_cv/mob.git] / quicktime / ffmpeg / libavcodec / i386 / simple_idct_mmx.c
blob92a366f2175130d77862c3bbac1c9e9bfac828f6
1 /*
2 * Simple IDCT MMX
4 * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 #include "../dsputil.h"
21 #include "../simple_idct.h"
24 23170.475006
25 22725.260826
26 21406.727617
27 19265.545870
28 16384.000000
29 12872.826198
30 8866.956905
31 4520.335430
33 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
34 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
35 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
36 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
37 #if 0
38 #define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
39 #else
40 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
41 #endif
42 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
43 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
44 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46 #define ROW_SHIFT 11
47 #define COL_SHIFT 20 // 6
49 static const uint64_t attribute_used __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL;
50 static const uint64_t attribute_used __attribute__((aligned(8))) d40000= 0x0000000000040000ULL;
52 static const int16_t __attribute__((aligned(8))) coeffs[]= {
53 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
54 // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
55 // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
56 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
57 // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
58 // 0, 0, 0, 0,
59 // 0, 0, 0, 0,
61 C4, C4, C4, C4,
62 C4, -C4, C4, -C4,
64 C2, C6, C2, C6,
65 C6, -C2, C6, -C2,
67 C1, C3, C1, C3,
68 C5, C7, C5, C7,
70 C3, -C7, C3, -C7,
71 -C1, -C5, -C1, -C5,
73 C5, -C1, C5, -C1,
74 C7, C3, C7, C3,
76 C7, -C5, C7, -C5,
77 C3, -C1, C3, -C1
80 #if 0
81 static void unused_var_killer(){
82 int a= wm1010 + d40000;
83 temp[0]=a;
86 static void inline idctCol (int16_t * col, int16_t *input)
88 #undef C0
89 #undef C1
90 #undef C2
91 #undef C3
92 #undef C4
93 #undef C5
94 #undef C6
95 #undef C7
96 int a0, a1, a2, a3, b0, b1, b2, b3;
97 const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
98 const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
99 const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
100 const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
101 const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
102 const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
103 const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
104 const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
106 if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
107 col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
108 col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
109 return;
112 col[8*0] = input[8*0 + 0];
113 col[8*1] = input[8*2 + 0];
114 col[8*2] = input[8*0 + 1];
115 col[8*3] = input[8*2 + 1];
116 col[8*4] = input[8*4 + 0];
117 col[8*5] = input[8*6 + 0];
118 col[8*6] = input[8*4 + 1];
119 col[8*7] = input[8*6 + 1];
121 a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
122 a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
123 a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
124 a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
126 b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
127 b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
128 b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
129 b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
131 col[8*0] = (a0 + b0) >> COL_SHIFT;
132 col[8*1] = (a1 + b1) >> COL_SHIFT;
133 col[8*2] = (a2 + b2) >> COL_SHIFT;
134 col[8*3] = (a3 + b3) >> COL_SHIFT;
135 col[8*4] = (a3 - b3) >> COL_SHIFT;
136 col[8*5] = (a2 - b2) >> COL_SHIFT;
137 col[8*6] = (a1 - b1) >> COL_SHIFT;
138 col[8*7] = (a0 - b0) >> COL_SHIFT;
141 static void inline idctRow (int16_t * output, int16_t * input)
143 int16_t row[8];
145 int a0, a1, a2, a3, b0, b1, b2, b3;
146 const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
147 const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
148 const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
149 const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
150 const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
151 const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
152 const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
153 const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
155 row[0] = input[0];
156 row[2] = input[1];
157 row[4] = input[4];
158 row[6] = input[5];
159 row[1] = input[8];
160 row[3] = input[9];
161 row[5] = input[12];
162 row[7] = input[13];
164 if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
165 row[0] = row[1] = row[2] = row[3] = row[4] =
166 row[5] = row[6] = row[7] = row[0]<<3;
167 output[0] = row[0];
168 output[2] = row[1];
169 output[4] = row[2];
170 output[6] = row[3];
171 output[8] = row[4];
172 output[10] = row[5];
173 output[12] = row[6];
174 output[14] = row[7];
175 return;
178 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
179 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
180 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
181 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
183 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
184 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
185 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
186 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
188 row[0] = (a0 + b0) >> ROW_SHIFT;
189 row[1] = (a1 + b1) >> ROW_SHIFT;
190 row[2] = (a2 + b2) >> ROW_SHIFT;
191 row[3] = (a3 + b3) >> ROW_SHIFT;
192 row[4] = (a3 - b3) >> ROW_SHIFT;
193 row[5] = (a2 - b2) >> ROW_SHIFT;
194 row[6] = (a1 - b1) >> ROW_SHIFT;
195 row[7] = (a0 - b0) >> ROW_SHIFT;
197 output[0] = row[0];
198 output[2] = row[1];
199 output[4] = row[2];
200 output[6] = row[3];
201 output[8] = row[4];
202 output[10] = row[5];
203 output[12] = row[6];
204 output[14] = row[7];
206 #endif
208 static inline void idct(int16_t *block)
210 int64_t __attribute__((aligned(8))) align_tmp[16];
211 int16_t * const temp= (int16_t*)align_tmp;
213 asm volatile(
214 #if 0 //Alternative, simpler variant
216 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
217 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
218 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
219 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
220 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
221 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
222 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
223 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
224 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
225 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
226 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
227 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
228 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
229 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
230 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
231 #rounder ", %%mm4 \n\t"\
232 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
233 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
234 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
235 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
236 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
237 #rounder ", %%mm0 \n\t"\
238 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
239 "paddd %%mm0, %%mm0 \n\t" \
240 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
241 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
242 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
243 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
244 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
245 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
246 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
247 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
248 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
249 "psrad $" #shift ", %%mm7 \n\t"\
250 "psrad $" #shift ", %%mm4 \n\t"\
251 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
252 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
253 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
254 "psrad $" #shift ", %%mm1 \n\t"\
255 "psrad $" #shift ", %%mm2 \n\t"\
256 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
257 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
258 "movq %%mm7, " #dst " \n\t"\
259 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
260 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
261 "movq %%mm2, 24+" #dst " \n\t"\
262 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
263 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
264 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
265 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
266 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
267 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
268 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
269 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
270 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
271 "psrad $" #shift ", %%mm2 \n\t"\
272 "psrad $" #shift ", %%mm0 \n\t"\
273 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
274 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
275 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
276 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
277 "psrad $" #shift ", %%mm6 \n\t"\
278 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
279 "movq %%mm2, 8+" #dst " \n\t"\
280 "psrad $" #shift ", %%mm4 \n\t"\
281 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
282 "movq %%mm4, 16+" #dst " \n\t"\
284 #define COL_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
285 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
286 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
287 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
288 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
289 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
290 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
291 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
292 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
293 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
294 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
295 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
296 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
297 #rounder ", %%mm4 \n\t"\
298 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
299 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
300 #rounder ", %%mm0 \n\t"\
301 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
302 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
303 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
304 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
305 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
306 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
307 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
308 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
309 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
310 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
311 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
312 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
313 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
314 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
315 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
316 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
317 "psrad $" #shift ", %%mm7 \n\t"\
318 "psrad $" #shift ", %%mm4 \n\t"\
319 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
320 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
321 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
322 "psrad $" #shift ", %%mm0 \n\t"\
323 "psrad $" #shift ", %%mm2 \n\t"\
324 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
325 "movd %%mm7, " #dst " \n\t"\
326 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
327 "movd %%mm0, 16+" #dst " \n\t"\
328 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
329 "movd %%mm2, 96+" #dst " \n\t"\
330 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
331 "movd %%mm4, 112+" #dst " \n\t"\
332 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
333 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
334 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
335 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
336 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
337 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
338 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
339 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
340 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
341 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
342 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
343 "psrad $" #shift ", %%mm2 \n\t"\
344 "psrad $" #shift ", %%mm5 \n\t"\
345 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
346 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
347 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
348 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
349 "psrad $" #shift ", %%mm6 \n\t"\
350 "psrad $" #shift ", %%mm4 \n\t"\
351 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
352 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
353 "movd %%mm2, 32+" #dst " \n\t"\
354 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
355 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
356 "movd %%mm6, 48+" #dst " \n\t"\
357 "movd %%mm4, 64+" #dst " \n\t"\
358 "movd %%mm5, 80+" #dst " \n\t"\
361 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
362 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
363 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
364 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
365 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
366 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
367 "pand %%mm0, %%mm4 \n\t"\
368 "por %%mm1, %%mm4 \n\t"\
369 "por %%mm2, %%mm4 \n\t"\
370 "por %%mm3, %%mm4 \n\t"\
371 "packssdw %%mm4,%%mm4 \n\t"\
372 "movd %%mm4, %%eax \n\t"\
373 "orl %%eax, %%eax \n\t"\
374 "jz 1f \n\t"\
375 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
376 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
377 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
378 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
379 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
380 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
381 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
382 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
383 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
384 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
385 #rounder ", %%mm4 \n\t"\
386 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
387 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
388 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
389 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
390 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
391 #rounder ", %%mm0 \n\t"\
392 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
393 "paddd %%mm0, %%mm0 \n\t" \
394 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
395 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
396 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
397 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
398 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
399 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
400 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
401 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
402 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
403 "psrad $" #shift ", %%mm7 \n\t"\
404 "psrad $" #shift ", %%mm4 \n\t"\
405 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
406 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
407 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
408 "psrad $" #shift ", %%mm1 \n\t"\
409 "psrad $" #shift ", %%mm2 \n\t"\
410 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
411 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
412 "movq %%mm7, " #dst " \n\t"\
413 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
414 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
415 "movq %%mm2, 24+" #dst " \n\t"\
416 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
417 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
418 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
419 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
420 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
421 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
422 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
423 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
424 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
425 "psrad $" #shift ", %%mm2 \n\t"\
426 "psrad $" #shift ", %%mm0 \n\t"\
427 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
428 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
429 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
430 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
431 "psrad $" #shift ", %%mm6 \n\t"\
432 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
433 "movq %%mm2, 8+" #dst " \n\t"\
434 "psrad $" #shift ", %%mm4 \n\t"\
435 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
436 "movq %%mm4, 16+" #dst " \n\t"\
437 "jmp 2f \n\t"\
438 "1: \n\t"\
439 "pslld $16, %%mm0 \n\t"\
440 "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
441 "psrad $13, %%mm0 \n\t"\
442 "packssdw %%mm0, %%mm0 \n\t"\
443 "movq %%mm0, " #dst " \n\t"\
444 "movq %%mm0, 8+" #dst " \n\t"\
445 "movq %%mm0, 16+" #dst " \n\t"\
446 "movq %%mm0, 24+" #dst " \n\t"\
447 "2: \n\t"
450 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
451 ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
452 /*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
453 ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
454 ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
456 DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
457 DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
458 DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
461 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
462 COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
463 COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
464 COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
465 COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
467 #else
469 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
470 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
471 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
472 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
473 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
474 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
475 "pand %%mm0, %%mm4 \n\t"\
476 "por %%mm1, %%mm4 \n\t"\
477 "por %%mm2, %%mm4 \n\t"\
478 "por %%mm3, %%mm4 \n\t"\
479 "packssdw %%mm4,%%mm4 \n\t"\
480 "movd %%mm4, %%eax \n\t"\
481 "orl %%eax, %%eax \n\t"\
482 "jz 1f \n\t"\
483 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
484 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
485 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
486 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
487 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
488 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
489 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
490 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
491 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
492 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
493 #rounder ", %%mm4 \n\t"\
494 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
495 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
496 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
497 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
498 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
499 #rounder ", %%mm0 \n\t"\
500 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
501 "paddd %%mm0, %%mm0 \n\t" \
502 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
503 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
504 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
505 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
506 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
507 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
508 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
509 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
510 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
511 "psrad $" #shift ", %%mm7 \n\t"\
512 "psrad $" #shift ", %%mm4 \n\t"\
513 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
514 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
515 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
516 "psrad $" #shift ", %%mm1 \n\t"\
517 "psrad $" #shift ", %%mm2 \n\t"\
518 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
519 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
520 "movq %%mm7, " #dst " \n\t"\
521 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
522 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
523 "movq %%mm2, 24+" #dst " \n\t"\
524 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
525 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
526 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
527 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
528 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
529 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
530 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
531 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
532 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
533 "psrad $" #shift ", %%mm2 \n\t"\
534 "psrad $" #shift ", %%mm0 \n\t"\
535 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
536 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
537 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
538 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
539 "psrad $" #shift ", %%mm6 \n\t"\
540 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
541 "movq %%mm2, 8+" #dst " \n\t"\
542 "psrad $" #shift ", %%mm4 \n\t"\
543 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
544 "movq %%mm4, 16+" #dst " \n\t"\
545 "jmp 2f \n\t"\
546 "1: \n\t"\
547 "pslld $16, %%mm0 \n\t"\
548 "paddd "MANGLE(d40000)", %%mm0 \n\t"\
549 "psrad $13, %%mm0 \n\t"\
550 "packssdw %%mm0, %%mm0 \n\t"\
551 "movq %%mm0, " #dst " \n\t"\
552 "movq %%mm0, 8+" #dst " \n\t"\
553 "movq %%mm0, 16+" #dst " \n\t"\
554 "movq %%mm0, 24+" #dst " \n\t"\
555 "2: \n\t"
557 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
558 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
559 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
560 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
561 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
562 "movq %%mm0, %%mm4 \n\t"\
563 "por %%mm1, %%mm4 \n\t"\
564 "por %%mm2, %%mm4 \n\t"\
565 "por %%mm3, %%mm4 \n\t"\
566 "packssdw %%mm4,%%mm4 \n\t"\
567 "movd %%mm4, %%eax \n\t"\
568 "orl %%eax, %%eax \n\t"\
569 "jz " #bt " \n\t"\
570 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
571 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
572 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
573 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
574 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
575 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
576 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
577 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
578 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
579 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
580 #rounder ", %%mm4 \n\t"\
581 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
582 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
583 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
584 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
585 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
586 #rounder ", %%mm0 \n\t"\
587 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
588 "paddd %%mm0, %%mm0 \n\t" \
589 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
590 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
591 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
592 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
593 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
594 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
595 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
596 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
597 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
598 "psrad $" #shift ", %%mm7 \n\t"\
599 "psrad $" #shift ", %%mm4 \n\t"\
600 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
601 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
602 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
603 "psrad $" #shift ", %%mm1 \n\t"\
604 "psrad $" #shift ", %%mm2 \n\t"\
605 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
606 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
607 "movq %%mm7, " #dst " \n\t"\
608 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
609 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
610 "movq %%mm2, 24+" #dst " \n\t"\
611 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
612 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
613 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
614 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
615 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
616 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
617 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
618 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
619 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
620 "psrad $" #shift ", %%mm2 \n\t"\
621 "psrad $" #shift ", %%mm0 \n\t"\
622 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
623 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
624 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
625 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
626 "psrad $" #shift ", %%mm6 \n\t"\
627 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
628 "movq %%mm2, 8+" #dst " \n\t"\
629 "psrad $" #shift ", %%mm4 \n\t"\
630 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
631 "movq %%mm4, 16+" #dst " \n\t"\
633 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
634 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
635 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
636 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
637 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
638 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
639 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
640 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
641 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
642 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
643 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
644 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
645 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
646 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
647 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
648 #rounder ", %%mm4 \n\t"\
649 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
650 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
651 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
652 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
653 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
654 #rounder ", %%mm0 \n\t"\
655 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
656 "paddd %%mm0, %%mm0 \n\t" \
657 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
658 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
659 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
660 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
661 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
662 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
663 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
664 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
665 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
666 "psrad $" #shift ", %%mm7 \n\t"\
667 "psrad $" #shift ", %%mm4 \n\t"\
668 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
669 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
670 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
671 "psrad $" #shift ", %%mm1 \n\t"\
672 "psrad $" #shift ", %%mm2 \n\t"\
673 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
674 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
675 "movq %%mm7, " #dst " \n\t"\
676 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
677 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
678 "movq %%mm2, 24+" #dst " \n\t"\
679 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
680 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
681 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
682 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
683 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
684 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
685 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
686 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
687 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
688 "psrad $" #shift ", %%mm2 \n\t"\
689 "psrad $" #shift ", %%mm0 \n\t"\
690 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
691 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
692 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
693 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
694 "psrad $" #shift ", %%mm6 \n\t"\
695 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
696 "movq %%mm2, 8+" #dst " \n\t"\
697 "psrad $" #shift ", %%mm4 \n\t"\
698 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
699 "movq %%mm4, 16+" #dst " \n\t"\
701 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
702 DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
703 Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
704 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
705 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
707 #undef IDCT
708 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
709 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
710 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
711 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
712 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
713 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
714 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
715 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
716 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
717 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
718 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
719 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
720 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
721 #rounder ", %%mm4 \n\t"\
722 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
723 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
724 #rounder ", %%mm0 \n\t"\
725 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
726 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
727 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
728 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
729 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
730 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
731 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
732 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
733 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
734 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
735 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
736 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
737 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
738 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
739 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
740 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
741 "psrad $" #shift ", %%mm7 \n\t"\
742 "psrad $" #shift ", %%mm4 \n\t"\
743 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
744 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
745 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
746 "psrad $" #shift ", %%mm0 \n\t"\
747 "psrad $" #shift ", %%mm2 \n\t"\
748 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
749 "movd %%mm7, " #dst " \n\t"\
750 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
751 "movd %%mm0, 16+" #dst " \n\t"\
752 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
753 "movd %%mm2, 96+" #dst " \n\t"\
754 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
755 "movd %%mm4, 112+" #dst " \n\t"\
756 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
757 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
758 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
759 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
760 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
761 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
762 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
763 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
764 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
765 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
766 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
767 "psrad $" #shift ", %%mm2 \n\t"\
768 "psrad $" #shift ", %%mm5 \n\t"\
769 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
770 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
771 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
772 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
773 "psrad $" #shift ", %%mm6 \n\t"\
774 "psrad $" #shift ", %%mm4 \n\t"\
775 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
776 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
777 "movd %%mm2, 32+" #dst " \n\t"\
778 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
779 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
780 "movd %%mm6, 48+" #dst " \n\t"\
781 "movd %%mm4, 64+" #dst " \n\t"\
782 "movd %%mm5, 80+" #dst " \n\t"
785 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
786 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
787 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
788 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
789 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
790 "jmp 9f \n\t"
792 "#.balign 16 \n\t"\
793 "4: \n\t"
794 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
795 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
797 #undef IDCT
798 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
799 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
800 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
801 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
802 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
803 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
804 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
805 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
806 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
807 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
808 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
809 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
810 #rounder ", %%mm4 \n\t"\
811 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
812 #rounder ", %%mm0 \n\t"\
813 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
814 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
815 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
816 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
817 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
818 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
819 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
820 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
821 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
822 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
823 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
824 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
825 "psrad $" #shift ", %%mm1 \n\t"\
826 "psrad $" #shift ", %%mm4 \n\t"\
827 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
828 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
829 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
830 "psrad $" #shift ", %%mm0 \n\t"\
831 "psrad $" #shift ", %%mm2 \n\t"\
832 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
833 "movd %%mm1, " #dst " \n\t"\
834 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
835 "movd %%mm0, 16+" #dst " \n\t"\
836 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
837 "movd %%mm2, 96+" #dst " \n\t"\
838 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
839 "movd %%mm4, 112+" #dst " \n\t"\
840 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
841 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
842 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
843 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
844 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
845 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
846 "psrad $" #shift ", %%mm2 \n\t"\
847 "psrad $" #shift ", %%mm5 \n\t"\
848 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
849 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
850 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
851 "psrad $" #shift ", %%mm6 \n\t"\
852 "psrad $" #shift ", %%mm1 \n\t"\
853 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
854 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
855 "movd %%mm2, 32+" #dst " \n\t"\
856 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
857 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
858 "movd %%mm6, 48+" #dst " \n\t"\
859 "movd %%mm1, 64+" #dst " \n\t"\
860 "movd %%mm5, 80+" #dst " \n\t"
862 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
863 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
864 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
865 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
866 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
867 "jmp 9f \n\t"
869 "#.balign 16 \n\t"\
870 "6: \n\t"
871 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
873 #undef IDCT
874 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
875 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
876 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
877 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
878 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
879 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
880 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
881 #rounder ", %%mm4 \n\t"\
882 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
883 #rounder ", %%mm0 \n\t"\
884 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
885 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
886 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
887 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
888 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
889 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
890 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
891 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
892 "psrad $" #shift ", %%mm1 \n\t"\
893 "psrad $" #shift ", %%mm4 \n\t"\
894 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
895 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
896 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
897 "psrad $" #shift ", %%mm0 \n\t"\
898 "psrad $" #shift ", %%mm2 \n\t"\
899 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
900 "movd %%mm1, " #dst " \n\t"\
901 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
902 "movd %%mm0, 16+" #dst " \n\t"\
903 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
904 "movd %%mm2, 96+" #dst " \n\t"\
905 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
906 "movd %%mm4, 112+" #dst " \n\t"\
907 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
908 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
909 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
910 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
911 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
912 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
913 "psrad $" #shift ", %%mm2 \n\t"\
914 "psrad $" #shift ", %%mm5 \n\t"\
915 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
916 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
917 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
918 "psrad $" #shift ", %%mm6 \n\t"\
919 "psrad $" #shift ", %%mm1 \n\t"\
920 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
921 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
922 "movd %%mm2, 32+" #dst " \n\t"\
923 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
924 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
925 "movd %%mm6, 48+" #dst " \n\t"\
926 "movd %%mm1, 64+" #dst " \n\t"\
927 "movd %%mm5, 80+" #dst " \n\t"
930 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
931 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
932 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
933 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
934 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
935 "jmp 9f \n\t"
937 "#.balign 16 \n\t"\
938 "2: \n\t"
939 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
941 #undef IDCT
942 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
943 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
944 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
945 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
946 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
947 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
948 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
949 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
950 #rounder ", %%mm4 \n\t"\
951 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
952 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
953 #rounder ", %%mm0 \n\t"\
954 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
955 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
956 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
957 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
958 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
959 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
960 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
961 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
962 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
963 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
964 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
965 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
966 "psrad $" #shift ", %%mm7 \n\t"\
967 "psrad $" #shift ", %%mm4 \n\t"\
968 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
969 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
970 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
971 "psrad $" #shift ", %%mm0 \n\t"\
972 "psrad $" #shift ", %%mm2 \n\t"\
973 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
974 "movd %%mm7, " #dst " \n\t"\
975 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
976 "movd %%mm0, 16+" #dst " \n\t"\
977 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
978 "movd %%mm2, 96+" #dst " \n\t"\
979 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
980 "movd %%mm4, 112+" #dst " \n\t"\
981 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
982 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
983 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
984 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
985 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
986 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
987 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
988 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
989 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
990 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
991 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
992 "psrad $" #shift ", %%mm2 \n\t"\
993 "psrad $" #shift ", %%mm5 \n\t"\
994 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
995 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
996 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
997 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
998 "psrad $" #shift ", %%mm6 \n\t"\
999 "psrad $" #shift ", %%mm4 \n\t"\
1000 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
1001 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1002 "movd %%mm2, 32+" #dst " \n\t"\
1003 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1004 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1005 "movd %%mm6, 48+" #dst " \n\t"\
1006 "movd %%mm4, 64+" #dst " \n\t"\
1007 "movd %%mm5, 80+" #dst " \n\t"
1009 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
1010 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1011 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1012 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1013 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1014 "jmp 9f \n\t"
1016 "#.balign 16 \n\t"\
1017 "3: \n\t"
1018 #undef IDCT
1019 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1020 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1021 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1022 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1023 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1024 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1025 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1026 #rounder ", %%mm4 \n\t"\
1027 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1028 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1029 #rounder ", %%mm0 \n\t"\
1030 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1031 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1032 "movq 64(%2), %%mm3 \n\t"\
1033 "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1034 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1035 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1036 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1037 "psrad $" #shift ", %%mm7 \n\t"\
1038 "psrad $" #shift ", %%mm4 \n\t"\
1039 "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
1040 "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1041 "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
1042 "psrad $" #shift ", %%mm0 \n\t"\
1043 "psrad $" #shift ", %%mm1 \n\t"\
1044 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1045 "movd %%mm7, " #dst " \n\t"\
1046 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1047 "movd %%mm0, 16+" #dst " \n\t"\
1048 "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
1049 "movd %%mm1, 96+" #dst " \n\t"\
1050 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1051 "movd %%mm4, 112+" #dst " \n\t"\
1052 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1053 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1054 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1055 "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
1056 "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
1057 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1058 "psrad $" #shift ", %%mm1 \n\t"\
1059 "psrad $" #shift ", %%mm5 \n\t"\
1060 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1061 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1062 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1063 "psrad $" #shift ", %%mm6 \n\t"\
1064 "psrad $" #shift ", %%mm4 \n\t"\
1065 "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
1066 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1067 "movd %%mm1, 32+" #dst " \n\t"\
1068 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1069 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1070 "movd %%mm6, 48+" #dst " \n\t"\
1071 "movd %%mm4, 64+" #dst " \n\t"\
1072 "movd %%mm5, 80+" #dst " \n\t"
1075 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
1076 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1077 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1078 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1079 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1080 "jmp 9f \n\t"
1082 "#.balign 16 \n\t"\
1083 "5: \n\t"
1084 #undef IDCT
1085 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1086 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1087 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1088 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1089 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1090 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1091 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1092 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1093 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1094 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1095 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1096 #rounder ", %%mm4 \n\t"\
1097 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1098 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1099 #rounder ", %%mm0 \n\t"\
1100 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1101 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1102 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1103 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1104 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1105 "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
1106 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1107 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1108 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1109 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1110 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1111 "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1112 "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1113 #rounder ", %%mm1 \n\t"\
1114 "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
1115 "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
1116 #rounder ", %%mm2 \n\t"\
1117 "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
1118 "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
1119 "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
1120 "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
1121 "psrad $" #shift ", %%mm4 \n\t"\
1122 "psrad $" #shift ", %%mm7 \n\t"\
1123 "psrad $" #shift ", %%mm3 \n\t"\
1124 "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
1125 "movq %%mm4, " #dst " \n\t"\
1126 "psrad $" #shift ", %%mm0 \n\t"\
1127 "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
1128 "movq %%mm0, 16+" #dst " \n\t"\
1129 "movq %%mm0, 96+" #dst " \n\t"\
1130 "movq %%mm4, 112+" #dst " \n\t"\
1131 "psrad $" #shift ", %%mm5 \n\t"\
1132 "psrad $" #shift ", %%mm6 \n\t"\
1133 "psrad $" #shift ", %%mm2 \n\t"\
1134 "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1135 "movq %%mm5, 32+" #dst " \n\t"\
1136 "psrad $" #shift ", %%mm1 \n\t"\
1137 "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1138 "movq %%mm6, 48+" #dst " \n\t"\
1139 "movq %%mm6, 64+" #dst " \n\t"\
1140 "movq %%mm5, 80+" #dst " \n\t"
1143 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
1144 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1145 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1146 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1147 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1148 "jmp 9f \n\t"
1151 "#.balign 16 \n\t"\
1152 "1: \n\t"
1153 #undef IDCT
1154 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1155 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1156 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1157 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1158 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1159 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1160 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1161 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1162 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1163 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1164 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1165 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1166 #rounder ", %%mm4 \n\t"\
1167 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1168 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1169 #rounder ", %%mm0 \n\t"\
1170 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1171 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1172 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1173 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1174 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1175 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1176 "movq 64(%2), %%mm1 \n\t"\
1177 "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1178 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1179 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1180 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1181 "psrad $" #shift ", %%mm7 \n\t"\
1182 "psrad $" #shift ", %%mm4 \n\t"\
1183 "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
1184 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1185 "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1186 "psrad $" #shift ", %%mm0 \n\t"\
1187 "psrad $" #shift ", %%mm3 \n\t"\
1188 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1189 "movd %%mm7, " #dst " \n\t"\
1190 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1191 "movd %%mm0, 16+" #dst " \n\t"\
1192 "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1193 "movd %%mm3, 96+" #dst " \n\t"\
1194 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1195 "movd %%mm4, 112+" #dst " \n\t"\
1196 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1197 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1198 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1199 "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
1200 "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1201 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1202 "psrad $" #shift ", %%mm3 \n\t"\
1203 "psrad $" #shift ", %%mm5 \n\t"\
1204 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1205 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1206 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1207 "psrad $" #shift ", %%mm6 \n\t"\
1208 "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1209 "movd %%mm3, 32+" #dst " \n\t"\
1210 "psrad $" #shift ", %%mm4 \n\t"\
1211 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1212 "movd %%mm6, 48+" #dst " \n\t"\
1213 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1214 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1215 "movd %%mm4, 64+" #dst " \n\t"\
1216 "movd %%mm5, 80+" #dst " \n\t"
1219 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
1220 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1221 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1222 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1223 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1224 "jmp 9f \n\t"
1227 "#.balign 16 \n\t"
1228 "7: \n\t"
1229 #undef IDCT
1230 #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1231 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1232 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1233 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1234 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1235 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1236 #rounder ", %%mm4 \n\t"\
1237 #rounder ", %%mm0 \n\t"\
1238 "psrad $" #shift ", %%mm4 \n\t"\
1239 "psrad $" #shift ", %%mm0 \n\t"\
1240 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1241 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1242 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1243 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1244 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1245 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1246 #rounder ", %%mm1 \n\t"\
1247 #rounder ", %%mm2 \n\t"\
1248 "psrad $" #shift ", %%mm1 \n\t"\
1249 "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
1250 "movq %%mm4, " #dst " \n\t"\
1251 "psrad $" #shift ", %%mm2 \n\t"\
1252 "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
1253 "movq %%mm0, 16+" #dst " \n\t"\
1254 "movq %%mm0, 96+" #dst " \n\t"\
1255 "movq %%mm4, 112+" #dst " \n\t"\
1256 "movq %%mm0, 32+" #dst " \n\t"\
1257 "movq %%mm4, 48+" #dst " \n\t"\
1258 "movq %%mm4, 64+" #dst " \n\t"\
1259 "movq %%mm0, 80+" #dst " \n\t"
1261 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
1262 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1263 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1264 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1265 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
1268 #endif
1271 Input
1272 00 40 04 44 20 60 24 64
1273 10 30 14 34 50 70 54 74
1274 01 41 03 43 21 61 23 63
1275 11 31 13 33 51 71 53 73
1276 02 42 06 46 22 62 26 66
1277 12 32 16 36 52 72 56 76
1278 05 45 07 47 25 65 27 67
1279 15 35 17 37 55 75 57 77
1281 Temp
1282 00 04 10 14 20 24 30 34
1283 40 44 50 54 60 64 70 74
1284 01 03 11 13 21 23 31 33
1285 41 43 51 53 61 63 71 73
1286 02 06 12 16 22 26 32 36
1287 42 46 52 56 62 66 72 76
1288 05 07 15 17 25 27 35 37
1289 45 47 55 57 65 67 75 77
1292 "9: \n\t"
1293 :: "r" (block), "r" (temp), "r" (coeffs)
1294 : "%eax"
1298 void ff_simple_idct_mmx(int16_t *block)
1300 idct(block);
1303 //FIXME merge add/put into the idct
1305 void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1307 idct(block);
1308 put_pixels_clamped_mmx(block, dest, line_size);
1310 void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
1312 idct(block);
1313 add_pixels_clamped_mmx(block, dest, line_size);