Merge remote-tracking branch 'libav/master'
[FFMpeg-mirror/mplayer-patches.git] / libavcodec / x86 / simple_idct.c
blobf31f7f42f20b1b183eec882f1e1ecc79ae79c3d3
1 /*
2 * Simple IDCT MMX
4 * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of Libav.
8 * Libav is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * Libav is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with Libav; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "libavcodec/simple_idct.h"
23 #include "libavutil/internal.h"
24 #include "libavutil/mem.h"
25 #include "dsputil_mmx.h"
27 #if HAVE_INLINE_ASM
30 23170.475006
31 22725.260826
32 21406.727617
33 19265.545870
34 16384.000000
35 12872.826198
36 8866.956905
37 4520.335430
39 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
40 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
41 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
42 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
43 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
44 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
48 #define ROW_SHIFT 11
49 #define COL_SHIFT 20 // 6
51 DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
52 DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
54 DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
55 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
56 // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
57 // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
58 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
59 // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
60 // 0, 0, 0, 0,
61 // 0, 0, 0, 0,
63 C4, C4, C4, C4,
64 C4, -C4, C4, -C4,
66 C2, C6, C2, C6,
67 C6, -C2, C6, -C2,
69 C1, C3, C1, C3,
70 C5, C7, C5, C7,
72 C3, -C7, C3, -C7,
73 -C1, -C5, -C1, -C5,
75 C5, -C1, C5, -C1,
76 C7, C3, C7, C3,
78 C7, -C5, C7, -C5,
79 C3, -C1, C3, -C1
82 static inline void idct(int16_t *block)
84 DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
85 int16_t * const temp= (int16_t*)align_tmp;
87 __asm__ volatile(
88 #if 0 //Alternative, simpler variant
90 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
91 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
92 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
93 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
94 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
95 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
96 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
97 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
98 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
99 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
100 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
101 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
102 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
103 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
104 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
105 #rounder ", %%mm4 \n\t"\
106 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
107 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
108 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
109 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
110 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
111 #rounder ", %%mm0 \n\t"\
112 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
113 "paddd %%mm0, %%mm0 \n\t" \
114 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
115 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
116 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
117 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
118 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
119 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
120 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
121 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
122 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
123 "psrad $" #shift ", %%mm7 \n\t"\
124 "psrad $" #shift ", %%mm4 \n\t"\
125 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
126 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
127 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
128 "psrad $" #shift ", %%mm1 \n\t"\
129 "psrad $" #shift ", %%mm2 \n\t"\
130 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
131 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
132 "movq %%mm7, " #dst " \n\t"\
133 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
134 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
135 "movq %%mm2, 24+" #dst " \n\t"\
136 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
137 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
138 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
139 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
140 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
141 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
142 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
143 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
144 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
145 "psrad $" #shift ", %%mm2 \n\t"\
146 "psrad $" #shift ", %%mm0 \n\t"\
147 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
148 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
149 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
150 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
151 "psrad $" #shift ", %%mm6 \n\t"\
152 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
153 "movq %%mm2, 8+" #dst " \n\t"\
154 "psrad $" #shift ", %%mm4 \n\t"\
155 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
156 "movq %%mm4, 16+" #dst " \n\t"\
158 #define COL_IDCT(src0, src4, src1, src5, dst, shift) \
159 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
160 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
161 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
162 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
163 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
164 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
165 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
166 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
167 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
168 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
169 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
170 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
171 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
172 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
173 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
174 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
175 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
176 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
177 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
178 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
179 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
180 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
181 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
182 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
183 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
184 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
185 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
186 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
187 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
188 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
189 "psrad $" #shift ", %%mm7 \n\t"\
190 "psrad $" #shift ", %%mm4 \n\t"\
191 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
192 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
193 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
194 "psrad $" #shift ", %%mm0 \n\t"\
195 "psrad $" #shift ", %%mm2 \n\t"\
196 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
197 "movd %%mm7, " #dst " \n\t"\
198 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
199 "movd %%mm0, 16+" #dst " \n\t"\
200 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
201 "movd %%mm2, 96+" #dst " \n\t"\
202 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
203 "movd %%mm4, 112+" #dst " \n\t"\
204 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
205 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
206 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
207 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
208 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
209 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
210 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
211 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
212 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
213 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
214 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
215 "psrad $" #shift ", %%mm2 \n\t"\
216 "psrad $" #shift ", %%mm5 \n\t"\
217 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
218 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
219 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
220 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
221 "psrad $" #shift ", %%mm6 \n\t"\
222 "psrad $" #shift ", %%mm4 \n\t"\
223 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
224 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
225 "movd %%mm2, 32+" #dst " \n\t"\
226 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
227 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
228 "movd %%mm6, 48+" #dst " \n\t"\
229 "movd %%mm4, 64+" #dst " \n\t"\
230 "movd %%mm5, 80+" #dst " \n\t"\
233 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
234 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
235 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
236 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
237 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
238 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
239 "pand %%mm0, %%mm4 \n\t"\
240 "por %%mm1, %%mm4 \n\t"\
241 "por %%mm2, %%mm4 \n\t"\
242 "por %%mm3, %%mm4 \n\t"\
243 "packssdw %%mm4,%%mm4 \n\t"\
244 "movd %%mm4, %%eax \n\t"\
245 "orl %%eax, %%eax \n\t"\
246 "jz 1f \n\t"\
247 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
248 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
249 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
250 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
251 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
252 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
253 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
254 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
255 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
256 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
257 #rounder ", %%mm4 \n\t"\
258 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
259 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
260 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
261 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
262 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
263 #rounder ", %%mm0 \n\t"\
264 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
265 "paddd %%mm0, %%mm0 \n\t" \
266 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
267 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
268 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
269 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
270 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
271 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
272 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
273 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
274 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
275 "psrad $" #shift ", %%mm7 \n\t"\
276 "psrad $" #shift ", %%mm4 \n\t"\
277 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
278 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
279 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
280 "psrad $" #shift ", %%mm1 \n\t"\
281 "psrad $" #shift ", %%mm2 \n\t"\
282 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
283 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
284 "movq %%mm7, " #dst " \n\t"\
285 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
286 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
287 "movq %%mm2, 24+" #dst " \n\t"\
288 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
289 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
290 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
291 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
292 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
293 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
294 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
295 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
296 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
297 "psrad $" #shift ", %%mm2 \n\t"\
298 "psrad $" #shift ", %%mm0 \n\t"\
299 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
300 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
301 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
302 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
303 "psrad $" #shift ", %%mm6 \n\t"\
304 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
305 "movq %%mm2, 8+" #dst " \n\t"\
306 "psrad $" #shift ", %%mm4 \n\t"\
307 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
308 "movq %%mm4, 16+" #dst " \n\t"\
309 "jmp 2f \n\t"\
310 "1: \n\t"\
311 "pslld $16, %%mm0 \n\t"\
312 "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
313 "psrad $13, %%mm0 \n\t"\
314 "packssdw %%mm0, %%mm0 \n\t"\
315 "movq %%mm0, " #dst " \n\t"\
316 "movq %%mm0, 8+" #dst " \n\t"\
317 "movq %%mm0, 16+" #dst " \n\t"\
318 "movq %%mm0, 24+" #dst " \n\t"\
319 "2: \n\t"
322 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
323 ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
324 /*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
325 ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
326 ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
328 DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
329 DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
330 DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
333 //IDCT( src0, src4, src1, src5, dst, shift)
334 COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
335 COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
336 COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
337 COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
339 #else
341 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
342 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
343 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
344 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
345 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
346 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
347 "pand %%mm0, %%mm4 \n\t"\
348 "por %%mm1, %%mm4 \n\t"\
349 "por %%mm2, %%mm4 \n\t"\
350 "por %%mm3, %%mm4 \n\t"\
351 "packssdw %%mm4,%%mm4 \n\t"\
352 "movd %%mm4, %%eax \n\t"\
353 "orl %%eax, %%eax \n\t"\
354 "jz 1f \n\t"\
355 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
356 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
357 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
358 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
359 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
360 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
361 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
362 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
363 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
364 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
365 #rounder ", %%mm4 \n\t"\
366 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
367 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
368 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
369 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
370 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
371 #rounder ", %%mm0 \n\t"\
372 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
373 "paddd %%mm0, %%mm0 \n\t" \
374 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
375 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
376 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
377 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
378 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
379 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
380 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
381 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
382 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
383 "psrad $" #shift ", %%mm7 \n\t"\
384 "psrad $" #shift ", %%mm4 \n\t"\
385 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
386 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
387 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
388 "psrad $" #shift ", %%mm1 \n\t"\
389 "psrad $" #shift ", %%mm2 \n\t"\
390 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
391 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
392 "movq %%mm7, " #dst " \n\t"\
393 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
394 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
395 "movq %%mm2, 24+" #dst " \n\t"\
396 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
397 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
398 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
399 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
400 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
401 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
402 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
403 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
404 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
405 "psrad $" #shift ", %%mm2 \n\t"\
406 "psrad $" #shift ", %%mm0 \n\t"\
407 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
408 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
409 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
410 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
411 "psrad $" #shift ", %%mm6 \n\t"\
412 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
413 "movq %%mm2, 8+" #dst " \n\t"\
414 "psrad $" #shift ", %%mm4 \n\t"\
415 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
416 "movq %%mm4, 16+" #dst " \n\t"\
417 "jmp 2f \n\t"\
418 "1: \n\t"\
419 "pslld $16, %%mm0 \n\t"\
420 "paddd "MANGLE(d40000)", %%mm0 \n\t"\
421 "psrad $13, %%mm0 \n\t"\
422 "packssdw %%mm0, %%mm0 \n\t"\
423 "movq %%mm0, " #dst " \n\t"\
424 "movq %%mm0, 8+" #dst " \n\t"\
425 "movq %%mm0, 16+" #dst " \n\t"\
426 "movq %%mm0, 24+" #dst " \n\t"\
427 "2: \n\t"
429 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
430 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
431 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
432 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
433 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
434 "movq %%mm0, %%mm4 \n\t"\
435 "por %%mm1, %%mm4 \n\t"\
436 "por %%mm2, %%mm4 \n\t"\
437 "por %%mm3, %%mm4 \n\t"\
438 "packssdw %%mm4,%%mm4 \n\t"\
439 "movd %%mm4, %%eax \n\t"\
440 "orl %%eax, %%eax \n\t"\
441 "jz " #bt " \n\t"\
442 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
443 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
444 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
445 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
446 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
447 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
448 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
449 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
450 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
451 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
452 #rounder ", %%mm4 \n\t"\
453 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
454 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
455 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
456 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
457 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
458 #rounder ", %%mm0 \n\t"\
459 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
460 "paddd %%mm0, %%mm0 \n\t" \
461 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
462 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
463 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
464 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
465 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
466 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
467 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
468 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
469 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
470 "psrad $" #shift ", %%mm7 \n\t"\
471 "psrad $" #shift ", %%mm4 \n\t"\
472 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
473 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
474 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
475 "psrad $" #shift ", %%mm1 \n\t"\
476 "psrad $" #shift ", %%mm2 \n\t"\
477 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
478 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
479 "movq %%mm7, " #dst " \n\t"\
480 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
481 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
482 "movq %%mm2, 24+" #dst " \n\t"\
483 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
484 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
485 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
486 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
487 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
488 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
489 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
490 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
491 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
492 "psrad $" #shift ", %%mm2 \n\t"\
493 "psrad $" #shift ", %%mm0 \n\t"\
494 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
495 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
496 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
497 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
498 "psrad $" #shift ", %%mm6 \n\t"\
499 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
500 "movq %%mm2, 8+" #dst " \n\t"\
501 "psrad $" #shift ", %%mm4 \n\t"\
502 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
503 "movq %%mm4, 16+" #dst " \n\t"\
505 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
506 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
507 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
508 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
509 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
510 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
511 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
512 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
513 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
514 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
515 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
516 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
517 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
518 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
519 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
520 #rounder ", %%mm4 \n\t"\
521 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
522 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
523 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
524 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
525 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
526 #rounder ", %%mm0 \n\t"\
527 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
528 "paddd %%mm0, %%mm0 \n\t" \
529 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
530 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
531 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
532 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
533 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
534 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
535 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
536 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
537 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
538 "psrad $" #shift ", %%mm7 \n\t"\
539 "psrad $" #shift ", %%mm4 \n\t"\
540 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
541 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
542 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
543 "psrad $" #shift ", %%mm1 \n\t"\
544 "psrad $" #shift ", %%mm2 \n\t"\
545 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
546 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
547 "movq %%mm7, " #dst " \n\t"\
548 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
549 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
550 "movq %%mm2, 24+" #dst " \n\t"\
551 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
552 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
553 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
554 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
555 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
556 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
557 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
558 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
559 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
560 "psrad $" #shift ", %%mm2 \n\t"\
561 "psrad $" #shift ", %%mm0 \n\t"\
562 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
563 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
564 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
565 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
566 "psrad $" #shift ", %%mm6 \n\t"\
567 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
568 "movq %%mm2, 8+" #dst " \n\t"\
569 "psrad $" #shift ", %%mm4 \n\t"\
570 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
571 "movq %%mm4, 16+" #dst " \n\t"\
573 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
574 DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
575 Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
576 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
577 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
579 #undef IDCT
580 #define IDCT(src0, src4, src1, src5, dst, shift) \
581 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
582 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
583 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
584 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
585 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
586 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
587 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
588 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
589 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
590 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
591 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
592 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
593 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
594 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
595 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
596 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
597 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
598 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
599 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
600 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
601 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
602 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
603 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
604 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
605 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
606 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
607 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
608 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
609 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
610 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
611 "psrad $" #shift ", %%mm7 \n\t"\
612 "psrad $" #shift ", %%mm4 \n\t"\
613 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
614 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
615 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
616 "psrad $" #shift ", %%mm0 \n\t"\
617 "psrad $" #shift ", %%mm2 \n\t"\
618 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
619 "movd %%mm7, " #dst " \n\t"\
620 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
621 "movd %%mm0, 16+" #dst " \n\t"\
622 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
623 "movd %%mm2, 96+" #dst " \n\t"\
624 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
625 "movd %%mm4, 112+" #dst " \n\t"\
626 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
627 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
628 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
629 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
630 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
631 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
632 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
633 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
634 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
635 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
636 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
637 "psrad $" #shift ", %%mm2 \n\t"\
638 "psrad $" #shift ", %%mm5 \n\t"\
639 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
640 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
641 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
642 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
643 "psrad $" #shift ", %%mm6 \n\t"\
644 "psrad $" #shift ", %%mm4 \n\t"\
645 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
646 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
647 "movd %%mm2, 32+" #dst " \n\t"\
648 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
649 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
650 "movd %%mm6, 48+" #dst " \n\t"\
651 "movd %%mm4, 64+" #dst " \n\t"\
652 "movd %%mm5, 80+" #dst " \n\t"
655 //IDCT( src0, src4, src1, src5, dst, shift)
656 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
657 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
658 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
659 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
660 "jmp 9f \n\t"
662 "# .p2align 4 \n\t"\
663 "4: \n\t"
664 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
665 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
667 #undef IDCT
668 #define IDCT(src0, src4, src1, src5, dst, shift) \
669 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
670 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
671 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
672 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
673 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
674 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
675 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
676 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
677 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
678 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
679 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
680 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
681 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
682 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
683 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
684 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
685 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
686 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
687 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
688 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
689 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
690 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
691 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
692 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
693 "psrad $" #shift ", %%mm1 \n\t"\
694 "psrad $" #shift ", %%mm4 \n\t"\
695 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
696 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
697 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
698 "psrad $" #shift ", %%mm0 \n\t"\
699 "psrad $" #shift ", %%mm2 \n\t"\
700 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
701 "movd %%mm1, " #dst " \n\t"\
702 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
703 "movd %%mm0, 16+" #dst " \n\t"\
704 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
705 "movd %%mm2, 96+" #dst " \n\t"\
706 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
707 "movd %%mm4, 112+" #dst " \n\t"\
708 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
709 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
710 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
711 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
712 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
713 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
714 "psrad $" #shift ", %%mm2 \n\t"\
715 "psrad $" #shift ", %%mm5 \n\t"\
716 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
717 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
718 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
719 "psrad $" #shift ", %%mm6 \n\t"\
720 "psrad $" #shift ", %%mm1 \n\t"\
721 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
722 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
723 "movd %%mm2, 32+" #dst " \n\t"\
724 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
725 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
726 "movd %%mm6, 48+" #dst " \n\t"\
727 "movd %%mm1, 64+" #dst " \n\t"\
728 "movd %%mm5, 80+" #dst " \n\t"
730 //IDCT( src0, src4, src1, src5, dst, shift)
731 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
732 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
733 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
734 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
735 "jmp 9f \n\t"
737 "# .p2align 4 \n\t"\
738 "6: \n\t"
739 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
741 #undef IDCT
742 #define IDCT(src0, src4, src1, src5, dst, shift) \
743 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
744 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
745 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
746 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
747 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
748 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
749 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
750 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
751 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
752 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
753 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
754 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
755 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
756 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
757 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
758 "psrad $" #shift ", %%mm1 \n\t"\
759 "psrad $" #shift ", %%mm4 \n\t"\
760 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
761 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
762 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
763 "psrad $" #shift ", %%mm0 \n\t"\
764 "psrad $" #shift ", %%mm2 \n\t"\
765 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
766 "movd %%mm1, " #dst " \n\t"\
767 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
768 "movd %%mm0, 16+" #dst " \n\t"\
769 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
770 "movd %%mm2, 96+" #dst " \n\t"\
771 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
772 "movd %%mm4, 112+" #dst " \n\t"\
773 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
774 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
775 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
776 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
777 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
778 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
779 "psrad $" #shift ", %%mm2 \n\t"\
780 "psrad $" #shift ", %%mm5 \n\t"\
781 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
782 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
783 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
784 "psrad $" #shift ", %%mm6 \n\t"\
785 "psrad $" #shift ", %%mm1 \n\t"\
786 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
787 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
788 "movd %%mm2, 32+" #dst " \n\t"\
789 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
790 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
791 "movd %%mm6, 48+" #dst " \n\t"\
792 "movd %%mm1, 64+" #dst " \n\t"\
793 "movd %%mm5, 80+" #dst " \n\t"
796 //IDCT( src0, src4, src1, src5, dst, shift)
797 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
798 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
799 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
800 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
801 "jmp 9f \n\t"
803 "# .p2align 4 \n\t"\
804 "2: \n\t"
805 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
807 #undef IDCT
808 #define IDCT(src0, src4, src1, src5, dst, shift) \
809 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
810 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
811 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
812 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
813 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
814 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
815 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
816 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
817 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
818 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
819 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
820 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
821 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
822 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
823 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
824 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
825 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
826 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
827 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
828 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
829 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
830 "psrad $" #shift ", %%mm7 \n\t"\
831 "psrad $" #shift ", %%mm4 \n\t"\
832 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
833 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
834 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
835 "psrad $" #shift ", %%mm0 \n\t"\
836 "psrad $" #shift ", %%mm2 \n\t"\
837 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
838 "movd %%mm7, " #dst " \n\t"\
839 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
840 "movd %%mm0, 16+" #dst " \n\t"\
841 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
842 "movd %%mm2, 96+" #dst " \n\t"\
843 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
844 "movd %%mm4, 112+" #dst " \n\t"\
845 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
846 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
847 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
848 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
849 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
850 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
851 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
852 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
853 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
854 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
855 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
856 "psrad $" #shift ", %%mm2 \n\t"\
857 "psrad $" #shift ", %%mm5 \n\t"\
858 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
859 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
860 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
861 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
862 "psrad $" #shift ", %%mm6 \n\t"\
863 "psrad $" #shift ", %%mm4 \n\t"\
864 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
865 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
866 "movd %%mm2, 32+" #dst " \n\t"\
867 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
868 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
869 "movd %%mm6, 48+" #dst " \n\t"\
870 "movd %%mm4, 64+" #dst " \n\t"\
871 "movd %%mm5, 80+" #dst " \n\t"
873 //IDCT( src0, src4, src1, src5, dst, shift)
874 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
875 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
876 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
877 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
878 "jmp 9f \n\t"
880 "# .p2align 4 \n\t"\
881 "3: \n\t"
882 #undef IDCT
883 #define IDCT(src0, src4, src1, src5, dst, shift) \
884 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
885 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
886 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
887 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
888 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
889 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
890 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
891 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
892 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
893 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
894 "movq 64(%2), %%mm3 \n\t"\
895 "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
896 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
897 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
898 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
899 "psrad $" #shift ", %%mm7 \n\t"\
900 "psrad $" #shift ", %%mm4 \n\t"\
901 "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
902 "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
903 "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
904 "psrad $" #shift ", %%mm0 \n\t"\
905 "psrad $" #shift ", %%mm1 \n\t"\
906 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
907 "movd %%mm7, " #dst " \n\t"\
908 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
909 "movd %%mm0, 16+" #dst " \n\t"\
910 "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
911 "movd %%mm1, 96+" #dst " \n\t"\
912 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
913 "movd %%mm4, 112+" #dst " \n\t"\
914 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
915 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
916 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
917 "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
918 "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
919 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
920 "psrad $" #shift ", %%mm1 \n\t"\
921 "psrad $" #shift ", %%mm5 \n\t"\
922 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
923 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
924 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
925 "psrad $" #shift ", %%mm6 \n\t"\
926 "psrad $" #shift ", %%mm4 \n\t"\
927 "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
928 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
929 "movd %%mm1, 32+" #dst " \n\t"\
930 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
931 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
932 "movd %%mm6, 48+" #dst " \n\t"\
933 "movd %%mm4, 64+" #dst " \n\t"\
934 "movd %%mm5, 80+" #dst " \n\t"
937 //IDCT( src0, src4, src1, src5, dst, shift)
938 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
939 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
940 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
941 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
942 "jmp 9f \n\t"
944 "# .p2align 4 \n\t"\
945 "5: \n\t"
946 #undef IDCT
947 #define IDCT(src0, src4, src1, src5, dst, shift) \
948 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
949 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
950 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
951 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
952 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
953 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
954 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
955 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
956 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
957 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
958 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
959 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
960 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
961 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
962 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
963 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
964 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
965 "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
966 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
967 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
968 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
969 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
970 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
971 "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
972 "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
973 "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
974 "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
975 "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
976 "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
977 "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
978 "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
979 "psrad $" #shift ", %%mm4 \n\t"\
980 "psrad $" #shift ", %%mm7 \n\t"\
981 "psrad $" #shift ", %%mm3 \n\t"\
982 "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
983 "movq %%mm4, " #dst " \n\t"\
984 "psrad $" #shift ", %%mm0 \n\t"\
985 "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
986 "movq %%mm0, 16+" #dst " \n\t"\
987 "movq %%mm0, 96+" #dst " \n\t"\
988 "movq %%mm4, 112+" #dst " \n\t"\
989 "psrad $" #shift ", %%mm5 \n\t"\
990 "psrad $" #shift ", %%mm6 \n\t"\
991 "psrad $" #shift ", %%mm2 \n\t"\
992 "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
993 "movq %%mm5, 32+" #dst " \n\t"\
994 "psrad $" #shift ", %%mm1 \n\t"\
995 "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
996 "movq %%mm6, 48+" #dst " \n\t"\
997 "movq %%mm6, 64+" #dst " \n\t"\
998 "movq %%mm5, 80+" #dst " \n\t"
1001 //IDCT( src0, src4, src1, src5, dst, shift)
1002 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1003 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1004 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1005 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1006 "jmp 9f \n\t"
1009 "# .p2align 4 \n\t"\
1010 "1: \n\t"
1011 #undef IDCT
1012 #define IDCT(src0, src4, src1, src5, dst, shift) \
1013 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1014 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1015 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1016 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1017 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1018 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1019 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1020 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1021 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1022 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1023 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1024 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1025 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1026 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1027 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1028 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1029 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1030 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1031 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1032 "movq 64(%2), %%mm1 \n\t"\
1033 "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1034 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1035 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1036 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1037 "psrad $" #shift ", %%mm7 \n\t"\
1038 "psrad $" #shift ", %%mm4 \n\t"\
1039 "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
1040 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1041 "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1042 "psrad $" #shift ", %%mm0 \n\t"\
1043 "psrad $" #shift ", %%mm3 \n\t"\
1044 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1045 "movd %%mm7, " #dst " \n\t"\
1046 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1047 "movd %%mm0, 16+" #dst " \n\t"\
1048 "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1049 "movd %%mm3, 96+" #dst " \n\t"\
1050 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1051 "movd %%mm4, 112+" #dst " \n\t"\
1052 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1053 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1054 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1055 "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
1056 "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1057 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1058 "psrad $" #shift ", %%mm3 \n\t"\
1059 "psrad $" #shift ", %%mm5 \n\t"\
1060 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1061 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1062 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1063 "psrad $" #shift ", %%mm6 \n\t"\
1064 "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1065 "movd %%mm3, 32+" #dst " \n\t"\
1066 "psrad $" #shift ", %%mm4 \n\t"\
1067 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1068 "movd %%mm6, 48+" #dst " \n\t"\
1069 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1070 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1071 "movd %%mm4, 64+" #dst " \n\t"\
1072 "movd %%mm5, 80+" #dst " \n\t"
1075 //IDCT( src0, src4, src1, src5, dst, shift)
1076 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1077 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1078 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1079 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1080 "jmp 9f \n\t"
1083 "# .p2align 4 \n\t"
1084 "7: \n\t"
1085 #undef IDCT
1086 #define IDCT(src0, src4, src1, src5, dst, shift) \
1087 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1088 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1089 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1090 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1091 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1092 "psrad $" #shift ", %%mm4 \n\t"\
1093 "psrad $" #shift ", %%mm0 \n\t"\
1094 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1095 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1096 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1097 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1098 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1099 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1100 "psrad $" #shift ", %%mm1 \n\t"\
1101 "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
1102 "movq %%mm4, " #dst " \n\t"\
1103 "psrad $" #shift ", %%mm2 \n\t"\
1104 "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
1105 "movq %%mm0, 16+" #dst " \n\t"\
1106 "movq %%mm0, 96+" #dst " \n\t"\
1107 "movq %%mm4, 112+" #dst " \n\t"\
1108 "movq %%mm0, 32+" #dst " \n\t"\
1109 "movq %%mm4, 48+" #dst " \n\t"\
1110 "movq %%mm4, 64+" #dst " \n\t"\
1111 "movq %%mm0, 80+" #dst " \n\t"
1113 //IDCT( src0, src4, src1, src5, dst, shift)
1114 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1115 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1116 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1117 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1120 #endif
1123 Input
1124 00 40 04 44 20 60 24 64
1125 10 30 14 34 50 70 54 74
1126 01 41 03 43 21 61 23 63
1127 11 31 13 33 51 71 53 73
1128 02 42 06 46 22 62 26 66
1129 12 32 16 36 52 72 56 76
1130 05 45 07 47 25 65 27 67
1131 15 35 17 37 55 75 57 77
1133 Temp
1134 00 04 10 14 20 24 30 34
1135 40 44 50 54 60 64 70 74
1136 01 03 11 13 21 23 31 33
1137 41 43 51 53 61 63 71 73
1138 02 06 12 16 22 26 32 36
1139 42 46 52 56 62 66 72 76
1140 05 07 15 17 25 27 35 37
1141 45 47 55 57 65 67 75 77
1144 "9: \n\t"
1145 :: "r" (block), "r" (temp), "r" (coeffs)
1146 : "%eax"
1150 void ff_simple_idct_mmx(int16_t *block)
1152 idct(block);
1155 //FIXME merge add/put into the idct
1157 void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block)
1159 idct(block);
1160 ff_put_pixels_clamped_mmx(block, dest, line_size);
1162 void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block)
1164 idct(block);
1165 ff_add_pixels_clamped_mmx(block, dest, line_size);
1168 #endif /* HAVE_INLINE_ASM */