fix dependency generation
[FFMpeg-mirror/DVCPRO-HD.git] / libavcodec / i386 / vp3dsp_mmx.c
blob00a6d272078ae9ff0b149b6f6e0d0ee719fb4f13
1 /*
2 * Copyright (C) 2004 the ffmpeg project
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 /**
22 * @file vp3dsp_mmx.c
23 * MMX-optimized functions cribbed from the original VP3 source code.
26 #include "libavcodec/dsputil.h"
27 #include "mmx.h"
29 #define IdctAdjustBeforeShift 8
31 /* (12 * 4) 2-byte memory locations ( = 96 bytes total)
32 * idct_constants[0..15] = Mask table (M(I))
33 * idct_constants[16..43] = Cosine table (C(I))
34 * idct_constants[44..47] = 8
36 static uint16_t idct_constants[(4 + 7 + 1) * 4];
37 static const uint16_t idct_cosine_table[7] = {
38 64277, 60547, 54491, 46341, 36410, 25080, 12785
41 #define r0 mm0
42 #define r1 mm1
43 #define r2 mm2
44 #define r3 mm3
45 #define r4 mm4
46 #define r5 mm5
47 #define r6 mm6
48 #define r7 mm7
50 /* from original comments: The Macro does IDct on 4 1-D Dcts */
51 #define BeginIDCT() { \
52 movq_m2r(*I(3), r2); \
53 movq_m2r(*C(3), r6); \
54 movq_r2r(r2, r4); \
55 movq_m2r(*J(5), r7); \
56 pmulhw_r2r(r6, r4); /* r4 = c3*i3 - i3 */ \
57 movq_m2r(*C(5), r1); \
58 pmulhw_r2r(r7, r6); /* r6 = c3*i5 - i5 */ \
59 movq_r2r(r1, r5); \
60 pmulhw_r2r(r2, r1); /* r1 = c5*i3 - i3 */ \
61 movq_m2r(*I(1), r3); \
62 pmulhw_r2r(r7, r5); /* r5 = c5*i5 - i5 */ \
63 movq_m2r(*C(1), r0); /* (all registers are in use) */ \
64 paddw_r2r(r2, r4); /* r4 = c3*i3 */ \
65 paddw_r2r(r7, r6); /* r6 = c3*i5 */ \
66 paddw_r2r(r1, r2); /* r2 = c5*i3 */ \
67 movq_m2r(*J(7), r1); \
68 paddw_r2r(r5, r7); /* r7 = c5*i5 */ \
69 movq_r2r(r0, r5); /* r5 = c1 */ \
70 pmulhw_r2r(r3, r0); /* r0 = c1*i1 - i1 */ \
71 paddsw_r2r(r7, r4); /* r4 = C = c3*i3 + c5*i5 */ \
72 pmulhw_r2r(r1, r5); /* r5 = c1*i7 - i7 */ \
73 movq_m2r(*C(7), r7); \
74 psubsw_r2r(r2, r6); /* r6 = D = c3*i5 - c5*i3 */ \
75 paddw_r2r(r3, r0); /* r0 = c1*i1 */ \
76 pmulhw_r2r(r7, r3); /* r3 = c7*i1 */ \
77 movq_m2r(*I(2), r2); \
78 pmulhw_r2r(r1, r7); /* r7 = c7*i7 */ \
79 paddw_r2r(r1, r5); /* r5 = c1*i7 */ \
80 movq_r2r(r2, r1); /* r1 = i2 */ \
81 pmulhw_m2r(*C(2), r2); /* r2 = c2*i2 - i2 */ \
82 psubsw_r2r(r5, r3); /* r3 = B = c7*i1 - c1*i7 */ \
83 movq_m2r(*J(6), r5); \
84 paddsw_r2r(r7, r0); /* r0 = A = c1*i1 + c7*i7 */ \
85 movq_r2r(r5, r7); /* r7 = i6 */ \
86 psubsw_r2r(r4, r0); /* r0 = A - C */ \
87 pmulhw_m2r(*C(2), r5); /* r5 = c2*i6 - i6 */ \
88 paddw_r2r(r1, r2); /* r2 = c2*i2 */ \
89 pmulhw_m2r(*C(6), r1); /* r1 = c6*i2 */ \
90 paddsw_r2r(r4, r4); /* r4 = C + C */ \
91 paddsw_r2r(r0, r4); /* r4 = C. = A + C */ \
92 psubsw_r2r(r6, r3); /* r3 = B - D */ \
93 paddw_r2r(r7, r5); /* r5 = c2*i6 */ \
94 paddsw_r2r(r6, r6); /* r6 = D + D */ \
95 pmulhw_m2r(*C(6), r7); /* r7 = c6*i6 */ \
96 paddsw_r2r(r3, r6); /* r6 = D. = B + D */ \
97 movq_r2m(r4, *I(1)); /* save C. at I(1) */ \
98 psubsw_r2r(r5, r1); /* r1 = H = c6*i2 - c2*i6 */ \
99 movq_m2r(*C(4), r4); \
100 movq_r2r(r3, r5); /* r5 = B - D */ \
101 pmulhw_r2r(r4, r3); /* r3 = (c4 - 1) * (B - D) */ \
102 paddsw_r2r(r2, r7); /* r7 = G = c6*i6 + c2*i2 */ \
103 movq_r2m(r6, *I(2)); /* save D. at I(2) */ \
104 movq_r2r(r0, r2); /* r2 = A - C */ \
105 movq_m2r(*I(0), r6); \
106 pmulhw_r2r(r4, r0); /* r0 = (c4 - 1) * (A - C) */ \
107 paddw_r2r(r3, r5); /* r5 = B. = c4 * (B - D) */ \
108 movq_m2r(*J(4), r3); \
109 psubsw_r2r(r1, r5); /* r5 = B.. = B. - H */ \
110 paddw_r2r(r0, r2); /* r0 = A. = c4 * (A - C) */ \
111 psubsw_r2r(r3, r6); /* r6 = i0 - i4 */ \
112 movq_r2r(r6, r0); \
113 pmulhw_r2r(r4, r6); /* r6 = (c4 - 1) * (i0 - i4) */ \
114 paddsw_r2r(r3, r3); /* r3 = i4 + i4 */ \
115 paddsw_r2r(r1, r1); /* r1 = H + H */ \
116 paddsw_r2r(r0, r3); /* r3 = i0 + i4 */ \
117 paddsw_r2r(r5, r1); /* r1 = H. = B + H */ \
118 pmulhw_r2r(r3, r4); /* r4 = (c4 - 1) * (i0 + i4) */ \
119 paddsw_r2r(r0, r6); /* r6 = F = c4 * (i0 - i4) */ \
120 psubsw_r2r(r2, r6); /* r6 = F. = F - A. */ \
121 paddsw_r2r(r2, r2); /* r2 = A. + A. */ \
122 movq_m2r(*I(1), r0); /* r0 = C. */ \
123 paddsw_r2r(r6, r2); /* r2 = A.. = F + A. */ \
124 paddw_r2r(r3, r4); /* r4 = E = c4 * (i0 + i4) */ \
125 psubsw_r2r(r1, r2); /* r2 = R2 = A.. - H. */ \
128 /* RowIDCT gets ready to transpose */
129 #define RowIDCT() { \
131 BeginIDCT(); \
133 movq_m2r(*I(2), r3); /* r3 = D. */ \
134 psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \
135 paddsw_r2r(r1, r1); /* r1 = H. + H. */ \
136 paddsw_r2r(r7, r7); /* r7 = G + G */ \
137 paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \
138 paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \
139 psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \
140 paddsw_r2r(r3, r3); \
141 psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \
142 paddsw_r2r(r5, r5); \
143 paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \
144 paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \
145 psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \
146 paddsw_r2r(r0, r0); \
147 movq_r2m(r1, *I(1)); /* save R1 */ \
148 paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \
151 /* Column IDCT normalizes and stores final results */
152 #define ColumnIDCT() { \
154 BeginIDCT(); \
156 paddsw_m2r(*Eight, r2); /* adjust R2 (and R1) for shift */ \
157 paddsw_r2r(r1, r1); /* r1 = H. + H. */ \
158 paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \
159 psraw_i2r(4, r2); /* r2 = NR2 */ \
160 psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \
161 psraw_i2r(4, r1); /* r1 = NR1 */ \
162 movq_m2r(*I(2), r3); /* r3 = D. */ \
163 paddsw_r2r(r7, r7); /* r7 = G + G */ \
164 movq_r2m(r2, *I(2)); /* store NR2 at I2 */ \
165 paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \
166 movq_r2m(r1, *I(1)); /* store NR1 at I1 */ \
167 psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \
168 paddsw_m2r(*Eight, r4); /* adjust R4 (and R3) for shift */ \
169 paddsw_r2r(r3, r3); /* r3 = D. + D. */ \
170 paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \
171 psraw_i2r(4, r4); /* r4 = NR4 */ \
172 psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \
173 psraw_i2r(4, r3); /* r3 = NR3 */ \
174 paddsw_m2r(*Eight, r6); /* adjust R6 (and R5) for shift */ \
175 paddsw_r2r(r5, r5); /* r5 = B.. + B.. */ \
176 paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \
177 psraw_i2r(4, r6); /* r6 = NR6 */ \
178 movq_r2m(r4, *J(4)); /* store NR4 at J4 */ \
179 psraw_i2r(4, r5); /* r5 = NR5 */ \
180 movq_r2m(r3, *I(3)); /* store NR3 at I3 */ \
181 psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \
182 paddsw_m2r(*Eight, r7); /* adjust R7 (and R0) for shift */ \
183 paddsw_r2r(r0, r0); /* r0 = C. + C. */ \
184 paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \
185 psraw_i2r(4, r7); /* r7 = NR7 */ \
186 movq_r2m(r6, *J(6)); /* store NR6 at J6 */ \
187 psraw_i2r(4, r0); /* r0 = NR0 */ \
188 movq_r2m(r5, *J(5)); /* store NR5 at J5 */ \
189 movq_r2m(r7, *J(7)); /* store NR7 at J7 */ \
190 movq_r2m(r0, *I(0)); /* store NR0 at I0 */ \
193 /* Following macro does two 4x4 transposes in place.
195 At entry (we assume):
197 r0 = a3 a2 a1 a0
198 I(1) = b3 b2 b1 b0
199 r2 = c3 c2 c1 c0
200 r3 = d3 d2 d1 d0
202 r4 = e3 e2 e1 e0
203 r5 = f3 f2 f1 f0
204 r6 = g3 g2 g1 g0
205 r7 = h3 h2 h1 h0
207 At exit, we have:
209 I(0) = d0 c0 b0 a0
210 I(1) = d1 c1 b1 a1
211 I(2) = d2 c2 b2 a2
212 I(3) = d3 c3 b3 a3
214 J(4) = h0 g0 f0 e0
215 J(5) = h1 g1 f1 e1
216 J(6) = h2 g2 f2 e2
217 J(7) = h3 g3 f3 e3
219 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
220 J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
222 Since r1 is free at entry, we calculate the Js first. */
224 #define Transpose() { \
225 movq_r2r(r4, r1); /* r1 = e3 e2 e1 e0 */ \
226 punpcklwd_r2r(r5, r4); /* r4 = f1 e1 f0 e0 */ \
227 movq_r2m(r0, *I(0)); /* save a3 a2 a1 a0 */ \
228 punpckhwd_r2r(r5, r1); /* r1 = f3 e3 f2 e2 */ \
229 movq_r2r(r6, r0); /* r0 = g3 g2 g1 g0 */ \
230 punpcklwd_r2r(r7, r6); /* r6 = h1 g1 h0 g0 */ \
231 movq_r2r(r4, r5); /* r5 = f1 e1 f0 e0 */ \
232 punpckldq_r2r(r6, r4); /* r4 = h0 g0 f0 e0 = R4 */ \
233 punpckhdq_r2r(r6, r5); /* r5 = h1 g1 f1 e1 = R5 */ \
234 movq_r2r(r1, r6); /* r6 = f3 e3 f2 e2 */ \
235 movq_r2m(r4, *J(4)); \
236 punpckhwd_r2r(r7, r0); /* r0 = h3 g3 h2 g2 */ \
237 movq_r2m(r5, *J(5)); \
238 punpckhdq_r2r(r0, r6); /* r6 = h3 g3 f3 e3 = R7 */ \
239 movq_m2r(*I(0), r4); /* r4 = a3 a2 a1 a0 */ \
240 punpckldq_r2r(r0, r1); /* r1 = h2 g2 f2 e2 = R6 */ \
241 movq_m2r(*I(1), r5); /* r5 = b3 b2 b1 b0 */ \
242 movq_r2r(r4, r0); /* r0 = a3 a2 a1 a0 */ \
243 movq_r2m(r6, *J(7)); \
244 punpcklwd_r2r(r5, r0); /* r0 = b1 a1 b0 a0 */ \
245 movq_r2m(r1, *J(6)); \
246 punpckhwd_r2r(r5, r4); /* r4 = b3 a3 b2 a2 */ \
247 movq_r2r(r2, r5); /* r5 = c3 c2 c1 c0 */ \
248 punpcklwd_r2r(r3, r2); /* r2 = d1 c1 d0 c0 */ \
249 movq_r2r(r0, r1); /* r1 = b1 a1 b0 a0 */ \
250 punpckldq_r2r(r2, r0); /* r0 = d0 c0 b0 a0 = R0 */ \
251 punpckhdq_r2r(r2, r1); /* r1 = d1 c1 b1 a1 = R1 */ \
252 movq_r2r(r4, r2); /* r2 = b3 a3 b2 a2 */ \
253 movq_r2m(r0, *I(0)); \
254 punpckhwd_r2r(r3, r5); /* r5 = d3 c3 d2 c2 */ \
255 movq_r2m(r1, *I(1)); \
256 punpckhdq_r2r(r5, r4); /* r4 = d3 c3 b3 a3 = R3 */ \
257 punpckldq_r2r(r5, r2); /* r2 = d2 c2 b2 a2 = R2 */ \
258 movq_r2m(r4, *I(3)); \
259 movq_r2m(r2, *I(2)); \
262 void ff_vp3_dsp_init_mmx(void)
264 int j = 16;
265 uint16_t *p;
267 j = 1;
268 do {
269 p = idct_constants + ((j + 3) << 2);
270 p[0] = p[1] = p[2] = p[3] = idct_cosine_table[j - 1];
271 } while (++j <= 7);
273 idct_constants[44] = idct_constants[45] =
274 idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift;
277 void ff_vp3_idct_mmx(int16_t *output_data)
279 /* eax = quantized input
280 * ebx = dequantizer matrix
281 * ecx = IDCT constants
282 * M(I) = ecx + MaskOffset(0) + I * 8
283 * C(I) = ecx + CosineOffset(32) + (I-1) * 8
284 * edx = output
285 * r0..r7 = mm0..mm7
288 #define C(x) (idct_constants + 16 + (x - 1) * 4)
289 #define Eight (idct_constants + 44)
291 /* at this point, function has completed dequantization + dezigzag +
292 * partial transposition; now do the idct itself */
293 #define I(K) (output_data + K * 8)
294 #define J(K) (output_data + ((K - 4) * 8) + 4)
296 RowIDCT();
297 Transpose();
299 #undef I
300 #undef J
301 #define I(K) (output_data + (K * 8) + 32)
302 #define J(K) (output_data + ((K - 4) * 8) + 36)
304 RowIDCT();
305 Transpose();
307 #undef I
308 #undef J
309 #define I(K) (output_data + K * 8)
310 #define J(K) (output_data + K * 8)
312 ColumnIDCT();
314 #undef I
315 #undef J
316 #define I(K) (output_data + (K * 8) + 4)
317 #define J(K) (output_data + (K * 8) + 4)
319 ColumnIDCT();
321 #undef I
322 #undef J
326 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
328 ff_vp3_idct_mmx(block);
329 put_signed_pixels_clamped_mmx(block, dest, line_size);
332 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
334 ff_vp3_idct_mmx(block);
335 add_pixels_clamped_mmx(block, dest, line_size);