2 * Copyright (C) 2004 the ffmpeg project
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 * MMX-optimized functions cribbed from the original VP3 source code.
24 #include "../dsputil.h"
27 #define IdctAdjustBeforeShift 8
29 /* (12 * 4) 2-byte memory locations ( = 96 bytes total)
30 * idct_constants[0..15] = Mask table (M(I))
31 * idct_constants[16..43] = Cosine table (C(I))
32 * idct_constants[44..47] = 8
34 static uint16_t idct_constants
[(4 + 7 + 1) * 4];
35 static const uint16_t idct_cosine_table
[7] = {
36 64277, 60547, 54491, 46341, 36410, 25080, 12785
48 /* from original comments: The Macro does IDct on 4 1-D Dcts */
49 #define BeginIDCT() { \
50 movq_m2r(*I(3), r2); \
51 movq_m2r(*C(3), r6); \
53 movq_m2r(*J(5), r7); \
54 pmulhw_r2r(r6, r4); /* r4 = c3*i3 - i3 */ \
55 movq_m2r(*C(5), r1); \
56 pmulhw_r2r(r7, r6); /* r6 = c3*i5 - i5 */ \
58 pmulhw_r2r(r2, r1); /* r1 = c5*i3 - i3 */ \
59 movq_m2r(*I(1), r3); \
60 pmulhw_r2r(r7, r5); /* r5 = c5*i5 - i5 */ \
61 movq_m2r(*C(1), r0); /* (all registers are in use) */ \
62 paddw_r2r(r2, r4); /* r4 = c3*i3 */ \
63 paddw_r2r(r7, r6); /* r6 = c3*i5 */ \
64 paddw_r2r(r1, r2); /* r2 = c5*i3 */ \
65 movq_m2r(*J(7), r1); \
66 paddw_r2r(r5, r7); /* r7 = c5*i5 */ \
67 movq_r2r(r0, r5); /* r5 = c1 */ \
68 pmulhw_r2r(r3, r0); /* r0 = c1*i1 - i1 */ \
69 paddsw_r2r(r7, r4); /* r4 = C = c3*i3 + c5*i5 */ \
70 pmulhw_r2r(r1, r5); /* r5 = c1*i7 - i7 */ \
71 movq_m2r(*C(7), r7); \
72 psubsw_r2r(r2, r6); /* r6 = D = c3*i5 - c5*i3 */ \
73 paddw_r2r(r3, r0); /* r0 = c1*i1 */ \
74 pmulhw_r2r(r7, r3); /* r3 = c7*i1 */ \
75 movq_m2r(*I(2), r2); \
76 pmulhw_r2r(r1, r7); /* r7 = c7*i7 */ \
77 paddw_r2r(r1, r5); /* r5 = c1*i7 */ \
78 movq_r2r(r2, r1); /* r1 = i2 */ \
79 pmulhw_m2r(*C(2), r2); /* r2 = c2*i2 - i2 */ \
80 psubsw_r2r(r5, r3); /* r3 = B = c7*i1 - c1*i7 */ \
81 movq_m2r(*J(6), r5); \
82 paddsw_r2r(r7, r0); /* r0 = A = c1*i1 + c7*i7 */ \
83 movq_r2r(r5, r7); /* r7 = i6 */ \
84 psubsw_r2r(r4, r0); /* r0 = A - C */ \
85 pmulhw_m2r(*C(2), r5); /* r5 = c2*i6 - i6 */ \
86 paddw_r2r(r1, r2); /* r2 = c2*i2 */ \
87 pmulhw_m2r(*C(6), r1); /* r1 = c6*i2 */ \
88 paddsw_r2r(r4, r4); /* r4 = C + C */ \
89 paddsw_r2r(r0, r4); /* r4 = C. = A + C */ \
90 psubsw_r2r(r6, r3); /* r3 = B - D */ \
91 paddw_r2r(r7, r5); /* r5 = c2*i6 */ \
92 paddsw_r2r(r6, r6); /* r6 = D + D */ \
93 pmulhw_m2r(*C(6), r7); /* r7 = c6*i6 */ \
94 paddsw_r2r(r3, r6); /* r6 = D. = B + D */ \
95 movq_r2m(r4, *I(1)); /* save C. at I(1) */ \
96 psubsw_r2r(r5, r1); /* r1 = H = c6*i2 - c2*i6 */ \
97 movq_m2r(*C(4), r4); \
98 movq_r2r(r3, r5); /* r5 = B - D */ \
99 pmulhw_r2r(r4, r3); /* r3 = (c4 - 1) * (B - D) */ \
100 paddsw_r2r(r2, r7); /* r7 = G = c6*i6 + c2*i2 */ \
101 movq_r2m(r6, *I(2)); /* save D. at I(2) */ \
102 movq_r2r(r0, r2); /* r2 = A - C */ \
103 movq_m2r(*I(0), r6); \
104 pmulhw_r2r(r4, r0); /* r0 = (c4 - 1) * (A - C) */ \
105 paddw_r2r(r3, r5); /* r5 = B. = c4 * (B - D) */ \
106 movq_m2r(*J(4), r3); \
107 psubsw_r2r(r1, r5); /* r5 = B.. = B. - H */ \
108 paddw_r2r(r0, r2); /* r0 = A. = c4 * (A - C) */ \
109 psubsw_r2r(r3, r6); /* r6 = i0 - i4 */ \
111 pmulhw_r2r(r4, r6); /* r6 = (c4 - 1) * (i0 - i4) */ \
112 paddsw_r2r(r3, r3); /* r3 = i4 + i4 */ \
113 paddsw_r2r(r1, r1); /* r1 = H + H */ \
114 paddsw_r2r(r0, r3); /* r3 = i0 + i4 */ \
115 paddsw_r2r(r5, r1); /* r1 = H. = B + H */ \
116 pmulhw_r2r(r3, r4); /* r4 = (c4 - 1) * (i0 + i4) */ \
117 paddsw_r2r(r0, r6); /* r6 = F = c4 * (i0 - i4) */ \
118 psubsw_r2r(r2, r6); /* r6 = F. = F - A. */ \
119 paddsw_r2r(r2, r2); /* r2 = A. + A. */ \
120 movq_m2r(*I(1), r0); /* r0 = C. */ \
121 paddsw_r2r(r6, r2); /* r2 = A.. = F + A. */ \
122 paddw_r2r(r3, r4); /* r4 = E = c4 * (i0 + i4) */ \
123 psubsw_r2r(r1, r2); /* r2 = R2 = A.. - H. */ \
126 /* RowIDCT gets ready to transpose */
127 #define RowIDCT() { \
131 movq_m2r(*I(2), r3); /* r3 = D. */ \
132 psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \
133 paddsw_r2r(r1, r1); /* r1 = H. + H. */ \
134 paddsw_r2r(r7, r7); /* r7 = G + G */ \
135 paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \
136 paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \
137 psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \
138 paddsw_r2r(r3, r3); \
139 psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \
140 paddsw_r2r(r5, r5); \
141 paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \
142 paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \
143 psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \
144 paddsw_r2r(r0, r0); \
145 movq_r2m(r1, *I(1)); /* save R1 */ \
146 paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \
149 /* Column IDCT normalizes and stores final results */
150 #define ColumnIDCT() { \
154 paddsw_m2r(*Eight, r2); /* adjust R2 (and R1) for shift */ \
155 paddsw_r2r(r1, r1); /* r1 = H. + H. */ \
156 paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \
157 psraw_i2r(4, r2); /* r2 = NR2 */ \
158 psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \
159 psraw_i2r(4, r1); /* r1 = NR1 */ \
160 movq_m2r(*I(2), r3); /* r3 = D. */ \
161 paddsw_r2r(r7, r7); /* r7 = G + G */ \
162 movq_r2m(r2, *I(2)); /* store NR2 at I2 */ \
163 paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \
164 movq_r2m(r1, *I(1)); /* store NR1 at I1 */ \
165 psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \
166 paddsw_m2r(*Eight, r4); /* adjust R4 (and R3) for shift */ \
167 paddsw_r2r(r3, r3); /* r3 = D. + D. */ \
168 paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \
169 psraw_i2r(4, r4); /* r4 = NR4 */ \
170 psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \
171 psraw_i2r(4, r3); /* r3 = NR3 */ \
172 paddsw_m2r(*Eight, r6); /* adjust R6 (and R5) for shift */ \
173 paddsw_r2r(r5, r5); /* r5 = B.. + B.. */ \
174 paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \
175 psraw_i2r(4, r6); /* r6 = NR6 */ \
176 movq_r2m(r4, *J(4)); /* store NR4 at J4 */ \
177 psraw_i2r(4, r5); /* r5 = NR5 */ \
178 movq_r2m(r3, *I(3)); /* store NR3 at I3 */ \
179 psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \
180 paddsw_m2r(*Eight, r7); /* adjust R7 (and R0) for shift */ \
181 paddsw_r2r(r0, r0); /* r0 = C. + C. */ \
182 paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \
183 psraw_i2r(4, r7); /* r7 = NR7 */ \
184 movq_r2m(r6, *J(6)); /* store NR6 at J6 */ \
185 psraw_i2r(4, r0); /* r0 = NR0 */ \
186 movq_r2m(r5, *J(5)); /* store NR5 at J5 */ \
187 movq_r2m(r7, *J(7)); /* store NR7 at J7 */ \
188 movq_r2m(r0, *I(0)); /* store NR0 at I0 */ \
191 /* Following macro does two 4x4 transposes in place.
193 At entry (we assume):
217 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
218 J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
220 Since r1 is free at entry, we calculate the Js first. */
222 #define Transpose() { \
223 movq_r2r(r4, r1); /* r1 = e3 e2 e1 e0 */ \
224 punpcklwd_r2r(r5, r4); /* r4 = f1 e1 f0 e0 */ \
225 movq_r2m(r0, *I(0)); /* save a3 a2 a1 a0 */ \
226 punpckhwd_r2r(r5, r1); /* r1 = f3 e3 f2 e2 */ \
227 movq_r2r(r6, r0); /* r0 = g3 g2 g1 g0 */ \
228 punpcklwd_r2r(r7, r6); /* r6 = h1 g1 h0 g0 */ \
229 movq_r2r(r4, r5); /* r5 = f1 e1 f0 e0 */ \
230 punpckldq_r2r(r6, r4); /* r4 = h0 g0 f0 e0 = R4 */ \
231 punpckhdq_r2r(r6, r5); /* r5 = h1 g1 f1 e1 = R5 */ \
232 movq_r2r(r1, r6); /* r6 = f3 e3 f2 e2 */ \
233 movq_r2m(r4, *J(4)); \
234 punpckhwd_r2r(r7, r0); /* r0 = h3 g3 h2 g2 */ \
235 movq_r2m(r5, *J(5)); \
236 punpckhdq_r2r(r0, r6); /* r6 = h3 g3 f3 e3 = R7 */ \
237 movq_m2r(*I(0), r4); /* r4 = a3 a2 a1 a0 */ \
238 punpckldq_r2r(r0, r1); /* r1 = h2 g2 f2 e2 = R6 */ \
239 movq_m2r(*I(1), r5); /* r5 = b3 b2 b1 b0 */ \
240 movq_r2r(r4, r0); /* r0 = a3 a2 a1 a0 */ \
241 movq_r2m(r6, *J(7)); \
242 punpcklwd_r2r(r5, r0); /* r0 = b1 a1 b0 a0 */ \
243 movq_r2m(r1, *J(6)); \
244 punpckhwd_r2r(r5, r4); /* r4 = b3 a3 b2 a2 */ \
245 movq_r2r(r2, r5); /* r5 = c3 c2 c1 c0 */ \
246 punpcklwd_r2r(r3, r2); /* r2 = d1 c1 d0 c0 */ \
247 movq_r2r(r0, r1); /* r1 = b1 a1 b0 a0 */ \
248 punpckldq_r2r(r2, r0); /* r0 = d0 c0 b0 a0 = R0 */ \
249 punpckhdq_r2r(r2, r1); /* r1 = d1 c1 b1 a1 = R1 */ \
250 movq_r2r(r4, r2); /* r2 = b3 a3 b2 a2 */ \
251 movq_r2m(r0, *I(0)); \
252 punpckhwd_r2r(r3, r5); /* r5 = d3 c3 d2 c2 */ \
253 movq_r2m(r1, *I(1)); \
254 punpckhdq_r2r(r5, r4); /* r4 = d3 c3 b3 a3 = R3 */ \
255 punpckldq_r2r(r5, r2); /* r2 = d2 c2 b2 a2 = R2 */ \
256 movq_r2m(r4, *I(3)); \
257 movq_r2m(r2, *I(2)); \
260 void ff_vp3_dsp_init_mmx(void)
267 p
= idct_constants
+ ((j
+ 3) << 2);
268 p
[0] = p
[1] = p
[2] = p
[3] = idct_cosine_table
[j
- 1];
271 idct_constants
[44] = idct_constants
[45] =
272 idct_constants
[46] = idct_constants
[47] = IdctAdjustBeforeShift
;
275 void ff_vp3_idct_mmx(int16_t *output_data
)
277 /* eax = quantized input
278 * ebx = dequantizer matrix
279 * ecx = IDCT constants
280 * M(I) = ecx + MaskOffset(0) + I * 8
281 * C(I) = ecx + CosineOffset(32) + (I-1) * 8
286 #define C(x) (idct_constants + 16 + (x - 1) * 4)
287 #define Eight (idct_constants + 44)
289 /* at this point, function has completed dequantization + dezigzag +
290 * partial transposition; now do the idct itself */
291 #define I(K) (output_data + K * 8)
292 #define J(K) (output_data + ((K - 4) * 8) + 4)
299 #define I(K) (output_data + (K * 8) + 32)
300 #define J(K) (output_data + ((K - 4) * 8) + 36)
307 #define I(K) (output_data + K * 8)
308 #define J(K) (output_data + K * 8)
314 #define I(K) (output_data + (K * 8) + 4)
315 #define J(K) (output_data + (K * 8) + 4)