lavfi: switch to AVFrame.
[FFMpeg-mirror/mplayer-patches.git] / libavcodec / x86 / vp3dsp.asm
blobd2c464c5cfea22e69e2aee4194f93b28408cc79b
1 ;******************************************************************************
2 ;* MMX/SSE2-optimized functions for the VP3 decoder
3 ;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
4 ;*
5 ;* This file is part of Libav.
6 ;*
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
24 ; MMX-optimized functions cribbed from the original VP3 source code.
26 SECTION_RODATA
28 vp3_idct_data: times 8 dw 64277
29 times 8 dw 60547
30 times 8 dw 54491
31 times 8 dw 46341
32 times 8 dw 36410
33 times 8 dw 25080
34 times 8 dw 12785
36 cextern pb_1
37 cextern pb_3
38 cextern pb_7
39 cextern pb_1F
40 cextern pb_80
41 cextern pb_81
43 cextern pw_8
45 SECTION .text
47 ; this is off by one or two for some cases when filter_limit is greater than 63
48 ; in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
49 ; out: p1 in mm4, p2 in mm3
50 %macro VP3_LOOP_FILTER 0
51 movq m7, m6
52 pand m6, [pb_7] ; p0&7
53 psrlw m7, 3
54 pand m7, [pb_1F] ; p0>>3
55 movq m3, m2 ; p2
56 pxor m2, m4
57 pand m2, [pb_1] ; (p2^p1)&1
58 movq m5, m2
59 paddb m2, m2
60 paddb m2, m5 ; 3*(p2^p1)&1
61 paddb m2, m6 ; extra bits lost in shifts
62 pcmpeqb m0, m0
63 pxor m1, m0 ; 255 - p3
64 pavgb m1, m2 ; (256 - p3 + extrabits) >> 1
65 pxor m0, m4 ; 255 - p1
66 pavgb m0, m3 ; (256 + p2-p1) >> 1
67 paddb m1, [pb_3]
68 pavgb m1, m0 ; 128+2+( p2-p1 - p3) >> 2
69 pavgb m1, m0 ; 128+1+(3*(p2-p1) - p3) >> 3
70 paddusb m7, m1 ; d+128+1
71 movq m6, [pb_81]
72 psubusb m6, m7
73 psubusb m7, [pb_81]
75 movq m5, [r2+516] ; flim
76 pminub m6, m5
77 pminub m7, m5
78 movq m0, m6
79 movq m1, m7
80 paddb m6, m6
81 paddb m7, m7
82 pminub m6, m5
83 pminub m7, m5
84 psubb m6, m0
85 psubb m7, m1
86 paddusb m4, m7
87 psubusb m4, m6
88 psubusb m3, m7
89 paddusb m3, m6
90 %endmacro
92 %macro STORE_4_WORDS 1
93 movd r2d, %1
94 mov [r0 -1], r2w
95 psrlq %1, 32
96 shr r2, 16
97 mov [r0+r1 -1], r2w
98 movd r2d, %1
99 mov [r0+r1*2-1], r2w
100 shr r2, 16
101 mov [r0+r3 -1], r2w
102 %endmacro
104 INIT_MMX mmxext
105 cglobal vp3_v_loop_filter, 3, 4
106 %if ARCH_X86_64
107 movsxd r1, r1d
108 %endif
109 mov r3, r1
110 neg r1
111 movq m6, [r0+r1*2]
112 movq m4, [r0+r1 ]
113 movq m2, [r0 ]
114 movq m1, [r0+r3 ]
116 VP3_LOOP_FILTER
118 movq [r0+r1], m4
119 movq [r0 ], m3
122 cglobal vp3_h_loop_filter, 3, 4
123 %if ARCH_X86_64
124 movsxd r1, r1d
125 %endif
126 lea r3, [r1*3]
128 movd m6, [r0 -2]
129 movd m4, [r0+r1 -2]
130 movd m2, [r0+r1*2-2]
131 movd m1, [r0+r3 -2]
132 lea r0, [r0+r1*4 ]
133 punpcklbw m6, [r0 -2]
134 punpcklbw m4, [r0+r1 -2]
135 punpcklbw m2, [r0+r1*2-2]
136 punpcklbw m1, [r0+r3 -2]
137 sub r0, r3
138 sub r0, r1
140 TRANSPOSE4x4B 6, 4, 2, 1, 0
141 VP3_LOOP_FILTER
142 SBUTTERFLY bw, 4, 3, 5
144 STORE_4_WORDS m4
145 lea r0, [r0+r1*4 ]
146 STORE_4_WORDS m3
149 ; from original comments: The Macro does IDct on 4 1-D Dcts
150 %macro BeginIDCT 0
151 movq m2, I(3)
152 movq m6, C(3)
153 movq m4, m2
154 movq m7, J(5)
155 pmulhw m4, m6 ; r4 = c3*i3 - i3
156 movq m1, C(5)
157 pmulhw m6, m7 ; r6 = c3*i5 - i5
158 movq m5, m1
159 pmulhw m1, m2 ; r1 = c5*i3 - i3
160 movq m3, I(1)
161 pmulhw m5, m7 ; r5 = c5*i5 - i5
162 movq m0, C(1)
163 paddw m4, m2 ; r4 = c3*i3
164 paddw m6, m7 ; r6 = c3*i5
165 paddw m2, m1 ; r2 = c5*i3
166 movq m1, J(7)
167 paddw m7, m5 ; r7 = c5*i5
168 movq m5, m0 ; r5 = c1
169 pmulhw m0, m3 ; r0 = c1*i1 - i1
170 paddsw m4, m7 ; r4 = C = c3*i3 + c5*i5
171 pmulhw m5, m1 ; r5 = c1*i7 - i7
172 movq m7, C(7)
173 psubsw m6, m2 ; r6 = D = c3*i5 - c5*i3
174 paddw m0, m3 ; r0 = c1*i1
175 pmulhw m3, m7 ; r3 = c7*i1
176 movq m2, I(2)
177 pmulhw m7, m1 ; r7 = c7*i7
178 paddw m5, m1 ; r5 = c1*i7
179 movq m1, m2 ; r1 = i2
180 pmulhw m2, C(2) ; r2 = c2*i2 - i2
181 psubsw m3, m5 ; r3 = B = c7*i1 - c1*i7
182 movq m5, J(6)
183 paddsw m0, m7 ; r0 = A = c1*i1 + c7*i7
184 movq m7, m5 ; r7 = i6
185 psubsw m0, m4 ; r0 = A - C
186 pmulhw m5, C(2) ; r5 = c2*i6 - i6
187 paddw m2, m1 ; r2 = c2*i2
188 pmulhw m1, C(6) ; r1 = c6*i2
189 paddsw m4, m4 ; r4 = C + C
190 paddsw m4, m0 ; r4 = C. = A + C
191 psubsw m3, m6 ; r3 = B - D
192 paddw m5, m7 ; r5 = c2*i6
193 paddsw m6, m6 ; r6 = D + D
194 pmulhw m7, C(6) ; r7 = c6*i6
195 paddsw m6, m3 ; r6 = D. = B + D
196 movq I(1), m4 ; save C. at I(1)
197 psubsw m1, m5 ; r1 = H = c6*i2 - c2*i6
198 movq m4, C(4)
199 movq m5, m3 ; r5 = B - D
200 pmulhw m3, m4 ; r3 = (c4 - 1) * (B - D)
201 paddsw m7, m2 ; r3 = (c4 - 1) * (B - D)
202 movq I(2), m6 ; save D. at I(2)
203 movq m2, m0 ; r2 = A - C
204 movq m6, I(0)
205 pmulhw m0, m4 ; r0 = (c4 - 1) * (A - C)
206 paddw m5, m3 ; r5 = B. = c4 * (B - D)
207 movq m3, J(4)
208 psubsw m5, m1 ; r5 = B.. = B. - H
209 paddw m2, m0 ; r0 = A. = c4 * (A - C)
210 psubsw m6, m3 ; r6 = i0 - i4
211 movq m0, m6
212 pmulhw m6, m4 ; r6 = (c4 - 1) * (i0 - i4)
213 paddsw m3, m3 ; r3 = i4 + i4
214 paddsw m1, m1 ; r1 = H + H
215 paddsw m3, m0 ; r3 = i0 + i4
216 paddsw m1, m5 ; r1 = H. = B + H
217 pmulhw m4, m3 ; r4 = (c4 - 1) * (i0 + i4)
218 paddsw m6, m0 ; r6 = F = c4 * (i0 - i4)
219 psubsw m6, m2 ; r6 = F. = F - A.
220 paddsw m2, m2 ; r2 = A. + A.
221 movq m0, I(1) ; r0 = C.
222 paddsw m2, m6 ; r2 = A.. = F + A.
223 paddw m4, m3 ; r4 = E = c4 * (i0 + i4)
224 psubsw m2, m1 ; r2 = R2 = A.. - H.
225 %endmacro
227 ; RowIDCT gets ready to transpose
228 %macro RowIDCT 0
229 BeginIDCT
230 movq m3, I(2) ; r3 = D.
231 psubsw m4, m7 ; r4 = E. = E - G
232 paddsw m1, m1 ; r1 = H. + H.
233 paddsw m7, m7 ; r7 = G + G
234 paddsw m1, m2 ; r1 = R1 = A.. + H.
235 paddsw m7, m4 ; r1 = R1 = A.. + H.
236 psubsw m4, m3 ; r4 = R4 = E. - D.
237 paddsw m3, m3
238 psubsw m6, m5 ; r6 = R6 = F. - B..
239 paddsw m5, m5
240 paddsw m3, m4 ; r3 = R3 = E. + D.
241 paddsw m5, m6 ; r5 = R5 = F. + B..
242 psubsw m7, m0 ; r7 = R7 = G. - C.
243 paddsw m0, m0
244 movq I(1), m1 ; save R1
245 paddsw m0, m7 ; r0 = R0 = G. + C.
246 %endmacro
248 ; Column IDCT normalizes and stores final results
249 %macro ColumnIDCT 0
250 BeginIDCT
251 paddsw m2, OC_8 ; adjust R2 (and R1) for shift
252 paddsw m1, m1 ; r1 = H. + H.
253 paddsw m1, m2 ; r1 = R1 = A.. + H.
254 psraw m2, 4 ; r2 = NR2
255 psubsw m4, m7 ; r4 = E. = E - G
256 psraw m1, 4 ; r1 = NR2
257 movq m3, I(2) ; r3 = D.
258 paddsw m7, m7 ; r7 = G + G
259 movq I(2), m2 ; store NR2 at I2
260 paddsw m7, m4 ; r7 = G. = E + G
261 movq I(1), m1 ; store NR1 at I1
262 psubsw m4, m3 ; r4 = R4 = E. - D.
263 paddsw m4, OC_8 ; adjust R4 (and R3) for shift
264 paddsw m3, m3 ; r3 = D. + D.
265 paddsw m3, m4 ; r3 = R3 = E. + D.
266 psraw m4, 4 ; r4 = NR4
267 psubsw m6, m5 ; r6 = R6 = F. - B..
268 psraw m3, 4 ; r3 = NR3
269 paddsw m6, OC_8 ; adjust R6 (and R5) for shift
270 paddsw m5, m5 ; r5 = B.. + B..
271 paddsw m5, m6 ; r5 = R5 = F. + B..
272 psraw m6, 4 ; r6 = NR6
273 movq J(4), m4 ; store NR4 at J4
274 psraw m5, 4 ; r5 = NR5
275 movq I(3), m3 ; store NR3 at I3
276 psubsw m7, m0 ; r7 = R7 = G. - C.
277 paddsw m7, OC_8 ; adjust R7 (and R0) for shift
278 paddsw m0, m0 ; r0 = C. + C.
279 paddsw m0, m7 ; r0 = R0 = G. + C.
280 psraw m7, 4 ; r7 = NR7
281 movq J(6), m6 ; store NR6 at J6
282 psraw m0, 4 ; r0 = NR0
283 movq J(5), m5 ; store NR5 at J5
284 movq J(7), m7 ; store NR7 at J7
285 movq I(0), m0 ; store NR0 at I0
286 %endmacro
288 ; Following macro does two 4x4 transposes in place.
290 ; At entry (we assume):
292 ; r0 = a3 a2 a1 a0
293 ; I(1) = b3 b2 b1 b0
294 ; r2 = c3 c2 c1 c0
295 ; r3 = d3 d2 d1 d0
297 ; r4 = e3 e2 e1 e0
298 ; r5 = f3 f2 f1 f0
299 ; r6 = g3 g2 g1 g0
300 ; r7 = h3 h2 h1 h0
302 ; At exit, we have:
304 ; I(0) = d0 c0 b0 a0
305 ; I(1) = d1 c1 b1 a1
306 ; I(2) = d2 c2 b2 a2
307 ; I(3) = d3 c3 b3 a3
309 ; J(4) = h0 g0 f0 e0
310 ; J(5) = h1 g1 f1 e1
311 ; J(6) = h2 g2 f2 e2
312 ; J(7) = h3 g3 f3 e3
314 ; I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
315 ; J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
317 ; Since r1 is free at entry, we calculate the Js first.
318 %macro Transpose 0
319 movq m1, m4 ; r1 = e3 e2 e1 e0
320 punpcklwd m4, m5 ; r4 = f1 e1 f0 e0
321 movq I(0), m0 ; save a3 a2 a1 a0
322 punpckhwd m1, m5 ; r1 = f3 e3 f2 e2
323 movq m0, m6 ; r0 = g3 g2 g1 g0
324 punpcklwd m6, m7 ; r6 = h1 g1 h0 g0
325 movq m5, m4 ; r5 = f1 e1 f0 e0
326 punpckldq m4, m6 ; r4 = h0 g0 f0 e0 = R4
327 punpckhdq m5, m6 ; r5 = h1 g1 f1 e1 = R5
328 movq m6, m1 ; r6 = f3 e3 f2 e2
329 movq J(4), m4
330 punpckhwd m0, m7 ; r0 = h3 g3 h2 g2
331 movq J(5), m5
332 punpckhdq m6, m0 ; r6 = h3 g3 f3 e3 = R7
333 movq m4, I(0) ; r4 = a3 a2 a1 a0
334 punpckldq m1, m0 ; r1 = h2 g2 f2 e2 = R6
335 movq m5, I(1) ; r5 = b3 b2 b1 b0
336 movq m0, m4 ; r0 = a3 a2 a1 a0
337 movq J(7), m6
338 punpcklwd m0, m5 ; r0 = b1 a1 b0 a0
339 movq J(6), m1
340 punpckhwd m4, m5 ; r4 = b3 a3 b2 a2
341 movq m5, m2 ; r5 = c3 c2 c1 c0
342 punpcklwd m2, m3 ; r2 = d1 c1 d0 c0
343 movq m1, m0 ; r1 = b1 a1 b0 a0
344 punpckldq m0, m2 ; r0 = d0 c0 b0 a0 = R0
345 punpckhdq m1, m2 ; r1 = d1 c1 b1 a1 = R1
346 movq m2, m4 ; r2 = b3 a3 b2 a2
347 movq I(0), m0
348 punpckhwd m5, m3 ; r5 = d3 c3 d2 c2
349 movq I(1), m1
350 punpckhdq m4, m5 ; r4 = d3 c3 b3 a3 = R3
351 punpckldq m2, m5 ; r2 = d2 c2 b2 a2 = R2
352 movq I(3), m4
353 movq I(2), m2
354 %endmacro
356 %macro VP3_1D_IDCT_SSE2 0
357 movdqa m2, I(3) ; xmm2 = i3
358 movdqa m6, C(3) ; xmm6 = c3
359 movdqa m4, m2 ; xmm4 = i3
360 movdqa m7, I(5) ; xmm7 = i5
361 pmulhw m4, m6 ; xmm4 = c3 * i3 - i3
362 movdqa m1, C(5) ; xmm1 = c5
363 pmulhw m6, m7 ; xmm6 = c3 * i5 - i5
364 movdqa m5, m1 ; xmm5 = c5
365 pmulhw m1, m2 ; xmm1 = c5 * i3 - i3
366 movdqa m3, I(1) ; xmm3 = i1
367 pmulhw m5, m7 ; xmm5 = c5 * i5 - i5
368 movdqa m0, C(1) ; xmm0 = c1
369 paddw m4, m2 ; xmm4 = c3 * i3
370 paddw m6, m7 ; xmm6 = c3 * i5
371 paddw m2, m1 ; xmm2 = c5 * i3
372 movdqa m1, I(7) ; xmm1 = i7
373 paddw m7, m5 ; xmm7 = c5 * i5
374 movdqa m5, m0 ; xmm5 = c1
375 pmulhw m0, m3 ; xmm0 = c1 * i1 - i1
376 paddsw m4, m7 ; xmm4 = c3 * i3 + c5 * i5 = C
377 pmulhw m5, m1 ; xmm5 = c1 * i7 - i7
378 movdqa m7, C(7) ; xmm7 = c7
379 psubsw m6, m2 ; xmm6 = c3 * i5 - c5 * i3 = D
380 paddw m0, m3 ; xmm0 = c1 * i1
381 pmulhw m3, m7 ; xmm3 = c7 * i1
382 movdqa m2, I(2) ; xmm2 = i2
383 pmulhw m7, m1 ; xmm7 = c7 * i7
384 paddw m5, m1 ; xmm5 = c1 * i7
385 movdqa m1, m2 ; xmm1 = i2
386 pmulhw m2, C(2) ; xmm2 = i2 * c2 -i2
387 psubsw m3, m5 ; xmm3 = c7 * i1 - c1 * i7 = B
388 movdqa m5, I(6) ; xmm5 = i6
389 paddsw m0, m7 ; xmm0 = c1 * i1 + c7 * i7 = A
390 movdqa m7, m5 ; xmm7 = i6
391 psubsw m0, m4 ; xmm0 = A - C
392 pmulhw m5, C(2) ; xmm5 = c2 * i6 - i6
393 paddw m2, m1 ; xmm2 = i2 * c2
394 pmulhw m1, C(6) ; xmm1 = c6 * i2
395 paddsw m4, m4 ; xmm4 = C + C
396 paddsw m4, m0 ; xmm4 = A + C = C.
397 psubsw m3, m6 ; xmm3 = B - D
398 paddw m5, m7 ; xmm5 = c2 * i6
399 paddsw m6, m6 ; xmm6 = D + D
400 pmulhw m7, C(6) ; xmm7 = c6 * i6
401 paddsw m6, m3 ; xmm6 = B + D = D.
402 movdqa I(1), m4 ; Save C. at I(1)
403 psubsw m1, m5 ; xmm1 = c6 * i2 - c2 * i6 = H
404 movdqa m4, C(4) ; xmm4 = C4
405 movdqa m5, m3 ; xmm5 = B - D
406 pmulhw m3, m4 ; xmm3 = ( c4 -1 ) * ( B - D )
407 paddsw m7, m2 ; xmm7 = c2 * i2 + c6 * i6 = G
408 movdqa I(2), m6 ; save D. at I(2)
409 movdqa m2, m0 ; xmm2 = A - C
410 movdqa m6, I(0) ; xmm6 = i0
411 pmulhw m0, m4 ; xmm0 = ( c4 - 1 ) * ( A - C ) = A.
412 paddw m5, m3 ; xmm5 = c4 * ( B - D ) = B.
413 movdqa m3, I(4) ; xmm3 = i4
414 psubsw m5, m1 ; xmm5 = B. - H = B..
415 paddw m2, m0 ; xmm2 = c4 * ( A - C) = A.
416 psubsw m6, m3 ; xmm6 = i0 - i4
417 movdqa m0, m6 ; xmm0 = i0 - i4
418 pmulhw m6, m4 ; xmm6 = (c4 - 1) * (i0 - i4) = F
419 paddsw m3, m3 ; xmm3 = i4 + i4
420 paddsw m1, m1 ; xmm1 = H + H
421 paddsw m3, m0 ; xmm3 = i0 + i4
422 paddsw m1, m5 ; xmm1 = B. + H = H.
423 pmulhw m4, m3 ; xmm4 = ( c4 - 1 ) * ( i0 + i4 )
424 paddw m6, m0 ; xmm6 = c4 * ( i0 - i4 )
425 psubsw m6, m2 ; xmm6 = F - A. = F.
426 paddsw m2, m2 ; xmm2 = A. + A.
427 movdqa m0, I(1) ; Load C. from I(1)
428 paddsw m2, m6 ; xmm2 = F + A. = A..
429 paddw m4, m3 ; xmm4 = c4 * ( i0 + i4 ) = 3
430 psubsw m2, m1 ; xmm2 = A.. - H. = R2
431 ADD(m2) ; Adjust R2 and R1 before shifting
432 paddsw m1, m1 ; xmm1 = H. + H.
433 paddsw m1, m2 ; xmm1 = A.. + H. = R1
434 SHIFT(m2) ; xmm2 = op2
435 psubsw m4, m7 ; xmm4 = E - G = E.
436 SHIFT(m1) ; xmm1 = op1
437 movdqa m3, I(2) ; Load D. from I(2)
438 paddsw m7, m7 ; xmm7 = G + G
439 paddsw m7, m4 ; xmm7 = E + G = G.
440 psubsw m4, m3 ; xmm4 = E. - D. = R4
441 ADD(m4) ; Adjust R4 and R3 before shifting
442 paddsw m3, m3 ; xmm3 = D. + D.
443 paddsw m3, m4 ; xmm3 = E. + D. = R3
444 SHIFT(m4) ; xmm4 = op4
445 psubsw m6, m5 ; xmm6 = F. - B..= R6
446 SHIFT(m3) ; xmm3 = op3
447 ADD(m6) ; Adjust R6 and R5 before shifting
448 paddsw m5, m5 ; xmm5 = B.. + B..
449 paddsw m5, m6 ; xmm5 = F. + B.. = R5
450 SHIFT(m6) ; xmm6 = op6
451 SHIFT(m5) ; xmm5 = op5
452 psubsw m7, m0 ; xmm7 = G. - C. = R7
453 ADD(m7) ; Adjust R7 and R0 before shifting
454 paddsw m0, m0 ; xmm0 = C. + C.
455 paddsw m0, m7 ; xmm0 = G. + C.
456 SHIFT(m7) ; xmm7 = op7
457 SHIFT(m0) ; xmm0 = op0
458 %endmacro
460 %macro PUT_BLOCK 8
461 movdqa O(0), m%1
462 movdqa O(1), m%2
463 movdqa O(2), m%3
464 movdqa O(3), m%4
465 movdqa O(4), m%5
466 movdqa O(5), m%6
467 movdqa O(6), m%7
468 movdqa O(7), m%8
469 %endmacro
471 %macro VP3_IDCT 1
472 %if mmsize == 16
473 %define I(x) [%1+16*x]
474 %define O(x) [%1+16*x]
475 %define C(x) [vp3_idct_data+16*(x-1)]
476 %define SHIFT(x)
477 %define ADD(x)
478 VP3_1D_IDCT_SSE2
479 %if ARCH_X86_64
480 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
481 %else
482 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16]
483 %endif
484 PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
486 %define SHIFT(x) psraw x, 4
487 %define ADD(x) paddsw x, [pw_8]
488 VP3_1D_IDCT_SSE2
489 PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
490 %else ; mmsize == 8
491 ; eax = quantized input
492 ; ebx = dequantizer matrix
493 ; ecx = IDCT constants
494 ; M(I) = ecx + MaskOffset(0) + I * 8
495 ; C(I) = ecx + CosineOffset(32) + (I-1) * 8
496 ; edx = output
497 ; r0..r7 = mm0..mm7
498 %define OC_8 [pw_8]
499 %define C(x) [vp3_idct_data+16*(x-1)]
501 ; at this point, function has completed dequantization + dezigzag +
502 ; partial transposition; now do the idct itself
503 %define I(x) [%1+16* x ]
504 %define J(x) [%1+16*(x-4)+8]
505 RowIDCT
506 Transpose
508 %define I(x) [%1+16* x +64]
509 %define J(x) [%1+16*(x-4)+72]
510 RowIDCT
511 Transpose
513 %define I(x) [%1+16*x]
514 %define J(x) [%1+16*x]
515 ColumnIDCT
517 %define I(x) [%1+16*x+8]
518 %define J(x) [%1+16*x+8]
519 ColumnIDCT
520 %endif ; mmsize == 16/8
521 %endmacro
523 %macro vp3_idct_funcs 0
524 cglobal vp3_idct_put, 3, 4, 9
525 VP3_IDCT r2
527 movsxdifnidn r1, r1d
528 mova m4, [pb_80]
529 lea r3, [r1*3]
530 %assign %%i 0
531 %rep 16/mmsize
532 mova m0, [r2+mmsize*0+%%i]
533 mova m1, [r2+mmsize*2+%%i]
534 mova m2, [r2+mmsize*4+%%i]
535 mova m3, [r2+mmsize*6+%%i]
536 packsswb m0, [r2+mmsize*1+%%i]
537 packsswb m1, [r2+mmsize*3+%%i]
538 packsswb m2, [r2+mmsize*5+%%i]
539 packsswb m3, [r2+mmsize*7+%%i]
540 paddb m0, m4
541 paddb m1, m4
542 paddb m2, m4
543 paddb m3, m4
544 movq [r0 ], m0
545 %if mmsize == 8
546 movq [r0+r1 ], m1
547 movq [r0+r1*2], m2
548 movq [r0+r3 ], m3
549 %else
550 movhps [r0+r1 ], m0
551 movq [r0+r1*2], m1
552 movhps [r0+r3 ], m1
553 %endif
554 %if %%i == 0
555 lea r0, [r0+r1*4]
556 %endif
557 %if mmsize == 16
558 movq [r0 ], m2
559 movhps [r0+r1 ], m2
560 movq [r0+r1*2], m3
561 movhps [r0+r3 ], m3
562 %endif
563 %assign %%i %%i+64
564 %endrep
566 pxor m0, m0
567 %assign %%offset 0
568 %rep 128/mmsize
569 mova [r2+%%offset], m0
570 %assign %%offset %%offset+mmsize
571 %endrep
574 cglobal vp3_idct_add, 3, 4, 9
575 VP3_IDCT r2
577 mov r3, 4
578 pxor m4, m4
579 movsxdifnidn r1, r1d
580 .loop:
581 movq m0, [r0]
582 movq m1, [r0+r1]
583 %if mmsize == 8
584 mova m2, m0
585 mova m3, m1
586 %endif
587 punpcklbw m0, m4
588 punpcklbw m1, m4
589 %if mmsize == 8
590 punpckhbw m2, m4
591 punpckhbw m3, m4
592 %endif
593 paddsw m0, [r2+ 0]
594 paddsw m1, [r2+16]
595 %if mmsize == 8
596 paddsw m2, [r2+ 8]
597 paddsw m3, [r2+24]
598 packuswb m0, m2
599 packuswb m1, m3
600 %else ; mmsize == 16
601 packuswb m0, m1
602 %endif
603 movq [r0 ], m0
604 %if mmsize == 8
605 movq [r0+r1], m1
606 %else ; mmsize == 16
607 movhps [r0+r1], m0
608 %endif
609 lea r0, [r0+r1*2]
610 %assign %%offset 0
611 %rep 32/mmsize
612 mova [r2+%%offset], m4
613 %assign %%offset %%offset+mmsize
614 %endrep
615 add r2, 32
616 dec r3
617 jg .loop
619 %endmacro
621 %if ARCH_X86_32
622 INIT_MMX mmx
623 vp3_idct_funcs
624 %endif
626 INIT_XMM sse2
627 vp3_idct_funcs
629 %macro DC_ADD 0
630 movq m2, [r0 ]
631 movq m3, [r0+r1 ]
632 paddusb m2, m0
633 movq m4, [r0+r1*2]
634 paddusb m3, m0
635 movq m5, [r0+r2 ]
636 paddusb m4, m0
637 paddusb m5, m0
638 psubusb m2, m1
639 psubusb m3, m1
640 movq [r0 ], m2
641 psubusb m4, m1
642 movq [r0+r1 ], m3
643 psubusb m5, m1
644 movq [r0+r1*2], m4
645 movq [r0+r2 ], m5
646 %endmacro
648 INIT_MMX mmxext
649 cglobal vp3_idct_dc_add, 3, 4
650 %if ARCH_X86_64
651 movsxd r1, r1d
652 %endif
653 movsx r3, word [r2]
654 mov word [r2], 0
655 lea r2, [r1*3]
656 add r3, 15
657 sar r3, 5
658 movd m0, r3d
659 pshufw m0, m0, 0x0
660 pxor m1, m1
661 psubw m1, m0
662 packuswb m0, m0
663 packuswb m1, m1
664 DC_ADD
665 lea r0, [r0+r1*4]
666 DC_ADD