Add Russian translation provided by Валерий Крувялис <valkru@mail.ru>
[xiph-mirror.git] / theora-old / lib / x86_32 / idct_mmx.c
blobdbc33d092c9dd37861e0ff2d25d4372a7dfe3a09
1 /********************************************************************
2 * *
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7 * *
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
9 * by the Xiph.Org Foundation http://www.xiph.org/ *
10 * *
11 ********************************************************************
13 function:
14 last mod: $Id: dsp_mmx.c 12440 2007-02-06 16:36:26Z j $
16 ********************************************************************/
18 #include "codec_internal.h"
20 #if defined(USE_ASM)
22 #define ASM asm
24 /****************************************************************************
26 * Description : IDCT with multiple versions based on # of non 0 coeffs
28 *****************************************************************************
31 // Dequantization + inverse discrete cosine transform.
33 // Constants used in MMX implementation of dequantization and idct.
34 // All the MMX stuff works with 4 16-bit quantities at a time and
35 // we create 11 constants of size 4 x 16 bits.
36 // The first 4 are used to mask the individual 16-bit words within a group
37 // and are used in the address-shuffling part of the dequantization.
38 // The last 7 are fixed-point approximations to the cosines of angles
39 // occurring in the DCT; each of these contains 4 copies of the same value.
41 // There is only one (statically initialized) instance of this object
42 // wrapped in an allocator object that forces its starting address
43 // to be evenly divisible by 32. Hence the actual object occupies 2.75
44 // cache lines on a Pentium processor.
46 // Offsets in bytes used by the assembler code below
47 // must of course agree with the idctConstants constructor.
49 #define MaskOffset 0 // 4 masks come in order low word to high
50 #define CosineOffset 32 // 7 cosines come in order pi/16 * (1 ... 7)
51 #define EightOffset 88
52 #define IdctAdjustBeforeShift 8
55 UINT16 idctcosTbl[ 7] =
57 64277, 60547, 54491, 46341, 36410, 25080, 12785
60 void fillidctconstants(void)
62 int j = 16;
63 UINT16 * p;
66 idctconstants[ --j] = 0;
68 while( j);
70 idctconstants[0] = idctconstants[5] = idctconstants[10] = idctconstants[15] = 65535;
72 j = 1;
75 p = idctconstants + ( (j+3) << 2);
76 p[0] = p[1] = p[2] = p[3] = idctcosTbl[ j - 1];
78 while( ++j <= 7);
80 idctconstants[44] = idctconstants[45] = idctconstants[46] = idctconstants[47] = IdctAdjustBeforeShift;
84 ogg_uint16_t idctconstants[(4+7+1) * 4] = {
85 65535, 0, 0, 0, 0, 65535, 0, 0,
86 0, 0, 65535, 0, 0, 0, 0, 65535,
87 64277, 64277, 64277, 64277, 60547, 60547, 60547, 60547,
88 54491, 54491, 54491, 54491, 46341, 46341, 46341, 46341,
89 36410, 36410, 36410, 36410, 25080, 25080, 25080, 25080,
90 12785, 12785, 12785, 12785, 8, 8, 8, 8,
93 /* Dequantization + inverse DCT.
95 Dequantization multiplies user's 16-bit signed indices (range -512 to +511)
96 by unsigned 16-bit quantization table entries.
97 These table entries are upscaled by 4, max is 30 * 128 * 4 < 2^14.
98 Result is scaled signed DCT coefficients (abs value < 2^15).
100 In the data stream, the coefficients are sent in order of increasing
101 total (horizontal + vertical) frequency. The exact picture is as follows:
103 00 01 05 06 16 17 33 34
104 02 04 07 15 20 32 35 52
105 03 10 14 21 31 36 51 53
106 11 13 22 30 37 50 54 65
108 12 23 27 40 47 55 64 66
109 24 26 41 46 56 63 67 74
110 25 42 45 57 62 70 73 75
111 43 44 60 61 71 72 76 77
113 Here the position in the matrix corresponds to the (horiz,vert)
114 freqency indices and the octal entry in the matrix is the position
115 of the coefficient in the data stream. Thus the coefficients are sent
116 in sort of a diagonal "snake".
118 The dequantization stage "uncurls the snake" and stores the expanded
119 coefficients in more convenient positions. These are not exactly the
120 natural positions given above but take into account our implementation
121 of the idct, which basically requires two one-dimensional idcts and
122 two transposes.
124 We fold the first transpose into the storage of the expanded coefficients.
125 We don't actually do a full transpose because this would require doubling
126 the size of the idct buffer; rather, we just transpose each of the 4x4
127 subblocks. Using slightly varying addressing schemes in each of the
128 four 4x8 idcts then allows these transforms to be done in place.
130 Transposing the 4x4 subblocks in the matrix above gives
132 00 02 03 11 16 20 31 37
133 01 04 10 13 17 32 36 50
134 05 07 14 22 33 35 51 54
135 06 15 21 30 34 52 53 65
137 12 24 25 43 47 56 62 71
138 23 26 42 44 55 63 70 72
139 27 41 45 60 64 67 73 76
140 40 46 57 61 66 74 75 77
142 Finally, we reverse the words in each 4 word group to clarify
143 direction of shifts.
145 11 03 02 00 37 31 20 16
146 13 10 04 01 50 36 32 17
147 22 14 07 05 54 51 35 33
148 30 21 15 06 65 53 52 34
150 43 25 24 12 71 62 56 47
151 44 42 26 23 72 70 63 55
152 60 45 41 27 76 73 67 64
153 61 57 46 40 77 75 74 66
155 This matrix then shows the 16 4x16 destination words in terms of
156 the 16 4x16 input words.
158 We implement this algorithm by manipulation of mmx registers,
159 which seems to be the fastest way to proceed. It is completely
160 hand-written; there does not seem to be enough recurrence to
161 reasonably compartmentalize any of it. Hence the resulting
162 program is ugly and bloated. Furthermore, due to the absence of
163 register pressure, it is boring and artless. I hate it.
165 The idct itself is more interesting. Since the two-dimensional dct
166 basis functions are products of the one-dimesional dct basis functions,
167 we can compute an inverse (or forward) dct via two 1-D transforms,
168 on rows then on columns. To exploit MMX parallelism, we actually do
169 both operations on columns, interposing a (partial) transpose between
170 the two 1-D transforms, the first transpose being done by the expansion
171 described above.
173 The 8-sample one-dimensional DCT is a standard orthogonal expansion using
174 the (unnormalized) basis functions
176 b[k]( i) = cos( pi * k * (2i + 1) / 16);
178 here k = 0 ... 7 is the frequency and i = 0 ... 7 is the spatial coordinate.
179 To normalize, b[0] should be multiplied by 1/sqrt( 8) and the other b[k]
180 should be multiplied by 1/2.
182 The 8x8 two-dimensional DCT is just the product of one-dimensional DCTs
183 in each direction. The (unnormalized) basis functions are
185 B[k,l]( i, j) = b[k]( i) * b[l]( j);
187 this time k and l are the horizontal and vertical frequencies,
188 i and j are the horizontal and vertical spatial coordinates;
189 all indices vary from 0 ... 7 (as above)
190 and there are now 4 cases of normalization.
192 Our 1-D idct expansion uses constants C1 ... C7 given by
194 (*) Ck = C(-k) = cos( pi * k/16) = S(8-k) = -S(k-8) = sin( pi * (8-k)/16)
196 and the following 1-D algorithm transforming I0 ... I7 to R0 ... R7 :
198 A = (C1 * I1) + (C7 * I7) B = (C7 * I1) - (C1 * I7)
199 C = (C3 * I3) + (C5 * I5) D = (C3 * I5) - (C5 * I3)
200 A. = C4 * (A - C) B. = C4 * (B - D)
201 C. = A + C D. = B + D
203 E = C4 * (I0 + I4) F = C4 * (I0 - I4)
204 G = (C2 * I2) + (C6 * I6) H = (C6 * I2) - (C2 * I6)
205 E. = E - G
206 G. = E + G
208 A.. = F + A. B.. = B. - H
209 F. = F - A. H. = B. + H
211 R0 = G. + C. R1 = A.. + H. R3 = E. + D. R5 = F. + B..
212 R7 = G. - C. R2 = A.. - H. R4 = E. - D. R6 = F. - B..
214 It is due to Vetterli and Lightenberg and may be found in the JPEG
215 reference book by Pennebaker and Mitchell.
217 Correctness of the algorithm follows from (*) together with the
218 addition formulas for sine and cosine:
220 cos( A + B) = cos( A) * cos( B) - sin( A) * sin( B)
221 sin( A + B) = sin( A) * cos( B) + cos( A) * sin( B)
223 Note that this implementation absorbs the difference in normalization
224 between the 0th and higher frequencies, although the results produced
225 are actually twice as big as they should be. Since we do this for each
226 dimension, the 2-D idct results are 4x the desired results. Finally,
227 taking into account that the dequantization multiplies by 4 as well,
228 our actual results are 16x too big. We fix this by shifting the final
229 results right by 4 bits.
231 High precision version approximates C1 ... C7 to 16 bits.
232 Since MMX only provides a signed multiply, C1 ... C5 appear to be
233 negative and multiplies involving them must be adjusted to compensate
234 for this. C6 and C7 do not require this adjustment since
235 they are < 1/2 and are correctly treated as positive numbers.
237 Following macro does four 8-sample one-dimensional idcts in parallel.
238 This is actually not such a difficult program to write once you
239 make a couple of observations (I of course was unable to make these
240 observations until I'd half-written a couple of other versions).
242 1. Everything is easy once you are done with the multiplies.
243 This is because, given X and Y in registers, one may easily
244 calculate X+Y and X-Y using just those 2 registers.
246 2. You always need at least 2 extra registers to calculate products,
247 so storing 2 temporaries is inevitable. C. and D. seem to be
248 the best candidates.
250 3. The products should be calculated in decreasing order of complexity
251 (which translates into register pressure). Since C1 ... C5 require
252 adjustment (and C6, C7 do not), we begin by calculating C and D.
255 /**************************************************************************************
257 * Routine: BeginIDCT
259 * Description: The Macro does IDct on 4 1-D Dcts
261 * Input: None
263 * Output: None
265 * Return: None
267 * Special Note: None
269 * Error: None
271 ***************************************************************************************
274 #define MtoSTR(s) #s
276 #define Dump "call MMX_dump\n"
278 #define BeginIDCT "#BeginIDCT\n" \
280 " movq " I(3)","r2"\n" \
282 " movq " C(3)","r6"\n" \
283 " movq " r2","r4"\n" \
284 " movq " J(5)","r7"\n" \
285 " pmulhw " r6","r4"\n" \
286 " movq " C(5)","r1"\n" \
287 " pmulhw " r7","r6"\n" \
288 " movq " r1","r5"\n" \
289 " pmulhw " r2","r1"\n" \
290 " movq " I(1)","r3"\n" \
291 " pmulhw " r7","r5"\n" \
292 " movq " C(1)","r0"\n" \
293 " paddw " r2","r4"\n" \
294 " paddw " r7","r6"\n" \
295 " paddw " r1","r2"\n" \
296 " movq " J(7)","r1"\n" \
297 " paddw " r5","r7"\n" \
298 " movq " r0","r5"\n" \
299 " pmulhw " r3","r0"\n" \
300 " paddsw " r7","r4"\n" \
301 " pmulhw " r1","r5"\n" \
302 " movq " C(7)","r7"\n" \
303 " psubsw " r2","r6"\n" \
304 " paddw " r3","r0"\n" \
305 " pmulhw " r7","r3"\n" \
306 " movq " I(2)","r2"\n" \
307 " pmulhw " r1","r7"\n" \
308 " paddw " r1","r5"\n" \
309 " movq " r2","r1"\n" \
310 " pmulhw " C(2)","r2"\n" \
311 " psubsw " r5","r3"\n" \
312 " movq " J(6)","r5"\n" \
313 " paddsw " r7","r0"\n" \
314 " movq " r5","r7"\n" \
315 " psubsw " r4","r0"\n" \
316 " pmulhw " C(2)","r5"\n" \
317 " paddw " r1","r2"\n" \
318 " pmulhw " C(6)","r1"\n" \
319 " paddsw " r4","r4"\n" \
320 " paddsw " r0","r4"\n" \
321 " psubsw " r6","r3"\n" \
322 " paddw " r7","r5"\n" \
323 " paddsw " r6","r6"\n" \
324 " pmulhw " C(6)","r7"\n" \
325 " paddsw " r3","r6"\n" \
326 " movq " r4","I(1)"\n" \
327 " psubsw " r5","r1"\n" \
328 " movq " C(4)","r4"\n" \
329 " movq " r3","r5"\n" \
330 " pmulhw " r4","r3"\n" \
331 " paddsw " r2","r7"\n" \
332 " movq " r6","I(2)"\n" \
333 " movq " r0","r2"\n" \
334 " movq " I(0)","r6"\n" \
335 " pmulhw " r4","r0"\n" \
336 " paddw " r3","r5"\n" \
337 "\n" \
338 " movq " J(4)","r3"\n" \
339 " psubsw " r1","r5"\n" \
340 " paddw " r0","r2"\n" \
341 " psubsw " r3","r6"\n" \
342 " movq " r6","r0"\n" \
343 " pmulhw " r4","r6"\n" \
344 " paddsw " r3","r3"\n" \
345 " paddsw " r1","r1"\n" \
346 " paddsw " r0","r3"\n" \
347 " paddsw " r5","r1"\n" \
348 " pmulhw " r3","r4"\n" \
349 " paddsw " r0","r6"\n" \
350 " psubsw " r2","r6"\n" \
351 " paddsw " r2","r2"\n" \
352 " movq " I(1)","r0"\n" \
353 " paddsw " r6","r2"\n" \
354 " paddw " r3","r4"\n" \
355 " psubsw " r1","r2"\n" \
356 "#end BeginIDCT\n"
357 // end BeginIDCT macro (38 cycles).
360 // Two versions of the end of the idct depending on whether we're feeding
361 // into a transpose or dividing the final results by 16 and storing them.
363 /**************************************************************************************
365 * Routine: RowIDCT
367 * Description: The Macro does 1-D IDct on 4 Rows
369 * Input: None
371 * Output: None
373 * Return: None
375 * Special Note: None
377 * Error: None
379 ***************************************************************************************
382 // RowIDCT gets ready to transpose.
384 #define RowIDCT ASM("\n"\
385 "#RowIDCT\n" \
386 BeginIDCT \
387 "\n" \
388 " movq "I(2)","r3"\n" /* r3 = D. */ \
389 " psubsw "r7","r4"\n" /* r4 = E. = E - G */ \
390 " paddsw "r1","r1"\n" /* r1 = H. + H. */ \
391 " paddsw "r7","r7"\n" /* r7 = G + G */ \
392 " paddsw "r2","r1"\n" /* r1 = R1 = A.. + H. */\
393 " paddsw "r4","r7"\n" /* r7 = G. = E + G */ \
394 " psubsw "r3","r4"\n" /* r4 = R4 = E. - D. */ \
395 " paddsw "r3","r3"\n" \
396 " psubsw "r5","r6"\n" /* r6 = R6 = F. - B.. */\
397 " paddsw "r5","r5"\n" \
398 " paddsw "r4","r3"\n" /* r3 = R3 = E. + D. */ \
399 " paddsw "r6","r5"\n" /* r5 = R5 = F. + B.. */\
400 " psubsw "r0","r7"\n" /* r7 = R7 = G. - C. */ \
401 " paddsw "r0","r0"\n" \
402 " movq "r1","I(1)"\n" /* save R1 */ \
403 " paddsw "r7","r0"\n" /* r0 = R0 = G. + C. */ \
404 "#end RowIDCT" \
406 // end RowIDCT macro (8 + 38 = 46 cycles)
409 /**************************************************************************************
411 * Routine: ColumnIDCT
413 * Description: The Macro does 1-D IDct on 4 columns
415 * Input: None
417 * Output: None
419 * Return: None
421 * Special Note: None
423 * Error: None
425 ***************************************************************************************
427 // Column IDCT normalizes and stores final results.
429 #define ColumnIDCT ASM("\n" \
430 "#ColumnIDCT\n" \
431 BeginIDCT \
432 "\n" \
433 " paddsw "Eight","r2"\n" \
434 " paddsw "r1","r1"\n" /* r1 = H. + H. */ \
435 " paddsw "r2","r1"\n" /* r1 = R1 = A.. + H. */\
436 " psraw ""$4"","r2"\n" /* r2 = NR2 */ \
437 " psubsw "r7","r4"\n" /* r4 = E. = E - G */ \
438 " psraw ""$4"","r1"\n" /* r1 = NR1 */ \
439 " movq "I(2)","r3"\n" /* r3 = D. */ \
440 " paddsw "r7","r7"\n" /* r7 = G + G */ \
441 " movq "r2","I(2)"\n" /* store NR2 at I2 */ \
442 " paddsw "r4","r7"\n" /* r7 = G. = E + G */ \
443 " movq "r1","I(1)"\n" /* store NR1 at I1 */ \
444 " psubsw "r3","r4"\n" /* r4 = R4 = E. - D. */ \
445 " paddsw "Eight","r4"\n" \
446 " paddsw "r3","r3"\n" /* r3 = D. + D. */ \
447 " paddsw "r4","r3"\n" /* r3 = R3 = E. + D. */ \
448 " psraw ""$4"","r4"\n" /* r4 = NR4 */ \
449 " psubsw "r5","r6"\n" /* r6 = R6 = F. - B.. */\
450 " psraw ""$4"","r3"\n" /* r3 = NR3 */ \
451 " paddsw "Eight","r6"\n" \
452 " paddsw "r5","r5"\n" /* r5 = B.. + B.. */ \
453 " paddsw "r6","r5"\n" /* r5 = R5 = F. + B.. */\
454 " psraw ""$4"","r6"\n" /* r6 = NR6 */ \
455 " movq "r4","J(4)"\n" /* store NR4 at J4 */ \
456 " psraw ""$4"","r5"\n" /* r5 = NR5 */ \
457 " movq "r3","I(3)"\n" /* store NR3 at I3 */ \
458 " psubsw "r0","r7"\n" /* r7 = R7 = G. - C. */ \
459 " paddsw "Eight","r7"\n" \
460 " paddsw "r0","r0"\n" /* r0 = C. + C. */ \
461 " paddsw "r7","r0"\n" /* r0 = R0 = G. + C. */ \
462 " psraw ""$4"","r7"\n" /* r7 = NR7 */ \
463 " movq "r6","J(6)"\n" /* store NR6 at J6 */ \
464 " psraw ""$4"","r0"\n" /* r0 = NR0 */ \
465 " movq "r5","J(5)"\n" /* store NR5 at J5 */ \
466 " movq "r7","J(7)"\n" /* store NR7 at J7 */ \
467 " movq "r0","I(0)"\n" /* store NR0 at I0 */ \
468 "#end ColumnIDCT\n" \
470 // end ColumnIDCT macro (38 + 19 = 57 cycles)
472 /**************************************************************************************
474 * Routine: Transpose
476 * Description: The Macro does two 4x4 transposes in place.
478 * Input: None
480 * Output: None
482 * Return: None
484 * Special Note: None
486 * Error: None
488 ***************************************************************************************
491 /* Following macro does two 4x4 transposes in place.
493 At entry (we assume):
495 r0 = a3 a2 a1 a0
496 I(1) = b3 b2 b1 b0
497 r2 = c3 c2 c1 c0
498 r3 = d3 d2 d1 d0
500 r4 = e3 e2 e1 e0
501 r5 = f3 f2 f1 f0
502 r6 = g3 g2 g1 g0
503 r7 = h3 h2 h1 h0
505 At exit, we have:
507 I(0) = d0 c0 b0 a0
508 I(1) = d1 c1 b1 a1
509 I(2) = d2 c2 b2 a2
510 I(3) = d3 c3 b3 a3
512 J(4) = h0 g0 f0 e0
513 J(5) = h1 g1 f1 e1
514 J(6) = h2 g2 f2 e2
515 J(7) = h3 g3 f3 e3
517 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
518 J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
520 Since r1 is free at entry, we calculate the Js first. */
523 #define Transpose ASM("\n#Transpose\n" \
525 " movq "r4","r1"\n" \
526 " punpcklwd "r5","r4"\n" \
527 " movq "r0","I(0)"\n" \
528 " punpckhwd "r5","r1"\n" \
529 " movq "r6","r0"\n" \
530 " punpcklwd "r7","r6"\n" \
531 " movq "r4","r5"\n" \
532 " punpckldq "r6","r4"\n" \
533 " punpckhdq "r6","r5"\n" \
534 " movq "r1","r6"\n" \
535 " movq "r4","J(4)"\n" \
536 " punpckhwd "r7","r0"\n" \
537 " movq "r5","J(5)"\n" \
538 " punpckhdq "r0","r6"\n" \
539 " movq "I(0)","r4"\n" \
540 " punpckldq "r0","r1"\n" \
541 " movq "I(1)","r5"\n" \
542 " movq "r4","r0"\n" \
543 " movq "r6","J(7)"\n" \
544 " punpcklwd "r5","r0"\n" \
545 " movq "r1","J(6)"\n" \
546 " punpckhwd "r5","r4"\n" \
547 " movq "r2","r5"\n" \
548 " punpcklwd "r3","r2"\n" \
549 " movq "r0","r1"\n" \
550 " punpckldq "r2","r0"\n" \
551 " punpckhdq "r2","r1"\n" \
552 " movq "r4","r2"\n" \
553 " movq "r0","I(0)"\n" \
554 " punpckhwd "r3","r5"\n" \
555 " movq "r1","I(1)"\n" \
556 " punpckhdq "r5","r4"\n" \
557 " punpckldq "r5","r2"\n" \
559 " movq "r4","I(3)"\n" \
561 " movq "r2","I(2)"\n" \
562 "#end Transpose\n" \
564 // end Transpose macro (19 cycles).
567 static void MMX_dump()
571 movq %mm0,(%edi)\n\
572 movq %mm1,8(%edi)\n\
573 movq %mm2,16(%edi)\n\
574 movq %mm3,24(%edi)\n\
575 movq %mm4,32(%edi)\n\
576 movq %mm5,40(%edi)\n\
577 movq %mm6,48(%edi)\n\
578 movq %mm7,56(%edi)\n\
579 ret"
584 /**************************************************************************************
586 * Routine: MMX_idct
588 * Description: Perform IDCT on a 8x8 block
590 * Input: Pointer to input and output buffer
592 * Output: None
594 * Return: None
596 * Special Note: The input coefficients are in ZigZag order
598 * Error: None
600 ***************************************************************************************
602 void IDctSlow__mmx( Q_LIST_ENTRY * InputData,
603 ogg_int16_t *QuantMatrix,
604 ogg_int16_t * OutputData ) {
606 # define MIDM(M,I) MtoSTR(M+I*8(%ecx))
607 # define M(I) MIDM( MaskOffset , I )
608 # define MIDC(M,I) MtoSTR(M+(I-1)*8(%ecx))
609 # define C(I) MIDC( CosineOffset , I )
610 # define MIDEight(M) MtoSTR(M(%ecx))
611 # define Eight MIDEight(EightOffset)
613 # define r0 "%mm0"
614 # define r1 "%mm1"
615 # define r2 "%mm2"
616 # define r3 "%mm3"
617 # define r4 "%mm4"
618 # define r5 "%mm5"
619 # define r6 "%mm6"
620 # define r7 "%mm7"
622 __asm__ __volatile__ (
623 /* eax = quantized input */
624 /* esi = quantization table */
625 /* edx = destination (= idct buffer) */
626 /* ecx = idctconstants */
629 :"a"(InputData), "S"(QuantMatrix), "d"(OutputData), "c"(idctconstants)
632 ASM(
633 "movq (%eax), "r0"\n"
634 "pmullw (%esi), "r0"\n" /* r0 = 03 02 01 00 */
635 "movq 16(%eax), "r1"\n"
636 "pmullw 16(%esi), "r1"\n" /* r1 = 13 12 11 10 */
637 "movq "M(0)", "r2"\n" /* r2 = __ __ __ FF */
638 "movq "r0", "r3"\n" /* r3 = 03 02 01 00 */
639 "movq 8(%eax), "r4"\n"
640 "psrlq $16, "r0"\n" /* r0 = __ 03 02 01 */
641 "pmullw 8(%esi), "r4"\n" /* r4 = 07 06 05 04 */
642 "pand "r2", "r3"\n" /* r3 = __ __ __ 00 */
643 "movq "r0", "r5"\n" /* r5 = __ 03 02 01 */
644 "movq "r1", "r6"\n" /* r6 = 13 12 11 10 */
645 "pand "r2", "r5"\n" /* r5 = __ __ __ 01 */
646 "psllq $32, "r6"\n" /* r6 = 11 10 __ __ */
647 "movq "M(3)", "r7"\n" /* r7 = FF __ __ __ */
648 "pxor "r5", "r0"\n" /* r0 = __ 03 02 __ */
649 "pand "r6", "r7"\n" /* r7 = 11 __ __ __ */
650 "por "r3", "r0"\n" /* r0 = __ 03 02 00 */
651 "pxor "r7", "r6"\n" /* r6 = __ 10 __ __ */
652 "por "r7", "r0"\n" /* r0 = 11 03 02 00 = R0 */
653 "movq "M(3)", "r7"\n" /* r7 = FF __ __ __ */
654 "movq "r4", "r3"\n" /* r3 = 07 06 05 04 */
655 "movq "r0", (%edx)\n" /* write R0 = r0 */
656 "pand "r2", "r3"\n" /* r3 = __ __ __ 04 */
657 "movq 32(%eax), "r0"\n"
658 "psllq $16, "r3"\n" /* r3 = __ __ 04 __ */
659 "pmullw 32(%esi), "r0"\n" /* r0 = 23 22 21 20 */
660 "pand "r1", "r7"\n" /* r7 = 13 __ __ __ */
661 "por "r3", "r5"\n" /* r5 = __ __ 04 01 */
662 "por "r6", "r7"\n" /* r7 = 13 10 __ __ */
663 "movq 24(%eax), "r3"\n"
664 "por "r5", "r7"\n" /* r7 = 13 10 04 01 = R1 */
665 "pmullw 24(%esi), "r3"\n" /* r3 = 17 16 15 14 */
666 "psrlq $16, "r4"\n" /* r4 = __ 07 06 05 */
667 "movq "r7", 16(%edx)\n" /* write R1 = r7 */
668 "movq "r4", "r5"\n" /* r5 = __ 07 06 05 */
669 "movq "r0", "r7"\n" /* r7 = 23 22 21 20 */
670 "psrlq $16, "r4"\n" /* r4 = __ __ 07 06 */
671 "psrlq $48, "r7"\n" /* r7 = __ __ __ 23 */
672 "movq "r2", "r6"\n" /* r6 = __ __ __ FF */
673 "pand "r2", "r5"\n" /* r5 = __ __ __ 05 */
674 "pand "r4", "r6"\n" /* r6 = __ __ __ 06 */
675 "movq "r7", 80(%edx)\n" /* partial R9 = __ __ __ 23 */
676 "pxor "r6", "r4"\n" /* r4 = __ __ 07 __ */
677 "psrlq $32, "r1"\n" /* r1 = __ __ 13 12 */
678 "por "r5", "r4"\n" /* r4 = __ __ 07 05 */
679 "movq "M(3)", "r7"\n" /* r7 = FF __ __ __ */
680 "pand "r2", "r1"\n" /* r1 = __ __ __ 12 */
681 "movq 48(%eax), "r5"\n"
682 "psllq $16, "r0"\n" /* r0 = 22 21 20 __ */
683 "pmullw 48(%esi), "r5"\n" /* r5 = 33 32 31 30 */
684 "pand "r0", "r7"\n" /* r7 = 22 __ __ __ */
685 "movq "r1", 64(%edx)\n" /* partial R8 = __ __ __ 12 */
686 "por "r4", "r7"\n" /* r7 = 22 __ 07 05 */
687 "movq "r3", "r4"\n" /* r4 = 17 16 15 14 */
688 "pand "r2", "r3"\n" /* r3 = __ __ __ 14 */
689 "movq "M(2)", "r1"\n" /* r1 = __ FF __ __ */
690 "psllq $32, "r3"\n" /* r3 = __ 14 __ __ */
691 "por "r3", "r7"\n" /* r7 = 22 14 07 05 = R2 */
692 "movq "r5", "r3"\n" /* r3 = 33 32 31 30 */
693 "psllq $48, "r3"\n" /* r3 = 30 __ __ __ */
694 "pand "r0", "r1"\n" /* r1 = __ 21 __ __ */
695 "movq "r7", 32(%edx)\n" /* write R2 = r7 */
696 "por "r3", "r6"\n" /* r6 = 30 __ __ 06 */
697 "movq "M(1)", "r7"\n" /* r7 = __ __ FF __ */
698 "por "r1", "r6"\n" /* r6 = 30 21 __ 06 */
699 "movq 56(%eax), "r1"\n"
700 "pand "r4", "r7"\n" /* r7 = __ __ 15 __ */
701 "pmullw 56(%esi), "r1"\n" /* r1 = 37 36 35 34 */
702 "por "r6", "r7"\n" /* r7 = 30 21 15 06 = R3 */
703 "pand "M(1)", "r0"\n" /* r0 = __ __ 20 __ */
704 "psrlq $32, "r4"\n" /* r4 = __ __ 17 16 */
705 "movq "r7", 48(%edx)\n" /* write R3 = r7 */
706 "movq "r4", "r6"\n" /* r6 = __ __ 17 16 */
707 "movq "M(3)", "r7"\n" /* r7 = FF __ __ __ */
708 "pand "r2", "r4"\n" /* r4 = __ __ __ 16 */
709 "movq "M(1)", "r3"\n" /* r3 = __ __ FF __ */
710 "pand "r1", "r7"\n" /* r7 = 37 __ __ __ */
711 "pand "r5", "r3"\n" /* r3 = __ __ 31 __ */
712 "por "r4", "r0"\n" /* r0 = __ __ 20 16 */
713 "psllq $16, "r3"\n" /* r3 = __ 31 __ __ */
714 "por "r0", "r7"\n" /* r7 = 37 __ 20 16 */
715 "movq "M(2)", "r4"\n" /* r4 = __ FF __ __ */
716 "por "r3", "r7"\n" /* r7 = 37 31 20 16 = R4 */
717 "movq 80(%eax), "r0"\n"
718 "movq "r4", "r3"\n" /* r3 = __ __ FF __ */
719 "pmullw 80(%esi), "r0"\n" /* r0 = 53 52 51 50 */
720 "pand "r5", "r4"\n" /* r4 = __ 32 __ __ */
721 "movq "r7", 8(%edx)\n" /* write R4 = r7 */
722 "por "r4", "r6"\n" /* r6 = __ 32 17 16 */
723 "movq "r3", "r4"\n" /* r4 = __ FF __ __ */
724 "psrlq $16, "r6"\n" /* r6 = __ __ 32 17 */
725 "movq "r0", "r7"\n" /* r7 = 53 52 51 50 */
726 "pand "r1", "r4"\n" /* r4 = __ 36 __ __ */
727 "psllq $48, "r7"\n" /* r7 = 50 __ __ __ */
728 "por "r4", "r6"\n" /* r6 = __ 36 32 17 */
729 "movq 88(%eax), "r4"\n"
730 "por "r6", "r7"\n" /* r7 = 50 36 32 17 = R5 */
731 "pmullw 88(%esi), "r4"\n" /* r4 = 57 56 55 54 */
732 "psrlq $16, "r3"\n" /* r3 = __ __ FF __ */
733 "movq "r7", 24(%edx)\n" /* write R5 = r7 */
734 "pand "r1", "r3"\n" /* r3 = __ __ 35 __ */
735 "psrlq $48, "r5"\n" /* r5 = __ __ __ 33 */
736 "pand "r2", "r1"\n" /* r1 = __ __ __ 34 */
737 "movq 104(%eax), "r6"\n"
738 "por "r3", "r5"\n" /* r5 = __ __ 35 33 */
739 "pmullw 104(%esi), "r6"\n" /* r6 = 67 66 65 64 */
740 "psrlq $16, "r0"\n" /* r0 = __ 53 52 51 */
741 "movq "r4", "r7"\n" /* r7 = 57 56 55 54 */
742 "movq "r2", "r3"\n" /* r3 = __ __ __ FF */
743 "psllq $48, "r7"\n" /* r7 = 54 __ __ __ */
744 "pand "r0", "r3"\n" /* r3 = __ __ __ 51 */
745 "pxor "r3", "r0"\n" /* r0 = __ 53 52 __ */
746 "psllq $32, "r3"\n" /* r3 = __ 51 __ __ */
747 "por "r5", "r7"\n" /* r7 = 54 __ 35 33 */
748 "movq "r6", "r5"\n" /* r5 = 67 66 65 64 */
749 "pand "M(1)", "r6"\n" /* r6 = __ __ 65 __ */
750 "por "r3", "r7"\n" /* r7 = 54 51 35 33 = R6 */
751 "psllq $32, "r6"\n" /* r6 = 65 __ __ __ */
752 "por "r1", "r0"\n" /* r0 = __ 53 52 34 */
753 "movq "r7", 40(%edx)\n" /* write R6 = r7 */
754 "por "r6", "r0"\n" /* r0 = 65 53 52 34 = R7 */
755 "movq 120(%eax), "r7"\n"
756 "movq "r5", "r6"\n" /* r6 = 67 66 65 64 */
757 "pmullw 120(%esi), "r7"\n" /* r7 = 77 76 75 74 */
758 "psrlq $32, "r5"\n" /* r5 = __ __ 67 66 */
759 "pand "r2", "r6"\n" /* r6 = __ __ __ 64 */
760 "movq "r5", "r1"\n" /* r1 = __ __ 67 66 */
761 "movq "r0", 56(%edx)\n" /* write R7 = r0 */
762 "pand "r2", "r1"\n" /* r1 = __ __ __ 66 */
763 "movq 112(%eax), "r0"\n"
764 "movq "r7", "r3"\n" /* r3 = 77 76 75 74 */
765 "pmullw 112(%esi), "r0"\n" /* r0 = 73 72 71 70 */
766 "psllq $16, "r3"\n" /* r3 = 76 75 74 __ */
767 "pand "M(3)", "r7"\n" /* r7 = 77 __ __ __ */
768 "pxor "r1", "r5"\n" /* r5 = __ __ 67 __ */
769 "por "r5", "r6"\n" /* r6 = __ __ 67 64 */
770 "movq "r3", "r5"\n" /* r5 = 76 75 74 __ */
771 "pand "M(3)", "r5"\n" /* r5 = 76 __ __ __ */
772 "por "r1", "r7"\n" /* r7 = 77 __ __ 66 */
773 "movq 96(%eax), "r1"\n"
774 "pxor "r5", "r3"\n" /* r3 = __ 75 74 __ */
775 "pmullw 96(%esi), "r1"\n" /* r1 = 63 62 61 60 */
776 "por "r3", "r7"\n" /* r7 = 77 75 74 66 = R15 */
777 "por "r5", "r6"\n" /* r6 = 76 __ 67 64 */
778 "movq "r0", "r5"\n" /* r5 = 73 72 71 70 */
779 "movq "r7", 120(%edx)\n" /* store R15 = r7 */
780 "psrlq $16, "r5"\n" /* r5 = __ 73 72 71 */
781 "pand "M(2)", "r5"\n" /* r5 = __ 73 __ __ */
782 "movq "r0", "r7"\n" /* r7 = 73 72 71 70 */
783 "por "r5", "r6"\n" /* r6 = 76 73 67 64 = R14 */
784 "pand "r2", "r0"\n" /* r0 = __ __ __ 70 */
785 "pxor "r0", "r7"\n" /* r7 = 73 72 71 __ */
786 "psllq $32, "r0"\n" /* r0 = __ 70 __ __ */
787 "movq "r6", 104(%edx)\n" /* write R14 = r6 */
788 "psrlq $16, "r4"\n" /* r4 = __ 57 56 55 */
789 "movq 72(%eax), "r5"\n"
790 "psllq $16, "r7"\n" /* r7 = 72 71 __ __ */
791 "pmullw 72(%esi), "r5"\n" /* r5 = 47 46 45 44 */
792 "movq "r7", "r6"\n" /* r6 = 72 71 __ __ */
793 "movq "M(2)", "r3"\n" /* r3 = __ FF __ __ */
794 "psllq $16, "r6"\n" /* r6 = 71 __ __ __ */
795 "pand "M(3)", "r7"\n" /* r7 = 72 __ __ __ */
796 "pand "r1", "r3"\n" /* r3 = __ 62 __ __ */
797 "por "r0", "r7"\n" /* r7 = 72 70 __ __ */
798 "movq "r1", "r0"\n" /* r0 = 63 62 61 60 */
799 "pand "M(3)", "r1"\n" /* r1 = 63 __ __ __ */
800 "por "r3", "r6"\n" /* r6 = 71 62 __ __ */
801 "movq "r4", "r3"\n" /* r3 = __ 57 56 55 */
802 "psrlq $32, "r1"\n" /* r1 = __ __ 63 __ */
803 "pand "r2", "r3"\n" /* r3 = __ __ __ 55 */
804 "por "r1", "r7"\n" /* r7 = 72 70 63 __ */
805 "por "r3", "r7"\n" /* r7 = 72 70 63 55 = R13 */
806 "movq "r4", "r3"\n" /* r3 = __ 57 56 55 */
807 "pand "M(1)", "r3"\n" /* r3 = __ __ 56 __ */
808 "movq "r5", "r1"\n" /* r1 = 47 46 45 44 */
809 "movq "r7", 88(%edx)\n" /* write R13 = r7 */
810 "psrlq $48, "r5"\n" /* r5 = __ __ __ 47 */
811 "movq 64(%eax), "r7"\n"
812 "por "r3", "r6"\n" /* r6 = 71 62 56 __ */
813 "pmullw 64(%esi), "r7"\n" /* r7 = 43 42 41 40 */
814 "por "r5", "r6"\n" /* r6 = 71 62 56 47 = R12 */
815 "pand "M(2)", "r4"\n" /* r4 = __ 57 __ __ */
816 "psllq $32, "r0"\n" /* r0 = 61 60 __ __ */
817 "movq "r6", 72(%edx)\n" /* write R12 = r6 */
818 "movq "r0", "r6"\n" /* r6 = 61 60 __ __ */
819 "pand "M(3)", "r0"\n" /* r0 = 61 __ __ __ */
820 "psllq $16, "r6"\n" /* r6 = 60 __ __ __ */
821 "movq 40(%eax), "r5"\n"
822 "movq "r1", "r3"\n" /* r3 = 47 46 45 44 */
823 "pmullw 40(%esi), "r5"\n" /* r5 = 27 26 25 24 */
824 "psrlq $16, "r1"\n" /* r1 = __ 47 46 45 */
825 "pand "M(1)", "r1"\n" /* r1 = __ __ 46 __ */
826 "por "r4", "r0"\n" /* r0 = 61 57 __ __ */
827 "pand "r7", "r2"\n" /* r2 = __ __ __ 40 */
828 "por "r1", "r0"\n" /* r0 = 61 57 46 __ */
829 "por "r2", "r0"\n" /* r0 = 61 57 46 40 = R11 */
830 "psllq $16, "r3"\n" /* r3 = 46 45 44 __ */
831 "movq "r3", "r4"\n" /* r4 = 46 45 44 __ */
832 "movq "r5", "r2"\n" /* r2 = 27 26 25 24 */
833 "movq "r0", 112(%edx)\n" /* write R11 = r0 */
834 "psrlq $48, "r2"\n" /* r2 = __ __ __ 27 */
835 "pand "M(2)", "r4"\n" /* r4 = __ 45 __ __ */
836 "por "r2", "r6"\n" /* r6 = 60 __ __ 27 */
837 "movq "M(1)", "r2"\n" /* r2 = __ __ FF __ */
838 "por "r4", "r6"\n" /* r6 = 60 45 __ 27 */
839 "pand "r7", "r2"\n" /* r2 = __ __ 41 __ */
840 "psllq $32, "r3"\n" /* r3 = 44 __ __ __ */
841 "por 80(%edx), "r3"\n" /* r3 = 44 __ __ 23 */
842 "por "r2", "r6"\n" /* r6 = 60 45 41 27 = R10 */
843 "movq "M(3)", "r2"\n" /* r2 = FF __ __ __ */
844 "psllq $16, "r5"\n" /* r5 = 26 25 24 __ */
845 "movq "r6", 96(%edx)\n" /* store R10 = r6 */
846 "pand "r5", "r2"\n" /* r2 = 26 __ __ __ */
847 "movq "M(2)", "r6"\n" /* r6 = __ FF __ __ */
848 "pxor "r2", "r5"\n" /* r5 = __ 25 24 __ */
849 "pand "r7", "r6"\n" /* r6 = __ 42 __ __ */
850 "psrlq $32, "r2"\n" /* r2 = __ __ 26 __ */
851 "pand "M(3)", "r7"\n" /* r7 = 43 __ __ __ */
852 "por "r2", "r3"\n" /* r3 = 44 __ 26 23 */
853 "por 64(%edx), "r7"\n" /* r7 = 43 __ __ 12 */
854 "por "r3", "r6"\n" /* r6 = 44 42 26 23 = R9 */
855 "por "r5", "r7"\n" /* r7 = 43 25 24 12 = R8 */
856 "movq "r6", 80(%edx)\n" /* store R9 = r6 */
857 "movq "r7", 64(%edx)\n" /* store R8 = r7 */
859 /* 123c ( / 64 coeffs < 2c / coeff) */
860 # undef M
862 /* Done w/dequant + descramble + partial transpose; now do the idct itself. */
864 # define I( K) MtoSTR(K*16(%edx))
865 # define J( K) MtoSTR(((K - 4)*16)+8(%edx))
867 RowIDCT /* 46 c */
868 Transpose /* 19 c */
870 # undef I
871 # undef J
872 # define I( K) MtoSTR((K*16)+64(%edx))
873 # define J( K) MtoSTR(((K-4)*16)+72(%edx))
875 RowIDCT /* 46 c */
876 Transpose /* 19 c */
878 # undef I
879 # undef J
880 # define I( K) MtoSTR((K * 16)(%edx))
881 # define J( K) I( K)
883 ColumnIDCT /* 57 c */
885 # undef I
886 # undef J
887 # define I( K) MtoSTR((K*16)+8(%edx))
888 # define J( K) I( K)
890 ColumnIDCT /* 57 c */
892 # undef I
893 # undef J
894 /* 368 cycles ( / 64 coeff < 6 c / coeff) */
896 ASM("emms\n");
899 /**************************************************************************************
901 * Routine: MMX_idct10
903 * Description: Perform IDCT on a 8x8 block with at most 10 nonzero coefficients
905 * Input: Pointer to input and output buffer
907 * Output: None
909 * Return: None
911 * Special Note: The input coefficients are in transposed ZigZag order
913 * Error: None
915 ***************************************************************************************
917 /* --------------------------------------------------------------- */
918 // This macro does four 4-sample one-dimensional idcts in parallel. Inputs
919 // 4 thru 7 are assumed to be zero.
920 #define BeginIDCT_10 "#BeginIDCT_10\n" \
921 " movq "I(3)","r2"\n" \
923 " movq "C(3)","r6"\n" \
924 " movq "r2","r4"\n" \
926 " movq "C(5)","r1"\n" \
927 " pmulhw "r6","r4"\n" \
929 " movq "I(1)","r3"\n" \
930 " pmulhw "r2","r1"\n" \
932 " movq "C(1)","r0"\n" \
933 " paddw "r2","r4"\n" \
935 " pxor "r6","r6"\n" \
936 " paddw "r1","r2"\n" \
938 " movq "I(2)","r5"\n" \
939 " pmulhw "r3","r0"\n" \
941 " movq "r5","r1"\n" \
942 " paddw "r3","r0"\n" \
944 " pmulhw "C(7)","r3"\n" \
945 " psubsw "r2","r6"\n" \
947 " pmulhw "C(2)","r5"\n" \
948 " psubsw "r4","r0"\n" \
950 " movq "I(2)","r7"\n" \
951 " paddsw "r4","r4"\n" \
953 " paddw "r5","r7"\n" \
954 " paddsw "r0","r4"\n" \
956 " pmulhw "C(6)","r1"\n" \
957 " psubsw "r6","r3"\n" \
959 " movq "r4","I(1)"\n" \
960 " paddsw "r6","r6"\n" \
962 " movq "C(4)","r4"\n" \
963 " paddsw "r3","r6"\n" \
965 " movq "r3","r5"\n" \
966 " pmulhw "r4","r3"\n" \
968 " movq "r6","I(2)"\n" \
969 " movq "r0","r2"\n" \
971 " movq "I(0)","r6"\n" \
972 " pmulhw "r4","r0"\n" \
974 " paddw "r3","r5"\n" \
975 " paddw "r0","r2"\n" \
977 " psubsw "r1","r5"\n" \
978 " pmulhw "r4","r6"\n" \
980 " paddw "I(0)","r6"\n" \
981 " paddsw "r1","r1"\n" \
983 " movq "r6","r4"\n" \
984 " paddsw "r5","r1"\n" \
986 " psubsw "r2","r6"\n" \
987 " paddsw "r2","r2"\n" \
989 " movq "I(1)","r0"\n" \
990 " paddsw "r6","r2"\n" \
992 " psubsw "r1","r2"\n" \
993 "#end BeginIDCT_10\n"
994 // end BeginIDCT_10 macro (25 cycles).
996 #define RowIDCT_10 ASM("\n" \
997 "#RowIDCT_10\n" \
998 BeginIDCT_10 \
999 "\n" \
1000 " movq "I(2)","r3"\n" /* r3 = D. */ \
1001 " psubsw "r7","r4"\n" /* r4 = E. = E - G */ \
1002 " paddsw "r1","r1"\n" /* r1 = H. + H. */ \
1003 " paddsw "r7","r7"\n" /* r7 = G + G */ \
1004 " paddsw "r2","r1"\n" /* r1 = R1 = A.. + H. */\
1005 " paddsw "r4","r7"\n" /* r7 = G. = E + G */ \
1006 " psubsw "r3","r4"\n" /* r4 = R4 = E. - D. */ \
1007 " paddsw "r3","r3"\n" \
1008 " psubsw "r5","r6"\n" /* r6 = R6 = F. - B.. */\
1009 " paddsw "r5","r5"\n" \
1010 " paddsw "r4","r3"\n" /* r3 = R3 = E. + D. */ \
1011 " paddsw "r6","r5"\n" /* r5 = R5 = F. + B.. */\
1012 " psubsw "r0","r7"\n" /* r7 = R7 = G. - C. */ \
1013 " paddsw "r0","r0"\n" \
1014 " movq "r1","I(1)"\n" /* save R1 */ \
1015 " paddsw "r7","r0"\n" /* r0 = R0 = G. + C. */ \
1016 "#end RowIDCT_10\n" \
1018 // end RowIDCT macro (8 + 38 = 46 cycles)
1020 // Column IDCT normalizes and stores final results.
1022 #define ColumnIDCT_10 ASM("\n" \
1023 "#ColumnIDCT_10\n" \
1024 BeginIDCT_10 \
1025 "\n" \
1026 " paddsw "Eight","r2"\n" \
1027 " paddsw "r1","r1"\n" /* r1 = H. + H. */ \
1028 " paddsw "r2","r1"\n" /* r1 = R1 = A.. + H. */\
1029 " psraw ""$4"","r2"\n" /* r2 = NR2 */ \
1030 " psubsw "r7","r4"\n" /* r4 = E. = E - G */ \
1031 " psraw ""$4"","r1"\n" /* r1 = NR1 */ \
1032 " movq "I(2)","r3"\n" /* r3 = D. */ \
1033 " paddsw "r7","r7"\n" /* r7 = G + G */ \
1034 " movq "r2","I(2)"\n" /* store NR2 at I2 */ \
1035 " paddsw "r4","r7"\n" /* r7 = G. = E + G */ \
1036 " movq "r1","I(1)"\n" /* store NR1 at I1 */ \
1037 " psubsw "r3","r4"\n" /* r4 = R4 = E. - D. */ \
1038 " paddsw "Eight","r4"\n" \
1039 " paddsw "r3","r3"\n" /* r3 = D. + D. */ \
1040 " paddsw "r4","r3"\n" /* r3 = R3 = E. + D. */ \
1041 " psraw ""$4"","r4"\n" /* r4 = NR4 */ \
1042 " psubsw "r5","r6"\n" /* r6 = R6 = F. - B.. */\
1043 " psraw ""$4"","r3"\n" /* r3 = NR3 */ \
1044 " paddsw "Eight","r6"\n" \
1045 " paddsw "r5","r5"\n" /* r5 = B.. + B.. */ \
1046 " paddsw "r6","r5"\n" /* r5 = R5 = F. + B.. */\
1047 " psraw ""$4"","r6"\n" /* r6 = NR6 */ \
1048 " movq "r4","J(4)"\n" /* store NR4 at J4 */ \
1049 " psraw ""$4"","r5"\n" /* r5 = NR5 */ \
1050 " movq "r3","I(3)"\n" /* store NR3 at I3 */ \
1051 " psubsw "r0","r7"\n" /* r7 = R7 = G. - C. */ \
1052 " paddsw "Eight","r7"\n" \
1053 " paddsw "r0","r0"\n" /* r0 = C. + C. */ \
1054 " paddsw "r7","r0"\n" /* r0 = R0 = G. + C. */ \
1055 " psraw ""$4"","r7"\n" /* r7 = NR7 */ \
1056 " movq "r6","J(6)"\n" /* store NR6 at J6 */ \
1057 " psraw ""$4"","r0"\n" /* r0 = NR0 */ \
1058 " movq "r5","J(5)"\n" /* store NR5 at J5 */ \
1060 " movq "r7","J(7)"\n" /* store NR7 at J7 */ \
1062 " movq "r0","I(0)"\n" /* store NR0 at I0 */ \
1063 "#end ColumnIDCT_10\n" \
1065 // end ColumnIDCT macro (38 + 19 = 57 cycles)
1066 /* --------------------------------------------------------------- */
1069 /* --------------------------------------------------------------- */
1070 /* IDCT 10 */
1071 void IDct10__mmx( Q_LIST_ENTRY * InputData,
1072 ogg_int16_t *QuantMatrix,
1073 ogg_int16_t * OutputData ) {
1075 # define MIDM(M,I) MtoSTR(M+I*8(%ecx))
1076 # define M(I) MIDM( MaskOffset , I )
1077 # define MIDC(M,I) MtoSTR(M+(I-1)*8(%ecx))
1078 # define C(I) MIDC( CosineOffset , I )
1079 # define MIDEight(M) MtoSTR(M(%ecx))
1080 # define Eight MIDEight(EightOffset)
1082 # define r0 "%mm0"
1083 # define r1 "%mm1"
1084 # define r2 "%mm2"
1085 # define r3 "%mm3"
1086 # define r4 "%mm4"
1087 # define r5 "%mm5"
1088 # define r6 "%mm6"
1089 # define r7 "%mm7"
1091 __asm__ __volatile__ (
1092 /* eax = quantized input */
1093 /* esi = quantization table */
1094 /* edx = destination (= idct buffer) */
1095 /* ecx = idctconstants */
1098 :"a"(InputData), "S"(QuantMatrix), "d"(OutputData), "c"(idctconstants)
1101 ASM(
1102 "movq (%eax), "r0"\n"
1103 "pmullw (%esi), "r0"\n" /* r0 = 03 02 01 00 */
1104 "movq 16(%eax), "r1"\n"
1105 "pmullw 16(%esi), "r1"\n" /* r1 = 13 12 11 10 */
1106 "movq "M(0)", "r2"\n" /* r2 = __ __ __ FF */
1107 "movq "r0", "r3"\n" /* r3 = 03 02 01 00 */
1108 "movq 8(%eax), "r4"\n"
1109 "psrlq $16, "r0"\n" /* r0 = __ 03 02 01 */
1110 "pmullw 8(%esi), "r4"\n" /* r4 = 07 06 05 04 */
1111 "pand "r2", "r3"\n" /* r3 = __ __ __ 00 */
1112 "movq "r0", "r5"\n" /* r5 = __ 03 02 01 */
1113 "pand "r2", "r5"\n" /* r5 = __ __ __ 01 */
1114 "psllq $32, "r1"\n" /* r1 = 11 10 __ __ */
1115 "movq "M(3)", "r7"\n" /* r7 = FF __ __ __ */
1116 "pxor "r5", "r0"\n" /* r0 = __ 03 02 __ */
1117 "pand "r1", "r7"\n" /* r7 = 11 __ __ __ */
1118 "por "r3", "r0"\n" /* r0 = __ 03 02 00 */
1119 "pxor "r7", "r1"\n" /* r1 = __ 10 __ __ */
1120 "por "r7", "r0"\n" /* r0 = 11 03 02 00 = R0 */
1121 "movq "r4", "r3"\n" /* r3 = 07 06 05 04 */
1122 "movq "r0", (%edx)\n" /* write R0 = r0 */
1123 "pand "r2", "r3"\n" /* r3 = __ __ __ 04 */
1124 "psllq $16, "r3"\n" /* r3 = __ __ 04 __ */
1125 "por "r3", "r5"\n" /* r5 = __ __ 04 01 */
1126 "por "r5", "r1"\n" /* r1 = __ 10 04 01 = R1 */
1127 "psrlq $16, "r4"\n" /* r4 = __ 07 06 05 */
1128 "movq "r1", 16(%edx)\n" /* write R1 = r1 */
1129 "movq "r4", "r5"\n" /* r5 = __ 07 06 05 */
1130 "psrlq $16, "r4"\n" /* r4 = __ __ 07 06 */
1131 "movq "r2", "r6"\n" /* r6 = __ __ __ FF */
1132 "pand "r2", "r5"\n" /* r5 = __ __ __ 05 */
1133 "pand "r4", "r6"\n" /* r6 = __ __ __ 06 */
1134 "pxor "r6", "r4"\n" /* r4 = __ __ 07 __ */
1135 "por "r5", "r4"\n" /* r4 = __ __ 07 05 */
1136 "movq "r4", 32(%edx)\n" /* write R2 = r4 */
1137 "movq "r6", 48(%edx)\n" /* write R3 = r6 */
1139 # undef M
1141 /* Done w/dequant + descramble + partial transpose; now do the idct itself. */
1143 # define I( K) MtoSTR((K*16)(%edx))
1144 # define J( K) MtoSTR(((K - 4) * 16)+8(%edx))
1146 RowIDCT_10 /* 33 c */
1147 Transpose /* 19 c */
1149 # undef I
1150 # undef J
1151 //# define I( K) [edx + ( K * 16) + 64]
1152 //# define J( K) [edx + ( (K - 4) * 16) + 72]
1154 // RowIDCT ; 46 c
1155 // Transpose ; 19 c
1157 //# undef I
1158 //# undef J
1159 # define I( K) MtoSTR((K * 16)(%edx))
1160 # define J( K) I( K)
1162 ColumnIDCT_10 /* 44 c */
1164 # undef I
1165 # undef J
1166 # define I( K) MtoSTR((K * 16)+8(%edx))
1167 # define J( K) I( K)
1169 ColumnIDCT_10 /* 44 c */
1171 # undef I
1172 # undef J
1174 ASM("emms\n");
1177 /**************************************************************************************
1179 * Routine: MMX_idct3
1181 * Description: Perform IDCT on a 8x8 block with at most 3 nonzero coefficients
1183 * Input: Pointer to input and output buffer
1185 * Output: None
1187 * Return: None
1189 * Special Note: Only works for three nonzero coefficients.
1191 * Error: None
1193 ***************************************************************************************
1195 /***************************************************************************************
1196 In IDCT 3, we are dealing with only three Non-Zero coefficients in the 8x8 block.
1197 In the case that we work in the fashion RowIDCT -> ColumnIDCT, we only have to
1198 do 1-D row idcts on the first two rows, the rest six rows remain zero anyway.
1199 After row IDCTs, since every column could have nonzero coefficients, we need do
1200 eight 1-D column IDCT. However, for each column, there are at most two nonzero
1201 coefficients, coefficient 0 and coefficient 1. Same for the coefficents for the
1202 two 1-d row idcts. For this reason, the process of a 1-D IDCT is simplified
1204 from a full version:
1206 A = (C1 * I1) + (C7 * I7) B = (C7 * I1) - (C1 * I7)
1207 C = (C3 * I3) + (C5 * I5) D = (C3 * I5) - (C5 * I3)
1208 A. = C4 * (A - C) B. = C4 * (B - D)
1209 C. = A + C D. = B + D
1211 E = C4 * (I0 + I4) F = C4 * (I0 - I4)
1212 G = (C2 * I2) + (C6 * I6) H = (C6 * I2) - (C2 * I6)
1213 E. = E - G
1214 G. = E + G
1216 A.. = F + A. B.. = B. - H
1217 F. = F - A. H. = B. + H
1219 R0 = G. + C. R1 = A.. + H. R3 = E. + D. R5 = F. + B..
1220 R7 = G. - C. R2 = A.. - H. R4 = E. - D. R6 = F. - B..
1225 A = (C1 * I1) B = (C7 * I1)
1226 C = 0 D = 0
1227 A. = C4 * A B. = C4 * B
1228 C. = A D. = B
1230 E = C4 * I0 F = E
1231 G = 0 H = 0
1232 E. = E
1233 G. = E
1235 A.. = E + A. B.. = B.
1236 F. = E - A. H. = B.
1238 R0 = E + A R1 = E + A. + B. R3 = E + B R5 = E - A. + B.
1239 R7 = E - A R2 = E + A. - B. R4 = E - B R6 = F - A. - B.
1241 ******************************************************************************************/
1243 #define RowIDCT_3 ASM("\n"\
1244 "#RowIDCT_3\n"\
1245 " movq "I(1)","r7"\n" /* r7 = I1 */ \
1246 " movq "C(1)","r0"\n" /* r0 = C1 */ \
1247 " movq "C(7)","r3"\n" /* r3 = C7 */ \
1248 " pmulhw "r7","r0"\n" /* r0 = C1 * I1 - I1 */ \
1249 " pmulhw "r7","r3"\n" /* r3 = C7 * I1 = B, D. */ \
1250 " movq "I(0)","r6"\n" /* r6 = I0 */ \
1251 " movq "C(4)","r4"\n" /* r4 = C4 */ \
1252 " paddw "r7","r0"\n" /* r0 = C1 * I1 = A, C. */ \
1253 " movq "r6","r1"\n" /* make a copy of I0 */ \
1254 " pmulhw "r4","r6"\n" /* r2 = C4 * I0 - I0 */ \
1255 " movq "r0","r2"\n" /* make a copy of A */ \
1256 " movq "r3","r5"\n" /* make a copy of B */ \
1257 " pmulhw "r4","r2"\n" /* r2 = C4 * A - A */ \
1258 " pmulhw "r4","r5"\n" /* r5 = C4 * B - B */ \
1259 " paddw "r1","r6"\n" /* r2 = C4 * I0 = E, F */ \
1260 " movq "r6","r4"\n" /* r4 = E */ \
1261 " paddw "r0","r2"\n" /* r2 = A. */ \
1262 " paddw "r3","r5"\n" /* r5 = B. */ \
1263 " movq "r6","r7"\n" /* r7 = E */ \
1264 " movq "r5","r1"\n" /* r1 = B. */ \
1265 /* r0 = A */ \
1266 /* r3 = B */ \
1267 /* r2 = A. */ \
1268 /* r5 = B. */ \
1269 /* r6 = E */ \
1270 /* r4 = E */ \
1271 /* r7 = E */ \
1272 /* r1 = B. */ \
1273 " psubw "r2","r6"\n" /* r6 = E - A. */ \
1274 " psubw "r3","r4"\n" /* r4 = E - B ----R4 */ \
1275 " psubw "r0","r7"\n" /* r7 = E - A ----R7 */ \
1276 " paddw "r2","r2"\n" /* r2 = A. + A. */ \
1277 " paddw "r3","r3"\n" /* r3 = B + B */ \
1278 " paddw "r0","r0"\n" /* r0 = A + A */ \
1279 " paddw "r6","r2"\n" /* r2 = E + A. */ \
1280 " paddw "r4","r3"\n" /* r3 = E + B ----R3 */ \
1281 " psubw "r1","r2"\n" /* r2 = E + A. - B. ----R2 */ \
1282 " psubw "r5","r6"\n" /* r6 = E - A. - B. ----R6 */ \
1283 " paddw "r1","r1"\n" /* r1 = B. + B. */ \
1284 " paddw "r5","r5"\n" /* r5 = B. + B. */ \
1285 " paddw "r7","r0"\n" /* r0 = E + A ----R0 */ \
1286 " paddw "r2","r1"\n" /* r1 = E + A. + B. -----R1 */ \
1287 " movq "r1","I(1)"\n" /* save r1 */ \
1288 " paddw "r6","r5"\n" /* r5 = E - A. + B. -----R5 */ \
1289 "#end RowIDCT_3\n"\
1291 //End of RowIDCT_3
1293 #define ColumnIDCT_3 ASM("\n"\
1294 "#ColumnIDCT_3\n"\
1295 " movq "I(1)","r7"\n" /* r7 = I1 */ \
1296 " movq "C(1)","r0"\n" /* r0 = C1 */ \
1297 " movq "C(7)","r3"\n" /* r3 = C7 */ \
1298 " pmulhw "r7","r0"\n" /* r0 = C1 * I1 - I1 */ \
1299 " pmulhw "r7","r3"\n" /* r3 = C7 * I1 = B, D. */ \
1300 " movq "I(0)","r6"\n" /* r6 = I0 */ \
1301 " movq "C(4)","r4"\n" /* r4 = C4 */ \
1302 " paddw "r7","r0"\n" /* r0 = C1 * I1 = A, C. */ \
1303 " movq "r6","r1"\n" /* make a copy of I0 */ \
1304 " pmulhw "r4","r6"\n" /* r2 = C4 * I0 - I0 */ \
1305 " movq "r0","r2"\n" /* make a copy of A */ \
1306 " movq "r3","r5"\n" /* make a copy of B */ \
1307 " pmulhw "r4","r2"\n" /* r2 = C4 * A - A */ \
1308 " pmulhw "r4","r5"\n" /* r5 = C4 * B - B */ \
1309 " paddw "r1","r6"\n" /* r2 = C4 * I0 = E, F */ \
1310 " movq "r6","r4"\n" /* r4 = E */ \
1311 " paddw "Eight","r6"\n" /* +8 for shift */ \
1312 " paddw "Eight","r4"\n" /* +8 for shift */ \
1313 " paddw "r0","r2"\n" /* r2 = A. */ \
1314 " paddw "r3","r5"\n" /* r5 = B. */ \
1315 " movq "r6","r7"\n" /* r7 = E */ \
1316 " movq "r5","r1"\n" /* r1 = B. */ \
1317 /* r0 = A */ \
1318 /* r3 = B */ \
1319 /* r2 = A. */ \
1320 /* r5 = B. */ \
1321 /* r6 = E */ \
1322 /* r4 = E */ \
1323 /* r7 = E */ \
1324 /* r1 = B. */ \
1325 " psubw "r2","r6"\n" /* r6 = E - A. */ \
1326 " psubw "r3","r4"\n" /* r4 = E - B ----R4 */ \
1327 " psubw "r0","r7"\n" /* r7 = E - A ----R7 */ \
1328 " paddw "r2","r2"\n" /* r2 = A. + A. */ \
1329 " paddw "r3","r3"\n" /* r3 = B + B */ \
1330 " paddw "r0","r0"\n" /* r0 = A + A */ \
1331 " paddw "r6","r2"\n" /* r2 = E + A. */ \
1332 " paddw "r4","r3"\n" /* r3 = E + B ----R3 */ \
1333 " psraw $4,"r4"\n" /* shift */ \
1334 " movq "r4","J(4)"\n" /* store R4 at J4 */ \
1335 " psraw $4,"r3"\n" /* shift */ \
1336 " movq "r3","I(3)"\n" /* store R3 at I3 */ \
1337 " psubw "r1","r2"\n" /* r2 = E + A. - B. ----R2 */ \
1338 " psubw "r5","r6"\n" /* r6 = E - A. - B. ----R6 */ \
1339 " paddw "r1","r1"\n" /* r1 = B. + B. */ \
1340 " paddw "r5","r5"\n" /* r5 = B. + B. */ \
1341 " paddw "r7","r0"\n" /* r0 = E + A ----R0 */ \
1342 " paddw "r2","r1"\n" /* r1 = E + A. + B. -----R1 */ \
1343 " psraw $4,"r7"\n" /* shift */ \
1344 " psraw $4,"r2"\n" /* shift */ \
1345 " psraw $4,"r0"\n" /* shift */ \
1346 " psraw $4,"r1"\n" /* shift */ \
1347 " movq "r7","J(7)"\n" /* store R7 to J7 */ \
1348 " movq "r0","I(0)"\n" /* store R0 to I0 */ \
1349 " movq "r1","I(1)"\n" /* store R1 to I1 */ \
1350 " movq "r2","I(2)"\n" /* store R2 to I2 */ \
1351 " movq "r1","I(1)"\n" /* save r1 */ \
1352 " paddw "r6","r5"\n" /* r5 = E - A. + B. -----R5 */ \
1353 " psraw $4,"r5"\n" /* shift */ \
1354 " movq "r5","J(5)"\n" /* store R5 at J5 */ \
1355 " psraw $4,"r6"\n" /* shift */ \
1356 " movq "r6","J(6)"\n" /* store R6 at J6 */ \
1357 "#end ColumnIDCT_3\n"\
1359 //End of ColumnIDCT_3
1361 void IDct3__mmx( Q_LIST_ENTRY * InputData,
1362 ogg_int16_t *QuantMatrix,
1363 ogg_int16_t * OutputData ) {
1365 # define MIDM(M,I) MtoSTR(M+I*8(%ecx))
1366 # define M(I) MIDM( MaskOffset , I )
1367 # define MIDC(M,I) MtoSTR(M+(I-1)*8(%ecx))
1368 # define C(I) MIDC( CosineOffset , I )
1369 # define MIDEight(M) MtoSTR(M(%ecx))
1370 # define Eight MIDEight(EightOffset)
1372 # define r0 "%mm0"
1373 # define r1 "%mm1"
1374 # define r2 "%mm2"
1375 # define r3 "%mm3"
1376 # define r4 "%mm4"
1377 # define r5 "%mm5"
1378 # define r6 "%mm6"
1379 # define r7 "%mm7"
1381 __asm__ __volatile__ (
1382 /* eax = quantized input */
1383 /* esi = quantization table */
1384 /* edx = destination (= idct buffer) */
1385 /* ecx = idctconstants */
1388 :"a"(InputData), "S"(QuantMatrix), "d"(OutputData), "c"(idctconstants)
1391 ASM(
1392 "movq (%eax), "r0"\n"
1393 "pmullw (%esi), "r0"\n" /* r0 = 03 02 01 00 */
1394 "movq "M(0)", "r2"\n" /* r2 = __ __ __ FF */
1395 "movq "r0", "r3"\n" /* r3 = 03 02 01 00 */
1396 "psrlq $16, "r0"\n" /* r0 = __ 03 02 01 */
1397 "pand "r2", "r3"\n" /* r3 = __ __ __ 00 */
1398 "movq "r0", "r5"\n" /* r5 = __ 03 02 01 */
1399 "pand "r2", "r5"\n" /* r5 = __ __ __ 01 */
1400 "pxor "r5", "r0"\n" /* r0 = __ 03 02 __ */
1401 "por "r3", "r0"\n" /* r0 = __ 03 02 00 */
1402 "movq "r0", (%edx)\n" /* write R0 = r0 */
1403 "movq "r5", 16(%edx)\n" /* write R1 = r5 */
1405 # undef M
1407 /* Done partial transpose; now do the idct itself. */
1409 # define I( K) MtoSTR(K*16(%edx))
1410 # define J( K) MtoSTR(((K - 4)*16)+8(%edx))
1412 RowIDCT_3 /* 33 c */
1413 Transpose /* 19 c */
1415 # undef I
1416 # undef J
1417 //# define I( K) [edx + ( K * 16) + 64]
1418 //# define J( K) [edx + ( (K - 4) * 16) + 72]
1420 // RowIDCT ; 46 c
1421 // Transpose ; 19 c
1423 //# undef I
1424 //# undef J
1425 # define I( K) MtoSTR((K * 16)(%edx))
1426 # define J( K) I( K)
1428 ColumnIDCT_3 /* 44 c */
1430 # undef I
1431 # undef J
1432 # define I( K) MtoSTR((K*16)+8(%edx))
1433 # define J( K) I( K)
1435 ColumnIDCT_3 /* 44 c */
1437 # undef I
1438 # undef J
1440 ASM("emms\n");
1444 /* install our implementation in the function table */
1445 void dsp_mmx_idct_init(DspFunctions *funcs)
1447 TH_DEBUG("enabling accelerated x86_32 mmx idct functions.\n");
1448 funcs->IDctSlow = IDctSlow__mmx;
1449 funcs->IDct10 = IDct10__mmx;
1450 funcs->IDct3 = IDct3__mmx;
1453 #endif /* USE_ASM */