2 ; MMX32 iDCT algorithm
(IEEE-
1180 compliant
) :: idct_mmx32
()
6 ; v0.16B33 initial release
8 ; This was one of the harder pieces of work to code.
9 ; Intel
's app-note focuses on the numerical issues of the algorithm, but
10 ; assumes the programmer is familiar with IDCT mathematics, leaving the
11 ; form of the complete function up to the programmer's imagination.
15 ; I played around with the code for quite
a few hours. I came up
16 ; with
*A* working IDCT algorithm
, however I
'm not sure whether my routine
17 ; is "the correct one." But rest assured, my code passes all six IEEE
18 ; accuracy tests with plenty of margin.
20 ; My IDCT algorithm consists of 4 steps:
22 ; 1) IDCT-row transformation (using the IDCT-row function) on all 8 rows
23 ; This yields an intermediate 8x8 matrix.
25 ; 2) intermediate matrix transpose (mandatory)
27 ; 3) IDCT-row transformation (2nd time) on all 8 rows of the intermediate
28 ; matrix. The output is the final-result, in transposed form.
30 ; 4) post-transformation matrix transpose
31 ; (not necessary if the input-data is already transposed, this could
32 ; be done during the MPEG "zig-zag" scan, but since my algorithm
33 ; requires at least one transpose operation, why not re-use the
36 ; Although the (1st) and (3rd) steps use the SAME row-transform operation,
37 ; the (3rd) step uses different shift&round constants (explained later.)
39 ; Also note that the intermediate transpose (2) would not be neccessary,
40 ; if the subsequent operation were a iDCT-column transformation. Since
41 ; we only have the iDCT-row transform, we transpose the intermediate
42 ; matrix and use the iDCT-row transform a 2nd time.
44 ; I had to change some constants/variables for my method to work :
46 ; As given by Intel, the #defines for SHIFT_INV_COL and RND_INV_COL are
47 ; wrong. Not surprising since I'm
not using
a true column-transform
48 ; operation
, but the row-transform operation
(as mentioned earlier.
)
49 ; round_inv_col
[], which is given as
"4 short" values
, should have the
50 ; same dimensions as round_inv_row
[]. The corrected variables are
53 ; Intel
's code defines a different table for each each row operation.
54 ; The tables given are 0/4, 1/7, 2/6, and 5/3. My code only uses row#0.
55 ; Using the other rows messes up the overall transform.
57 ; IMPLEMENTATION DETAILs
58 ; ----------------------
60 ; I divided the algorithm's work into two subroutines
,
61 ;
1) idct_mmx32_rows
() - transforms
8 rows
, then transpose
62 ;
2) idct_mmx32_cols
() - transforms
8 rows
, then transpose
63 ; yields final result
("drop-in" direct replacement for INT32 IDCT
)
65 ; The
2nd function is
a clone of the
1st, with changes made only to the
66 ; shift
&rounding instructions.
68 ; In the
1st function
(rows
), the shift
& round instructions use
69 ; SHIFT_INV_ROW
& round_inv_row
[] (renamed to r_inv_row
[])
71 ; In the
2nd function
(cols
)-> r_inv_col
[], and
72 ; SHIFT_INV_COL
& round_inv_col
[] (renamed to r_inv_col
[])
74 ; Each function contains an integrated transpose-operator
, which comes
75 ; AFTER the primary transformation operation. In the future
, I
'll optimize
76 ; the code to do more of the transpose-work "in-place". Right now, I've
77 ; left the code as two subroutines
and a main calling function
, so other
78 ; people can read the code more easily.
80 ; liaor@umcc.ais.org http
:;members.tripod.com
/~liaor
84 ;;;
A.Stevens Jul
2000 easy-peasy quick port to nasm
85 ;;; Isn
't open source a sensible idea...
88 ;=============================================================================
90 ; AP-922 http:;developer.intel.com/vtune/cbts/strmsimd
91 ; These examples contain code fragments for first stage iDCT 8x8
92 ; (for rows) and first stage DCT 8x8 (for columns)
94 ;============================================================================
96 %define INP eax ; pointer to (short *blk)
97 %define OUT ecx ; pointer to output (temporary store space qwTemp[])
98 %define TABLE ebx ; pointer to idct_tab_01234567[]
99 %define round_inv_row edx
100 %define round_inv_col edx
103 %define ROW_STRIDE 16 ; for 8x8 matrix transposer
104 %define BITS_INV_ACC 4 ; 4 or 5 for IEEE
105 %define SHIFT_INV_ROW (16 - BITS_INV_ACC)
106 %define SHIFT_INV_COL (1 + BITS_INV_ACC +14 ) ; changed from Intel's val
)
109 ;; Variables
and tables defined in C for convenience
111 extern idct_r_inv_row ;
2 DWORDSs
112 extern idct_r_inv_col ;
"
113 extern idct_r_inv_corr ; "
114 extern idct_tab_01234567 ; Catenated table of coefficients
117 ;; private variables
and functions
122 ; qwTemp
: resw
64 ; temporary storage space
, 8x8 of shorts
127 ;; static void idct_mmx
( short
*blk
131 push ebp ; save frame pointer
140 ;; transform all
8 rows of
8x8 iDCT block
143 ; this subroutine performs two operations
144 ;
1) iDCT row transform
145 ; for
( i
= 0; i
< 8;
++ i
)
146 ; DCT_8_INV_ROW_1
( blk
[i
*8], qwTemp
[i
] );
148 ;
2) transpose the matrix
(which was stored in qwTemp
[])
149 ; qwTemp
[] -> [8x8 matrix transpose
] -> blk
[]
151 mov INP
, [ebp+
8] ; INP
= blk
152 mov edi
, 0x00; ; x
= 0
153 lea TABLE
,[idct_tab_01234567
]; ; row
0
158 lea round_inv_row
, [idct_r_inv_row
]
161 ; for
( x
= 0; x
< 8;
++x
) ; transform one row per iteration
164 movq mm0
, [INP
] ;
0 ; x3 x2 x1 x0
166 movq mm1
, [INP+
8] ;
1 ; x7 x6 x5 x4
167 movq mm2
, mm0 ; ;
2 ; x3 x2 x1 x0
169 movq mm3
, [TABLE
] ;
3 ; w06 w04 w02 w00
170 punpcklwd mm0
, mm1 ; x5 x1 x4 x0
173 movq mm5
, mm0 ; ;
5 ; x5 x1 x4 x0
174 punpckldq mm0
, mm0 ; ; x4 x0 x4 x0
176 movq mm4
, [TABLE+
8] ; ;
4 ; w07 w05 w03 w01
177 punpckhwd mm2
, mm1 ; ;
1 ; x7 x3 x6 x2
179 pmaddwd mm3
, mm0 ; ; x4
*w06+x0
*w04 x4
*w02+x0
*w00
180 movq mm6
, mm2 ; ;
6 ; x7 x3 x6 x2
182 movq mm1
, [TABLE+
32] ;;
1 ; w22 w20 w18 w16
183 punpckldq mm2
, mm2 ; ; x6 x2 x6 x2
185 pmaddwd mm4
, mm2 ; ; x6
*w07+x2
*w05 x6
*w03+x2
*w01
186 punpckhdq mm5
, mm5 ; ; x5 x1 x5 x1
188 pmaddwd mm0
, [TABLE+
16] ;; x4
*w14+x0
*w12 x4
*w10+x0
*w08
189 punpckhdq mm6
, mm6 ; ; x7 x3 x7 x3
191 movq mm7
, [TABLE+
40] ;;
7 ; w23 w21 w19 w17
192 pmaddwd mm1
, mm5 ; ; x5
*w22+x1
*w20 x5
*w18+x1
*w16
194 paddd mm3
, [round_inv_row
];;
+rounder
195 pmaddwd mm7
, mm6 ; ; x7
*w23+x3
*w21 x7
*w19+x3
*w17
197 pmaddwd mm2
, [TABLE+
24] ;; x6
*w15+x2
*w13 x6
*w11+x2
*w09
198 paddd mm3
, mm4 ; ;
4 ; a1
=sum
(even1
) a0
=sum
(even0
)
200 pmaddwd mm5
, [TABLE+
48] ;; x5
*w30+x1
*w28 x5
*w26+x1
*w24
201 movq mm4
, mm3 ; ;
4 ; a1 a0
203 pmaddwd mm6
, [TABLE+
56] ;; x7
*w31+x3
*w29 x7
*w27+x3
*w25
204 paddd mm1
, mm7 ; ;
7 ; b1
=sum
(odd1
) b0
=sum
(odd0
)
206 paddd mm0
, [round_inv_row
];;
+rounder
207 psubd mm3
, mm1 ; ; a1-b1 a0-b0
209 psrad mm3
, SHIFT_INV_ROW ; ; y6
=a1-b1 y7
=a0-b0
210 paddd mm1
, mm4 ; ;
4 ; a1+b1 a0+b0
212 paddd mm0
, mm2 ; ;
2 ; a3
=sum
(even3
) a2
=sum
(even2
)
213 psrad mm1
, SHIFT_INV_ROW ; ; y1
=a1+b1 y0
=a0+b0
215 paddd mm5
, mm6 ; ;
6 ; b3
=sum
(odd3
) b2
=sum
(odd2
)
216 movq mm4
, mm0 ; ;
4 ; a3 a2
218 paddd mm0
, mm5 ; ; a3+b3 a2+b2
219 psubd mm4
, mm5 ; ;
5 ; a3-b3 a2-b2
221 add INP
, 16; ; increment INPUT pointer
-> row
1
222 psrad mm4
, SHIFT_INV_ROW ; ; y4
=a3-b3 y5
=a2-b2
224 ;
add TABLE
, 0; ; TABLE
+= 64 -> row
1
225 psrad mm0
, SHIFT_INV_ROW ; ; y3
=a3+b3 y2
=a2+b2
227 ; movq mm2
, [INP
] ; ; row+
1;
0; x3 x2 x1 x0
228 packssdw mm4
, mm3 ; ;
3 ; y6 y7 y4 y5
230 packssdw mm1
, mm0 ; ;
0 ; y3 y2 y1 y0
231 movq mm7
, mm4 ; ;
7 ; y6 y7 y4 y5
233 ; movq mm0
, mm2 ; ; row+
1;
2 ; x3 x2 x1 x0
234 psrld mm4
, 16 ; ;
0 y6
0 y4
236 movq
[OUT
], mm1 ; ;
1 ; save y3 y2 y1 y0
237 pslld mm7
, 16 ; ; y7
0 y5
0
239 ; movq mm1
, [INP+
8] ; ; row+
1;
1 ; x7 x6 x5 x4
240 por mm7
, mm4 ; ;
4 ; y7 y6 y5 y4
242 movq mm3
, [TABLE
] ; ;
3 ; w06 w04 w02 w00
243 ; punpcklwd mm0
, mm1 ; ; row+
1; x5 x1 x4 x0
245 ; begin processing row
1
246 movq
[OUT+
8], mm7 ; ;
7 ; save y7 y6 y5 y4
249 add OUT
, 16; ; increment OUTPUT pointer
-> row
1
251 jl near lpa; ; end for
( x
= 0; x
< 8;
++x
)
253 ; done with the iDCT row-transformation
255 ; now we have to transpose the output
8x8 matrix
256 ;
8x8
(OUT
) -> 8x8
't' (IN
)
257 ; the transposition is implemented as
4 sub-operations.
258 ;
1) transpose upper-left quad
259 ;
2) transpose lower-right quad
260 ;
3) transpose lower-left quad
261 ;
4) transpose upper-right quad
264 ; mm0
= 1st row
[ A B C D
] row1
265 ; mm1
= 2nd row
[ E F G H
] 2
266 ; mm2
= 3rd row
[ I J K
L ] 3
267 ; mm3
= 4th row
[ M N O P
] 4
269 ;
1) transpose upper-left quad
273 movq mm0
, [OUT
+ ROW_STRIDE
* 0 ]
275 movq mm1
, [OUT
+ ROW_STRIDE
* 1 ]
276 movq mm4
, mm0; ; mm4
= copy of row1
[A B C D
]
278 movq mm2
, [OUT
+ ROW_STRIDE
* 2 ]
279 punpcklwd mm0
, mm1; ; mm0
= [ 0 4 1 5]
281 movq mm3
, [OUT
+ ROW_STRIDE
* 3]
282 punpckhwd mm4
, mm1; ; mm4
= [ 2 6 3 7]
285 punpcklwd mm2
, mm3; ; mm2
= [ 8 12 9 13]
287 punpckhwd mm6
, mm3; ; mm6
= 10 14 11 15]
288 movq mm1
, mm0; ; mm1
= [ 0 4 1 5]
290 mov INP
, [ebp+
8]; ; load input address
291 punpckldq mm0
, mm2; ; final result mm0
= row1
[0 4 8 12]
293 movq mm3
, mm4; ; mm3
= [ 2 6 3 7]
294 punpckhdq mm1
, mm2; ; mm1
= final result mm1
= row2
[1 5 9 13]
296 movq
[ INP
+ ROW_STRIDE
* 0 ], mm0; ; store row
1
297 punpckldq mm4
, mm6; ; final result mm4
= row3
[2 6 10 14]
299 ; begin reading next quadrant
(lower-right
)
300 movq mm0
, [OUT
+ ROW_STRIDE
*4 + 8];
301 punpckhdq mm3
, mm6; ; final result mm3
= row4
[3 7 11 15]
303 movq
[ INP
+ROW_STRIDE
* 2], mm4; ; store row
3
304 movq mm4
, mm0; ; mm4
= copy of row1
[A B C D
]
306 movq
[ INP
+ROW_STRIDE
* 1], mm1; ; store row
2
308 movq mm1
, [OUT
+ ROW_STRIDE
*5 + 8]
310 movq
[ INP
+ROW_STRIDE
* 3], mm3; ; store row
4
311 punpcklwd mm0
, mm1; ; mm0
= [ 0 4 1 5]
313 ;
2) transpose lower-right quadrant
315 ; movq mm0
, [OUT
+ ROW_STRIDE
*4 + 8]
317 ; movq mm1
, [OUT
+ ROW_STRIDE
*5 + 8]
318 ; movq mm4
, mm0; ; mm4
= copy of row1
[A B C D
]
320 movq mm2
, [OUT
+ ROW_STRIDE
*6 + 8]
321 ; punpcklwd mm0
, mm1; ; mm0
= [ 0 4 1 5]
322 punpckhwd mm4
, mm1; ; mm4
= [ 2 6 3 7]
324 movq mm3
, [OUT
+ ROW_STRIDE
*7 + 8]
327 punpcklwd mm2
, mm3; ; mm2
= [ 8 12 9 13]
328 movq mm1
, mm0; ; mm1
= [ 0 4 1 5]
330 punpckhwd mm6
, mm3; ; mm6
= 10 14 11 15]
331 movq mm3
, mm4; ; mm3
= [ 2 6 3 7]
333 punpckldq mm0
, mm2; ; final result mm0
= row1
[0 4 8 12]
335 punpckhdq mm1
, mm2; ; mm1
= final result mm1
= row2
[1 5 9 13]
338 movq
[ INP
+ ROW_STRIDE
*4 + 8], mm0; ; store row
1
339 punpckldq mm4
, mm6; ; final result mm4
= row3
[2 6 10 14]
341 movq mm0
, [OUT
+ ROW_STRIDE
* 4 ]
342 punpckhdq mm3
, mm6; ; final result mm3
= row4
[3 7 11 15]
343 movq
[ INP
+ROW_STRIDE
*6 + 8], mm4; ; store row
3
344 movq mm4
, mm0; ; mm4
= copy of row1
[A B C D
]
345 movq
[ INP
+ROW_STRIDE
*5 + 8], mm1; ; store row
2
347 movq mm1
, [OUT
+ ROW_STRIDE
* 5 ]
350 movq
[ INP
+ROW_STRIDE
*7 + 8], mm3; ; store row
4
351 punpcklwd mm0
, mm1; ; mm0
= [ 0 4 1 5]
353 ;
3) transpose lower-left
354 ; movq mm0
, [OUT
+ ROW_STRIDE
* 4 ]
356 ; movq mm1
, [OUT
+ ROW_STRIDE
* 5 ]
357 ; movq mm4
, mm0; ; mm4
= copy of row1
[A B C D
]
359 movq mm2
, [OUT
+ ROW_STRIDE
* 6 ]
360 ; punpcklwd mm0
, mm1; ; mm0
= [ 0 4 1 5]
361 punpckhwd mm4
, mm1; ; mm4
= [ 2 6 3 7]
363 movq mm3
, [OUT
+ ROW_STRIDE
* 7 ]
366 punpcklwd mm2
, mm3; ; mm2
= [ 8 12 9 13]
367 movq mm1
, mm0; ; mm1
= [ 0 4 1 5]
369 punpckhwd mm6
, mm3; ; mm6
= 10 14 11 15]
370 movq mm3
, mm4; ; mm3
= [ 2 6 3 7]
372 punpckldq mm0
, mm2; ; final result mm0
= row1
[0 4 8 12]
374 punpckhdq mm1
, mm2; ; mm1
= final result mm1
= row2
[1 5 9 13]
377 movq
[ INP
+ ROW_STRIDE
* 0 + 8 ], mm0; ; store row
1
378 punpckldq mm4
, mm6; ; final result mm4
= row3
[2 6 10 14]
380 ; begin reading next quadrant
(upper-right
)
381 movq mm0
, [OUT
+ ROW_STRIDE
*0 + 8];
382 punpckhdq mm3
, mm6; ; final result mm3
= row4
[3 7 11 15]
384 movq
[ INP
+ROW_STRIDE
* 2 + 8], mm4; ; store row
3
385 movq mm4
, mm0; ; mm4
= copy of row1
[A B C D
]
387 movq
[ INP
+ROW_STRIDE
* 1 + 8 ], mm1; ; store row
2
388 movq mm1
, [OUT
+ ROW_STRIDE
*1 + 8]
390 movq
[ INP
+ROW_STRIDE
* 3 + 8], mm3; ; store row
4
391 punpcklwd mm0
, mm1; ; mm0
= [ 0 4 1 5]
394 ;
2) transpose lower-right quadrant
396 ; movq mm0
, [OUT
+ ROW_STRIDE
*4 + 8]
398 ; movq mm1
, [OUT
+ ROW_STRIDE
*5 + 8]
399 ; movq mm4
, mm0; ; mm4
= copy of row1
[A B C D
]
401 movq mm2
, [OUT
+ ROW_STRIDE
*2 + 8]
402 ; punpcklwd mm0
, mm1; ; mm0
= [ 0 4 1 5]
403 punpckhwd mm4
, mm1; ; mm4
= [ 2 6 3 7]
405 movq mm3
, [OUT
+ ROW_STRIDE
*3 + 8]
408 punpcklwd mm2
, mm3; ; mm2
= [ 8 12 9 13]
409 movq mm1
, mm0; ; mm1
= [ 0 4 1 5]
411 punpckhwd mm6
, mm3; ; mm6
= 10 14 11 15]
412 movq mm3
, mm4; ; mm3
= [ 2 6 3 7]
414 punpckldq mm0
, mm2; ; final result mm0
= row1
[0 4 8 12]
416 punpckhdq mm1
, mm2; ; mm1
= final result mm1
= row2
[1 5 9 13]
419 movq
[ INP
+ ROW_STRIDE
*4 ], mm0; ; store row
1
420 punpckldq mm4
, mm6; ; final result mm4
= row3
[2 6 10 14]
422 movq
[ INP
+ROW_STRIDE
*5 ], mm1; ; store row
2
423 punpckhdq mm3
, mm6; ; final result mm3
= row4
[3 7 11 15]
425 movq
[ INP
+ROW_STRIDE
*6 ], mm4; ; store row
3
428 movq
[ INP
+ROW_STRIDE
*7 ], mm3; ; store row
4
430 ; Conceptually this is the column transform.
431 ; Actually
, the matrix is transformed
432 ; row by row. This function is identical to idct_mmx32_rows
(),
433 ; except for the SHIFT amount
and ROUND_INV amount.
435 ; this subroutine performs two operations
436 ;
1) iDCT row transform
437 ; for
( i
= 0; i
< 8;
++ i
)
438 ; DCT_8_INV_ROW_1
( blk
[i
*8], qwTemp
[i
] );
440 ;
2) transpose the matrix
(which was stored in qwTemp
[])
441 ; qwTemp
[] -> [8x8 matrix transpose
] -> blk
[]
444 mov INP
, [ebp+
8]; ; ; row
0
445 mov edi
, 0x00; ; x
= 0
447 lea TABLE
, [idct_tab_01234567
]; ; row
0
450 ; mov OUT
, INP; ; algorithm writes data in-place
-> row
0
452 lea round_inv_col
, [idct_r_inv_col
]
453 jmp acc_idct_colloop1
455 ; for
( x
= 0; x
< 8;
++x
) ; transform one row per iteration
459 movq mm0
, [INP
] ; ;
0 ; x3 x2 x1 x0
461 movq mm1
, [INP+
8] ; ;
1 ; x7 x6 x5 x4
462 movq mm2
, mm0 ; ;
2 ; x3 x2 x1 x0
464 movq mm3
, [TABLE
] ; ;
3 ; w06 w04 w02 w00
465 punpcklwd mm0
, mm1 ; ; x5 x1 x4 x0
468 movq mm5
, mm0 ; ;
5 ; x5 x1 x4 x0
469 punpckldq mm0
, mm0 ; ; x4 x0 x4 x0
471 movq mm4
, [TABLE+
8] ; ;
4 ; w07 w05 w03 w01
472 punpckhwd mm2
, mm1 ; ;
1 ; x7 x3 x6 x2
474 pmaddwd mm3
, mm0 ; ; x4
*w06+x0
*w04 x4
*w02+x0
*w00
475 movq mm6
, mm2 ; ;
6 ; x7 x3 x6 x2
477 movq mm1
, [TABLE+
32] ;;
1 ; w22 w20 w18 w16
478 punpckldq mm2
, mm2 ; ; x6 x2 x6 x2
480 pmaddwd mm4
, mm2 ; ; x6
*w07+x2
*w05 x6
*w03+x2
*w01
481 punpckhdq mm5
, mm5 ; ; x5 x1 x5 x1
483 pmaddwd mm0
, [TABLE+
16] ;; x4
*w14+x0
*w12 x4
*w10+x0
*w08
484 punpckhdq mm6
, mm6 ; ; x7 x3 x7 x3
486 movq mm7
, [TABLE+
40] ;;
7 ; w23 w21 w19 w17
487 pmaddwd mm1
, mm5 ; ; x5
*w22+x1
*w20 x5
*w18+x1
*w16
489 paddd mm3
, [round_inv_col
] ;;
+rounder
490 pmaddwd mm7
, mm6 ; ; x7
*w23+x3
*w21 x7
*w19+x3
*w17
492 pmaddwd mm2
, [TABLE+
24] ;; x6
*w15+x2
*w13 x6
*w11+x2
*w09
493 paddd mm3
, mm4 ; ;
4 ; a1
=sum
(even1
) a0
=sum
(even0
)
495 pmaddwd mm5
, [TABLE+
48] ;; x5
*w30+x1
*w28 x5
*w26+x1
*w24
496 movq mm4
, mm3 ; ;
4 ; a1 a0
498 pmaddwd mm6
, [TABLE+
56] ;; x7
*w31+x3
*w29 x7
*w27+x3
*w25
499 paddd mm1
, mm7 ; ;
7 ; b1
=sum
(odd1
) b0
=sum
(odd0
)
501 paddd mm0
, [round_inv_col
] ;;
+rounder
502 psubd mm3
, mm1 ; ; a1-b1 a0-b0
504 psrad mm3
, SHIFT_INV_COL; ; y6
=a1-b1 y7
=a0-b0
505 paddd mm1
, mm4 ; ;
4 ; a1+b1 a0+b0
507 paddd mm0
, mm2 ; ;
2 ; a3
=sum
(even3
) a2
=sum
(even2
)
508 psrad mm1
, SHIFT_INV_COL; ; y1
=a1+b1 y0
=a0+b0
510 paddd mm5
, mm6 ; ;
6 ; b3
=sum
(odd3
) b2
=sum
(odd2
)
511 movq mm4
, mm0 ; ;
4 ; a3 a2
513 paddd mm0
, mm5 ; ; a3+b3 a2+b2
514 psubd mm4
, mm5 ; ;
5 ; a3-b3 a2-b2
516 add INP
, 16; ; increment INPUT pointer
-> row
1
517 psrad mm4
, SHIFT_INV_COL; ; y4
=a3-b3 y5
=a2-b2
519 add TABLE
, 0; ; TABLE
+= 64 -> row
1
520 psrad mm0
, SHIFT_INV_COL; ; y3
=a3+b3 y2
=a2+b2
522 ; movq mm2
, [INP
] ; ; row+
1;
0; x3 x2 x1 x0
523 packssdw mm4
, mm3 ; ;
3 ; y6 y7 y4 y5
525 packssdw mm1
, mm0 ; ;
0 ; y3 y2 y1 y0
526 movq mm7
, mm4 ; ;
7 ; y6 y7 y4 y5
528 ; movq mm0
, mm2 ; ; row+
1;
2 ; x3 x2 x1 x0
529 ; por mm1
, dct_one_corr ; ; correction y2
+0.5
530 psrld mm4
, 16 ; ;
0 y6
0 y4
532 movq
[OUT
], mm1 ; ;
1 ; save y3 y2 y1 y0
533 pslld mm7
, 16 ; ; y7
0 y5
0
535 ; movq mm1
, [INP+
8] ; ; row+
1;
1 ; x7 x6 x5 x4
536 ; por mm7
, dct_one_corr ; ; correction y2
+0.5
537 por mm7
, mm4 ; ;
4 ; y7 y6 y5 y4
539 ; movq mm3
, [TABLE
] ; ;
3 ; w06 w04 w02 w00
540 ; punpcklwd mm0
, mm1 ; ; row+
1; x5 x1 x4 x0
542 ; begin processing row
1
543 movq
[OUT+
8], mm7 ; ;
7 ; save y7 y6 y5 y4
547 cmp edi
, 0x08; ; compare x
<> 8
549 jl near acc_idct_colloop1; ; end for
( x
= 0; x
< 8;
++x
)
551 ; done with the iDCT column-transformation
553 ; now we have to transpose the output
8x8 matrix
554 ;
8x8
(OUT
) -> 8x8
't' (IN
)
556 ; the transposition is implemented as
4 sub-operations.
557 ;
1) transpose upper-left quad
558 ;
2) transpose lower-right quad
559 ;
3) transpose lower-left quad
560 ;
4) transpose upper-right quad
564 ; mm0
= 1st row
[ A B C D
] row1
565 ; mm1
= 2nd row
[ E F G H
] 2
566 ; mm2
= 3rd row
[ I J K
L ] 3
567 ; mm3
= 4th row
[ M N O P
] 4
569 ;
1) transpose upper-left quad
573 movq mm0
, [OUT
+ ROW_STRIDE
* 0 ]
575 movq mm1
, [OUT
+ ROW_STRIDE
* 1 ]
576 movq mm4
, mm0; ; mm4
= copy of row1
[A B C D
]
578 movq mm2
, [OUT
+ ROW_STRIDE
* 2 ]
579 punpcklwd mm0
, mm1; ; mm0
= [ 0 4 1 5]
581 movq mm3
, [OUT
+ ROW_STRIDE
* 3]
582 punpckhwd mm4
, mm1 ; mm4
= [ 2 6 3 7]
585 punpcklwd mm2
, mm3 ; mm2
= [ 8 12 9 13]
587 punpckhwd mm6
, mm3 ; mm6
= 10 14 11 15]
588 movq mm1
, mm0 ; mm1
= [ 0 4 1 5]
590 mov INP
, [ebp+
8] ; load input address
591 punpckldq mm0
, mm2 ; final result mm0
= row1
[0 4 8 12]
593 movq mm3
, mm4; ; mm3
= [ 2 6 3 7]
594 punpckhdq mm1
, mm2; ; mm1
= final result mm1
= row2
[1 5 9 13]
596 movq
[ INP
+ ROW_STRIDE
* 0 ], mm0; ; store row
1
597 punpckldq mm4
, mm6; ; final result mm4
= row3
[2 6 10 14]
599 ; begin reading next quadrant
(lower-right
)
600 movq mm0
, [OUT
+ ROW_STRIDE
*4 + 8];
601 punpckhdq mm3
, mm6; ; final result mm3
= row4
[3 7 11 15]
603 movq
[ INP
+ROW_STRIDE
* 2], mm4; ; store row
3
604 movq mm4
, mm0; ; mm4
= copy of row1
[A B C D
]
606 movq
[ INP
+ROW_STRIDE
* 1], mm1; ; store row
2
608 movq mm1
, [OUT
+ ROW_STRIDE
*5 + 8]
610 movq
[ INP
+ROW_STRIDE
* 3], mm3; ; store row
4
611 punpcklwd mm0
, mm1; ; mm0
= [ 0 4 1 5]
613 ;
2) transpose lower-right quadrant
615 ; movq mm0
, [OUT
+ ROW_STRIDE
*4 + 8]
617 ; movq mm1
, [OUT
+ ROW_STRIDE
*5 + 8]
618 ; movq mm4
, mm0; ; mm4
= copy of row1
[A B C D
]
620 movq mm2
, [OUT
+ ROW_STRIDE
*6 + 8]
621 ; punpcklwd mm0
, mm1; ; mm0
= [ 0 4 1 5]
622 punpckhwd mm4
, mm1; ; mm4
= [ 2 6 3 7]
624 movq mm3
, [OUT
+ ROW_STRIDE
*7 + 8]
627 punpcklwd mm2
, mm3; ; mm2
= [ 8 12 9 13]
628 movq mm1
, mm0; ; mm1
= [ 0 4 1 5]
630 punpckhwd mm6
, mm3; ; mm6
= 10 14 11 15]
631 movq mm3
, mm4; ; mm3
= [ 2 6 3 7]
633 punpckldq mm0
, mm2; ; final result mm0
= row1
[0 4 8 12]
635 punpckhdq mm1
, mm2; ; mm1
= final result mm1
= row2
[1 5 9 13]
638 movq
[ INP
+ ROW_STRIDE
*4 + 8], mm0; ; store row
1
639 punpckldq mm4
, mm6; ; final result mm4
= row3
[2 6 10 14]
641 movq mm0
, [OUT
+ ROW_STRIDE
* 4 ]
642 punpckhdq mm3
, mm6; ; final result mm3
= row4
[3 7 11 15]
643 movq
[ INP
+ROW_STRIDE
*6 + 8], mm4; ; store row
3
644 movq mm4
, mm0; ; mm4
= copy of row1
[A B C D
]
646 movq
[ INP
+ROW_STRIDE
*5 + 8], mm1; ; store row
2
648 movq mm1
, [OUT
+ ROW_STRIDE
* 5 ]
651 movq
[ INP
+ROW_STRIDE
*7 + 8], mm3; ; store row
4
652 punpcklwd mm0
, mm1; ; mm0
= [ 0 4 1 5]
654 ;
3) transpose lower-left
655 ; movq mm0
, [OUT
+ ROW_STRIDE
* 4 ]
657 ; movq mm1
, [OUT
+ ROW_STRIDE
* 5 ]
658 ; movq mm4
, mm0; ; mm4
= copy of row1
[A B C D
]
660 movq mm2
, [OUT
+ ROW_STRIDE
* 6 ]
661 ; punpcklwd mm0
, mm1; ; mm0
= [ 0 4 1 5]
662 punpckhwd mm4
, mm1; ; mm4
= [ 2 6 3 7]
664 movq mm3
, [OUT
+ ROW_STRIDE
* 7 ]
667 punpcklwd mm2
, mm3; ; mm2
= [ 8 12 9 13]
668 movq mm1
, mm0; ; mm1
= [ 0 4 1 5]
670 punpckhwd mm6
, mm3; ; mm6
= 10 14 11 15]
671 movq mm3
, mm4; ; mm3
= [ 2 6 3 7]
673 punpckldq mm0
, mm2; ; final result mm0
= row1
[0 4 8 12]
675 punpckhdq mm1
, mm2; ; mm1
= final result mm1
= row2
[1 5 9 13]
678 movq
[ INP
+ ROW_STRIDE
* 0 + 8 ], mm0; ; store row
1
679 punpckldq mm4
, mm6; ; final result mm4
= row3
[2 6 10 14]
681 ; begin reading next quadrant
(upper-right
)
682 movq mm0
, [OUT
+ ROW_STRIDE
*0 + 8];
683 punpckhdq mm3
, mm6; ; final result mm3
= row4
[3 7 11 15]
685 movq
[ INP
+ROW_STRIDE
* 2 + 8], mm4; ; store row
3
686 movq mm4
, mm0; ; mm4
= copy of row1
[A B C D
]
688 movq
[ INP
+ROW_STRIDE
* 1 + 8 ], mm1; ; store row
2
689 movq mm1
, [OUT
+ ROW_STRIDE
*1 + 8]
691 movq
[ INP
+ROW_STRIDE
* 3 + 8], mm3; ; store row
4
692 punpcklwd mm0
, mm1; ; mm0
= [ 0 4 1 5]
695 ;
2) transpose lower-right quadrant
697 ; movq mm0
, [OUT
+ ROW_STRIDE
*4 + 8]
699 ; movq mm1
, [OUT
+ ROW_STRIDE
*5 + 8]
700 ; movq mm4
, mm0; ; mm4
= copy of row1
[A B C D
]
702 movq mm2
, [OUT
+ ROW_STRIDE
*2 + 8]
703 ; punpcklwd mm0
, mm1; ; mm0
= [ 0 4 1 5]
704 punpckhwd mm4
, mm1; ; mm4
= [ 2 6 3 7]
706 movq mm3
, [OUT
+ ROW_STRIDE
*3 + 8]
709 punpcklwd mm2
, mm3; ; mm2
= [ 8 12 9 13]
710 movq mm1
, mm0; ; mm1
= [ 0 4 1 5]
712 punpckhwd mm6
, mm3; ; mm6
= 10 14 11 15]
713 movq mm3
, mm4; ; mm3
= [ 2 6 3 7]
715 punpckldq mm0
, mm2; ; final result mm0
= row1
[0 4 8 12]
717 punpckhdq mm1
, mm2; ; mm1
= final result mm1
= row2
[1 5 9 13]
720 movq
[ INP
+ ROW_STRIDE
*4 ], mm0; ; store row
1
721 punpckldq mm4
, mm6; ; final result mm4
= row3
[2 6 10 14]
723 movq
[ INP
+ROW_STRIDE
*5 ], mm1; ; store row
2
724 punpckhdq mm3
, mm6; ; final result mm3
= row4
[3 7 11 15]
726 movq
[ INP
+ROW_STRIDE
*6 ], mm4; ; store row
3
729 movq
[ INP
+ROW_STRIDE
*7 ], mm3; ; store row
4
736 pop ebp ; restore frame pointer