4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27 based upon some outcommented c code from mpeg2dec (idct_mmx.c
28 written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>)
32 #include "simple_idct.h"
35 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
36 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
37 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
38 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
39 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
40 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
41 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
45 #define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46 #define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
47 #define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
48 #define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
49 #define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
50 #define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
51 #define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
53 #define COL_SHIFT 20 // 6
56 #if defined(ARCH_POWERPC_405)
58 /* signed 16x16 -> 32 multiply add accumulate */
59 #define MAC16(rt, ra, rb) \
60 asm ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb));
62 /* signed 16x16 -> 32 multiply */
63 #define MUL16(rt, ra, rb) \
64 asm ("mullhw %0, %1, %2" : "=r" (rt) : "r" (ra), "r" (rb));
68 /* signed 16x16 -> 32 multiply add accumulate */
69 #define MAC16(rt, ra, rb) rt += (ra) * (rb)
71 /* signed 16x16 -> 32 multiply */
72 #define MUL16(rt, ra, rb) rt = (ra) * (rb)
76 static inline void idctRowCondDC (DCTELEM
* row
)
78 int a0
, a1
, a2
, a3
, b0
, b1
, b2
, b3
;
86 #ifdef WORDS_BIGENDIAN
87 #define ROW0_MASK 0xffff000000000000LL
89 #define ROW0_MASK 0xffffLL
91 if(sizeof(DCTELEM
)==2){
92 if ( ((((uint64_t *)row
)[0] & ~ROW0_MASK
) |
93 ((uint64_t *)row
)[1]) == 0) {
94 temp
= (row
[0] << 3) & 0xffff;
97 ((uint64_t *)row
)[0] = temp
;
98 ((uint64_t *)row
)[1] = temp
;
102 if (!(row
[1]|row
[2]|row
[3]|row
[4]|row
[5]|row
[6]|row
[7])) {
103 row
[0]=row
[1]=row
[2]=row
[3]=row
[4]=row
[5]=row
[6]=row
[7]= row
[0] << 3;
108 if(sizeof(DCTELEM
)==2){
109 if (!(((uint32_t*)row
)[1] |
110 ((uint32_t*)row
)[2] |
111 ((uint32_t*)row
)[3] |
113 temp
= (row
[0] << 3) & 0xffff;
115 ((uint32_t*)row
)[0]=((uint32_t*)row
)[1] =
116 ((uint32_t*)row
)[2]=((uint32_t*)row
)[3] = temp
;
120 if (!(row
[1]|row
[2]|row
[3]|row
[4]|row
[5]|row
[6]|row
[7])) {
121 row
[0]=row
[1]=row
[2]=row
[3]=row
[4]=row
[5]=row
[6]=row
[7]= row
[0] << 3;
127 a0
= (W4
* row
[0]) + (1 << (ROW_SHIFT
- 1));
132 /* no need to optimize : gcc does it */
138 MUL16(b0
, W1
, row
[1]);
139 MAC16(b0
, W3
, row
[3]);
140 MUL16(b1
, W3
, row
[1]);
141 MAC16(b1
, -W7
, row
[3]);
142 MUL16(b2
, W5
, row
[1]);
143 MAC16(b2
, -W1
, row
[3]);
144 MUL16(b3
, W7
, row
[1]);
145 MAC16(b3
, -W5
, row
[3]);
148 temp
= ((uint64_t*)row
)[1];
150 temp
= ((uint32_t*)row
)[2] | ((uint32_t*)row
)[3];
153 a0
+= W4
*row
[4] + W6
*row
[6];
154 a1
+= - W4
*row
[4] - W2
*row
[6];
155 a2
+= - W4
*row
[4] + W2
*row
[6];
156 a3
+= W4
*row
[4] - W6
*row
[6];
158 MAC16(b0
, W5
, row
[5]);
159 MAC16(b0
, W7
, row
[7]);
161 MAC16(b1
, -W1
, row
[5]);
162 MAC16(b1
, -W5
, row
[7]);
164 MAC16(b2
, W7
, row
[5]);
165 MAC16(b2
, W3
, row
[7]);
167 MAC16(b3
, W3
, row
[5]);
168 MAC16(b3
, -W1
, row
[7]);
171 row
[0] = (a0
+ b0
) >> ROW_SHIFT
;
172 row
[7] = (a0
- b0
) >> ROW_SHIFT
;
173 row
[1] = (a1
+ b1
) >> ROW_SHIFT
;
174 row
[6] = (a1
- b1
) >> ROW_SHIFT
;
175 row
[2] = (a2
+ b2
) >> ROW_SHIFT
;
176 row
[5] = (a2
- b2
) >> ROW_SHIFT
;
177 row
[3] = (a3
+ b3
) >> ROW_SHIFT
;
178 row
[4] = (a3
- b3
) >> ROW_SHIFT
;
181 static inline void idctSparseColPut (uint8_t *dest
, int line_size
,
184 int a0
, a1
, a2
, a3
, b0
, b1
, b2
, b3
;
185 uint8_t *cm
= cropTbl
+ MAX_NEG_CROP
;
187 /* XXX: I did that only to give same values as previous code */
188 a0
= W4
* (col
[8*0] + ((1<<(COL_SHIFT
-1))/W4
));
198 MUL16(b0
, W1
, col
[8*1]);
199 MUL16(b1
, W3
, col
[8*1]);
200 MUL16(b2
, W5
, col
[8*1]);
201 MUL16(b3
, W7
, col
[8*1]);
203 MAC16(b0
, + W3
, col
[8*3]);
204 MAC16(b1
, - W7
, col
[8*3]);
205 MAC16(b2
, - W1
, col
[8*3]);
206 MAC16(b3
, - W5
, col
[8*3]);
216 MAC16(b0
, + W5
, col
[8*5]);
217 MAC16(b1
, - W1
, col
[8*5]);
218 MAC16(b2
, + W7
, col
[8*5]);
219 MAC16(b3
, + W3
, col
[8*5]);
230 MAC16(b0
, + W7
, col
[8*7]);
231 MAC16(b1
, - W5
, col
[8*7]);
232 MAC16(b2
, + W3
, col
[8*7]);
233 MAC16(b3
, - W1
, col
[8*7]);
236 dest
[0] = cm
[(a0
+ b0
) >> COL_SHIFT
];
238 dest
[0] = cm
[(a1
+ b1
) >> COL_SHIFT
];
240 dest
[0] = cm
[(a2
+ b2
) >> COL_SHIFT
];
242 dest
[0] = cm
[(a3
+ b3
) >> COL_SHIFT
];
244 dest
[0] = cm
[(a3
- b3
) >> COL_SHIFT
];
246 dest
[0] = cm
[(a2
- b2
) >> COL_SHIFT
];
248 dest
[0] = cm
[(a1
- b1
) >> COL_SHIFT
];
250 dest
[0] = cm
[(a0
- b0
) >> COL_SHIFT
];
253 static inline void idctSparseColAdd (uint8_t *dest
, int line_size
,
256 int a0
, a1
, a2
, a3
, b0
, b1
, b2
, b3
;
257 uint8_t *cm
= cropTbl
+ MAX_NEG_CROP
;
259 /* XXX: I did that only to give same values as previous code */
260 a0
= W4
* (col
[8*0] + ((1<<(COL_SHIFT
-1))/W4
));
270 MUL16(b0
, W1
, col
[8*1]);
271 MUL16(b1
, W3
, col
[8*1]);
272 MUL16(b2
, W5
, col
[8*1]);
273 MUL16(b3
, W7
, col
[8*1]);
275 MAC16(b0
, + W3
, col
[8*3]);
276 MAC16(b1
, - W7
, col
[8*3]);
277 MAC16(b2
, - W1
, col
[8*3]);
278 MAC16(b3
, - W5
, col
[8*3]);
288 MAC16(b0
, + W5
, col
[8*5]);
289 MAC16(b1
, - W1
, col
[8*5]);
290 MAC16(b2
, + W7
, col
[8*5]);
291 MAC16(b3
, + W3
, col
[8*5]);
302 MAC16(b0
, + W7
, col
[8*7]);
303 MAC16(b1
, - W5
, col
[8*7]);
304 MAC16(b2
, + W3
, col
[8*7]);
305 MAC16(b3
, - W1
, col
[8*7]);
308 dest
[0] = cm
[dest
[0] + ((a0
+ b0
) >> COL_SHIFT
)];
310 dest
[0] = cm
[dest
[0] + ((a1
+ b1
) >> COL_SHIFT
)];
312 dest
[0] = cm
[dest
[0] + ((a2
+ b2
) >> COL_SHIFT
)];
314 dest
[0] = cm
[dest
[0] + ((a3
+ b3
) >> COL_SHIFT
)];
316 dest
[0] = cm
[dest
[0] + ((a3
- b3
) >> COL_SHIFT
)];
318 dest
[0] = cm
[dest
[0] + ((a2
- b2
) >> COL_SHIFT
)];
320 dest
[0] = cm
[dest
[0] + ((a1
- b1
) >> COL_SHIFT
)];
322 dest
[0] = cm
[dest
[0] + ((a0
- b0
) >> COL_SHIFT
)];
325 static inline void idctSparseCol (DCTELEM
* col
)
327 int a0
, a1
, a2
, a3
, b0
, b1
, b2
, b3
;
329 /* XXX: I did that only to give same values as previous code */
330 a0
= W4
* (col
[8*0] + ((1<<(COL_SHIFT
-1))/W4
));
340 MUL16(b0
, W1
, col
[8*1]);
341 MUL16(b1
, W3
, col
[8*1]);
342 MUL16(b2
, W5
, col
[8*1]);
343 MUL16(b3
, W7
, col
[8*1]);
345 MAC16(b0
, + W3
, col
[8*3]);
346 MAC16(b1
, - W7
, col
[8*3]);
347 MAC16(b2
, - W1
, col
[8*3]);
348 MAC16(b3
, - W5
, col
[8*3]);
358 MAC16(b0
, + W5
, col
[8*5]);
359 MAC16(b1
, - W1
, col
[8*5]);
360 MAC16(b2
, + W7
, col
[8*5]);
361 MAC16(b3
, + W3
, col
[8*5]);
372 MAC16(b0
, + W7
, col
[8*7]);
373 MAC16(b1
, - W5
, col
[8*7]);
374 MAC16(b2
, + W3
, col
[8*7]);
375 MAC16(b3
, - W1
, col
[8*7]);
378 col
[0 ] = ((a0
+ b0
) >> COL_SHIFT
);
379 col
[8 ] = ((a1
+ b1
) >> COL_SHIFT
);
380 col
[16] = ((a2
+ b2
) >> COL_SHIFT
);
381 col
[24] = ((a3
+ b3
) >> COL_SHIFT
);
382 col
[32] = ((a3
- b3
) >> COL_SHIFT
);
383 col
[40] = ((a2
- b2
) >> COL_SHIFT
);
384 col
[48] = ((a1
- b1
) >> COL_SHIFT
);
385 col
[56] = ((a0
- b0
) >> COL_SHIFT
);
388 void simple_idct_put(uint8_t *dest
, int line_size
, DCTELEM
*block
)
392 idctRowCondDC(block
+ i
*8);
395 idctSparseColPut(dest
+ i
, line_size
, block
+ i
);
398 void simple_idct_add(uint8_t *dest
, int line_size
, DCTELEM
*block
)
402 idctRowCondDC(block
+ i
*8);
405 idctSparseColAdd(dest
+ i
, line_size
, block
+ i
);
408 void simple_idct(DCTELEM
*block
)
412 idctRowCondDC(block
+ i
*8);
415 idctSparseCol(block
+ i
);
421 #define C_FIX(x) ((int)((x) * (1 << CN_SHIFT) + 0.5))
422 #define C1 C_FIX(0.6532814824)
423 #define C2 C_FIX(0.2705980501)
425 /* row idct is multiple by 16 * sqrt(2.0), col idct4 is normalized,
426 and the butterfly must be multiplied by 0.5 * sqrt(2.0) */
427 #define C_SHIFT (4+1+12)
429 static inline void idct4col(uint8_t *dest
, int line_size
, const DCTELEM
*col
)
431 int c0
, c1
, c2
, c3
, a0
, a1
, a2
, a3
;
432 const uint8_t *cm
= cropTbl
+ MAX_NEG_CROP
;
438 c0
= ((a0
+ a2
) << (CN_SHIFT
- 1)) + (1 << (C_SHIFT
- 1));
439 c2
= ((a0
- a2
) << (CN_SHIFT
- 1)) + (1 << (C_SHIFT
- 1));
440 c1
= a1
* C1
+ a3
* C2
;
441 c3
= a1
* C2
- a3
* C1
;
442 dest
[0] = cm
[(c0
+ c1
) >> C_SHIFT
];
444 dest
[0] = cm
[(c2
+ c3
) >> C_SHIFT
];
446 dest
[0] = cm
[(c2
- c3
) >> C_SHIFT
];
448 dest
[0] = cm
[(c0
- c1
) >> C_SHIFT
];
457 ptr[8 + k] = a0 - a1;\
460 /* only used by DV codec. The input must be interlaced. 128 is added
461 to the pixels before clamping to avoid systematic error
462 (1024*sqrt(2)) offset would be needed otherwise. */
463 /* XXX: I think a 1.0/sqrt(2) normalization should be needed to
464 compensate the extra butterfly stage - I don't have the full DV
466 void simple_idct248_put(uint8_t *dest
, int line_size
, DCTELEM
*block
)
485 /* IDCT8 on each line */
487 idctRowCondDC(block
+ i
*8);
490 /* IDCT4 and store */
492 idct4col(dest
+ i
, 2 * line_size
, block
+ i
);
493 idct4col(dest
+ line_size
+ i
, 2 * line_size
, block
+ 8 + i
);
497 /* 8x4 & 4x8 WMV2 IDCT */
504 #define C_FIX(x) ((int)((x) * 1.414213562 * (1 << CN_SHIFT) + 0.5))
505 #define C1 C_FIX(0.6532814824)
506 #define C2 C_FIX(0.2705980501)
507 #define C3 C_FIX(0.5)
508 #define C_SHIFT (4+1+12)
509 static inline void idct4col_add(uint8_t *dest
, int line_size
, const DCTELEM
*col
)
511 int c0
, c1
, c2
, c3
, a0
, a1
, a2
, a3
;
512 const uint8_t *cm
= cropTbl
+ MAX_NEG_CROP
;
518 c0
= (a0
+ a2
)*C3
+ (1 << (C_SHIFT
- 1));
519 c2
= (a0
- a2
)*C3
+ (1 << (C_SHIFT
- 1));
520 c1
= a1
* C1
+ a3
* C2
;
521 c3
= a1
* C2
- a3
* C1
;
522 dest
[0] = cm
[dest
[0] + ((c0
+ c1
) >> C_SHIFT
)];
524 dest
[0] = cm
[dest
[0] + ((c2
+ c3
) >> C_SHIFT
)];
526 dest
[0] = cm
[dest
[0] + ((c2
- c3
) >> C_SHIFT
)];
528 dest
[0] = cm
[dest
[0] + ((c0
- c1
) >> C_SHIFT
)];
532 #define R_FIX(x) ((int)((x) * 1.414213562 * (1 << RN_SHIFT) + 0.5))
533 #define R1 R_FIX(0.6532814824)
534 #define R2 R_FIX(0.2705980501)
535 #define R3 R_FIX(0.5)
537 static inline void idct4row(DCTELEM
*row
)
539 int c0
, c1
, c2
, c3
, a0
, a1
, a2
, a3
;
540 //const uint8_t *cm = cropTbl + MAX_NEG_CROP;
546 c0
= (a0
+ a2
)*R3
+ (1 << (R_SHIFT
- 1));
547 c2
= (a0
- a2
)*R3
+ (1 << (R_SHIFT
- 1));
548 c1
= a1
* R1
+ a3
* R2
;
549 c3
= a1
* R2
- a3
* R1
;
550 row
[0]= (c0
+ c1
) >> R_SHIFT
;
551 row
[1]= (c2
+ c3
) >> R_SHIFT
;
552 row
[2]= (c2
- c3
) >> R_SHIFT
;
553 row
[3]= (c0
- c1
) >> R_SHIFT
;
556 void simple_idct84_add(uint8_t *dest
, int line_size
, DCTELEM
*block
)
560 /* IDCT8 on each line */
562 idctRowCondDC(block
+ i
*8);
565 /* IDCT4 and store */
567 idct4col_add(dest
+ i
, line_size
, block
+ i
);
571 void simple_idct48_add(uint8_t *dest
, int line_size
, DCTELEM
*block
)
575 /* IDCT4 on each line */
577 idct4row(block
+ i
*8);
580 /* IDCT8 and store */
582 idctSparseColAdd(dest
+ i
, line_size
, block
+ i
);