1 /* quantize_x86.c Quantization / inverse quantization
2 In compiler (gcc) embdeed assmbley language...
5 /* Copyright (C) 2000 Andrew Stevens */
7 /* This program is free software; you can redistribute it
8 * and/or modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2 of
10 * the License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
28 * Quantisation for non-intra blocks using Test Model 5 quantization
30 * this quantizer has a bias of 1/8 stepsize towards zero
31 * (except for the DC coefficient)
33 * PRECONDITION: src dst point to *disinct* memory buffers...
34 * of block_count *adjacent* int16_t[64] arrays...
36 * RETURN: 1 If non-zero coefficients left after quantisaion 0 otherwise
44 #include "cpu_accel.h"
46 #include "attributes.h"
50 * Quantisation for non-intra blocks
52 * Various versions for various SIMD instruction sets. Not all of them
53 * bother to implement the test model 5 quantisation of the reference source
54 * (this has a bias of 1/8 stepsize towards zero - except for the DC coefficient).
56 * Actually, as far as I can tell even the reference source doesn't quite do it
57 * for non-intra (though it *does* for intra).
59 * Careful analysis of the code also suggests what it actually does is truncate
60 * with a modest bias towards 1 (the d>>2 factor)
62 * PRECONDITION: src dst point to *disinct* memory buffers...
63 * of block_count *adjacent* int16_t[64] arrays...
65 *RETURN: A bit-mask of block_count bits indicating non-zero blocks (a 1).
70 * 3D-Now version: simply truncates to zero, however, the tables have a 2% bias
71 * upwards which partly compensates.
74 int quant_non_intra_3dnow(
76 int16_t *src
, int16_t *dst
,
81 int satlim
= dctsatlim
;
83 int coeff_count
= 64*block_count
;
84 uint32_t nzflag
, flags
;
90 /* Initialise zero block flags */
92 __asm__ ( "movl %0, %%eax\n"
95 /* Load satlim into mm1 */
96 movd_m2r( satlim
, mm1
);
97 punpcklwd_r2r( mm1
, mm1
);
98 punpckldq_r2r( mm1
, mm1
);
100 i_quant_matf
= i_inter_q_tblf
[mquant
];
107 for (i
=0; i
< coeff_count
; i
+=4)
110 /* TODO: For maximum efficiency this should be unrolled to allow
111 f.p. and int MMX to be interleaved...
114 /* Load 4 words, unpack into mm2 and mm3 (with sign extension!)
117 movq_m2r( *(mmx_t
*)&psrc
[0], mm2
);
118 movq_r2r( mm2
, mm7
);
119 psraw_i2r( 16, mm7
); /* Replicate sign bits mm2 in mm7 */
120 movq_r2r( mm2
, mm3
);
121 punpcklwd_r2r( mm7
, mm2
); /* Unpack with sign extensions */
122 punpckhwd_r2r( mm7
, mm3
);
124 /* Multiply by sixteen... */
129 Load the inverse quantisation factors from the
130 table in to mm4 and mm5
131 Interleaved with converting mm2 and mm3 to float's
132 to (hopefully) maximise parallelism.
134 movq_m2r( *(mmx_t
*)&piqf
[0], mm4
);
135 pi2fd_r2r( mm2
, mm2
);
136 movq_m2r( *(mmx_t
*)&piqf
[2], mm5
);
137 pi2fd_r2r( mm3
, mm3
);
139 /* "Divide" by multiplying by inverse quantisation
140 and convert back to integers*/
141 pfmul_r2r( mm4
, mm2
);
142 pf2id_r2r( mm2
, mm2
);
143 pfmul_r2r( mm5
, mm3
);
144 pf2id_r2r( mm3
, mm3
);
147 /* Convert the two pairs of double words into four words */
148 packssdw_r2r( mm3
, mm2
);
151 /* Accumulate saturation... */
152 movq_r2r( mm2
, mm4
);
154 pxor_r2r( mm5
, mm5
); // mm5 = -mm2
155 pcmpgtw_r2r( mm1
, mm4
); // mm4 = (mm2 > satlim)
156 psubw_r2r( mm2
, mm5
);
157 pcmpgtw_r2r( mm1
, mm5
); // mm5 = -mm2 > satlim
158 por_r2r( mm5
, mm4
); // mm4 = abs(mm2) > satlim
159 movq_r2r( mm4
, mm5
);
163 movd_m2r( saturated
, mm5
); // saturated |= mm4
165 movd_r2m( mm5
, saturated
);
167 /* Store and accumulate zero-ness */
168 movq_r2r( mm2
, mm3
);
169 movq_r2m( mm2
, *(mmx_t
*)pdst
);
170 psrlq_i2r( 32, mm3
);
172 movd_r2m( mm2
, tmp
);
179 if( (i
& 63) == (63/4)*4 )
184 int new_mquant
= next_larger_quant( picture
, mquant
);
185 if( new_mquant
!= mquant
)
192 return quant_non_intra(picture
, src
, dst
, mquant
,
197 nzflag
= (nzflag
<<1) | !!flags
;
205 //nzflag = (nzflag<<1) | (!!flags);
210 * SSE version: simply truncates to zero, however, the tables have a 2% bias
211 * upwards which partly compensates.
213 static int trunc_mxcsr
= 0x7f80;
215 int quant_non_intra_sse(
216 pict_data_s
*picture
,
217 int16_t *src
, int16_t *dst
,
222 int satlim
= dctsatlim
;
224 int coeff_count
= 64*block_count
;
225 uint32_t nzflag
, flags
;
226 int16_t *psrc
, *pdst
;
231 /* Initialise zero block flags */
232 /* Load 1 into mm6 */
233 __asm__ ( "movl %0, %%eax\n"
234 "movd %%eax, %%mm6\n"
235 : :"g" (1) : "eax" );
236 /* Set up SSE rounding mode */
237 __asm__ ( "ldmxcsr %0\n" : : "X" (trunc_mxcsr
) );
239 /* Load satlim into mm1 */
240 movd_m2r( satlim
, mm1
);
241 punpcklwd_r2r( mm1
, mm1
);
242 punpckldq_r2r( mm1
, mm1
);
244 i_quant_matf
= i_inter_q_tblf
[mquant
];
251 for (i
=0; i
< coeff_count
; i
+=4)
254 /* Load 4 words, unpack into mm2 and mm3 (with sign extension!)
257 movq_m2r( *(mmx_t
*)&psrc
[0], mm2
);
258 movq_r2r( mm2
, mm7
);
259 psraw_i2r( 16, mm7
); /* Replicate sign bits mm2 in mm7 */
260 movq_r2r( mm2
, mm3
);
261 punpcklwd_r2r( mm7
, mm2
); /* Unpack with sign extensions */
262 punpckhwd_r2r( mm7
, mm3
);
264 /* Multiply by sixteen... */
269 Convert mm2 and mm3 to float's in xmm2 and xmm3
271 cvtpi2ps_r2r( mm2
, xmm2
);
272 cvtpi2ps_r2r( mm3
, xmm3
);
273 shufps_r2ri( xmm3
, xmm2
, 0*1 + 1*4 + 0 * 16 + 1 * 64 );
275 /* "Divide" by multiplying by inverse quantisation
276 and convert back to integers*/
277 mulps_m2r( *(mmx_t
*)&piqf
[0], xmm2
);
278 cvtps2pi_r2r( xmm2
, mm2
);
279 shufps_r2ri( xmm2
, xmm2
, 2*1 + 3*4 + 0 * 16 + 1 * 64 );
280 cvtps2pi_r2r( xmm2
, mm3
);
282 /* Convert the two pairs of double words into four words */
283 packssdw_r2r( mm3
, mm2
);
286 /* Accumulate saturation... */
287 movq_r2r( mm2
, mm4
);
289 pxor_r2r( mm5
, mm5
); // mm5 = -mm2
290 pcmpgtw_r2r( mm1
, mm4
); // mm4 = (mm2 > satlim)
291 psubw_r2r( mm2
, mm5
);
292 pcmpgtw_r2r( mm1
, mm5
); // mm5 = -mm2 > satlim
293 por_r2r( mm5
, mm4
); // mm4 = abs(mm2) > satlim
294 movq_r2r( mm4
, mm5
);
298 movd_m2r( saturated
, mm5
); // saturated |= mm4
300 movd_r2m( mm5
, saturated
);
302 /* Store and accumulate zero-ness */
303 movq_r2r( mm2
, mm3
);
304 movq_r2m( mm2
, *(mmx_t
*)pdst
);
305 psrlq_i2r( 32, mm3
);
307 movd_r2m( mm2
, tmp
);
314 if( (i
& 63) == (63/4)*4 )
319 int new_mquant
= next_larger_quant( picture
, mquant
);
320 if( new_mquant
!= mquant
)
327 return quant_non_intra(picture
, src
, dst
, mquant
,
332 nzflag
= (nzflag
<<1) | !!flags
;
340 //nzflag = (nzflag<<1) | (!!flags);
345 * The ordinary MMX version. Due to the limited dynamic range afforded by working
346 * with 16-bit int's it (a) has to jump through some gory fudge-factor hoops
347 * (b) give up in tough cases and fall back on the reference code. Fortunately, the
348 * latter happens *very* rarely.
350 * TODO Replace the inefficient block-by-block call to the assembler by a sweep
351 * through the whole lot...
354 int quant_non_intra_mmx(
355 pict_data_s
*picture
,
356 int16_t *src
, int16_t *dst
,
362 int clipvalue
= dctsatlim
;
365 uint16_t *quant_mat
= inter_q
;
367 uint16_t *i_quant_mat
= i_inter_q
;
369 int16_t *psrc
, *pdst
;
371 /* MMX routine does not work right for MQ=2 ... (no unsigned mult) */
374 return quant_non_intra(picture
, src
, dst
, mquant
, nonsat_mquant
);
376 /* If available use the fast MMX quantiser. It returns
377 flags to signal if coefficients are outside its limited range or
378 saturation would occur with the specified quantisation
380 Top 16 bits - non zero quantised coefficient present
381 Bits 8-15 - Saturation occurred
382 Bits 0-7 - Coefficient out of range.
391 imquant
= (IQUANT_SCALE
/mquant
);
392 flags
= quantize_ni_mmx( pdst
, psrc
, quant_mat
, i_quant_mat
,
393 imquant
, mquant
, clipvalue
);
394 nzflag
= (nzflag
<< 1) |( !!(flags
& 0xffff0000));
396 /* If we're saturating simply bump up quantization and start
397 from scratch... if we can't avoid saturation by
398 quantising then we're hosed and we fall back to
399 saturation using the old C code. */
401 if( (flags
& 0xff00) != 0 )
403 int new_mquant
= next_larger_quant( picture
, mquant
);
404 if( new_mquant
!= mquant
)
425 /* Fall back to 32-bit(or better - if some hero(ine) made this work on
426 non 32-bit int machines ;-)) if out of dynamic range for MMX...
429 while( comp
< block_count
&& (flags
& 0xff) == 0 );
432 /* Coefficient out of range or can't avoid saturation:
433 fall back to the original 32-bit int version: this is rare */
434 if( (flags
& 0xff) != 0 || saturated
)
436 return quant_non_intra(picture
, src
, dst
, mquant
, nonsat_mquant
);
439 *nonsat_mquant
= mquant
;
444 void iquant1_intra(int16_t *src
, int16_t *dst
, int dc_prec
, int mquant
)
447 uint16_t *quant_mat
= intra_q
;
449 dst
[0] = src
[0] << (3-dc_prec
);
452 val
= (int)(src
[i
]*quant_mat
[i
]*mquant
)/16;
454 /* mismatch control */
455 if ((val
&1)==0 && val
!=0)
456 val
+= (val
>0) ? -1 : 1;
459 dst
[i
] = (val
>2047) ? 2047 : ((val
<-2048) ? -2048 : val
);