r125: This commit was manufactured by cvs2svn to create tag 'r1_1_7-last'.
[cinelerra_cv/mob.git] / hvirtual / mpeg2enc / quantize_x86.c
blob2476082df489ec32971831fd1d0e47e5d49cab82
1 /* quantize_x86.c Quantization / inverse quantization
2 In compiler (gcc) embdeed assmbley language...
3 */
5 /* Copyright (C) 2000 Andrew Stevens */
7 /* This program is free software; you can redistribute it
8 * and/or modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2 of
10 * the License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
20 * 02111-1307, USA.
26 /*
27 * 3DNow version of
28 * Quantisation for non-intra blocks using Test Model 5 quantization
30 * this quantizer has a bias of 1/8 stepsize towards zero
31 * (except for the DC coefficient)
33 * PRECONDITION: src dst point to *disinct* memory buffers...
34 * of block_count *adjacent* int16_t[64] arrays...
36 * RETURN: 1 If non-zero coefficients left after quantisaion 0 otherwise
39 #include "config.h"
40 #include <stdio.h>
41 #include <math.h>
42 #include <fenv.h>
43 #include "global.h"
44 #include "cpu_accel.h"
45 #include "simd.h"
46 #include "attributes.h"
47 #include "mmx.h"
49 /*
50 * Quantisation for non-intra blocks
52 * Various versions for various SIMD instruction sets. Not all of them
53 * bother to implement the test model 5 quantisation of the reference source
54 * (this has a bias of 1/8 stepsize towards zero - except for the DC coefficient).
56 * Actually, as far as I can tell even the reference source doesn't quite do it
57 * for non-intra (though it *does* for intra).
59 * Careful analysis of the code also suggests what it actually does is truncate
60 * with a modest bias towards 1 (the d>>2 factor)
62 * PRECONDITION: src dst point to *disinct* memory buffers...
63 * of block_count *adjacent* int16_t[64] arrays...
65 *RETURN: A bit-mask of block_count bits indicating non-zero blocks (a 1).
70 * 3D-Now version: simply truncates to zero, however, the tables have a 2% bias
71 * upwards which partly compensates.
74 int quant_non_intra_3dnow(
75 pict_data_s *picture,
76 int16_t *src, int16_t *dst,
77 int mquant,
78 int *nonsat_mquant)
80 int saturated;
81 int satlim = dctsatlim;
82 float *i_quant_matf;
83 int coeff_count = 64*block_count;
84 uint32_t nzflag, flags;
85 int16_t *psrc, *pdst;
86 float *piqf;
87 int i;
88 uint32_t tmp;
90 /* Initialise zero block flags */
91 /* Load 1 into mm6 */
92 __asm__ ( "movl %0, %%eax\n"
93 "movd %%eax, %%mm6\n"
94 : :"g" (1) : "eax" );
95 /* Load satlim into mm1 */
96 movd_m2r( satlim, mm1 );
97 punpcklwd_r2r( mm1, mm1 );
98 punpckldq_r2r( mm1, mm1 );
99 restart:
100 i_quant_matf = i_inter_q_tblf[mquant];
101 flags = 0;
102 piqf = i_quant_matf;
103 saturated = 0;
104 nzflag = 0;
105 psrc = src;
106 pdst = dst;
107 for (i=0; i < coeff_count ; i+=4)
110 /* TODO: For maximum efficiency this should be unrolled to allow
111 f.p. and int MMX to be interleaved...
114 /* Load 4 words, unpack into mm2 and mm3 (with sign extension!)
117 movq_m2r( *(mmx_t *)&psrc[0], mm2 );
118 movq_r2r( mm2, mm7 );
119 psraw_i2r( 16, mm7 ); /* Replicate sign bits mm2 in mm7 */
120 movq_r2r( mm2, mm3 );
121 punpcklwd_r2r( mm7, mm2 ); /* Unpack with sign extensions */
122 punpckhwd_r2r( mm7, mm3);
124 /* Multiply by sixteen... */
125 pslld_i2r( 4, mm2 );
126 pslld_i2r( 4, mm3 );
129 Load the inverse quantisation factors from the
130 table in to mm4 and mm5
131 Interleaved with converting mm2 and mm3 to float's
132 to (hopefully) maximise parallelism.
134 movq_m2r( *(mmx_t*)&piqf[0], mm4);
135 pi2fd_r2r( mm2, mm2);
136 movq_m2r( *(mmx_t*)&piqf[2], mm5);
137 pi2fd_r2r( mm3, mm3);
139 /* "Divide" by multiplying by inverse quantisation
140 and convert back to integers*/
141 pfmul_r2r( mm4, mm2 );
142 pf2id_r2r( mm2, mm2);
143 pfmul_r2r( mm5, mm3);
144 pf2id_r2r( mm3, mm3);
147 /* Convert the two pairs of double words into four words */
148 packssdw_r2r( mm3, mm2);
151 /* Accumulate saturation... */
152 movq_r2r( mm2, mm4 );
154 pxor_r2r( mm5, mm5 ); // mm5 = -mm2
155 pcmpgtw_r2r( mm1, mm4 ); // mm4 = (mm2 > satlim)
156 psubw_r2r( mm2, mm5 );
157 pcmpgtw_r2r( mm1, mm5 ); // mm5 = -mm2 > satlim
158 por_r2r( mm5, mm4 ); // mm4 = abs(mm2) > satlim
159 movq_r2r( mm4, mm5 );
160 psrlq_i2r( 32, mm5);
161 por_r2r( mm5, mm4 );
163 movd_m2r( saturated, mm5 ); // saturated |= mm4
164 por_r2r( mm4, mm5 );
165 movd_r2m( mm5, saturated );
167 /* Store and accumulate zero-ness */
168 movq_r2r( mm2, mm3 );
169 movq_r2m( mm2, *(mmx_t*)pdst );
170 psrlq_i2r( 32, mm3 );
171 por_r2r( mm3, mm2 );
172 movd_r2m( mm2, tmp );
173 flags |= tmp;
175 piqf += 4;
176 pdst += 4;
177 psrc += 4;
179 if( (i & 63) == (63/4)*4 )
182 if( saturated )
184 int new_mquant = next_larger_quant( picture, mquant );
185 if( new_mquant != mquant )
187 mquant = new_mquant;
188 goto restart;
190 else
192 return quant_non_intra(picture, src, dst, mquant,
193 nonsat_mquant);
197 nzflag = (nzflag<<1) | !!flags;
198 flags = 0;
199 piqf = i_quant_matf;
203 femms();
205 //nzflag = (nzflag<<1) | (!!flags);
206 return nzflag;
210 * SSE version: simply truncates to zero, however, the tables have a 2% bias
211 * upwards which partly compensates.
213 static int trunc_mxcsr = 0x7f80;
215 int quant_non_intra_sse(
216 pict_data_s *picture,
217 int16_t *src, int16_t *dst,
218 int mquant,
219 int *nonsat_mquant)
221 int saturated;
222 int satlim = dctsatlim;
223 float *i_quant_matf;
224 int coeff_count = 64*block_count;
225 uint32_t nzflag, flags;
226 int16_t *psrc, *pdst;
227 float *piqf;
228 int i;
229 uint32_t tmp;
231 /* Initialise zero block flags */
232 /* Load 1 into mm6 */
233 __asm__ ( "movl %0, %%eax\n"
234 "movd %%eax, %%mm6\n"
235 : :"g" (1) : "eax" );
236 /* Set up SSE rounding mode */
237 __asm__ ( "ldmxcsr %0\n" : : "X" (trunc_mxcsr) );
239 /* Load satlim into mm1 */
240 movd_m2r( satlim, mm1 );
241 punpcklwd_r2r( mm1, mm1 );
242 punpckldq_r2r( mm1, mm1 );
243 restart:
244 i_quant_matf = i_inter_q_tblf[mquant];
245 flags = 0;
246 piqf = i_quant_matf;
247 saturated = 0;
248 nzflag = 0;
249 psrc = src;
250 pdst = dst;
251 for (i=0; i < coeff_count ; i+=4)
254 /* Load 4 words, unpack into mm2 and mm3 (with sign extension!)
257 movq_m2r( *(mmx_t *)&psrc[0], mm2 );
258 movq_r2r( mm2, mm7 );
259 psraw_i2r( 16, mm7 ); /* Replicate sign bits mm2 in mm7 */
260 movq_r2r( mm2, mm3 );
261 punpcklwd_r2r( mm7, mm2 ); /* Unpack with sign extensions */
262 punpckhwd_r2r( mm7, mm3);
264 /* Multiply by sixteen... */
265 pslld_i2r( 4, mm2 );
266 pslld_i2r( 4, mm3 );
269 Convert mm2 and mm3 to float's in xmm2 and xmm3
271 cvtpi2ps_r2r( mm2, xmm2 );
272 cvtpi2ps_r2r( mm3, xmm3 );
273 shufps_r2ri( xmm3, xmm2, 0*1 + 1*4 + 0 * 16 + 1 * 64 );
275 /* "Divide" by multiplying by inverse quantisation
276 and convert back to integers*/
277 mulps_m2r( *(mmx_t*)&piqf[0], xmm2 );
278 cvtps2pi_r2r( xmm2, mm2 );
279 shufps_r2ri( xmm2, xmm2, 2*1 + 3*4 + 0 * 16 + 1 * 64 );
280 cvtps2pi_r2r( xmm2, mm3 );
282 /* Convert the two pairs of double words into four words */
283 packssdw_r2r( mm3, mm2);
286 /* Accumulate saturation... */
287 movq_r2r( mm2, mm4 );
289 pxor_r2r( mm5, mm5 ); // mm5 = -mm2
290 pcmpgtw_r2r( mm1, mm4 ); // mm4 = (mm2 > satlim)
291 psubw_r2r( mm2, mm5 );
292 pcmpgtw_r2r( mm1, mm5 ); // mm5 = -mm2 > satlim
293 por_r2r( mm5, mm4 ); // mm4 = abs(mm2) > satlim
294 movq_r2r( mm4, mm5 );
295 psrlq_i2r( 32, mm5);
296 por_r2r( mm5, mm4 );
298 movd_m2r( saturated, mm5 ); // saturated |= mm4
299 por_r2r( mm4, mm5 );
300 movd_r2m( mm5, saturated );
302 /* Store and accumulate zero-ness */
303 movq_r2r( mm2, mm3 );
304 movq_r2m( mm2, *(mmx_t*)pdst );
305 psrlq_i2r( 32, mm3 );
306 por_r2r( mm3, mm2 );
307 movd_r2m( mm2, tmp );
308 flags |= tmp;
310 piqf += 4;
311 pdst += 4;
312 psrc += 4;
314 if( (i & 63) == (63/4)*4 )
317 if( saturated )
319 int new_mquant = next_larger_quant( picture, mquant );
320 if( new_mquant != mquant )
322 mquant = new_mquant;
323 goto restart;
325 else
327 return quant_non_intra(picture, src, dst, mquant,
328 nonsat_mquant);
332 nzflag = (nzflag<<1) | !!flags;
333 flags = 0;
334 piqf = i_quant_matf;
338 emms();
340 //nzflag = (nzflag<<1) | (!!flags);
341 return nzflag;
345 * The ordinary MMX version. Due to the limited dynamic range afforded by working
346 * with 16-bit int's it (a) has to jump through some gory fudge-factor hoops
347 * (b) give up in tough cases and fall back on the reference code. Fortunately, the
348 * latter happens *very* rarely.
350 * TODO Replace the inefficient block-by-block call to the assembler by a sweep
351 * through the whole lot...
354 int quant_non_intra_mmx(
355 pict_data_s *picture,
356 int16_t *src, int16_t *dst,
357 int mquant,
358 int *nonsat_mquant)
361 int nzflag;
362 int clipvalue = dctsatlim;
363 int flags = 0;
364 int saturated = 0;
365 uint16_t *quant_mat = inter_q;
366 int comp;
367 uint16_t *i_quant_mat = i_inter_q;
368 int imquant;
369 int16_t *psrc, *pdst;
371 /* MMX routine does not work right for MQ=2 ... (no unsigned mult) */
372 if( mquant == 2 )
374 return quant_non_intra(picture, src, dst, mquant, nonsat_mquant);
376 /* If available use the fast MMX quantiser. It returns
377 flags to signal if coefficients are outside its limited range or
378 saturation would occur with the specified quantisation
379 factor
380 Top 16 bits - non zero quantised coefficient present
381 Bits 8-15 - Saturation occurred
382 Bits 0-7 - Coefficient out of range.
385 nzflag = 0;
386 pdst = dst;
387 psrc = src;
388 comp = 0;
391 imquant = (IQUANT_SCALE/mquant);
392 flags = quantize_ni_mmx( pdst, psrc, quant_mat, i_quant_mat,
393 imquant, mquant, clipvalue );
394 nzflag = (nzflag << 1) |( !!(flags & 0xffff0000));
396 /* If we're saturating simply bump up quantization and start
397 from scratch... if we can't avoid saturation by
398 quantising then we're hosed and we fall back to
399 saturation using the old C code. */
401 if( (flags & 0xff00) != 0 )
403 int new_mquant = next_larger_quant( picture, mquant );
404 if( new_mquant != mquant )
406 mquant = new_mquant;
408 else
410 saturated = 1;
411 break;
414 comp = 0;
415 nzflag = 0;
416 pdst = dst;
417 psrc = src;
419 else
421 ++comp;
422 pdst += 64;
423 psrc +=64;
425 /* Fall back to 32-bit(or better - if some hero(ine) made this work on
426 non 32-bit int machines ;-)) if out of dynamic range for MMX...
429 while( comp < block_count && (flags & 0xff) == 0 );
432 /* Coefficient out of range or can't avoid saturation:
433 fall back to the original 32-bit int version: this is rare */
434 if( (flags & 0xff) != 0 || saturated)
436 return quant_non_intra(picture, src, dst, mquant, nonsat_mquant);
439 *nonsat_mquant = mquant;
440 return nzflag;
444 void iquant1_intra(int16_t *src, int16_t *dst, int dc_prec, int mquant)
446 int i, val;
447 uint16_t *quant_mat = intra_q;
449 dst[0] = src[0] << (3-dc_prec);
450 for (i=1; i<64; i++)
452 val = (int)(src[i]*quant_mat[i]*mquant)/16;
454 /* mismatch control */
455 if ((val&1)==0 && val!=0)
456 val+= (val>0) ? -1 : 1;
458 /* saturation */
459 dst[i] = (val>2047) ? 2047 : ((val<-2048) ? -2048 : val);