Merge branch 'ct' of git.pipapo.org:cinelerra-ct into ct
[cinelerra_cv/ct.git] / mpeg2enc / quantize_x86.c
blob37bf91873b8c06348619a632658a256b7240e359
1 /* quantize_x86.c Quantization / inverse quantization
2 In compiler (gcc) embdeed assmbley language...
3 */
5 /* Copyright (C) 2000 Andrew Stevens */
7 /* This program is free software; you can redistribute it
8 * and/or modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2 of
10 * the License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
20 * 02111-1307, USA.
26 /*
27 * 3DNow version of
28 * Quantisation for non-intra blocks using Test Model 5 quantization
30 * this quantizer has a bias of 1/8 stepsize towards zero
31 * (except for the DC coefficient)
33 * PRECONDITION: src dst point to *disinct* memory buffers...
34 * of block_count *adjacent* int16_t[64] arrays...
36 * RETURN: 1 If non-zero coefficients left after quantisaion 0 otherwise
39 #include "config.h"
40 #include <stdio.h>
41 #include <math.h>
42 #include <fenv.h>
43 #include "global.h"
44 #include "cpu_accel.h"
45 #include "simd.h"
46 #include "attributes.h"
47 #include "mmx.h"
49 /*
50 * Quantisation for non-intra blocks
52 * Various versions for various SIMD instruction sets. Not all of them
53 * bother to implement the test model 5 quantisation of the reference source
54 * (this has a bias of 1/8 stepsize towards zero - except for the DC coefficient).
56 * Actually, as far as I can tell even the reference source doesn't quite do it
57 * for non-intra (though it *does* for intra).
59 * Careful analysis of the code also suggests what it actually does is truncate
60 * with a modest bias towards 1 (the d>>2 factor)
62 * PRECONDITION: src dst point to *disinct* memory buffers...
63 * of block_count *adjacent* int16_t[64] arrays...
65 *RETURN: A bit-mask of block_count bits indicating non-zero blocks (a 1).
70 * 3D-Now version: simply truncates to zero, however, the tables have a 2% bias
71 * upwards which partly compensates.
74 int quant_non_intra_hv_3dnow(
75 pict_data_s *picture,
76 int16_t *src, int16_t *dst,
77 int mquant,
78 int *nonsat_mquant)
80 int saturated;
81 int satlim = dctsatlim;
82 float *i_quant_matf;
83 int coeff_count = 64*block_count;
84 uint32_t nzflag, flags;
85 int16_t *psrc, *pdst;
86 float *piqf;
87 int i;
88 uint32_t tmp;
90 /* Initialise zero block flags */
91 /* Load 1 into mm6 */
92 __asm__ ( "movl %0, %%eax\n"
93 "movd %%eax, %%mm6\n"
94 : :"g" (1) : "eax" );
95 /* Load satlim into mm1 */
96 movd_m2r( satlim, mm1 );
97 punpcklwd_r2r( mm1, mm1 );
98 punpckldq_r2r( mm1, mm1 );
99 restart:
100 i_quant_matf = i_inter_q_tblf[mquant];
101 flags = 0;
102 piqf = i_quant_matf;
103 saturated = 0;
104 nzflag = 0;
105 psrc = src;
106 pdst = dst;
107 for (i=0; i < coeff_count ; i+=4)
110 /* TODO: For maximum efficiency this should be unrolled to allow
111 f.p. and int MMX to be interleaved...
114 /* Load 4 words, unpack into mm2 and mm3 (with sign extension!)
117 movq_m2r( *(mmx_t *)&psrc[0], mm2 );
118 movq_r2r( mm2, mm7 );
119 psraw_i2r( 16, mm7 ); /* Replicate sign bits mm2 in mm7 */
120 movq_r2r( mm2, mm3 );
121 punpcklwd_r2r( mm7, mm2 ); /* Unpack with sign extensions */
122 punpckhwd_r2r( mm7, mm3);
124 /* Multiply by sixteen... */
125 pslld_i2r( 4, mm2 );
126 pslld_i2r( 4, mm3 );
129 Load the inverse quantisation factors from the
130 table in to mm4 and mm5
131 Interleaved with converting mm2 and mm3 to float's
132 to (hopefully) maximise parallelism.
134 movq_m2r( *(mmx_t*)&piqf[0], mm4);
135 pi2fd_r2r( mm2, mm2);
136 movq_m2r( *(mmx_t*)&piqf[2], mm5);
137 pi2fd_r2r( mm3, mm3);
139 /* "Divide" by multiplying by inverse quantisation
140 and convert back to integers*/
141 pfmul_r2r( mm4, mm2 );
142 pf2id_r2r( mm2, mm2);
143 pfmul_r2r( mm5, mm3);
144 pf2id_r2r( mm3, mm3);
147 /* Convert the two pairs of double words into four words */
148 packssdw_r2r( mm3, mm2);
151 /* Accumulate saturation... */
152 movq_r2r( mm2, mm4 );
154 pxor_r2r( mm5, mm5 ); // mm5 = -mm2
155 pcmpgtw_r2r( mm1, mm4 ); // mm4 = (mm2 > satlim)
156 psubw_r2r( mm2, mm5 );
157 pcmpgtw_r2r( mm1, mm5 ); // mm5 = -mm2 > satlim
158 por_r2r( mm5, mm4 ); // mm4 = abs(mm2) > satlim
159 movq_r2r( mm4, mm5 );
160 psrlq_i2r( 32, mm5);
161 por_r2r( mm5, mm4 );
163 movd_m2r( saturated, mm5 ); // saturated |= mm4
164 por_r2r( mm4, mm5 );
165 movd_r2m( mm5, saturated );
167 /* Store and accumulate zero-ness */
168 movq_r2r( mm2, mm3 );
169 movq_r2m( mm2, *(mmx_t*)pdst );
170 psrlq_i2r( 32, mm3 );
171 por_r2r( mm3, mm2 );
172 movd_r2m( mm2, tmp );
173 flags |= tmp;
175 piqf += 4;
176 pdst += 4;
177 psrc += 4;
179 if( (i & 63) == (63/4)*4 )
182 if( saturated )
184 int new_mquant = next_larger_quant_hv( picture, mquant );
185 if( new_mquant != mquant )
187 mquant = new_mquant;
188 goto restart;
190 else
192 return quant_non_intra_hv(picture, src, dst, mquant,
193 nonsat_mquant);
197 nzflag = (nzflag<<1) | !!flags;
198 flags = 0;
199 piqf = i_quant_matf;
203 femms();
205 //nzflag = (nzflag<<1) | (!!flags);
206 return nzflag;
215 /* Disabled in mjpegtools */
217 #if 0
219 * SSE version: simply truncates to zero, however, the tables have a 2% bias
220 * upwards which partly compensates.
222 static int trunc_mxcsr = 0x7f80;
224 int quant_non_intra_hv_sse(
225 pict_data_s *picture,
226 int16_t *src, int16_t *dst,
227 int mquant,
228 int *nonsat_mquant)
230 int saturated;
231 int satlim = dctsatlim;
232 float *i_quant_matf;
233 int coeff_count = 64*block_count;
234 uint32_t nzflag, flags;
235 int16_t *psrc, *pdst;
236 float *piqf;
237 int i;
238 uint32_t tmp;
240 /* Initialise zero block flags */
241 /* Load 1 into mm6 */
242 __asm__ ( "movl %0, %%eax\n"
243 "movd %%eax, %%mm6\n"
244 : :"g" (1) : "eax" );
245 /* Set up SSE rounding mode */
246 __asm__ ( "ldmxcsr (%0)\n" : : "X" (trunc_mxcsr) );
248 /* Load satlim into mm1 */
249 movd_m2r( satlim, mm1 );
250 punpcklwd_r2r( mm1, mm1 );
251 punpckldq_r2r( mm1, mm1 );
252 restart:
253 i_quant_matf = i_inter_q_tblf[mquant];
254 flags = 0;
255 piqf = i_quant_matf;
256 saturated = 0;
257 nzflag = 0;
258 psrc = src;
259 pdst = dst;
260 for (i=0; i < coeff_count ; i+=4)
263 /* Load 4 words, unpack into mm2 and mm3 (with sign extension!)
266 movq_m2r( *(mmx_t *)&psrc[0], mm2 );
267 movq_r2r( mm2, mm7 );
268 psraw_i2r( 16, mm7 ); /* Replicate sign bits mm2 in mm7 */
269 movq_r2r( mm2, mm3 );
270 punpcklwd_r2r( mm7, mm2 ); /* Unpack with sign extensions */
271 punpckhwd_r2r( mm7, mm3);
273 /* Multiply by sixteen... */
274 pslld_i2r( 4, mm2 );
275 pslld_i2r( 4, mm3 );
278 Convert mm2 and mm3 to float's in xmm2 and xmm3
280 cvtpi2ps_r2r( mm2, xmm2 );
281 cvtpi2ps_r2r( mm3, xmm3 );
282 shufps_r2ri( xmm3, xmm2, 0*1 + 1*4 + 0 * 16 + 1 * 64 );
284 /* "Divide" by multiplying by inverse quantisation
285 and convert back to integers*/
286 mulps_m2r( *(mmx_t*)&piqf[0], xmm2 );
287 cvtps2pi_r2r( xmm2, mm2 );
288 shufps_r2ri( xmm2, xmm2, 2*1 + 3*4 + 0 * 16 + 1 * 64 );
289 cvtps2pi_r2r( xmm2, mm3 );
291 /* Convert the two pairs of double words into four words */
292 packssdw_r2r( mm3, mm2);
295 /* Accumulate saturation... */
296 movq_r2r( mm2, mm4 );
298 pxor_r2r( mm5, mm5 ); // mm5 = -mm2
299 pcmpgtw_r2r( mm1, mm4 ); // mm4 = (mm2 > satlim)
300 psubw_r2r( mm2, mm5 );
301 pcmpgtw_r2r( mm1, mm5 ); // mm5 = -mm2 > satlim
302 por_r2r( mm5, mm4 ); // mm4 = abs(mm2) > satlim
303 movq_r2r( mm4, mm5 );
304 psrlq_i2r( 32, mm5);
305 por_r2r( mm5, mm4 );
307 movd_m2r( saturated, mm5 ); // saturated |= mm4
308 por_r2r( mm4, mm5 );
309 movd_r2m( mm5, saturated );
311 /* Store and accumulate zero-ness */
312 movq_r2r( mm2, mm3 );
313 movq_r2m( mm2, *(mmx_t*)pdst );
314 psrlq_i2r( 32, mm3 );
315 por_r2r( mm3, mm2 );
316 movd_r2m( mm2, tmp );
317 flags |= tmp;
319 piqf += 4;
320 pdst += 4;
321 psrc += 4;
323 if( (i & 63) == (63/4)*4 )
326 if( saturated )
328 int new_mquant = next_larger_quant_hv( picture, mquant );
329 if( new_mquant != mquant )
331 mquant = new_mquant;
332 goto restart;
334 else
336 return quant_non_intra_hv(picture, src, dst, mquant,
337 nonsat_mquant);
341 nzflag = (nzflag<<1) | !!flags;
342 flags = 0;
343 piqf = i_quant_matf;
347 emms();
349 //nzflag = (nzflag<<1) | (!!flags);
350 return nzflag;
353 #endif
356 * The ordinary MMX version. Due to the limited dynamic range afforded by working
357 * with 16-bit int's it (a) has to jump through some gory fudge-factor hoops
358 * (b) give up in tough cases and fall back on the reference code. Fortunately, the
359 * latter happens *very* rarely.
361 * TODO Replace the inefficient block-by-block call to the assembler by a sweep
362 * through the whole lot...
365 int quant_non_intra_hv_mmx(
366 pict_data_s *picture,
367 int16_t *src, int16_t *dst,
368 int mquant,
369 int *nonsat_mquant)
372 int nzflag;
373 int clipvalue = dctsatlim;
374 int flags = 0;
375 int saturated = 0;
376 uint16_t *quant_mat = inter_q;
377 int comp;
378 uint16_t *i_quant_mat = i_inter_q;
379 int imquant;
380 int16_t *psrc, *pdst;
382 /* MMX routine does not work right for MQ=2 ... (no unsigned mult) */
383 if( mquant == 2 )
385 return quant_non_intra_hv(picture, src, dst, mquant, nonsat_mquant);
387 /* If available use the fast MMX quantiser. It returns
388 flags to signal if coefficients are outside its limited range or
389 saturation would occur with the specified quantisation
390 factor
391 Top 16 bits - non zero quantised coefficient present
392 Bits 8-15 - Saturation occurred
393 Bits 0-7 - Coefficient out of range.
396 nzflag = 0;
397 pdst = dst;
398 psrc = src;
399 comp = 0;
402 imquant = (IQUANT_SCALE/mquant);
403 flags = quantize_ni_mmx( pdst, psrc, quant_mat, i_quant_mat,
404 imquant, mquant, clipvalue );
405 nzflag = (nzflag << 1) |( !!(flags & 0xffff0000));
407 /* If we're saturating simply bump up quantization and start
408 from scratch... if we can't avoid saturation by
409 quantising then we're hosed and we fall back to
410 saturation using the old C code. */
412 if( (flags & 0xff00) != 0 )
414 int new_mquant = next_larger_quant_hv( picture, mquant );
415 if( new_mquant != mquant )
417 mquant = new_mquant;
419 else
421 saturated = 1;
422 break;
425 comp = 0;
426 nzflag = 0;
427 pdst = dst;
428 psrc = src;
430 else
432 ++comp;
433 pdst += 64;
434 psrc +=64;
436 /* Fall back to 32-bit(or better - if some hero(ine) made this work on
437 non 32-bit int machines ;-)) if out of dynamic range for MMX...
440 while( comp < block_count && (flags & 0xff) == 0 );
443 /* Coefficient out of range or can't avoid saturation:
444 fall back to the original 32-bit int version: this is rare */
445 if( (flags & 0xff) != 0 || saturated)
447 return quant_non_intra_hv(picture, src, dst, mquant, nonsat_mquant);
450 *nonsat_mquant = mquant;
451 return nzflag;
455 void iquant1_intra(int16_t *src, int16_t *dst, int dc_prec, int mquant)
457 int i, val;
458 uint16_t *quant_mat = intra_q;
460 dst[0] = src[0] << (3-dc_prec);
461 for (i=1; i<64; i++)
463 val = (int)(src[i]*quant_mat[i]*mquant)/16;
465 /* mismatch control */
466 if ((val&1)==0 && val!=0)
467 val+= (val>0) ? -1 : 1;
469 /* saturation */
470 dst[i] = (val>2047) ? 2047 : ((val<-2048) ? -2048 : val);