hvirtual/mpeg2enc/quantize_x86.c

   1 /* quantize_x86.c Quantization / inverse quantization
   2    In compiler (gcc) embdeed assmbley language...
   3 */
   4
   5 /* Copyright (C) 2000 Andrew Stevens */
   6
   7 /* This program is free software; you can redistribute it
   8  *  and/or modify it under the terms of the GNU General Public License
   9  *  as published by the Free Software Foundation; either version 2 of
  10  *  the License, or (at your option) any later version.
  11  *
  12  *  This program is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  *  General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with this program; if not, write to the Free Software
  19  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  20  * 02111-1307, USA.
  21  *
  22  */
  23
  24
  25
  26 /*
  27  * 3DNow version of
  28  * Quantisation for non-intra blocks using Test Model 5 quantization
  29  *
  30  * this quantizer has a bias of 1/8 stepsize towards zero
  31  * (except for the DC coefficient)
  32  *
  33  *      PRECONDITION: src dst point to *disinct* memory buffers...
  34  *                    of block_count *adjacent* int16_t[64] arrays...
  35  *
  36  * RETURN: 1 If non-zero coefficients left after quantisaion 0 otherwise
  37  */
  38
  39 #include "config.h"
  40 #include <stdio.h>
  41 #include <math.h>
  42 #include <fenv.h>
  43 #include "global.h"
  44 #include "cpu_accel.h"
  45 #include "simd.h"
  46 #include "attributes.h"
  47 #include "mmx.h"
  48
  49 /*
  50  * Quantisation for non-intra blocks
  51  *
  52  * Various versions for various SIMD instruction sets.  Not all of them
  53  * bother to implement the test model 5 quantisation of the reference source
  54  * (this has a bias of 1/8 stepsize towards zero - except for the DC coefficient).
  55  *
  56  * Actually, as far as I can tell even the reference source doesn't quite do it
  57  * for non-intra (though it *does* for intra).
  58  *
  59  * Careful analysis of the code also suggests what it actually does is truncate
  60  * with a modest bias towards 1 (the d>>2 factor)
  61  *
  62  *      PRECONDITION: src dst point to *disinct* memory buffers...
  63  *                    of block_count *adjacent* int16_t[64] arrays...
  64  *
  65  *RETURN: A bit-mask of block_count bits indicating non-zero blocks (a 1).
  66  */
  67
  68
  69 /*
  70  * 3D-Now version: simply truncates to zero, however, the tables have a 2% bias
  71  * upwards which partly compensates.
  72  */
  73
  74 int quant_non_intra_3dnow(
  75         pict_data_s *picture,
  76         int16_t *src, int16_t *dst,
  77         int mquant,
  78         int *nonsat_mquant)
  79 {
  80         int saturated;
  81         int satlim = dctsatlim;
  82         float *i_quant_matf;
  83         int   coeff_count = 64*block_count;
  84         uint32_t nzflag, flags;
  85         int16_t *psrc, *pdst;
  86         float *piqf;
  87         int i;
  88         uint32_t tmp;
  89
  90         /* Initialise zero block flags */
  91         /* Load 1 into mm6 */
  92         __asm__ ( "movl %0, %%eax\n"
  93                           "movd %%eax, %%mm6\n"
  94                           : :"g" (1) : "eax" );
  95         /* Load satlim into mm1 */
  96         movd_m2r( satlim, mm1 );
  97         punpcklwd_r2r( mm1, mm1 );
  98         punpckldq_r2r( mm1, mm1 );
  99 restart:
 100         i_quant_matf = i_inter_q_tblf[mquant];
 101         flags = 0;
 102         piqf = i_quant_matf;
 103         saturated = 0;
 104         nzflag = 0;
 105         psrc = src;
 106         pdst = dst;
 107         for (i=0; i < coeff_count ; i+=4)
 108         {
 109
 110                 /* TODO: For maximum efficiency this should be unrolled to allow
 111                    f.p. and int MMX to be interleaved...
 112                 */
 113
 114                 /* Load 4 words, unpack into mm2 and mm3 (with sign extension!)
 115                  */
 116
 117                 movq_m2r( *(mmx_t *)&psrc[0], mm2 );
 118                 movq_r2r( mm2, mm7 );
 119                 psraw_i2r( 16, mm7 );   /* Replicate sign bits mm2 in mm7 */
 120                 movq_r2r( mm2, mm3 );
 121                 punpcklwd_r2r( mm7, mm2 ); /* Unpack with sign extensions */
 122                 punpckhwd_r2r( mm7, mm3);
 123
 124                 /* Multiply by sixteen... */
 125                 pslld_i2r( 4, mm2 );
 126                 pslld_i2r( 4, mm3 );
 127
 128                 /*
 129                    Load the inverse quantisation factors from the
 130                    table in to mm4 and mm5
 131                    Interleaved with converting mm2 and mm3 to float's
 132                    to (hopefully) maximise parallelism.
 133                  */
 134                 movq_m2r( *(mmx_t*)&piqf[0], mm4);
 135                 pi2fd_r2r( mm2, mm2);
 136                 movq_m2r( *(mmx_t*)&piqf[2], mm5);
 137                 pi2fd_r2r( mm3, mm3);
 138
 139                 /* "Divide" by multiplying by inverse quantisation
 140                  and convert back to integers*/
 141                 pfmul_r2r( mm4, mm2 );
 142                 pf2id_r2r( mm2, mm2);
 143                 pfmul_r2r( mm5, mm3);
 144                 pf2id_r2r( mm3, mm3);
 145
 146
 147                 /* Convert the two pairs of double words into four words */
 148                 packssdw_r2r(  mm3, mm2);
 149
 150
 151                 /* Accumulate saturation... */
 152                 movq_r2r( mm2, mm4 );
 153
 154                 pxor_r2r( mm5, mm5 );   // mm5 = -mm2
 155                 pcmpgtw_r2r( mm1, mm4 ); // mm4 = (mm2 > satlim)
 156                 psubw_r2r( mm2, mm5 );
 157                 pcmpgtw_r2r( mm1, mm5 ); // mm5 = -mm2 > satlim
 158                 por_r2r( mm5, mm4 );  // mm4 = abs(mm2) > satlim
 159                 movq_r2r( mm4, mm5 );
 160                 psrlq_i2r( 32, mm5);
 161                 por_r2r( mm5, mm4 );
 162
 163                 movd_m2r( saturated, mm5 ); // saturated |= mm4
 164                 por_r2r( mm4, mm5 );
 165                 movd_r2m( mm5, saturated );
 166
 167                 /* Store and accumulate zero-ness */
 168                 movq_r2r( mm2, mm3 );
 169                 movq_r2m( mm2, *(mmx_t*)pdst );
 170                 psrlq_i2r( 32, mm3 );
 171                 por_r2r( mm3, mm2 );
 172                 movd_r2m( mm2, tmp );
 173                 flags |= tmp;
 174
 175                 piqf += 4;
 176                 pdst += 4;
 177                 psrc += 4;
 178
 179                 if( (i & 63) == (63/4)*4 )
 180                 {
 181
 182                         if( saturated )
 183                         {
 184                                 int new_mquant = next_larger_quant( picture, mquant );
 185                                 if( new_mquant != mquant )
 186                                 {
 187                                         mquant = new_mquant;
 188                                         goto restart;
 189                                 }
 190                                 else
 191                                 {
 192                                         return quant_non_intra(picture, src, dst, mquant,
 193                                                                                    nonsat_mquant);
 194                                 }
 195                         }
 196
 197                         nzflag = (nzflag<<1) | !!flags;
 198                         flags = 0;
 199                         piqf = i_quant_matf;
 200                 }
 201
 202         }
 203         femms();
 204
 205         //nzflag = (nzflag<<1) | (!!flags);
 206         return nzflag;
 207 }
 208
 209 /*
 210  * SSE version: simply truncates to zero, however, the tables have a 2% bias
 211  * upwards which partly compensates.
 212  */
 213 static int trunc_mxcsr = 0x7f80;
 214
 215 int quant_non_intra_sse(
 216         pict_data_s *picture,
 217         int16_t *src, int16_t *dst,
 218         int mquant,
 219         int *nonsat_mquant)
 220 {
 221         int saturated;
 222         int satlim = dctsatlim;
 223         float *i_quant_matf;
 224         int   coeff_count = 64*block_count;
 225         uint32_t nzflag, flags;
 226         int16_t *psrc, *pdst;
 227         float *piqf;
 228         int i;
 229         uint32_t tmp;
 230
 231         /* Initialise zero block flags */
 232         /* Load 1 into mm6 */
 233         __asm__ ( "movl %0, %%eax\n"
 234                           "movd %%eax, %%mm6\n"
 235                           : :"g" (1) : "eax" );
 236         /* Set up SSE rounding mode */
 237         __asm__ ( "ldmxcsr %0\n" : : "X" (trunc_mxcsr) );
 238
 239         /* Load satlim into mm1 */
 240         movd_m2r( satlim, mm1 );
 241         punpcklwd_r2r( mm1, mm1 );
 242         punpckldq_r2r( mm1, mm1 );
 243 restart:
 244         i_quant_matf = i_inter_q_tblf[mquant];
 245         flags = 0;
 246         piqf = i_quant_matf;
 247         saturated = 0;
 248         nzflag = 0;
 249         psrc = src;
 250         pdst = dst;
 251         for (i=0; i < coeff_count ; i+=4)
 252         {
 253
 254                 /* Load 4 words, unpack into mm2 and mm3 (with sign extension!)
 255                  */
 256
 257                 movq_m2r( *(mmx_t *)&psrc[0], mm2 );
 258                 movq_r2r( mm2, mm7 );
 259                 psraw_i2r( 16, mm7 );   /* Replicate sign bits mm2 in mm7 */
 260                 movq_r2r( mm2, mm3 );
 261                 punpcklwd_r2r( mm7, mm2 ); /* Unpack with sign extensions */
 262                 punpckhwd_r2r( mm7, mm3);
 263
 264                 /* Multiply by sixteen... */
 265                 pslld_i2r( 4, mm2 );
 266                 pslld_i2r( 4, mm3 );
 267
 268                 /*
 269                   Convert mm2 and mm3 to float's  in xmm2 and xmm3
 270                  */
 271                 cvtpi2ps_r2r( mm2, xmm2 );
 272                 cvtpi2ps_r2r( mm3, xmm3 );
 273                 shufps_r2ri(  xmm3, xmm2, 0*1 + 1*4 + 0 * 16 + 1 * 64 );
 274
 275                 /* "Divide" by multiplying by inverse quantisation
 276                  and convert back to integers*/
 277                 mulps_m2r( *(mmx_t*)&piqf[0], xmm2 );
 278                 cvtps2pi_r2r( xmm2, mm2 );
 279                 shufps_r2ri( xmm2, xmm2, 2*1 + 3*4 + 0 * 16 + 1 * 64 );
 280                 cvtps2pi_r2r( xmm2, mm3 );
 281
 282                 /* Convert the two pairs of double words into four words */
 283                 packssdw_r2r(  mm3, mm2);
 284
 285
 286                 /* Accumulate saturation... */
 287                 movq_r2r( mm2, mm4 );
 288
 289                 pxor_r2r( mm5, mm5 );   // mm5 = -mm2
 290                 pcmpgtw_r2r( mm1, mm4 ); // mm4 = (mm2 > satlim)
 291                 psubw_r2r( mm2, mm5 );
 292                 pcmpgtw_r2r( mm1, mm5 ); // mm5 = -mm2 > satlim
 293                 por_r2r( mm5, mm4 );  // mm4 = abs(mm2) > satlim
 294                 movq_r2r( mm4, mm5 );
 295                 psrlq_i2r( 32, mm5);
 296                 por_r2r( mm5, mm4 );
 297
 298                 movd_m2r( saturated, mm5 ); // saturated |= mm4
 299                 por_r2r( mm4, mm5 );
 300                 movd_r2m( mm5, saturated );
 301
 302                 /* Store and accumulate zero-ness */
 303                 movq_r2r( mm2, mm3 );
 304                 movq_r2m( mm2, *(mmx_t*)pdst );
 305                 psrlq_i2r( 32, mm3 );
 306                 por_r2r( mm3, mm2 );
 307                 movd_r2m( mm2, tmp );
 308                 flags |= tmp;
 309
 310                 piqf += 4;
 311                 pdst += 4;
 312                 psrc += 4;
 313
 314                 if( (i & 63) == (63/4)*4 )
 315                 {
 316
 317                         if( saturated )
 318                         {
 319                                 int new_mquant = next_larger_quant( picture, mquant );
 320                                 if( new_mquant != mquant )
 321                                 {
 322                                         mquant = new_mquant;
 323                                         goto restart;
 324                                 }
 325                                 else
 326                                 {
 327                                         return quant_non_intra(picture, src, dst, mquant,
 328                                                                                    nonsat_mquant);
 329                                 }
 330                         }
 331
 332                         nzflag = (nzflag<<1) | !!flags;
 333                         flags = 0;
 334                         piqf = i_quant_matf;
 335                 }
 336
 337         }
 338         emms();
 339
 340         //nzflag = (nzflag<<1) | (!!flags);
 341         return nzflag;
 342 }
 343
 344 /*
 345  * The ordinary MMX version.  Due to the limited dynamic range afforded by working
 346  * with 16-bit int's it (a) has to jump through some gory fudge-factor hoops
 347  * (b) give up in tough cases and fall back on the reference code. Fortunately, the
 348  * latter happens *very* rarely.
 349  *
 350  * TODO Replace the inefficient block-by-block call to the assembler by a sweep
 351  * through the whole lot...
 352  */
 353
 354 int quant_non_intra_mmx(
 355         pict_data_s *picture,
 356         int16_t *src, int16_t *dst,
 357         int mquant,
 358         int *nonsat_mquant)
 359 {
 360
 361         int nzflag;
 362         int clipvalue  = dctsatlim;
 363         int flags = 0;
 364         int saturated = 0;
 365         uint16_t *quant_mat = inter_q;
 366         int comp;
 367         uint16_t *i_quant_mat = i_inter_q;
 368         int imquant;
 369         int16_t *psrc, *pdst;
 370
 371         /* MMX routine does not work right for MQ=2 ... (no unsigned mult) */
 372         if( mquant == 2 )
 373         {
 374                 return quant_non_intra(picture, src, dst, mquant, nonsat_mquant);
 375         }
 376         /* If available use the fast MMX quantiser.  It returns
 377            flags to signal if coefficients are outside its limited range or
 378            saturation would occur with the specified quantisation
 379            factor
 380            Top 16 bits - non zero quantised coefficient present
 381            Bits 8-15   - Saturation occurred
 382            Bits 0-7    - Coefficient out of range.
 383         */
 384
 385         nzflag = 0;
 386         pdst = dst;
 387         psrc = src;
 388         comp = 0;
 389         do
 390         {
 391                 imquant = (IQUANT_SCALE/mquant);
 392                 flags = quantize_ni_mmx( pdst, psrc, quant_mat, i_quant_mat,
 393                                                                                 imquant, mquant, clipvalue );
 394                 nzflag = (nzflag << 1) |( !!(flags & 0xffff0000));
 395
 396                 /* If we're saturating simply bump up quantization and start
 397                         from scratch...  if we can't avoid saturation by
 398                         quantising then we're hosed and we fall back to
 399                         saturation using the old C code.  */
 400
 401                 if( (flags & 0xff00) != 0 )
 402                 {
 403                         int new_mquant = next_larger_quant( picture, mquant );
 404                         if( new_mquant != mquant )
 405                         {
 406                                 mquant = new_mquant;
 407                         }
 408                         else
 409                         {
 410                                 saturated = 1;
 411                                 break;
 412                         }
 413
 414                         comp = 0;
 415                         nzflag = 0;
 416                         pdst = dst;
 417                         psrc = src;
 418                 }
 419                 else
 420                 {
 421                         ++comp;
 422                         pdst += 64;
 423                         psrc +=64;
 424                 }
 425                 /* Fall back to 32-bit(or better - if some hero(ine) made this work on
 426                         non 32-bit int machines ;-)) if out of dynamic range for MMX...
 427                 */
 428         }
 429         while( comp < block_count  && (flags & 0xff) == 0  );
 430
 431
 432         /* Coefficient out of range or can't avoid saturation:
 433         fall back to the original 32-bit int version: this is rare */
 434         if(  (flags & 0xff) != 0 || saturated)
 435         {
 436                 return quant_non_intra(picture, src, dst, mquant, nonsat_mquant);
 437         }
 438
 439         *nonsat_mquant = mquant;
 440         return nzflag;
 441 }
 442
 443
 444 void iquant1_intra(int16_t *src, int16_t *dst, int dc_prec, int mquant)
 445 {
 446   int i, val;
 447   uint16_t *quant_mat = intra_q;
 448
 449   dst[0] = src[0] << (3-dc_prec);
 450   for (i=1; i<64; i++)
 451   {
 452     val = (int)(src[i]*quant_mat[i]*mquant)/16;
 453
 454     /* mismatch control */
 455     if ((val&1)==0 && val!=0)
 456       val+= (val>0) ? -1 : 1;
 457
 458     /* saturation */
 459     dst[i] = (val>2047) ? 2047 : ((val<-2048) ? -2048 : val);
 460   }
 461 }