core/libs/libpng-1.2.18/pnggccrd.c

   1
   2 /* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
   3  *
   4  * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
   5  *
   6  *     See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
   7  *     and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
   8  *     for Intel's performance analysis of the MMX vs. non-MMX code.
   9  *
  10  * Last changed in libpng 1.2.15 January 5, 2007
  11  * For conditions of distribution and use, see copyright notice in png.h
  12  * Copyright (c) 1998-2007 Glenn Randers-Pehrson
  13  * Copyright (c) 1998, Intel Corporation
  14  *
  15  * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
  16  * Interface to libpng contributed by Gilles Vollant, 1999.
  17  * GNU C port by Greg Roelofs, 1999-2001.
  18  *
  19  * Lines 2350-4300 converted in place with intel2gas 1.3.1:
  20  *
  21  *   intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
  22  *
  23  * and then cleaned up by hand.  See http://hermes.terminal.at/intel2gas/ .
  24  *
  25  * NOTE:  A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
  26  *        is required to assemble the newer MMX instructions such as movq.
  27  *        For djgpp, see
  28  *
  29  *           ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
  30  *
  31  *        (or a later version in the same directory).  For Linux, check your
  32  *        distribution's web site(s) or try these links:
  33  *
  34  *           http://rufus.w3.org/linux/RPM/binutils.html
  35  *           http://www.debian.org/Packages/stable/devel/binutils.html
  36  *           ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
  37  *             binutils.tgz
  38  *
  39  *        For other platforms, see the main GNU site:
  40  *
  41  *           ftp://ftp.gnu.org/pub/gnu/binutils/
  42  *
  43  *        Version 2.5.2l.15 is definitely too old...
  44  */
  45
  46 /*
  47  * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
  48  * =====================================
  49  *
  50  * 19991006:
  51  *  - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
  52  *
  53  * 19991007:
  54  *  - additional optimizations (possible or definite):
  55  *     x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
  56  *     - write MMX code for 48-bit case (pixel_bytes == 6)
  57  *     - figure out what's up with 24-bit case (pixel_bytes == 3):
  58  *        why subtract 8 from width_mmx in the pass 4/5 case?
  59  *        (only width_mmx case) (near line 1606)
  60  *     x [DONE] replace pixel_bytes within each block with the true
  61  *        constant value (or are compilers smart enough to do that?)
  62  *     - rewrite all MMX interlacing code so it's aligned with
  63  *        the *beginning* of the row buffer, not the end.  This
  64  *        would not only allow one to eliminate half of the memory
  65  *        writes for odd passes (that is, pass == odd), it may also
  66  *        eliminate some unaligned-data-access exceptions (assuming
  67  *        there's a penalty for not aligning 64-bit accesses on
  68  *        64-bit boundaries).  The only catch is that the "leftover"
  69  *        pixel(s) at the end of the row would have to be saved,
  70  *        but there are enough unused MMX registers in every case,
  71  *        so this is not a problem.  A further benefit is that the
  72  *        post-MMX cleanup code (C code) in at least some of the
  73  *        cases could be done within the assembler block.
  74  *  x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
  75  *     inconsistent, and don't match the MMX Programmer's Reference
  76  *     Manual conventions anyway.  They should be changed to
  77  *     "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
  78  *     was lowest in memory (e.g., corresponding to a left pixel)
  79  *     and b7 is the byte that was highest (e.g., a right pixel).
  80  *
  81  * 19991016:
  82  *  - Brennan's Guide notwithstanding, gcc under Linux does *not*
  83  *     want globals prefixed by underscores when referencing them--
  84  *     i.e., if the variable is const4, then refer to it as const4,
  85  *     not _const4.  This seems to be a djgpp-specific requirement.
  86  *     Also, such variables apparently *must* be declared outside
  87  *     of functions; neither static nor automatic variables work if
  88  *     defined within the scope of a single function, but both
  89  *     static and truly global (multi-module) variables work fine.
  90  *
  91  * 19991023:
  92  *  - fixed png_combine_row() non-MMX replication bug (odd passes only?)
  93  *  - switched from string-concatenation-with-macros to cleaner method of
  94  *     renaming global variables for djgpp--i.e., always use prefixes in
  95  *     inlined assembler code (== strings) and conditionally rename the
  96  *     variables, not the other way around.  Hence _const4, _mask8_0, etc.
  97  *
  98  * 19991024:
  99  *  - fixed mmxsupport()/png_do_read_interlace() first-row bug
 100  *     This one was severely weird:  even though mmxsupport() doesn't touch
 101  *     ebx (where "row" pointer was stored), it nevertheless managed to zero
 102  *     the register (even in static/non-fPIC code--see below), which in turn
 103  *     caused png_do_read_interlace() to return prematurely on the first row of
 104  *     interlaced images (i.e., without expanding the interlaced pixels).
 105  *     Inspection of the generated assembly code didn't turn up any clues,
 106  *     although it did point at a minor optimization (i.e., get rid of
 107  *     mmx_supported_local variable and just use eax).  Possibly the CPUID
 108  *     instruction is more destructive than it looks?  (Not yet checked.)
 109  *  - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
 110  *     listings...  Apparently register spillage has to do with ebx, since
 111  *     it's used to index the global offset table.  Commenting it out of the
 112  *     input-reg lists in png_combine_row() eliminated compiler barfage, so
 113  *     ifdef'd with __PIC__ macro:  if defined, use a global for unmask
 114  *
 115  * 19991107:
 116  *  - verified CPUID clobberage:  12-char string constant ("GenuineIntel",
 117  *     "AuthenticAMD", etc.) placed in ebx:ecx:edx.  Still need to polish.
 118  *
 119  * 19991120:
 120  *  - made "diff" variable (now "_dif") global to simplify conversion of
 121  *     filtering routines (running out of regs, sigh).  "diff" is still used
 122  *     in interlacing routines, however.
 123  *  - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
 124  *     macro determines which is used); original not yet tested.
 125  *
 126  * 20000213:
 127  *  - when compiling with gcc, be sure to use  -fomit-frame-pointer
 128  *
 129  * 20000319:
 130  *  - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
 131  *     pass == 4 or 5, that caused visible corruption of interlaced images
 132  *
 133  * 20000623:
 134  *  - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
 135  *     many of the form "forbidden register 0 (ax) was spilled for class AREG."
 136  *     This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
 137  *     Chuck Wilson supplied a patch involving dummy output registers.  See
 138  *     http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
 139  *     for the original (anonymous) SourceForge bug report.
 140  *
 141  * 20000706:
 142  *  - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
 143  *       pnggccrd.c: In function `png_combine_row':
 144  *       pnggccrd.c:525: more than 10 operands in `asm'
 145  *       pnggccrd.c:669: more than 10 operands in `asm'
 146  *       pnggccrd.c:828: more than 10 operands in `asm'
 147  *       pnggccrd.c:994: more than 10 operands in `asm'
 148  *       pnggccrd.c:1177: more than 10 operands in `asm'
 149  *     They are all the same problem and can be worked around by using the
 150  *     global _unmask variable unconditionally, not just in the -fPIC case.
 151  *     Reportedly earlier versions of gcc also have the problem with more than
 152  *     10 operands; they just don't report it.  Much strangeness ensues, etc.
 153  *
 154  * 20000729:
 155  *  - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
 156  *     MMX routine); began converting png_read_filter_row_mmx_sub()
 157  *  - to finish remaining sections:
 158  *     - clean up indentation and comments
 159  *     - preload local variables
 160  *     - add output and input regs (order of former determines numerical
 161  *        mapping of latter)
 162  *     - avoid all usage of ebx (including bx, bh, bl) register [20000823]
 163  *     - remove "$" from addressing of Shift and Mask variables [20000823]
 164  *
 165  * 20000731:
 166  *  - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
 167  *
 168  * 20000822:
 169  *  - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
 170  *     shared-library (-fPIC) version!  Code works just fine as part of static
 171  *     library.  Damn damn damn damn damn, should have tested that sooner.
 172  *     ebx is getting clobbered again (explicitly this time); need to save it
 173  *     on stack or rewrite asm code to avoid using it altogether.  Blargh!
 174  *
 175  * 20000823:
 176  *  - first section was trickiest; all remaining sections have ebx -> edx now.
 177  *     (-fPIC works again.)  Also added missing underscores to various Shift*
 178  *     and *Mask* globals and got rid of leading "$" signs.
 179  *
 180  * 20000826:
 181  *  - added visual separators to help navigate microscopic printed copies
 182  *     (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
 183  *     on png_read_filter_row_mmx_avg()
 184  *
 185  * 20000828:
 186  *  - finished png_read_filter_row_mmx_avg():  only Paeth left! (930 lines...)
 187  *     What the hell, did png_read_filter_row_mmx_paeth(), too.  Comments not
 188  *     cleaned up/shortened in either routine, but functionality is complete
 189  *     and seems to be working fine.
 190  *
 191  * 20000829:
 192  *  - ahhh, figured out last(?) bit of gcc/gas asm-fu:  if register is listed
 193  *     as an input reg (with dummy output variables, etc.), then it *cannot*
 194  *     also appear in the clobber list or gcc 2.95.2 will barf.  The solution
 195  *     is simple enough...
 196  *
 197  * 20000914:
 198  *  - bug in png_read_filter_row_mmx_avg():  16-bit grayscale not handled
 199  *     correctly (but 48-bit RGB just fine)
 200  *
 201  * 20000916:
 202  *  - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
 203  *     - "_ShiftBpp.use = 24;"      should have been   "_ShiftBpp.use = 16;"
 204  *     - "_ShiftRem.use = 40;"      should have been   "_ShiftRem.use = 48;"
 205  *     - "psllq _ShiftRem, %%mm2"   should have been   "psrlq _ShiftRem, %%mm2"
 206  *
 207  * 20010101:
 208  *  - added new png_init_mmx_flags() function (here only because it needs to
 209  *     call mmxsupport(), which should probably become global png_mmxsupport());
 210  *     modified other MMX routines to run conditionally (png_ptr->asm_flags)
 211  *
 212  * 20010103:
 213  *  - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
 214  *     and made it public; moved png_init_mmx_flags() to png.c as internal func
 215  *
 216  * 20010104:
 217  *  - removed dependency on png_read_filter_row_c() (C code already duplicated
 218  *     within MMX version of png_read_filter_row()) so no longer necessary to
 219  *     compile it into pngrutil.o
 220  *
 221  * 20010310:
 222  *  - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
 223  *
 224  * 20020304:
 225  *  - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
 226  *
 227  * 20040724:
 228  *   - more tinkering with clobber list at lines 4529 and 5033, to get
 229  *     it to compile on gcc-3.4.
 230  *
 231  * STILL TO DO:
 232  *     - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
 233  *     - write MMX code for 48-bit case (pixel_bytes == 6)
 234  *     - figure out what's up with 24-bit case (pixel_bytes == 3):
 235  *        why subtract 8 from width_mmx in the pass 4/5 case?
 236  *        (only width_mmx case) (near line 1606)
 237  *     - rewrite all MMX interlacing code so it's aligned with beginning
 238  *        of the row buffer, not the end (see 19991007 for details)
 239  *     x pick one version of mmxsupport() and get rid of the other
 240  *     - add error messages to any remaining bogus default cases
 241  *     - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
 242  *     x add support for runtime enable/disable/query of various MMX routines
 243  */
 244
 245 #define PNG_INTERNAL
 246 #include "png.h"
 247
 248 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGGCCRD)
 249
 250 int PNGAPI png_mmx_support(void);
 251
 252 #ifdef PNG_USE_LOCAL_ARRAYS
 253 const static int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
 254 const static int FARDATA png_pass_inc[7]   = {8, 8, 4, 4, 2, 2, 1};
 255 const static int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
 256 #endif
 257
 258 #if defined(PNG_MMX_CODE_SUPPORTED)
 259 /* djgpp, Win32, Cygwin, and OS2 add their own underscores to global variables,
 260  * so define them without: */
 261 #if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__) || \
 262     defined(__OS2__)
 263 #  define _mmx_supported  mmx_supported
 264 #  define _const4         const4
 265 #  define _const6         const6
 266 #  define _mask8_0        mask8_0
 267 #  define _mask16_1       mask16_1
 268 #  define _mask16_0       mask16_0
 269 #  define _mask24_2       mask24_2
 270 #  define _mask24_1       mask24_1
 271 #  define _mask24_0       mask24_0
 272 #  define _mask32_3       mask32_3
 273 #  define _mask32_2       mask32_2
 274 #  define _mask32_1       mask32_1
 275 #  define _mask32_0       mask32_0
 276 #  define _mask48_5       mask48_5
 277 #  define _mask48_4       mask48_4
 278 #  define _mask48_3       mask48_3
 279 #  define _mask48_2       mask48_2
 280 #  define _mask48_1       mask48_1
 281 #  define _mask48_0       mask48_0
 282 #  define _LBCarryMask    LBCarryMask
 283 #  define _HBClearMask    HBClearMask
 284 #  define _ActiveMask     ActiveMask
 285 #  define _ActiveMask2    ActiveMask2
 286 #  define _ActiveMaskEnd  ActiveMaskEnd
 287 #  define _ShiftBpp       ShiftBpp
 288 #  define _ShiftRem       ShiftRem
 289 #ifdef PNG_THREAD_UNSAFE_OK
 290 #  define _unmask         unmask
 291 #  define _FullLength     FullLength
 292 #  define _MMXLength      MMXLength
 293 #  define _dif            dif
 294 #  define _patemp         patemp
 295 #  define _pbtemp         pbtemp
 296 #  define _pctemp         pctemp
 297 #endif
 298 #endif
 299
 300
 301 /* These constants are used in the inlined MMX assembly code.
 302    Ignore gcc's "At top level: defined but not used" warnings. */
 303
 304 /* GRR 20000706:  originally _unmask was needed only when compiling with -fPIC,
 305  *  since that case uses the %ebx register for indexing the Global Offset Table
 306  *  and there were no other registers available.  But gcc 2.95 and later emit
 307  *  "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
 308  *  in the non-PIC case, so we'll just use the global unconditionally now.
 309  */
 310 #ifdef PNG_THREAD_UNSAFE_OK
 311 static int _unmask;
 312 #endif
 313
 314 const static unsigned long long _mask8_0  = 0x0102040810204080LL;
 315
 316 const static unsigned long long _mask16_1 = 0x0101020204040808LL;
 317 const static unsigned long long _mask16_0 = 0x1010202040408080LL;
 318
 319 const static unsigned long long _mask24_2 = 0x0101010202020404LL;
 320 const static unsigned long long _mask24_1 = 0x0408080810101020LL;
 321 const static unsigned long long _mask24_0 = 0x2020404040808080LL;
 322
 323 const static unsigned long long _mask32_3 = 0x0101010102020202LL;
 324 const static unsigned long long _mask32_2 = 0x0404040408080808LL;
 325 const static unsigned long long _mask32_1 = 0x1010101020202020LL;
 326 const static unsigned long long _mask32_0 = 0x4040404080808080LL;
 327
 328 const static unsigned long long _mask48_5 = 0x0101010101010202LL;
 329 const static unsigned long long _mask48_4 = 0x0202020204040404LL;
 330 const static unsigned long long _mask48_3 = 0x0404080808080808LL;
 331 const static unsigned long long _mask48_2 = 0x1010101010102020LL;
 332 const static unsigned long long _mask48_1 = 0x2020202040404040LL;
 333 const static unsigned long long _mask48_0 = 0x4040808080808080LL;
 334
 335 const static unsigned long long _const4   = 0x0000000000FFFFFFLL;
 336 //const static unsigned long long _const5 = 0x000000FFFFFF0000LL;     // NOT USED
 337 const static unsigned long long _const6   = 0x00000000000000FFLL;
 338
 339 // These are used in the row-filter routines and should/would be local
 340 //  variables if not for gcc addressing limitations.
 341 // WARNING: Their presence probably defeats the thread safety of libpng.
 342
 343 #ifdef PNG_THREAD_UNSAFE_OK
 344 static png_uint_32  _FullLength;
 345 static png_uint_32  _MMXLength;
 346 static int          _dif;
 347 static int          _patemp; // temp variables for Paeth routine
 348 static int          _pbtemp;
 349 static int          _pctemp;
 350 #endif
 351
 352 void /* PRIVATE */
 353 png_squelch_warnings(void)
 354 {
 355 #ifdef PNG_THREAD_UNSAFE_OK
 356    _dif = _dif;
 357    _patemp = _patemp;
 358    _pbtemp = _pbtemp;
 359    _pctemp = _pctemp;
 360    _MMXLength = _MMXLength;
 361 #endif
 362    _const4  = _const4;
 363    _const6  = _const6;
 364    _mask8_0  = _mask8_0;
 365    _mask16_1 = _mask16_1;
 366    _mask16_0 = _mask16_0;
 367    _mask24_2 = _mask24_2;
 368    _mask24_1 = _mask24_1;
 369    _mask24_0 = _mask24_0;
 370    _mask32_3 = _mask32_3;
 371    _mask32_2 = _mask32_2;
 372    _mask32_1 = _mask32_1;
 373    _mask32_0 = _mask32_0;
 374    _mask48_5 = _mask48_5;
 375    _mask48_4 = _mask48_4;
 376    _mask48_3 = _mask48_3;
 377    _mask48_2 = _mask48_2;
 378    _mask48_1 = _mask48_1;
 379    _mask48_0 = _mask48_0;
 380 }
 381 #endif /* PNG_MMX_CODE_SUPPORTED */
 382
 383
 384 static int _mmx_supported = 2;
 385
 386 /*===========================================================================*/
 387 /*                                                                           */
 388 /*                       P N G _ C O M B I N E _ R O W                       */
 389 /*                                                                           */
 390 /*===========================================================================*/
 391
 392 #if defined(PNG_HAVE_MMX_COMBINE_ROW)
 393
 394 #define BPP2  2
 395 #define BPP3  3 /* bytes per pixel (a.k.a. pixel_bytes) */
 396 #define BPP4  4
 397 #define BPP6  6 /* (defined only to help avoid cut-and-paste errors) */
 398 #define BPP8  8
 399
 400 /* Combines the row recently read in with the previous row.
 401    This routine takes care of alpha and transparency if requested.
 402    This routine also handles the two methods of progressive display
 403    of interlaced images, depending on the mask value.
 404    The mask value describes which pixels are to be combined with
 405    the row.  The pattern always repeats every 8 pixels, so just 8
 406    bits are needed.  A one indicates the pixel is to be combined; a
 407    zero indicates the pixel is to be skipped.  This is in addition
 408    to any alpha or transparency value associated with the pixel.
 409    If you want all pixels to be combined, pass 0xff (255) in mask. */
 410
 411 /* Use this routine for the x86 platform - it uses a faster MMX routine
 412    if the machine supports MMX. */
 413
 414 void /* PRIVATE */
 415 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
 416 {
 417    png_debug(1, "in png_combine_row (pnggccrd.c)\n");
 418
 419 #if defined(PNG_MMX_CODE_SUPPORTED)
 420    if (_mmx_supported == 2) {
 421 #if !defined(PNG_1_0_X)
 422        /* this should have happened in png_init_mmx_flags() already */
 423        png_warning(png_ptr, "asm_flags may not have been initialized");
 424 #endif
 425        png_mmx_support();
 426    }
 427 #endif
 428
 429    if (mask == 0xff)
 430    {
 431       png_debug(2,"mask == 0xff:  doing single png_memcpy()\n");
 432       png_memcpy(row, png_ptr->row_buf + 1,
 433        (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width));
 434    }
 435    else   /* (png_combine_row() is never called with mask == 0) */
 436    {
 437       switch (png_ptr->row_info.pixel_depth)
 438       {
 439          case 1:        /* png_ptr->row_info.pixel_depth */
 440          {
 441             png_bytep sp;
 442             png_bytep dp;
 443             int s_inc, s_start, s_end;
 444             int m;
 445             int shift;
 446             png_uint_32 i;
 447
 448             sp = png_ptr->row_buf + 1;
 449             dp = row;
 450             m = 0x80;
 451 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
 452             if (png_ptr->transformations & PNG_PACKSWAP)
 453             {
 454                 s_start = 0;
 455                 s_end = 7;
 456                 s_inc = 1;
 457             }
 458             else
 459 #endif
 460             {
 461                 s_start = 7;
 462                 s_end = 0;
 463                 s_inc = -1;
 464             }
 465
 466             shift = s_start;
 467
 468             for (i = 0; i < png_ptr->width; i++)
 469             {
 470                if (m & mask)
 471                {
 472                   int value;
 473
 474                   value = (*sp >> shift) & 0x1;
 475                   *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
 476                   *dp |= (png_byte)(value << shift);
 477                }
 478
 479                if (shift == s_end)
 480                {
 481                   shift = s_start;
 482                   sp++;
 483                   dp++;
 484                }
 485                else
 486                   shift += s_inc;
 487
 488                if (m == 1)
 489                   m = 0x80;
 490                else
 491                   m >>= 1;
 492             }
 493             break;
 494          }
 495
 496          case 2:        /* png_ptr->row_info.pixel_depth */
 497          {
 498             png_bytep sp;
 499             png_bytep dp;
 500             int s_start, s_end, s_inc;
 501             int m;
 502             int shift;
 503             png_uint_32 i;
 504             int value;
 505
 506             sp = png_ptr->row_buf + 1;
 507             dp = row;
 508             m = 0x80;
 509 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
 510             if (png_ptr->transformations & PNG_PACKSWAP)
 511             {
 512                s_start = 0;
 513                s_end = 6;
 514                s_inc = 2;
 515             }
 516             else
 517 #endif
 518             {
 519                s_start = 6;
 520                s_end = 0;
 521                s_inc = -2;
 522             }
 523
 524             shift = s_start;
 525
 526             for (i = 0; i < png_ptr->width; i++)
 527             {
 528                if (m & mask)
 529                {
 530                   value = (*sp >> shift) & 0x3;
 531                   *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
 532                   *dp |= (png_byte)(value << shift);
 533                }
 534
 535                if (shift == s_end)
 536                {
 537                   shift = s_start;
 538                   sp++;
 539                   dp++;
 540                }
 541                else
 542                   shift += s_inc;
 543                if (m == 1)
 544                   m = 0x80;
 545                else
 546                   m >>= 1;
 547             }
 548             break;
 549          }
 550
 551          case 4:        /* png_ptr->row_info.pixel_depth */
 552          {
 553             png_bytep sp;
 554             png_bytep dp;
 555             int s_start, s_end, s_inc;
 556             int m;
 557             int shift;
 558             png_uint_32 i;
 559             int value;
 560
 561             sp = png_ptr->row_buf + 1;
 562             dp = row;
 563             m = 0x80;
 564 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
 565             if (png_ptr->transformations & PNG_PACKSWAP)
 566             {
 567                s_start = 0;
 568                s_end = 4;
 569                s_inc = 4;
 570             }
 571             else
 572 #endif
 573             {
 574                s_start = 4;
 575                s_end = 0;
 576                s_inc = -4;
 577             }
 578             shift = s_start;
 579
 580             for (i = 0; i < png_ptr->width; i++)
 581             {
 582                if (m & mask)
 583                {
 584                   value = (*sp >> shift) & 0xf;
 585                   *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
 586                   *dp |= (png_byte)(value << shift);
 587                }
 588
 589                if (shift == s_end)
 590                {
 591                   shift = s_start;
 592                   sp++;
 593                   dp++;
 594                }
 595                else
 596                   shift += s_inc;
 597                if (m == 1)
 598                   m = 0x80;
 599                else
 600                   m >>= 1;
 601             }
 602             break;
 603          }
 604
 605          case 8:        /* png_ptr->row_info.pixel_depth */
 606          {
 607             png_bytep srcptr;
 608             png_bytep dstptr;
 609
 610 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
 611 #if !defined(PNG_1_0_X)
 612             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
 613                 /* && _mmx_supported */ )
 614 #else
 615             if (_mmx_supported)
 616 #endif
 617             {
 618                png_uint_32 len;
 619                int diff;
 620                int dummy_value_a;   // fix 'forbidden register spilled' error
 621                int dummy_value_d;
 622                int dummy_value_c;
 623                int dummy_value_S;
 624                int dummy_value_D;
 625                _unmask = ~mask;            // global variable for -fPIC version
 626                srcptr = png_ptr->row_buf + 1;
 627                dstptr = row;
 628                len  = png_ptr->width &~7;  // reduce to multiple of 8
 629                diff = (int) (png_ptr->width & 7);  // amount lost
 630
 631                __asm__ __volatile__ (
 632                   "movd      _unmask, %%mm7  \n\t" // load bit pattern
 633                   "psubb     %%mm6, %%mm6    \n\t" // zero mm6
 634                   "punpcklbw %%mm7, %%mm7    \n\t"
 635                   "punpcklwd %%mm7, %%mm7    \n\t"
 636                   "punpckldq %%mm7, %%mm7    \n\t" // fill reg with 8 masks
 637
 638                   "movq      _mask8_0, %%mm0 \n\t"
 639                   "pand      %%mm7, %%mm0    \n\t" // nonzero if keep byte
 640                   "pcmpeqb   %%mm6, %%mm0    \n\t" // zeros->1s, v versa
 641
 642 // preload        "movl      len, %%ecx      \n\t" // load length of line
 643 // preload        "movl      srcptr, %%esi   \n\t" // load source
 644 // preload        "movl      dstptr, %%edi   \n\t" // load dest
 645
 646                   "cmpl      $0, %%ecx       \n\t" // len == 0 ?
 647                   "je        mainloop8end    \n\t"
 648
 649                 "mainloop8:                  \n\t"
 650                   "movq      (%%esi), %%mm4  \n\t" // *srcptr
 651                   "pand      %%mm0, %%mm4    \n\t"
 652                   "movq      %%mm0, %%mm6    \n\t"
 653                   "pandn     (%%edi), %%mm6  \n\t" // *dstptr
 654                   "por       %%mm6, %%mm4    \n\t"
 655                   "movq      %%mm4, (%%edi)  \n\t"
 656                   "addl      $8, %%esi       \n\t" // inc by 8 bytes processed
 657                   "addl      $8, %%edi       \n\t"
 658                   "subl      $8, %%ecx       \n\t" // dec by 8 pixels processed
 659                   "ja        mainloop8       \n\t"
 660
 661                 "mainloop8end:               \n\t"
 662 // preload        "movl      diff, %%ecx     \n\t" // (diff is in eax)
 663                   "movl      %%eax, %%ecx    \n\t"
 664                   "cmpl      $0, %%ecx       \n\t"
 665                   "jz        end8            \n\t"
 666 // preload        "movl      mask, %%edx     \n\t"
 667                   "sall      $24, %%edx      \n\t" // make low byte, high byte
 668
 669                 "secondloop8:                \n\t"
 670                   "sall      %%edx           \n\t" // move high bit to CF
 671                   "jnc       skip8           \n\t" // if CF = 0
 672                   "movb      (%%esi), %%al   \n\t"
 673                   "movb      %%al, (%%edi)   \n\t"
 674
 675                 "skip8:                      \n\t"
 676                   "incl      %%esi           \n\t"
 677                   "incl      %%edi           \n\t"
 678                   "decl      %%ecx           \n\t"
 679                   "jnz       secondloop8     \n\t"
 680
 681                 "end8:                       \n\t"
 682                   "EMMS                      \n\t"  // DONE
 683
 684                   : "=a" (dummy_value_a),           // output regs (dummy)
 685                     "=d" (dummy_value_d),
 686                     "=c" (dummy_value_c),
 687                     "=S" (dummy_value_S),
 688                     "=D" (dummy_value_D)
 689
 690                   : "3" (srcptr),      // esi       // input regs
 691                     "4" (dstptr),      // edi
 692                     "0" (diff),        // eax
 693 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
 694                     "2" (len),         // ecx
 695                     "1" (mask)         // edx
 696
 697 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
 698                   : "%mm0", "%mm4", "%mm6", "%mm7"  // clobber list
 699 #endif
 700                );
 701             }
 702             else /* mmx _not supported - Use modified C routine */
 703 #endif /* PNG_MMX_CODE_SUPPORTED */
 704             {
 705                register png_uint_32 i;
 706                png_uint_32 initial_val = png_pass_start[png_ptr->pass];
 707                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
 708                register int stride = png_pass_inc[png_ptr->pass];
 709                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
 710                register int rep_bytes = png_pass_width[png_ptr->pass];
 711                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
 712                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
 713                int diff = (int) (png_ptr->width & 7); /* amount lost */
 714                register png_uint_32 final_val = len;  /* GRR bugfix */
 715
 716                srcptr = png_ptr->row_buf + 1 + initial_val;
 717                dstptr = row + initial_val;
 718
 719                for (i = initial_val; i < final_val; i += stride)
 720                {
 721                   png_memcpy(dstptr, srcptr, rep_bytes);
 722                   srcptr += stride;
 723                   dstptr += stride;
 724                }
 725                if (diff)  /* number of leftover pixels:  3 for pngtest */
 726                {
 727                   final_val+=diff /* *BPP1 */ ;
 728                   for (; i < final_val; i += stride)
 729                   {
 730                      if (rep_bytes > (int)(final_val-i))
 731                         rep_bytes = (int)(final_val-i);
 732                      png_memcpy(dstptr, srcptr, rep_bytes);
 733                      srcptr += stride;
 734                      dstptr += stride;
 735                   }
 736                }
 737
 738             } /* end of else (_mmx_supported) */
 739
 740             break;
 741          }       /* end 8 bpp */
 742
 743          case 16:       /* png_ptr->row_info.pixel_depth */
 744          {
 745             png_bytep srcptr;
 746             png_bytep dstptr;
 747
 748 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
 749 #if !defined(PNG_1_0_X)
 750             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
 751                 /* && _mmx_supported */ )
 752 #else
 753             if (_mmx_supported)
 754 #endif
 755             {
 756                png_uint_32 len;
 757                int diff;
 758                int dummy_value_a;   // fix 'forbidden register spilled' error
 759                int dummy_value_d;
 760                int dummy_value_c;
 761                int dummy_value_S;
 762                int dummy_value_D;
 763                _unmask = ~mask;            // global variable for -fPIC version
 764                srcptr = png_ptr->row_buf + 1;
 765                dstptr = row;
 766                len  = png_ptr->width &~7;  // reduce to multiple of 8
 767                diff = (int) (png_ptr->width & 7); // amount lost //
 768
 769                __asm__ __volatile__ (
 770                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
 771                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
 772                   "punpcklbw %%mm7, %%mm7     \n\t"
 773                   "punpcklwd %%mm7, %%mm7     \n\t"
 774                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
 775
 776                   "movq      _mask16_0, %%mm0 \n\t"
 777                   "movq      _mask16_1, %%mm1 \n\t"
 778
 779                   "pand      %%mm7, %%mm0     \n\t"
 780                   "pand      %%mm7, %%mm1     \n\t"
 781
 782                   "pcmpeqb   %%mm6, %%mm0     \n\t"
 783                   "pcmpeqb   %%mm6, %%mm1     \n\t"
 784
 785 // preload        "movl      len, %%ecx       \n\t" // load length of line
 786 // preload        "movl      srcptr, %%esi    \n\t" // load source
 787 // preload        "movl      dstptr, %%edi    \n\t" // load dest
 788
 789                   "cmpl      $0, %%ecx        \n\t"
 790                   "jz        mainloop16end    \n\t"
 791
 792                 "mainloop16:                  \n\t"
 793                   "movq      (%%esi), %%mm4   \n\t"
 794                   "pand      %%mm0, %%mm4     \n\t"
 795                   "movq      %%mm0, %%mm6     \n\t"
 796                   "movq      (%%edi), %%mm7   \n\t"
 797                   "pandn     %%mm7, %%mm6     \n\t"
 798                   "por       %%mm6, %%mm4     \n\t"
 799                   "movq      %%mm4, (%%edi)   \n\t"
 800
 801                   "movq      8(%%esi), %%mm5  \n\t"
 802                   "pand      %%mm1, %%mm5     \n\t"
 803                   "movq      %%mm1, %%mm7     \n\t"
 804                   "movq      8(%%edi), %%mm6  \n\t"
 805                   "pandn     %%mm6, %%mm7     \n\t"
 806                   "por       %%mm7, %%mm5     \n\t"
 807                   "movq      %%mm5, 8(%%edi)  \n\t"
 808
 809                   "addl      $16, %%esi       \n\t" // inc by 16 bytes processed
 810                   "addl      $16, %%edi       \n\t"
 811                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
 812                   "ja        mainloop16       \n\t"
 813
 814                 "mainloop16end:               \n\t"
 815 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
 816                   "movl      %%eax, %%ecx     \n\t"
 817                   "cmpl      $0, %%ecx        \n\t"
 818                   "jz        end16            \n\t"
 819 // preload        "movl      mask, %%edx      \n\t"
 820                   "sall      $24, %%edx       \n\t" // make low byte, high byte
 821
 822                 "secondloop16:                \n\t"
 823                   "sall      %%edx            \n\t" // move high bit to CF
 824                   "jnc       skip16           \n\t" // if CF = 0
 825                   "movw      (%%esi), %%ax    \n\t"
 826                   "movw      %%ax, (%%edi)    \n\t"
 827
 828                 "skip16:                      \n\t"
 829                   "addl      $2, %%esi        \n\t"
 830                   "addl      $2, %%edi        \n\t"
 831                   "decl      %%ecx            \n\t"
 832                   "jnz       secondloop16     \n\t"
 833
 834                 "end16:                       \n\t"
 835                   "EMMS                       \n\t" // DONE
 836
 837                   : "=a" (dummy_value_a),           // output regs (dummy)
 838                     "=c" (dummy_value_c),
 839                     "=d" (dummy_value_d),
 840                     "=S" (dummy_value_S),
 841                     "=D" (dummy_value_D)
 842
 843                   : "0" (diff),        // eax       // input regs
 844 // was (unmask)     " "    RESERVED    // ebx       // Global Offset Table idx
 845                     "1" (len),         // ecx
 846                     "2" (mask),        // edx
 847                     "3" (srcptr),      // esi
 848                     "4" (dstptr)       // edi
 849
 850 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
 851                   : "%mm0", "%mm1", "%mm4"          // clobber list
 852                   , "%mm5", "%mm6", "%mm7"
 853 #endif
 854                );
 855             }
 856             else /* mmx _not supported - Use modified C routine */
 857 #endif /* PNG_MMX_CODE_SUPPORTED */
 858             {
 859                register png_uint_32 i;
 860                png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
 861                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
 862                register int stride = BPP2 * png_pass_inc[png_ptr->pass];
 863                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
 864                register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
 865                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
 866                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
 867                int diff = (int) (png_ptr->width & 7); /* amount lost */
 868                register png_uint_32 final_val = BPP2 * len;   /* GRR bugfix */
 869
 870                srcptr = png_ptr->row_buf + 1 + initial_val;
 871                dstptr = row + initial_val;
 872
 873                for (i = initial_val; i < final_val; i += stride)
 874                {
 875                   png_memcpy(dstptr, srcptr, rep_bytes);
 876                   srcptr += stride;
 877                   dstptr += stride;
 878                }
 879                if (diff)  /* number of leftover pixels:  3 for pngtest */
 880                {
 881                   final_val+=diff*BPP2;
 882                   for (; i < final_val; i += stride)
 883                   {
 884                      if (rep_bytes > (int)(final_val-i))
 885                         rep_bytes = (int)(final_val-i);
 886                      png_memcpy(dstptr, srcptr, rep_bytes);
 887                      srcptr += stride;
 888                      dstptr += stride;
 889                   }
 890                }
 891             } /* end of else (_mmx_supported) */
 892
 893             break;
 894          }       /* end 16 bpp */
 895
 896          case 24:       /* png_ptr->row_info.pixel_depth */
 897          {
 898             png_bytep srcptr;
 899             png_bytep dstptr;
 900
 901 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
 902 #if !defined(PNG_1_0_X)
 903             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
 904                 /* && _mmx_supported */ )
 905 #else
 906             if (_mmx_supported)
 907 #endif
 908             {
 909                png_uint_32 len;
 910                int diff;
 911                int dummy_value_a;   // fix 'forbidden register spilled' error
 912                int dummy_value_d;
 913                int dummy_value_c;
 914                int dummy_value_S;
 915                int dummy_value_D;
 916                _unmask = ~mask;            // global variable for -fPIC version
 917                srcptr = png_ptr->row_buf + 1;
 918                dstptr = row;
 919                len  = png_ptr->width &~7;  // reduce to multiple of 8
 920                diff = (int) (png_ptr->width & 7); // amount lost //
 921
 922                __asm__ __volatile__ (
 923                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
 924                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
 925                   "punpcklbw %%mm7, %%mm7     \n\t"
 926                   "punpcklwd %%mm7, %%mm7     \n\t"
 927                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
 928
 929                   "movq      _mask24_0, %%mm0 \n\t"
 930                   "movq      _mask24_1, %%mm1 \n\t"
 931                   "movq      _mask24_2, %%mm2 \n\t"
 932
 933                   "pand      %%mm7, %%mm0     \n\t"
 934                   "pand      %%mm7, %%mm1     \n\t"
 935                   "pand      %%mm7, %%mm2     \n\t"
 936
 937                   "pcmpeqb   %%mm6, %%mm0     \n\t"
 938                   "pcmpeqb   %%mm6, %%mm1     \n\t"
 939                   "pcmpeqb   %%mm6, %%mm2     \n\t"
 940
 941 // preload        "movl      len, %%ecx       \n\t" // load length of line
 942 // preload        "movl      srcptr, %%esi    \n\t" // load source
 943 // preload        "movl      dstptr, %%edi    \n\t" // load dest
 944
 945                   "cmpl      $0, %%ecx        \n\t"
 946                   "jz        mainloop24end    \n\t"
 947
 948                 "mainloop24:                  \n\t"
 949                   "movq      (%%esi), %%mm4   \n\t"
 950                   "pand      %%mm0, %%mm4     \n\t"
 951                   "movq      %%mm0, %%mm6     \n\t"
 952                   "movq      (%%edi), %%mm7   \n\t"
 953                   "pandn     %%mm7, %%mm6     \n\t"
 954                   "por       %%mm6, %%mm4     \n\t"
 955                   "movq      %%mm4, (%%edi)   \n\t"
 956
 957                   "movq      8(%%esi), %%mm5  \n\t"
 958                   "pand      %%mm1, %%mm5     \n\t"
 959                   "movq      %%mm1, %%mm7     \n\t"
 960                   "movq      8(%%edi), %%mm6  \n\t"
 961                   "pandn     %%mm6, %%mm7     \n\t"
 962                   "por       %%mm7, %%mm5     \n\t"
 963                   "movq      %%mm5, 8(%%edi)  \n\t"
 964
 965                   "movq      16(%%esi), %%mm6 \n\t"
 966                   "pand      %%mm2, %%mm6     \n\t"
 967                   "movq      %%mm2, %%mm4     \n\t"
 968                   "movq      16(%%edi), %%mm7 \n\t"
 969                   "pandn     %%mm7, %%mm4     \n\t"
 970                   "por       %%mm4, %%mm6     \n\t"
 971                   "movq      %%mm6, 16(%%edi) \n\t"
 972
 973                   "addl      $24, %%esi       \n\t" // inc by 24 bytes processed
 974                   "addl      $24, %%edi       \n\t"
 975                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
 976
 977                   "ja        mainloop24       \n\t"
 978
 979                 "mainloop24end:               \n\t"
 980 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
 981                   "movl      %%eax, %%ecx     \n\t"
 982                   "cmpl      $0, %%ecx        \n\t"
 983                   "jz        end24            \n\t"
 984 // preload        "movl      mask, %%edx      \n\t"
 985                   "sall      $24, %%edx       \n\t" // make low byte, high byte
 986
 987                 "secondloop24:                \n\t"
 988                   "sall      %%edx            \n\t" // move high bit to CF
 989                   "jnc       skip24           \n\t" // if CF = 0
 990                   "movw      (%%esi), %%ax    \n\t"
 991                   "movw      %%ax, (%%edi)    \n\t"
 992                   "xorl      %%eax, %%eax     \n\t"
 993                   "movb      2(%%esi), %%al   \n\t"
 994                   "movb      %%al, 2(%%edi)   \n\t"
 995
 996                 "skip24:                      \n\t"
 997                   "addl      $3, %%esi        \n\t"
 998                   "addl      $3, %%edi        \n\t"
 999                   "decl      %%ecx            \n\t"
1000                   "jnz       secondloop24     \n\t"
1001
1002                 "end24:                       \n\t"
1003                   "EMMS                       \n\t" // DONE
1004
1005                   : "=a" (dummy_value_a),           // output regs (dummy)
1006                     "=d" (dummy_value_d),
1007                     "=c" (dummy_value_c),
1008                     "=S" (dummy_value_S),
1009                     "=D" (dummy_value_D)
1010
1011                   : "3" (srcptr),      // esi       // input regs
1012                     "4" (dstptr),      // edi
1013                     "0" (diff),        // eax
1014 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
1015                     "2" (len),         // ecx
1016                     "1" (mask)         // edx
1017
1018 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1019                   : "%mm0", "%mm1", "%mm2"          // clobber list
1020                   , "%mm4", "%mm5", "%mm6", "%mm7"
1021 #endif
1022                );
1023             }
1024             else /* mmx _not supported - Use modified C routine */
1025 #endif /* PNG_MMX_CODE_SUPPORTED */
1026             {
1027                register png_uint_32 i;
1028                png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
1029                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1030                register int stride = BPP3 * png_pass_inc[png_ptr->pass];
1031                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1032                register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
1033                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1034                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1035                int diff = (int) (png_ptr->width & 7); /* amount lost */
1036                register png_uint_32 final_val = BPP3 * len;   /* GRR bugfix */
1037
1038                srcptr = png_ptr->row_buf + 1 + initial_val;
1039                dstptr = row + initial_val;
1040
1041                for (i = initial_val; i < final_val; i += stride)
1042                {
1043                   png_memcpy(dstptr, srcptr, rep_bytes);
1044                   srcptr += stride;
1045                   dstptr += stride;
1046                }
1047                if (diff)  /* number of leftover pixels:  3 for pngtest */
1048                {
1049                   final_val+=diff*BPP3;
1050                   for (; i < final_val; i += stride)
1051                   {
1052                      if (rep_bytes > (int)(final_val-i))
1053                         rep_bytes = (int)(final_val-i);
1054                      png_memcpy(dstptr, srcptr, rep_bytes);
1055                      srcptr += stride;
1056                      dstptr += stride;
1057                   }
1058                }
1059             } /* end of else (_mmx_supported) */
1060
1061             break;
1062          }       /* end 24 bpp */
1063
1064          case 32:       /* png_ptr->row_info.pixel_depth */
1065          {
1066             png_bytep srcptr;
1067             png_bytep dstptr;
1068
1069 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1070 #if !defined(PNG_1_0_X)
1071             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1072                 /* && _mmx_supported */ )
1073 #else
1074             if (_mmx_supported)
1075 #endif
1076             {
1077                png_uint_32 len;
1078                int diff;
1079                int dummy_value_a;   // fix 'forbidden register spilled' error
1080                int dummy_value_d;
1081                int dummy_value_c;
1082                int dummy_value_S;
1083                int dummy_value_D;
1084                _unmask = ~mask;            // global variable for -fPIC version
1085                srcptr = png_ptr->row_buf + 1;
1086                dstptr = row;
1087                len  = png_ptr->width &~7;  // reduce to multiple of 8
1088                diff = (int) (png_ptr->width & 7); // amount lost //
1089
1090                __asm__ __volatile__ (
1091                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
1092                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
1093                   "punpcklbw %%mm7, %%mm7     \n\t"
1094                   "punpcklwd %%mm7, %%mm7     \n\t"
1095                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
1096
1097                   "movq      _mask32_0, %%mm0 \n\t"
1098                   "movq      _mask32_1, %%mm1 \n\t"
1099                   "movq      _mask32_2, %%mm2 \n\t"
1100                   "movq      _mask32_3, %%mm3 \n\t"
1101
1102                   "pand      %%mm7, %%mm0     \n\t"
1103                   "pand      %%mm7, %%mm1     \n\t"
1104                   "pand      %%mm7, %%mm2     \n\t"
1105                   "pand      %%mm7, %%mm3     \n\t"
1106
1107                   "pcmpeqb   %%mm6, %%mm0     \n\t"
1108                   "pcmpeqb   %%mm6, %%mm1     \n\t"
1109                   "pcmpeqb   %%mm6, %%mm2     \n\t"
1110                   "pcmpeqb   %%mm6, %%mm3     \n\t"
1111
1112 // preload        "movl      len, %%ecx       \n\t" // load length of line
1113 // preload        "movl      srcptr, %%esi    \n\t" // load source
1114 // preload        "movl      dstptr, %%edi    \n\t" // load dest
1115
1116                   "cmpl      $0, %%ecx        \n\t" // lcr
1117                   "jz        mainloop32end    \n\t"
1118
1119                 "mainloop32:                  \n\t"
1120                   "movq      (%%esi), %%mm4   \n\t"
1121                   "pand      %%mm0, %%mm4     \n\t"
1122                   "movq      %%mm0, %%mm6     \n\t"
1123                   "movq      (%%edi), %%mm7   \n\t"
1124                   "pandn     %%mm7, %%mm6     \n\t"
1125                   "por       %%mm6, %%mm4     \n\t"
1126                   "movq      %%mm4, (%%edi)   \n\t"
1127
1128                   "movq      8(%%esi), %%mm5  \n\t"
1129                   "pand      %%mm1, %%mm5     \n\t"
1130                   "movq      %%mm1, %%mm7     \n\t"
1131                   "movq      8(%%edi), %%mm6  \n\t"
1132                   "pandn     %%mm6, %%mm7     \n\t"
1133                   "por       %%mm7, %%mm5     \n\t"
1134                   "movq      %%mm5, 8(%%edi)  \n\t"
1135
1136                   "movq      16(%%esi), %%mm6 \n\t"
1137                   "pand      %%mm2, %%mm6     \n\t"
1138                   "movq      %%mm2, %%mm4     \n\t"
1139                   "movq      16(%%edi), %%mm7 \n\t"
1140                   "pandn     %%mm7, %%mm4     \n\t"
1141                   "por       %%mm4, %%mm6     \n\t"
1142                   "movq      %%mm6, 16(%%edi) \n\t"
1143
1144                   "movq      24(%%esi), %%mm7 \n\t"
1145                   "pand      %%mm3, %%mm7     \n\t"
1146                   "movq      %%mm3, %%mm5     \n\t"
1147                   "movq      24(%%edi), %%mm4 \n\t"
1148                   "pandn     %%mm4, %%mm5     \n\t"
1149                   "por       %%mm5, %%mm7     \n\t"
1150                   "movq      %%mm7, 24(%%edi) \n\t"
1151
1152                   "addl      $32, %%esi       \n\t" // inc by 32 bytes processed
1153                   "addl      $32, %%edi       \n\t"
1154                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
1155                   "ja        mainloop32       \n\t"
1156
1157                 "mainloop32end:               \n\t"
1158 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
1159                   "movl      %%eax, %%ecx     \n\t"
1160                   "cmpl      $0, %%ecx        \n\t"
1161                   "jz        end32            \n\t"
1162 // preload        "movl      mask, %%edx      \n\t"
1163                   "sall      $24, %%edx       \n\t" // low byte => high byte
1164
1165                 "secondloop32:                \n\t"
1166                   "sall      %%edx            \n\t" // move high bit to CF
1167                   "jnc       skip32           \n\t" // if CF = 0
1168                   "movl      (%%esi), %%eax   \n\t"
1169                   "movl      %%eax, (%%edi)   \n\t"
1170
1171                 "skip32:                      \n\t"
1172                   "addl      $4, %%esi        \n\t"
1173                   "addl      $4, %%edi        \n\t"
1174                   "decl      %%ecx            \n\t"
1175                   "jnz       secondloop32     \n\t"
1176
1177                 "end32:                       \n\t"
1178                   "EMMS                       \n\t" // DONE
1179
1180                   : "=a" (dummy_value_a),           // output regs (dummy)
1181                     "=d" (dummy_value_d),
1182                     "=c" (dummy_value_c),
1183                     "=S" (dummy_value_S),
1184                     "=D" (dummy_value_D)
1185
1186                   : "3" (srcptr),      // esi       // input regs
1187                     "4" (dstptr),      // edi
1188                     "0" (diff),        // eax
1189 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
1190                     "2" (len),         // ecx
1191                     "1" (mask)         // edx
1192
1193 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1194                   : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
1195                   , "%mm4", "%mm5", "%mm6", "%mm7"
1196 #endif
1197                );
1198             }
1199             else /* mmx _not supported - Use modified C routine */
1200 #endif /* PNG_MMX_CODE_SUPPORTED */
1201             {
1202                register png_uint_32 i;
1203                png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
1204                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1205                register int stride = BPP4 * png_pass_inc[png_ptr->pass];
1206                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1207                register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
1208                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1209                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1210                int diff = (int) (png_ptr->width & 7); /* amount lost */
1211                register png_uint_32 final_val = BPP4 * len;   /* GRR bugfix */
1212
1213                srcptr = png_ptr->row_buf + 1 + initial_val;
1214                dstptr = row + initial_val;
1215
1216                for (i = initial_val; i < final_val; i += stride)
1217                {
1218                   png_memcpy(dstptr, srcptr, rep_bytes);
1219                   srcptr += stride;
1220                   dstptr += stride;
1221                }
1222                if (diff)  /* number of leftover pixels:  3 for pngtest */
1223                {
1224                   final_val+=diff*BPP4;
1225                   for (; i < final_val; i += stride)
1226                   {
1227                      if (rep_bytes > (int)(final_val-i))
1228                         rep_bytes = (int)(final_val-i);
1229                      png_memcpy(dstptr, srcptr, rep_bytes);
1230                      srcptr += stride;
1231                      dstptr += stride;
1232                   }
1233                }
1234             } /* end of else (_mmx_supported) */
1235
1236             break;
1237          }       /* end 32 bpp */
1238
1239          case 48:       /* png_ptr->row_info.pixel_depth */
1240          {
1241             png_bytep srcptr;
1242             png_bytep dstptr;
1243
1244 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1245 #if !defined(PNG_1_0_X)
1246             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1247                 /* && _mmx_supported */ )
1248 #else
1249             if (_mmx_supported)
1250 #endif
1251             {
1252                png_uint_32 len;
1253                int diff;
1254                int dummy_value_a;   // fix 'forbidden register spilled' error
1255                int dummy_value_d;
1256                int dummy_value_c;
1257                int dummy_value_S;
1258                int dummy_value_D;
1259                _unmask = ~mask;            // global variable for -fPIC version
1260                srcptr = png_ptr->row_buf + 1;
1261                dstptr = row;
1262                len  = png_ptr->width &~7;  // reduce to multiple of 8
1263                diff = (int) (png_ptr->width & 7); // amount lost //
1264
1265                __asm__ __volatile__ (
1266                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
1267                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
1268                   "punpcklbw %%mm7, %%mm7     \n\t"
1269                   "punpcklwd %%mm7, %%mm7     \n\t"
1270                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
1271
1272                   "movq      _mask48_0, %%mm0 \n\t"
1273                   "movq      _mask48_1, %%mm1 \n\t"
1274                   "movq      _mask48_2, %%mm2 \n\t"
1275                   "movq      _mask48_3, %%mm3 \n\t"
1276                   "movq      _mask48_4, %%mm4 \n\t"
1277                   "movq      _mask48_5, %%mm5 \n\t"
1278
1279                   "pand      %%mm7, %%mm0     \n\t"
1280                   "pand      %%mm7, %%mm1     \n\t"
1281                   "pand      %%mm7, %%mm2     \n\t"
1282                   "pand      %%mm7, %%mm3     \n\t"
1283                   "pand      %%mm7, %%mm4     \n\t"
1284                   "pand      %%mm7, %%mm5     \n\t"
1285
1286                   "pcmpeqb   %%mm6, %%mm0     \n\t"
1287                   "pcmpeqb   %%mm6, %%mm1     \n\t"
1288                   "pcmpeqb   %%mm6, %%mm2     \n\t"
1289                   "pcmpeqb   %%mm6, %%mm3     \n\t"
1290                   "pcmpeqb   %%mm6, %%mm4     \n\t"
1291                   "pcmpeqb   %%mm6, %%mm5     \n\t"
1292
1293 // preload        "movl      len, %%ecx       \n\t" // load length of line
1294 // preload        "movl      srcptr, %%esi    \n\t" // load source
1295 // preload        "movl      dstptr, %%edi    \n\t" // load dest
1296
1297                   "cmpl      $0, %%ecx        \n\t"
1298                   "jz        mainloop48end    \n\t"
1299
1300                 "mainloop48:                  \n\t"
1301                   "movq      (%%esi), %%mm7   \n\t"
1302                   "pand      %%mm0, %%mm7     \n\t"
1303                   "movq      %%mm0, %%mm6     \n\t"
1304                   "pandn     (%%edi), %%mm6   \n\t"
1305                   "por       %%mm6, %%mm7     \n\t"
1306                   "movq      %%mm7, (%%edi)   \n\t"
1307
1308                   "movq      8(%%esi), %%mm6  \n\t"
1309                   "pand      %%mm1, %%mm6     \n\t"
1310                   "movq      %%mm1, %%mm7     \n\t"
1311                   "pandn     8(%%edi), %%mm7  \n\t"
1312                   "por       %%mm7, %%mm6     \n\t"
1313                   "movq      %%mm6, 8(%%edi)  \n\t"
1314
1315                   "movq      16(%%esi), %%mm6 \n\t"
1316                   "pand      %%mm2, %%mm6     \n\t"
1317                   "movq      %%mm2, %%mm7     \n\t"
1318                   "pandn     16(%%edi), %%mm7 \n\t"
1319                   "por       %%mm7, %%mm6     \n\t"
1320                   "movq      %%mm6, 16(%%edi) \n\t"
1321
1322                   "movq      24(%%esi), %%mm7 \n\t"
1323                   "pand      %%mm3, %%mm7     \n\t"
1324                   "movq      %%mm3, %%mm6     \n\t"
1325                   "pandn     24(%%edi), %%mm6 \n\t"
1326                   "por       %%mm6, %%mm7     \n\t"
1327                   "movq      %%mm7, 24(%%edi) \n\t"
1328
1329                   "movq      32(%%esi), %%mm6 \n\t"
1330                   "pand      %%mm4, %%mm6     \n\t"
1331                   "movq      %%mm4, %%mm7     \n\t"
1332                   "pandn     32(%%edi), %%mm7 \n\t"
1333                   "por       %%mm7, %%mm6     \n\t"
1334                   "movq      %%mm6, 32(%%edi) \n\t"
1335
1336                   "movq      40(%%esi), %%mm7 \n\t"
1337                   "pand      %%mm5, %%mm7     \n\t"
1338                   "movq      %%mm5, %%mm6     \n\t"
1339                   "pandn     40(%%edi), %%mm6 \n\t"
1340                   "por       %%mm6, %%mm7     \n\t"
1341                   "movq      %%mm7, 40(%%edi) \n\t"
1342
1343                   "addl      $48, %%esi       \n\t" // inc by 48 bytes processed
1344                   "addl      $48, %%edi       \n\t"
1345                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
1346
1347                   "ja        mainloop48       \n\t"
1348
1349                 "mainloop48end:               \n\t"
1350 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
1351                   "movl      %%eax, %%ecx     \n\t"
1352                   "cmpl      $0, %%ecx        \n\t"
1353                   "jz        end48            \n\t"
1354 // preload        "movl      mask, %%edx      \n\t"
1355                   "sall      $24, %%edx       \n\t" // make low byte, high byte
1356
1357                 "secondloop48:                \n\t"
1358                   "sall      %%edx            \n\t" // move high bit to CF
1359                   "jnc       skip48           \n\t" // if CF = 0
1360                   "movl      (%%esi), %%eax   \n\t"
1361                   "movl      %%eax, (%%edi)   \n\t"
1362
1363                 "skip48:                      \n\t"
1364                   "addl      $4, %%esi        \n\t"
1365                   "addl      $4, %%edi        \n\t"
1366                   "decl      %%ecx            \n\t"
1367                   "jnz       secondloop48     \n\t"
1368
1369                 "end48:                       \n\t"
1370                   "EMMS                       \n\t" // DONE
1371
1372                   : "=a" (dummy_value_a),           // output regs (dummy)
1373                     "=d" (dummy_value_d),
1374                     "=c" (dummy_value_c),
1375                     "=S" (dummy_value_S),
1376                     "=D" (dummy_value_D)
1377
1378                   : "3" (srcptr),      // esi       // input regs
1379                     "4" (dstptr),      // edi
1380                     "0" (diff),        // eax
1381 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
1382                     "2" (len),         // ecx
1383                     "1" (mask)         // edx
1384
1385 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1386                   : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
1387                   , "%mm4", "%mm5", "%mm6", "%mm7"
1388 #endif
1389                );
1390             }
1391             else /* mmx _not supported - Use modified C routine */
1392 #endif /* PNG_MMX_CODE_SUPPORTED */
1393             {
1394                register png_uint_32 i;
1395                png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
1396                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1397                register int stride = BPP6 * png_pass_inc[png_ptr->pass];
1398                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1399                register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
1400                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1401                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1402                int diff = (int) (png_ptr->width & 7); /* amount lost */
1403                register png_uint_32 final_val = BPP6 * len;   /* GRR bugfix */
1404
1405                srcptr = png_ptr->row_buf + 1 + initial_val;
1406                dstptr = row + initial_val;
1407
1408                for (i = initial_val; i < final_val; i += stride)
1409                {
1410                   png_memcpy(dstptr, srcptr, rep_bytes);
1411                   srcptr += stride;
1412                   dstptr += stride;
1413                }
1414                if (diff)  /* number of leftover pixels:  3 for pngtest */
1415                {
1416                   final_val+=diff*BPP6;
1417                   for (; i < final_val; i += stride)
1418                   {
1419                      if (rep_bytes > (int)(final_val-i))
1420                         rep_bytes = (int)(final_val-i);
1421                      png_memcpy(dstptr, srcptr, rep_bytes);
1422                      srcptr += stride;
1423                      dstptr += stride;
1424                   }
1425                }
1426             } /* end of else (_mmx_supported) */
1427
1428             break;
1429          }       /* end 48 bpp */
1430
1431          case 64:       /* png_ptr->row_info.pixel_depth */
1432          {
1433             png_bytep srcptr;
1434             png_bytep dstptr;
1435             register png_uint_32 i;
1436             png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
1437               /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1438             register int stride = BPP8 * png_pass_inc[png_ptr->pass];
1439               /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1440             register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
1441               /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1442             png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1443             int diff = (int) (png_ptr->width & 7); /* amount lost */
1444             register png_uint_32 final_val = BPP8 * len;   /* GRR bugfix */
1445
1446             srcptr = png_ptr->row_buf + 1 + initial_val;
1447             dstptr = row + initial_val;
1448
1449             for (i = initial_val; i < final_val; i += stride)
1450             {
1451                png_memcpy(dstptr, srcptr, rep_bytes);
1452                srcptr += stride;
1453                dstptr += stride;
1454             }
1455             if (diff)  /* number of leftover pixels:  3 for pngtest */
1456             {
1457                final_val+=diff*BPP8;
1458                for (; i < final_val; i += stride)
1459                {
1460                   if (rep_bytes > (int)(final_val-i))
1461                      rep_bytes = (int)(final_val-i);
1462                   png_memcpy(dstptr, srcptr, rep_bytes);
1463                   srcptr += stride;
1464                   dstptr += stride;
1465                }
1466             }
1467
1468             break;
1469          }       /* end 64 bpp */
1470
1471          default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
1472          {
1473             /* this should never happen */
1474             png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
1475             break;
1476          }
1477       } /* end switch (png_ptr->row_info.pixel_depth) */
1478
1479    } /* end if (non-trivial mask) */
1480
1481 } /* end png_combine_row() */
1482
1483 #endif /* PNG_HAVE_MMX_COMBINE_ROW */
1484
1485
1486
1487
1488 /*===========================================================================*/
1489 /*                                                                           */
1490 /*                 P N G _ D O _ R E A D _ I N T E R L A C E                 */
1491 /*                                                                           */
1492 /*===========================================================================*/
1493
1494 #if defined(PNG_READ_INTERLACING_SUPPORTED)
1495 #if defined(PNG_HAVE_MMX_READ_INTERLACE)
1496
1497 /* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
1498  * has taken place.  [GRR: what other steps come before and/or after?]
1499  */
1500
1501 void /* PRIVATE */
1502 png_do_read_interlace(png_structp png_ptr)
1503 {
1504    png_row_infop row_info = &(png_ptr->row_info);
1505    png_bytep row = png_ptr->row_buf + 1;
1506    int pass = png_ptr->pass;
1507 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1508    png_uint_32 transformations = png_ptr->transformations;
1509 #endif
1510
1511    png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
1512
1513 #if defined(PNG_MMX_CODE_SUPPORTED)
1514    if (_mmx_supported == 2) {
1515 #if !defined(PNG_1_0_X)
1516        /* this should have happened in png_init_mmx_flags() already */
1517        png_warning(png_ptr, "asm_flags may not have been initialized");
1518 #endif
1519        png_mmx_support();
1520    }
1521 #endif
1522
1523    if (row != NULL && row_info != NULL)
1524    {
1525       png_uint_32 final_width;
1526
1527       final_width = row_info->width * png_pass_inc[pass];
1528
1529       switch (row_info->pixel_depth)
1530       {
1531          case 1:
1532          {
1533             png_bytep sp, dp;
1534             int sshift, dshift;
1535             int s_start, s_end, s_inc;
1536             png_byte v;
1537             png_uint_32 i;
1538             int j;
1539
1540             sp = row + (png_size_t)((row_info->width - 1) >> 3);
1541             dp = row + (png_size_t)((final_width - 1) >> 3);
1542 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1543             if (transformations & PNG_PACKSWAP)
1544             {
1545                sshift = (int)((row_info->width + 7) & 7);
1546                dshift = (int)((final_width + 7) & 7);
1547                s_start = 7;
1548                s_end = 0;
1549                s_inc = -1;
1550             }
1551             else
1552 #endif
1553             {
1554                sshift = 7 - (int)((row_info->width + 7) & 7);
1555                dshift = 7 - (int)((final_width + 7) & 7);
1556                s_start = 0;
1557                s_end = 7;
1558                s_inc = 1;
1559             }
1560
1561             for (i = row_info->width; i; i--)
1562             {
1563                v = (png_byte)((*sp >> sshift) & 0x1);
1564                for (j = 0; j < png_pass_inc[pass]; j++)
1565                {
1566                   *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1567                   *dp |= (png_byte)(v << dshift);
1568                   if (dshift == s_end)
1569                   {
1570                      dshift = s_start;
1571                      dp--;
1572                   }
1573                   else
1574                      dshift += s_inc;
1575                }
1576                if (sshift == s_end)
1577                {
1578                   sshift = s_start;
1579                   sp--;
1580                }
1581                else
1582                   sshift += s_inc;
1583             }
1584             break;
1585          }
1586
1587          case 2:
1588          {
1589             png_bytep sp, dp;
1590             int sshift, dshift;
1591             int s_start, s_end, s_inc;
1592             png_uint_32 i;
1593
1594             sp = row + (png_size_t)((row_info->width - 1) >> 2);
1595             dp = row + (png_size_t)((final_width - 1) >> 2);
1596 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1597             if (transformations & PNG_PACKSWAP)
1598             {
1599                sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1600                dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1601                s_start = 6;
1602                s_end = 0;
1603                s_inc = -2;
1604             }
1605             else
1606 #endif
1607             {
1608                sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1609                dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1610                s_start = 0;
1611                s_end = 6;
1612                s_inc = 2;
1613             }
1614
1615             for (i = row_info->width; i; i--)
1616             {
1617                png_byte v;
1618                int j;
1619
1620                v = (png_byte)((*sp >> sshift) & 0x3);
1621                for (j = 0; j < png_pass_inc[pass]; j++)
1622                {
1623                   *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1624                   *dp |= (png_byte)(v << dshift);
1625                   if (dshift == s_end)
1626                   {
1627                      dshift = s_start;
1628                      dp--;
1629                   }
1630                   else
1631                      dshift += s_inc;
1632                }
1633                if (sshift == s_end)
1634                {
1635                   sshift = s_start;
1636                   sp--;
1637                }
1638                else
1639                   sshift += s_inc;
1640             }
1641             break;
1642          }
1643
1644          case 4:
1645          {
1646             png_bytep sp, dp;
1647             int sshift, dshift;
1648             int s_start, s_end, s_inc;
1649             png_uint_32 i;
1650
1651             sp = row + (png_size_t)((row_info->width - 1) >> 1);
1652             dp = row + (png_size_t)((final_width - 1) >> 1);
1653 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1654             if (transformations & PNG_PACKSWAP)
1655             {
1656                sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1657                dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1658                s_start = 4;
1659                s_end = 0;
1660                s_inc = -4;
1661             }
1662             else
1663 #endif
1664             {
1665                sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1666                dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1667                s_start = 0;
1668                s_end = 4;
1669                s_inc = 4;
1670             }
1671
1672             for (i = row_info->width; i; i--)
1673             {
1674                png_byte v;
1675                int j;
1676
1677                v = (png_byte)((*sp >> sshift) & 0xf);
1678                for (j = 0; j < png_pass_inc[pass]; j++)
1679                {
1680                   *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1681                   *dp |= (png_byte)(v << dshift);
1682                   if (dshift == s_end)
1683                   {
1684                      dshift = s_start;
1685                      dp--;
1686                   }
1687                   else
1688                      dshift += s_inc;
1689                }
1690                if (sshift == s_end)
1691                {
1692                   sshift = s_start;
1693                   sp--;
1694                }
1695                else
1696                   sshift += s_inc;
1697             }
1698             break;
1699          }
1700
1701        /*====================================================================*/
1702
1703          default: /* 8-bit or larger (this is where the routine is modified) */
1704          {
1705 #if 0
1706 //          static unsigned long long _const4 = 0x0000000000FFFFFFLL;  no good
1707 //          static unsigned long long const4 = 0x0000000000FFFFFFLL;   no good
1708 //          unsigned long long _const4 = 0x0000000000FFFFFFLL;         no good
1709 //          unsigned long long const4 = 0x0000000000FFFFFFLL;          no good
1710 #endif
1711             png_bytep sptr, dp;
1712             png_uint_32 i;
1713             png_size_t pixel_bytes;
1714             int width = (int)row_info->width;
1715
1716             pixel_bytes = (row_info->pixel_depth >> 3);
1717
1718             /* point sptr at the last pixel in the pre-expanded row: */
1719             sptr = row + (width - 1) * pixel_bytes;
1720
1721             /* point dp at the last pixel position in the expanded row: */
1722             dp = row + (final_width - 1) * pixel_bytes;
1723
1724             /* New code by Nirav Chhatrapati - Intel Corporation */
1725
1726 #if defined(PNG_MMX_CODE_SUPPORTED)
1727 #if !defined(PNG_1_0_X)
1728             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1729                 /* && _mmx_supported */ )
1730 #else
1731             if (_mmx_supported)
1732 #endif
1733             {
1734                //--------------------------------------------------------------
1735                if (pixel_bytes == 3)
1736                {
1737                   if (((pass == 0) || (pass == 1)) && width)
1738                   {
1739                      int dummy_value_c;   // fix 'forbidden register spilled'
1740                      int dummy_value_S;
1741                      int dummy_value_D;
1742                      int dummy_value_a;
1743
1744                      __asm__ __volatile__ (
1745                         "subl $21, %%edi         \n\t"
1746                                      // (png_pass_inc[pass] - 1)*pixel_bytes
1747
1748                      ".loop3_pass0:              \n\t"
1749                         "movd (%%esi), %%mm0     \n\t" // x x x x x 2 1 0
1750                         "pand (%3), %%mm0        \n\t" // z z z z z 2 1 0
1751                         "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
1752                         "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
1753                         "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
1754                         "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
1755                         "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
1756                         "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
1757                         "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
1758                         "movq %%mm0, %%mm3       \n\t" // 2 1 0 2 1 0 2 1
1759                         "psllq $16, %%mm0        \n\t" // 0 2 1 0 2 1 z z
1760                         "movq %%mm3, %%mm4       \n\t" // 2 1 0 2 1 0 2 1
1761                         "punpckhdq %%mm0, %%mm3  \n\t" // 0 2 1 0 2 1 0 2
1762                         "movq %%mm4, 16(%%edi)   \n\t"
1763                         "psrlq $32, %%mm0        \n\t" // z z z z 0 2 1 0
1764                         "movq %%mm3, 8(%%edi)    \n\t"
1765                         "punpckldq %%mm4, %%mm0  \n\t" // 1 0 2 1 0 2 1 0
1766                         "subl $3, %%esi          \n\t"
1767                         "movq %%mm0, (%%edi)     \n\t"
1768                         "subl $24, %%edi         \n\t"
1769                         "decl %%ecx              \n\t"
1770                         "jnz .loop3_pass0        \n\t"
1771                         "EMMS                    \n\t" // DONE
1772
1773                         : "=c" (dummy_value_c),        // output regs (dummy)
1774                           "=S" (dummy_value_S),
1775                           "=D" (dummy_value_D),
1776                           "=a" (dummy_value_a)
1777
1778
1779                         : "1" (sptr),      // esi      // input regs
1780                           "2" (dp),        // edi
1781                           "0" (width),     // ecx
1782                           "3" (&_const4)  // %1(?)  (0x0000000000FFFFFFLL)
1783
1784 #if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1785                         : "%mm0", "%mm1", "%mm2"       // clobber list
1786                         , "%mm3", "%mm4"
1787 #endif
1788                      );
1789                   }
1790                   else if (((pass == 2) || (pass == 3)) && width)
1791                   {
1792                      int dummy_value_c;   // fix 'forbidden register spilled'
1793                      int dummy_value_S;
1794                      int dummy_value_D;
1795                      int dummy_value_a;
1796
1797                      __asm__ __volatile__ (
1798                         "subl $9, %%edi          \n\t"
1799                                      // (png_pass_inc[pass] - 1)*pixel_bytes
1800
1801                      ".loop3_pass2:              \n\t"
1802                         "movd (%%esi), %%mm0     \n\t" // x x x x x 2 1 0
1803                         "pand (%3), %%mm0     \n\t" // z z z z z 2 1 0
1804                         "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
1805                         "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
1806                         "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
1807                         "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
1808                         "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
1809                         "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
1810                         "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
1811                         "movq %%mm0, 4(%%edi)    \n\t"
1812                         "psrlq $16, %%mm0        \n\t" // z z 2 1 0 2 1 0
1813                         "subl $3, %%esi          \n\t"
1814                         "movd %%mm0, (%%edi)     \n\t"
1815                         "subl $12, %%edi         \n\t"
1816                         "decl %%ecx              \n\t"
1817                         "jnz .loop3_pass2        \n\t"
1818                         "EMMS                    \n\t" // DONE
1819
1820                         : "=c" (dummy_value_c),        // output regs (dummy)
1821                           "=S" (dummy_value_S),
1822                           "=D" (dummy_value_D),
1823                           "=a" (dummy_value_a)
1824
1825                         : "1" (sptr),      // esi      // input regs
1826                           "2" (dp),        // edi
1827                           "0" (width),     // ecx
1828                           "3" (&_const4)  // (0x0000000000FFFFFFLL)
1829
1830 #if 0  /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
1831                         : "%mm0", "%mm1", "%mm2"       // clobber list
1832 #endif
1833                      );
1834                   }
1835                   else if (width) /* && ((pass == 4) || (pass == 5)) */
1836                   {
1837                      int width_mmx = ((width >> 1) << 1) - 8;   // GRR:  huh?
1838                      if (width_mmx < 0)
1839                          width_mmx = 0;
1840                      width -= width_mmx;        // 8 or 9 pix, 24 or 27 bytes
1841                      if (width_mmx)
1842                      {
1843                         // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1844                         // sptr points at last pixel in pre-expanded row
1845                         // dp points at last pixel position in expanded row
1846                         int dummy_value_c;  // fix 'forbidden register spilled'
1847                         int dummy_value_S;
1848                         int dummy_value_D;
1849                         int dummy_value_a;
1850                         int dummy_value_d;
1851
1852                         __asm__ __volatile__ (
1853                            "subl $3, %%esi          \n\t"
1854                            "subl $9, %%edi          \n\t"
1855                                         // (png_pass_inc[pass] + 1)*pixel_bytes
1856
1857                         ".loop3_pass4:              \n\t"
1858                            "movq (%%esi), %%mm0     \n\t" // x x 5 4 3 2 1 0
1859                            "movq %%mm0, %%mm1       \n\t" // x x 5 4 3 2 1 0
1860                            "movq %%mm0, %%mm2       \n\t" // x x 5 4 3 2 1 0
1861                            "psllq $24, %%mm0        \n\t" // 4 3 2 1 0 z z z
1862                            "pand (%3), %%mm1          \n\t" // z z z z z 2 1 0
1863                            "psrlq $24, %%mm2        \n\t" // z z z x x 5 4 3
1864                            "por %%mm1, %%mm0        \n\t" // 4 3 2 1 0 2 1 0
1865                            "movq %%mm2, %%mm3       \n\t" // z z z x x 5 4 3
1866                            "psllq $8, %%mm2         \n\t" // z z x x 5 4 3 z
1867                            "movq %%mm0, (%%edi)     \n\t"
1868                            "psrlq $16, %%mm3        \n\t" // z z z z z x x 5
1869                            "pand (%4), %%mm3     \n\t" // z z z z z z z 5
1870                            "por %%mm3, %%mm2        \n\t" // z z x x 5 4 3 5
1871                            "subl $6, %%esi          \n\t"
1872                            "movd %%mm2, 8(%%edi)    \n\t"
1873                            "subl $12, %%edi         \n\t"
1874                            "subl $2, %%ecx          \n\t"
1875                            "jnz .loop3_pass4        \n\t"
1876                            "EMMS                    \n\t" // DONE
1877
1878                            : "=c" (dummy_value_c),        // output regs (dummy)
1879                              "=S" (dummy_value_S),
1880                              "=D" (dummy_value_D),
1881                              "=a" (dummy_value_a),
1882                              "=d" (dummy_value_d)
1883
1884                            : "1" (sptr),      // esi      // input regs
1885                              "2" (dp),        // edi
1886                              "0" (width_mmx), // ecx
1887                              "3" (&_const4), // 0x0000000000FFFFFFLL
1888                              "4" (&_const6)  // 0x00000000000000FFLL
1889
1890 #if 0  /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
1891                            : "%mm0", "%mm1"               // clobber list
1892                            , "%mm2", "%mm3"
1893 #endif
1894                         );
1895                      }
1896
1897                      sptr -= width_mmx*3;
1898                      dp -= width_mmx*6;
1899                      for (i = width; i; i--)
1900                      {
1901                         png_byte v[8];
1902                         int j;
1903
1904                         png_memcpy(v, sptr, 3);
1905                         for (j = 0; j < png_pass_inc[pass]; j++)
1906                         {
1907                            png_memcpy(dp, v, 3);
1908                            dp -= 3;
1909                         }
1910                         sptr -= 3;
1911                      }
1912                   }
1913                } /* end of pixel_bytes == 3 */
1914
1915                //--------------------------------------------------------------
1916                else if (pixel_bytes == 1)
1917                {
1918                   if (((pass == 0) || (pass == 1)) && width)
1919                   {
1920                      int width_mmx = ((width >> 2) << 2);
1921                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
1922                      if (width_mmx)
1923                      {
1924                         int dummy_value_c;  // fix 'forbidden register spilled'
1925                         int dummy_value_S;
1926                         int dummy_value_D;
1927
1928                         __asm__ __volatile__ (
1929                            "subl $3, %%esi          \n\t"
1930                            "subl $31, %%edi         \n\t"
1931
1932                         ".loop1_pass0:              \n\t"
1933                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
1934                            "movq %%mm0, %%mm1       \n\t" // x x x x 3 2 1 0
1935                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
1936                            "movq %%mm0, %%mm2       \n\t" // 3 3 2 2 1 1 0 0
1937                            "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
1938                            "movq %%mm0, %%mm3       \n\t" // 1 1 1 1 0 0 0 0
1939                            "punpckldq %%mm0, %%mm0  \n\t" // 0 0 0 0 0 0 0 0
1940                            "punpckhdq %%mm3, %%mm3  \n\t" // 1 1 1 1 1 1 1 1
1941                            "movq %%mm0, (%%edi)     \n\t"
1942                            "punpckhwd %%mm2, %%mm2  \n\t" // 3 3 3 3 2 2 2 2
1943                            "movq %%mm3, 8(%%edi)    \n\t"
1944                            "movq %%mm2, %%mm4       \n\t" // 3 3 3 3 2 2 2 2
1945                            "punpckldq %%mm2, %%mm2  \n\t" // 2 2 2 2 2 2 2 2
1946                            "punpckhdq %%mm4, %%mm4  \n\t" // 3 3 3 3 3 3 3 3
1947                            "movq %%mm2, 16(%%edi)   \n\t"
1948                            "subl $4, %%esi          \n\t"
1949                            "movq %%mm4, 24(%%edi)   \n\t"
1950                            "subl $32, %%edi         \n\t"
1951                            "subl $4, %%ecx          \n\t"
1952                            "jnz .loop1_pass0        \n\t"
1953                            "EMMS                    \n\t" // DONE
1954
1955                            : "=c" (dummy_value_c),        // output regs (dummy)
1956                              "=S" (dummy_value_S),
1957                              "=D" (dummy_value_D)
1958
1959                            : "1" (sptr),      // esi      // input regs
1960                              "2" (dp),        // edi
1961                              "0" (width_mmx)  // ecx
1962
1963 #if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1964                            : "%mm0", "%mm1", "%mm2"       // clobber list
1965                            , "%mm3", "%mm4"
1966 #endif
1967                         );
1968                      }
1969
1970                      sptr -= width_mmx;
1971                      dp -= width_mmx*8;
1972                      for (i = width; i; i--)
1973                      {
1974                         int j;
1975
1976                        /* I simplified this part in version 1.0.4e
1977                         * here and in several other instances where
1978                         * pixel_bytes == 1  -- GR-P
1979                         *
1980                         * Original code:
1981                         *
1982                         * png_byte v[8];
1983                         * png_memcpy(v, sptr, pixel_bytes);
1984                         * for (j = 0; j < png_pass_inc[pass]; j++)
1985                         * {
1986                         *    png_memcpy(dp, v, pixel_bytes);
1987                         *    dp -= pixel_bytes;
1988                         * }
1989                         * sptr -= pixel_bytes;
1990                         *
1991                         * Replacement code is in the next three lines:
1992                         */
1993
1994                         for (j = 0; j < png_pass_inc[pass]; j++)
1995                         {
1996                            *dp-- = *sptr;
1997                         }
1998                         --sptr;
1999                      }
2000                   }
2001                   else if (((pass == 2) || (pass == 3)) && width)
2002                   {
2003                      int width_mmx = ((width >> 2) << 2);
2004                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
2005                      if (width_mmx)
2006                      {
2007                         int dummy_value_c;  // fix 'forbidden register spilled'
2008                         int dummy_value_S;
2009                         int dummy_value_D;
2010
2011                         __asm__ __volatile__ (
2012                            "subl $3, %%esi          \n\t"
2013                            "subl $15, %%edi         \n\t"
2014
2015                         ".loop1_pass2:              \n\t"
2016                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
2017                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
2018                            "movq %%mm0, %%mm1       \n\t" // 3 3 2 2 1 1 0 0
2019                            "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
2020                            "punpckhwd %%mm1, %%mm1  \n\t" // 3 3 3 3 2 2 2 2
2021                            "movq %%mm0, (%%edi)     \n\t"
2022                            "subl $4, %%esi          \n\t"
2023                            "movq %%mm1, 8(%%edi)    \n\t"
2024                            "subl $16, %%edi         \n\t"
2025                            "subl $4, %%ecx          \n\t"
2026                            "jnz .loop1_pass2        \n\t"
2027                            "EMMS                    \n\t" // DONE
2028
2029                            : "=c" (dummy_value_c),        // output regs (dummy)
2030                              "=S" (dummy_value_S),
2031                              "=D" (dummy_value_D)
2032
2033                            : "1" (sptr),      // esi      // input regs
2034                              "2" (dp),        // edi
2035                              "0" (width_mmx)  // ecx
2036
2037 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2038                            : "%mm0", "%mm1"               // clobber list
2039 #endif
2040                         );
2041                      }
2042
2043                      sptr -= width_mmx;
2044                      dp -= width_mmx*4;
2045                      for (i = width; i; i--)
2046                      {
2047                         int j;
2048
2049                         for (j = 0; j < png_pass_inc[pass]; j++)
2050                         {
2051                            *dp-- = *sptr;
2052                         }
2053                         --sptr;
2054                      }
2055                   }
2056                   else if (width)  /* && ((pass == 4) || (pass == 5)) */
2057                   {
2058                      int width_mmx = ((width >> 3) << 3);
2059                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
2060                      if (width_mmx)
2061                      {
2062                         int dummy_value_c;  // fix 'forbidden register spilled'
2063                         int dummy_value_S;
2064                         int dummy_value_D;
2065
2066                         __asm__ __volatile__ (
2067                            "subl $7, %%esi          \n\t"
2068                            "subl $15, %%edi         \n\t"
2069
2070                         ".loop1_pass4:              \n\t"
2071                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2072                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2073                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
2074                            "punpckhbw %%mm1, %%mm1  \n\t" // 7 7 6 6 5 5 4 4
2075                            "movq %%mm1, 8(%%edi)    \n\t"
2076                            "subl $8, %%esi          \n\t"
2077                            "movq %%mm0, (%%edi)     \n\t"
2078                            "subl $16, %%edi         \n\t"
2079                            "subl $8, %%ecx          \n\t"
2080                            "jnz .loop1_pass4        \n\t"
2081                            "EMMS                    \n\t" // DONE
2082
2083                            : "=c" (dummy_value_c),        // output regs (none)
2084                              "=S" (dummy_value_S),
2085                              "=D" (dummy_value_D)
2086
2087                            : "1" (sptr),      // esi      // input regs
2088                              "2" (dp),        // edi
2089                              "0" (width_mmx)  // ecx
2090
2091 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2092                            : "%mm0", "%mm1"               // clobber list
2093 #endif
2094                         );
2095                      }
2096
2097                      sptr -= width_mmx;
2098                      dp -= width_mmx*2;
2099                      for (i = width; i; i--)
2100                      {
2101                         int j;
2102
2103                         for (j = 0; j < png_pass_inc[pass]; j++)
2104                         {
2105                            *dp-- = *sptr;
2106                         }
2107                         --sptr;
2108                      }
2109                   }
2110                } /* end of pixel_bytes == 1 */
2111
2112                //--------------------------------------------------------------
2113                else if (pixel_bytes == 2)
2114                {
2115                   if (((pass == 0) || (pass == 1)) && width)
2116                   {
2117                      int width_mmx = ((width >> 1) << 1);
2118                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
2119                      if (width_mmx)
2120                      {
2121                         int dummy_value_c;  // fix 'forbidden register spilled'
2122                         int dummy_value_S;
2123                         int dummy_value_D;
2124
2125                         __asm__ __volatile__ (
2126                            "subl $2, %%esi          \n\t"
2127                            "subl $30, %%edi         \n\t"
2128
2129                         ".loop2_pass0:              \n\t"
2130                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
2131                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
2132                            "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
2133                            "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
2134                            "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
2135                            "movq %%mm0, (%%edi)     \n\t"
2136                            "movq %%mm0, 8(%%edi)    \n\t"
2137                            "movq %%mm1, 16(%%edi)   \n\t"
2138                            "subl $4, %%esi          \n\t"
2139                            "movq %%mm1, 24(%%edi)   \n\t"
2140                            "subl $32, %%edi         \n\t"
2141                            "subl $2, %%ecx          \n\t"
2142                            "jnz .loop2_pass0        \n\t"
2143                            "EMMS                    \n\t" // DONE
2144
2145                            : "=c" (dummy_value_c),        // output regs (dummy)
2146                              "=S" (dummy_value_S),
2147                              "=D" (dummy_value_D)
2148
2149                            : "1" (sptr),      // esi      // input regs
2150                              "2" (dp),        // edi
2151                              "0" (width_mmx)  // ecx
2152
2153 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2154                            : "%mm0", "%mm1"               // clobber list
2155 #endif
2156                         );
2157                      }
2158
2159                      sptr -= (width_mmx*2 - 2); // sign fixed
2160                      dp -= (width_mmx*16 - 2);  // sign fixed
2161                      for (i = width; i; i--)
2162                      {
2163                         png_byte v[8];
2164                         int j;
2165                         sptr -= 2;
2166                         png_memcpy(v, sptr, 2);
2167                         for (j = 0; j < png_pass_inc[pass]; j++)
2168                         {
2169                            dp -= 2;
2170                            png_memcpy(dp, v, 2);
2171                         }
2172                      }
2173                   }
2174                   else if (((pass == 2) || (pass == 3)) && width)
2175                   {
2176                      int width_mmx = ((width >> 1) << 1) ;
2177                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
2178                      if (width_mmx)
2179                      {
2180                         int dummy_value_c;  // fix 'forbidden register spilled'
2181                         int dummy_value_S;
2182                         int dummy_value_D;
2183
2184                         __asm__ __volatile__ (
2185                            "subl $2, %%esi          \n\t"
2186                            "subl $14, %%edi         \n\t"
2187
2188                         ".loop2_pass2:              \n\t"
2189                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
2190                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
2191                            "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
2192                            "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
2193                            "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
2194                            "movq %%mm0, (%%edi)     \n\t"
2195                            "subl $4, %%esi          \n\t"
2196                            "movq %%mm1, 8(%%edi)    \n\t"
2197                            "subl $16, %%edi         \n\t"
2198                            "subl $2, %%ecx          \n\t"
2199                            "jnz .loop2_pass2        \n\t"
2200                            "EMMS                    \n\t" // DONE
2201
2202                            : "=c" (dummy_value_c),        // output regs (dummy)
2203                              "=S" (dummy_value_S),
2204                              "=D" (dummy_value_D)
2205
2206                            : "1" (sptr),      // esi      // input regs
2207                              "2" (dp),        // edi
2208                              "0" (width_mmx)  // ecx
2209
2210 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2211                            : "%mm0", "%mm1"               // clobber list
2212 #endif
2213                         );
2214                      }
2215
2216                      sptr -= (width_mmx*2 - 2); // sign fixed
2217                      dp -= (width_mmx*8 - 2);   // sign fixed
2218                      for (i = width; i; i--)
2219                      {
2220                         png_byte v[8];
2221                         int j;
2222                         sptr -= 2;
2223                         png_memcpy(v, sptr, 2);
2224                         for (j = 0; j < png_pass_inc[pass]; j++)
2225                         {
2226                            dp -= 2;
2227                            png_memcpy(dp, v, 2);
2228                         }
2229                      }
2230                   }
2231                   else if (width)  // pass == 4 or 5
2232                   {
2233                      int width_mmx = ((width >> 1) << 1) ;
2234                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
2235                      if (width_mmx)
2236                      {
2237                         int dummy_value_c;  // fix 'forbidden register spilled'
2238                         int dummy_value_S;
2239                         int dummy_value_D;
2240
2241                         __asm__ __volatile__ (
2242                            "subl $2, %%esi          \n\t"
2243                            "subl $6, %%edi          \n\t"
2244
2245                         ".loop2_pass4:              \n\t"
2246                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
2247                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
2248                            "subl $4, %%esi          \n\t"
2249                            "movq %%mm0, (%%edi)     \n\t"
2250                            "subl $8, %%edi          \n\t"
2251                            "subl $2, %%ecx          \n\t"
2252                            "jnz .loop2_pass4        \n\t"
2253                            "EMMS                    \n\t" // DONE
2254
2255                            : "=c" (dummy_value_c),        // output regs (dummy)
2256                              "=S" (dummy_value_S),
2257                              "=D" (dummy_value_D)
2258
2259                            : "1" (sptr),      // esi      // input regs
2260                              "2" (dp),        // edi
2261                              "0" (width_mmx)  // ecx
2262
2263 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2264                            : "%mm0"                       // clobber list
2265 #endif
2266                         );
2267                      }
2268
2269                      sptr -= (width_mmx*2 - 2); // sign fixed
2270                      dp -= (width_mmx*4 - 2);   // sign fixed
2271                      for (i = width; i; i--)
2272                      {
2273                         png_byte v[8];
2274                         int j;
2275                         sptr -= 2;
2276                         png_memcpy(v, sptr, 2);
2277                         for (j = 0; j < png_pass_inc[pass]; j++)
2278                         {
2279                            dp -= 2;
2280                            png_memcpy(dp, v, 2);
2281                         }
2282                      }
2283                   }
2284                } /* end of pixel_bytes == 2 */
2285
2286                //--------------------------------------------------------------
2287                else if (pixel_bytes == 4)
2288                {
2289                   if (((pass == 0) || (pass == 1)) && width)
2290                   {
2291                      int width_mmx = ((width >> 1) << 1);
2292                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
2293                      if (width_mmx)
2294                      {
2295                         int dummy_value_c;  // fix 'forbidden register spilled'
2296                         int dummy_value_S;
2297                         int dummy_value_D;
2298
2299                         __asm__ __volatile__ (
2300                            "subl $4, %%esi          \n\t"
2301                            "subl $60, %%edi         \n\t"
2302
2303                         ".loop4_pass0:              \n\t"
2304                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2305                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2306                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
2307                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
2308                            "movq %%mm0, (%%edi)     \n\t"
2309                            "movq %%mm0, 8(%%edi)    \n\t"
2310                            "movq %%mm0, 16(%%edi)   \n\t"
2311                            "movq %%mm0, 24(%%edi)   \n\t"
2312                            "movq %%mm1, 32(%%edi)   \n\t"
2313                            "movq %%mm1, 40(%%edi)   \n\t"
2314                            "movq %%mm1, 48(%%edi)   \n\t"
2315                            "subl $8, %%esi          \n\t"
2316                            "movq %%mm1, 56(%%edi)   \n\t"
2317                            "subl $64, %%edi         \n\t"
2318                            "subl $2, %%ecx          \n\t"
2319                            "jnz .loop4_pass0        \n\t"
2320                            "EMMS                    \n\t" // DONE
2321
2322                            : "=c" (dummy_value_c),        // output regs (dummy)
2323                              "=S" (dummy_value_S),
2324                              "=D" (dummy_value_D)
2325
2326                            : "1" (sptr),      // esi      // input regs
2327                              "2" (dp),        // edi
2328                              "0" (width_mmx)  // ecx
2329
2330 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2331                            : "%mm0", "%mm1"               // clobber list
2332 #endif
2333                         );
2334                      }
2335
2336                      sptr -= (width_mmx*4 - 4); // sign fixed
2337                      dp -= (width_mmx*32 - 4);  // sign fixed
2338                      for (i = width; i; i--)
2339                      {
2340                         png_byte v[8];
2341                         int j;
2342                         sptr -= 4;
2343                         png_memcpy(v, sptr, 4);
2344                         for (j = 0; j < png_pass_inc[pass]; j++)
2345                         {
2346                            dp -= 4;
2347                            png_memcpy(dp, v, 4);
2348                         }
2349                      }
2350                   }
2351                   else if (((pass == 2) || (pass == 3)) && width)
2352                   {
2353                      int width_mmx = ((width >> 1) << 1);
2354                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
2355                      if (width_mmx)
2356                      {
2357                         int dummy_value_c;  // fix 'forbidden register spilled'
2358                         int dummy_value_S;
2359                         int dummy_value_D;
2360
2361                         __asm__ __volatile__ (
2362                            "subl $4, %%esi          \n\t"
2363                            "subl $28, %%edi         \n\t"
2364
2365                         ".loop4_pass2:              \n\t"
2366                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2367                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2368                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
2369                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
2370                            "movq %%mm0, (%%edi)     \n\t"
2371                            "movq %%mm0, 8(%%edi)    \n\t"
2372                            "movq %%mm1, 16(%%edi)   \n\t"
2373                            "movq %%mm1, 24(%%edi)   \n\t"
2374                            "subl $8, %%esi          \n\t"
2375                            "subl $32, %%edi         \n\t"
2376                            "subl $2, %%ecx          \n\t"
2377                            "jnz .loop4_pass2        \n\t"
2378                            "EMMS                    \n\t" // DONE
2379
2380                            : "=c" (dummy_value_c),        // output regs (dummy)
2381                              "=S" (dummy_value_S),
2382                              "=D" (dummy_value_D)
2383
2384                            : "1" (sptr),      // esi      // input regs
2385                              "2" (dp),        // edi
2386                              "0" (width_mmx)  // ecx
2387
2388 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2389                            : "%mm0", "%mm1"               // clobber list
2390 #endif
2391                         );
2392                      }
2393
2394                      sptr -= (width_mmx*4 - 4); // sign fixed
2395                      dp -= (width_mmx*16 - 4);  // sign fixed
2396                      for (i = width; i; i--)
2397                      {
2398                         png_byte v[8];
2399                         int j;
2400                         sptr -= 4;
2401                         png_memcpy(v, sptr, 4);
2402                         for (j = 0; j < png_pass_inc[pass]; j++)
2403                         {
2404                            dp -= 4;
2405                            png_memcpy(dp, v, 4);
2406                         }
2407                      }
2408                   }
2409                   else if (width)  // pass == 4 or 5
2410                   {
2411                      int width_mmx = ((width >> 1) << 1) ;
2412                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
2413                      if (width_mmx)
2414                      {
2415                         int dummy_value_c;  // fix 'forbidden register spilled'
2416                         int dummy_value_S;
2417                         int dummy_value_D;
2418
2419                         __asm__ __volatile__ (
2420                            "subl $4, %%esi          \n\t"
2421                            "subl $12, %%edi         \n\t"
2422
2423                         ".loop4_pass4:              \n\t"
2424                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2425                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2426                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
2427                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
2428                            "movq %%mm0, (%%edi)     \n\t"
2429                            "subl $8, %%esi          \n\t"
2430                            "movq %%mm1, 8(%%edi)    \n\t"
2431                            "subl $16, %%edi         \n\t"
2432                            "subl $2, %%ecx          \n\t"
2433                            "jnz .loop4_pass4        \n\t"
2434                            "EMMS                    \n\t" // DONE
2435
2436                            : "=c" (dummy_value_c),        // output regs (dummy)
2437                              "=S" (dummy_value_S),
2438                              "=D" (dummy_value_D)
2439
2440                            : "1" (sptr),      // esi      // input regs
2441                              "2" (dp),        // edi
2442                              "0" (width_mmx)  // ecx
2443
2444 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2445                            : "%mm0", "%mm1"               // clobber list
2446 #endif
2447                         );
2448                      }
2449
2450                      sptr -= (width_mmx*4 - 4); // sign fixed
2451                      dp -= (width_mmx*8 - 4);   // sign fixed
2452                      for (i = width; i; i--)
2453                      {
2454                         png_byte v[8];
2455                         int j;
2456                         sptr -= 4;
2457                         png_memcpy(v, sptr, 4);
2458                         for (j = 0; j < png_pass_inc[pass]; j++)
2459                         {
2460                            dp -= 4;
2461                            png_memcpy(dp, v, 4);
2462                         }
2463                      }
2464                   }
2465                } /* end of pixel_bytes == 4 */
2466
2467                //--------------------------------------------------------------
2468                else if (pixel_bytes == 8)
2469                {
2470 // GRR TEST:  should work, but needs testing (special 64-bit version of rpng2?)
2471                   // GRR NOTE:  no need to combine passes here!
2472                   if (((pass == 0) || (pass == 1)) && width)
2473                   {
2474                      int dummy_value_c;  // fix 'forbidden register spilled'
2475                      int dummy_value_S;
2476                      int dummy_value_D;
2477
2478                      // source is 8-byte RRGGBBAA
2479                      // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
2480                      __asm__ __volatile__ (
2481                         "subl $56, %%edi         \n\t" // start of last block
2482
2483                      ".loop8_pass0:              \n\t"
2484                         "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2485                         "movq %%mm0, (%%edi)     \n\t"
2486                         "movq %%mm0, 8(%%edi)    \n\t"
2487                         "movq %%mm0, 16(%%edi)   \n\t"
2488                         "movq %%mm0, 24(%%edi)   \n\t"
2489                         "movq %%mm0, 32(%%edi)   \n\t"
2490                         "movq %%mm0, 40(%%edi)   \n\t"
2491                         "movq %%mm0, 48(%%edi)   \n\t"
2492                         "subl $8, %%esi          \n\t"
2493                         "movq %%mm0, 56(%%edi)   \n\t"
2494                         "subl $64, %%edi         \n\t"
2495                         "decl %%ecx              \n\t"
2496                         "jnz .loop8_pass0        \n\t"
2497                         "EMMS                    \n\t" // DONE
2498
2499                         : "=c" (dummy_value_c),        // output regs (dummy)
2500                           "=S" (dummy_value_S),
2501                           "=D" (dummy_value_D)
2502
2503                         : "1" (sptr),      // esi      // input regs
2504                           "2" (dp),        // edi
2505                           "0" (width)      // ecx
2506
2507 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2508                         : "%mm0"                       // clobber list
2509 #endif
2510                      );
2511                   }
2512                   else if (((pass == 2) || (pass == 3)) && width)
2513                   {
2514                      // source is 8-byte RRGGBBAA
2515                      // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
2516                      // (recall that expansion is _in place_:  sptr and dp
2517                      //  both point at locations within same row buffer)
2518                      {
2519                         int dummy_value_c;  // fix 'forbidden register spilled'
2520                         int dummy_value_S;
2521                         int dummy_value_D;
2522
2523                         __asm__ __volatile__ (
2524                            "subl $24, %%edi         \n\t" // start of last block
2525
2526                         ".loop8_pass2:              \n\t"
2527                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2528                            "movq %%mm0, (%%edi)     \n\t"
2529                            "movq %%mm0, 8(%%edi)    \n\t"
2530                            "movq %%mm0, 16(%%edi)   \n\t"
2531                            "subl $8, %%esi          \n\t"
2532                            "movq %%mm0, 24(%%edi)   \n\t"
2533                            "subl $32, %%edi         \n\t"
2534                            "decl %%ecx              \n\t"
2535                            "jnz .loop8_pass2        \n\t"
2536                            "EMMS                    \n\t" // DONE
2537
2538                            : "=c" (dummy_value_c),        // output regs (dummy)
2539                              "=S" (dummy_value_S),
2540                              "=D" (dummy_value_D)
2541
2542                            : "1" (sptr),      // esi      // input regs
2543                              "2" (dp),        // edi
2544                              "0" (width)      // ecx
2545
2546 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2547                            : "%mm0"                       // clobber list
2548 #endif
2549                         );
2550                      }
2551                   }
2552                   else if (width)  // pass == 4 or 5
2553                   {
2554                      // source is 8-byte RRGGBBAA
2555                      // dest is 16-byte RRGGBBAA RRGGBBAA
2556                      {
2557                         int dummy_value_c;  // fix 'forbidden register spilled'
2558                         int dummy_value_S;
2559                         int dummy_value_D;
2560
2561                         __asm__ __volatile__ (
2562                            "subl $8, %%edi          \n\t" // start of last block
2563
2564                         ".loop8_pass4:              \n\t"
2565                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2566                            "movq %%mm0, (%%edi)     \n\t"
2567                            "subl $8, %%esi          \n\t"
2568                            "movq %%mm0, 8(%%edi)    \n\t"
2569                            "subl $16, %%edi         \n\t"
2570                            "decl %%ecx              \n\t"
2571                            "jnz .loop8_pass4        \n\t"
2572                            "EMMS                    \n\t" // DONE
2573
2574                            : "=c" (dummy_value_c),        // output regs (dummy)
2575                              "=S" (dummy_value_S),
2576                              "=D" (dummy_value_D)
2577
2578                            : "1" (sptr),      // esi      // input regs
2579                              "2" (dp),        // edi
2580                              "0" (width)      // ecx
2581
2582 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2583                            : "%mm0"                       // clobber list
2584 #endif
2585                         );
2586                      }
2587                   }
2588
2589                } /* end of pixel_bytes == 8 */
2590
2591                //--------------------------------------------------------------
2592                else if (pixel_bytes == 6)
2593                {
2594                   for (i = width; i; i--)
2595                   {
2596                      png_byte v[8];
2597                      int j;
2598                      png_memcpy(v, sptr, 6);
2599                      for (j = 0; j < png_pass_inc[pass]; j++)
2600                      {
2601                         png_memcpy(dp, v, 6);
2602                         dp -= 6;
2603                      }
2604                      sptr -= 6;
2605                   }
2606                } /* end of pixel_bytes == 6 */
2607
2608                //--------------------------------------------------------------
2609                else
2610                {
2611                   for (i = width; i; i--)
2612                   {
2613                      png_byte v[8];
2614                      int j;
2615                      png_memcpy(v, sptr, pixel_bytes);
2616                      for (j = 0; j < png_pass_inc[pass]; j++)
2617                      {
2618                         png_memcpy(dp, v, pixel_bytes);
2619                         dp -= pixel_bytes;
2620                      }
2621                      sptr-= pixel_bytes;
2622                   }
2623                }
2624             } // end of _mmx_supported ========================================
2625
2626             else /* MMX not supported:  use modified C code - takes advantage
2627                   *   of inlining of png_memcpy for a constant */
2628                  /* GRR 19991007:  does it?  or should pixel_bytes in each
2629                   *   block be replaced with immediate value (e.g., 1)? */
2630                  /* GRR 19991017:  replaced with constants in each case */
2631 #endif /* PNG_MMX_CODE_SUPPORTED */
2632             {
2633                if (pixel_bytes == 1)
2634                {
2635                   for (i = width; i; i--)
2636                   {
2637                      int j;
2638                      for (j = 0; j < png_pass_inc[pass]; j++)
2639                      {
2640                         *dp-- = *sptr;
2641                      }
2642                      --sptr;
2643                   }
2644                }
2645                else if (pixel_bytes == 3)
2646                {
2647                   for (i = width; i; i--)
2648                   {
2649                      png_byte v[8];
2650                      int j;
2651                      png_memcpy(v, sptr, 3);
2652                      for (j = 0; j < png_pass_inc[pass]; j++)
2653                      {
2654                         png_memcpy(dp, v, 3);
2655                         dp -= 3;
2656                      }
2657                      sptr -= 3;
2658                   }
2659                }
2660                else if (pixel_bytes == 2)
2661                {
2662                   for (i = width; i; i--)
2663                   {
2664                      png_byte v[8];
2665                      int j;
2666                      png_memcpy(v, sptr, 2);
2667                      for (j = 0; j < png_pass_inc[pass]; j++)
2668                      {
2669                         png_memcpy(dp, v, 2);
2670                         dp -= 2;
2671                      }
2672                      sptr -= 2;
2673                   }
2674                }
2675                else if (pixel_bytes == 4)
2676                {
2677                   for (i = width; i; i--)
2678                   {
2679                      png_byte v[8];
2680                      int j;
2681                      png_memcpy(v, sptr, 4);
2682                      for (j = 0; j < png_pass_inc[pass]; j++)
2683                      {
2684 #ifdef PNG_DEBUG
2685                         if (dp < row || dp+3 > row+png_ptr->row_buf_size)
2686                         {
2687                            printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
2688                              row, dp, row+png_ptr->row_buf_size);
2689                            printf("row_buf=%d\n",png_ptr->row_buf_size);
2690                         }
2691 #endif
2692                         png_memcpy(dp, v, 4);
2693                         dp -= 4;
2694                      }
2695                      sptr -= 4;
2696                   }
2697                }
2698                else if (pixel_bytes == 6)
2699                {
2700                   for (i = width; i; i--)
2701                   {
2702                      png_byte v[8];
2703                      int j;
2704                      png_memcpy(v, sptr, 6);
2705                      for (j = 0; j < png_pass_inc[pass]; j++)
2706                      {
2707                         png_memcpy(dp, v, 6);
2708                         dp -= 6;
2709                      }
2710                      sptr -= 6;
2711                   }
2712                }
2713                else if (pixel_bytes == 8)
2714                {
2715                   for (i = width; i; i--)
2716                   {
2717                      png_byte v[8];
2718                      int j;
2719                      png_memcpy(v, sptr, 8);
2720                      for (j = 0; j < png_pass_inc[pass]; j++)
2721                      {
2722                         png_memcpy(dp, v, 8);
2723                         dp -= 8;
2724                      }
2725                      sptr -= 8;
2726                   }
2727                }
2728                else     /* GRR:  should never be reached */
2729                {
2730                   for (i = width; i; i--)
2731                   {
2732                      png_byte v[8];
2733                      int j;
2734                      png_memcpy(v, sptr, pixel_bytes);
2735                      for (j = 0; j < png_pass_inc[pass]; j++)
2736                      {
2737                         png_memcpy(dp, v, pixel_bytes);
2738                         dp -= pixel_bytes;
2739                      }
2740                      sptr -= pixel_bytes;
2741                   }
2742                }
2743
2744             } /* end if (MMX not supported) */
2745             break;
2746          }
2747       } /* end switch (row_info->pixel_depth) */
2748
2749       row_info->width = final_width;
2750
2751       row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
2752    }
2753
2754 } /* end png_do_read_interlace() */
2755
2756 #endif /* PNG_HAVE_MMX_READ_INTERLACE */
2757 #endif /* PNG_READ_INTERLACING_SUPPORTED */
2758
2759
2760
2761 #if defined(PNG_HAVE_MMX_READ_FILTER_ROW)
2762 #if defined(PNG_MMX_CODE_SUPPORTED)
2763
2764 // These variables are utilized in the functions below.  They are declared
2765 // globally here to ensure alignment on 8-byte boundaries.
2766
2767 union uAll {
2768    long long use;
2769    double  align;
2770 } _LBCarryMask = {0x0101010101010101LL},
2771   _HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
2772   _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
2773
2774 #ifdef PNG_THREAD_UNSAFE_OK
2775 //===========================================================================//
2776 //                                                                           //
2777 //           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G           //
2778 //                                                                           //
2779 //===========================================================================//
2780
2781 // Optimized code for PNG Average filter decoder
2782
2783 static void /* PRIVATE */
2784 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
2785                             png_bytep prev_row)
2786 {
2787    int bpp;
2788    int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
2789    int dummy_value_S;
2790    int dummy_value_D;
2791
2792    bpp = (row_info->pixel_depth + 7) >> 3;  // get # bytes per pixel
2793    _FullLength  = row_info->rowbytes;       // # of bytes to filter
2794
2795    __asm__ __volatile__ (
2796       // initialize address pointers and offset
2797 #ifdef __PIC__
2798       "pushl %%ebx                 \n\t" // save index to Global Offset Table
2799 #endif
2800 //pre "movl row, %%edi             \n\t" // edi:  Avg(x)
2801       "xorl %%ebx, %%ebx           \n\t" // ebx:  x
2802       "movl %%edi, %%edx           \n\t"
2803 //pre "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
2804 //pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
2805       "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
2806
2807       "xorl %%eax,%%eax            \n\t"
2808
2809       // Compute the Raw value for the first bpp bytes
2810       //    Raw(x) = Avg(x) + (Prior(x)/2)
2811    "avg_rlp:                       \n\t"
2812       "movb (%%esi,%%ebx,),%%al    \n\t" // load al with Prior(x)
2813       "incl %%ebx                  \n\t"
2814       "shrb %%al                   \n\t" // divide by 2
2815       "addb -1(%%edi,%%ebx,),%%al  \n\t" // add Avg(x); -1 to offset inc ebx
2816 //pre "cmpl bpp, %%ebx             \n\t" // (bpp is preloaded into ecx)
2817       "cmpl %%ecx, %%ebx           \n\t"
2818       "movb %%al,-1(%%edi,%%ebx,)  \n\t" // write Raw(x); -1 to offset inc ebx
2819       "jb avg_rlp                  \n\t" // mov does not affect flags
2820
2821       // get # of bytes to alignment
2822       "movl %%edi, _dif            \n\t" // take start of row
2823       "addl %%ebx, _dif            \n\t" // add bpp
2824       "addl $0xf, _dif             \n\t" // add 7+8 to incr past alignment bdry
2825       "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
2826       "subl %%edi, _dif            \n\t" // subtract from start => value ebx at
2827       "jz avg_go                   \n\t" //  alignment
2828
2829       // fix alignment
2830       // Compute the Raw value for the bytes up to the alignment boundary
2831       //    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2832       "xorl %%ecx, %%ecx           \n\t"
2833
2834    "avg_lp1:                       \n\t"
2835       "xorl %%eax, %%eax           \n\t"
2836       "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
2837       "movb (%%edx,%%ebx,), %%al   \n\t" // load al with Raw(x-bpp)
2838       "addw %%cx, %%ax             \n\t"
2839       "incl %%ebx                  \n\t"
2840       "shrw %%ax                   \n\t" // divide by 2
2841       "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
2842       "cmpl _dif, %%ebx            \n\t" // check if at alignment boundary
2843       "movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2844       "jb avg_lp1                  \n\t" // repeat until at alignment boundary
2845
2846    "avg_go:                        \n\t"
2847       "movl _FullLength, %%eax     \n\t"
2848       "movl %%eax, %%ecx           \n\t"
2849       "subl %%ebx, %%eax           \n\t" // subtract alignment fix
2850       "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
2851       "subl %%eax, %%ecx           \n\t" // drop over bytes from original length
2852       "movl %%ecx, _MMXLength      \n\t"
2853 #ifdef __PIC__
2854       "popl %%ebx                  \n\t" // restore index to Global Offset Table
2855 #endif
2856
2857       : "=c" (dummy_value_c),            // output regs (dummy)
2858         "=S" (dummy_value_S),
2859         "=D" (dummy_value_D)
2860
2861       : "0" (bpp),       // ecx          // input regs
2862         "1" (prev_row),  // esi
2863         "2" (row)        // edi
2864
2865       : "%eax", "%edx"                   // clobber list
2866 #ifndef __PIC__
2867       , "%ebx"
2868 #endif
2869       // GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
2870       // (seems to work fine without...)
2871    );
2872
2873    // now do the math for the rest of the row
2874    switch (bpp)
2875    {
2876       case 3:
2877       {
2878          _ActiveMask.use  = 0x0000000000ffffffLL;
2879          _ShiftBpp.use = 24;    // == 3 * 8
2880          _ShiftRem.use = 40;    // == 64 - 24
2881
2882          __asm__ __volatile__ (
2883             // re-init address pointers and offset
2884             "movq _ActiveMask, %%mm7      \n\t"
2885             "movl _dif, %%ecx             \n\t" // ecx:  x = offset to
2886             "movq _LBCarryMask, %%mm5     \n\t" //  alignment boundary
2887 // preload  "movl row, %%edi              \n\t" // edi:  Avg(x)
2888             "movq _HBClearMask, %%mm4     \n\t"
2889 // preload  "movl prev_row, %%esi         \n\t" // esi:  Prior(x)
2890
2891             // prime the pump:  load the first Raw(x-bpp) data set
2892             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2893                                                 // (correct pos. in loop below)
2894          "avg_3lp:                        \n\t"
2895             "movq (%%edi,%%ecx,), %%mm0   \n\t" // load mm0 with Avg(x)
2896             "movq %%mm5, %%mm3            \n\t"
2897             "psrlq _ShiftRem, %%mm2       \n\t" // correct position Raw(x-bpp)
2898                                                 // data
2899             "movq (%%esi,%%ecx,), %%mm1   \n\t" // load mm1 with Prior(x)
2900             "movq %%mm7, %%mm6            \n\t"
2901             "pand %%mm1, %%mm3            \n\t" // get lsb for each prev_row byte
2902             "psrlq $1, %%mm1              \n\t" // divide prev_row bytes by 2
2903             "pand  %%mm4, %%mm1           \n\t" // clear invalid bit 7 of each
2904                                                 // byte
2905             "paddb %%mm1, %%mm0           \n\t" // add (Prev_row/2) to Avg for
2906                                                 // each byte
2907             // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
2908             "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting
2909                                                 // LBCarrys
2910             "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte
2911                                                 // where both
2912                                // lsb's were == 1 (only valid for active group)
2913             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
2914             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
2915                                                 // byte
2916             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2917                                                 // for each byte
2918             "pand %%mm6, %%mm2            \n\t" // leave only Active Group 1
2919                                                 // bytes to add to Avg
2920             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
2921                                                 // Avg for each Active
2922                                //  byte
2923             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
2924             "psllq _ShiftBpp, %%mm6       \n\t" // shift the mm6 mask to cover
2925                                                 // bytes 3-5
2926             "movq %%mm0, %%mm2            \n\t" // mov updated Raws to mm2
2927             "psllq _ShiftBpp, %%mm2       \n\t" // shift data to pos. correctly
2928             "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting
2929                                                 // LBCarrys
2930             "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte
2931                                                 // where both
2932                                // lsb's were == 1 (only valid for active group)
2933             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
2934             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
2935                                                 // byte
2936             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2937                                                 // for each byte
2938             "pand %%mm6, %%mm2            \n\t" // leave only Active Group 2
2939                                                 // bytes to add to Avg
2940             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
2941                                                 // Avg for each Active
2942                                //  byte
2943
2944             // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
2945             "psllq _ShiftBpp, %%mm6       \n\t" // shift mm6 mask to cover last
2946                                                 // two
2947                                  // bytes
2948             "movq %%mm0, %%mm2            \n\t" // mov updated Raws to mm2
2949             "psllq _ShiftBpp, %%mm2       \n\t" // shift data to pos. correctly
2950                               // Data only needs to be shifted once here to
2951                               // get the correct x-bpp offset.
2952             "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting
2953                                                 // LBCarrys
2954             "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte
2955                                                 // where both
2956                               // lsb's were == 1 (only valid for active group)
2957             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
2958             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
2959                                                 // byte
2960             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2961                                                 // for each byte
2962             "pand %%mm6, %%mm2            \n\t" // leave only Active Group 2
2963                                                 // bytes to add to Avg
2964             "addl $8, %%ecx               \n\t"
2965             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
2966                                                 // Avg for each Active
2967                                                 // byte
2968             // now ready to write back to memory
2969             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2970             // move updated Raw(x) to use as Raw(x-bpp) for next loop
2971             "cmpl _MMXLength, %%ecx       \n\t"
2972             "movq %%mm0, %%mm2            \n\t" // mov updated Raw(x) to mm2
2973             "jb avg_3lp                   \n\t"
2974
2975             : "=S" (dummy_value_S),             // output regs (dummy)
2976               "=D" (dummy_value_D)
2977
2978             : "0" (prev_row),  // esi           // input regs
2979               "1" (row)        // edi
2980
2981             : "%ecx"                            // clobber list
2982 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2983             , "%mm0", "%mm1", "%mm2", "%mm3"
2984             , "%mm4", "%mm5", "%mm6", "%mm7"
2985 #endif
2986          );
2987       }
2988       break;  // end 3 bpp
2989
2990       case 6:
2991       case 4:
2992       //case 7:   // who wrote this?  PNG doesn't support 5 or 7 bytes/pixel
2993       //case 5:   // GRR BOGUS
2994       {
2995          _ActiveMask.use  = 0xffffffffffffffffLL; // use shift below to clear
2996                                                   // appropriate inactive bytes
2997          _ShiftBpp.use = bpp << 3;
2998          _ShiftRem.use = 64 - _ShiftBpp.use;
2999
3000          __asm__ __volatile__ (
3001             "movq _HBClearMask, %%mm4    \n\t"
3002
3003             // re-init address pointers and offset
3004             "movl _dif, %%ecx            \n\t" // ecx:  x = offset to
3005                                                // alignment boundary
3006
3007             // load _ActiveMask and clear all bytes except for 1st active group
3008             "movq _ActiveMask, %%mm7     \n\t"
3009 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
3010             "psrlq _ShiftRem, %%mm7      \n\t"
3011 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
3012             "movq %%mm7, %%mm6           \n\t"
3013             "movq _LBCarryMask, %%mm5    \n\t"
3014             "psllq _ShiftBpp, %%mm6      \n\t" // create mask for 2nd active
3015                                                // group
3016
3017             // prime the pump:  load the first Raw(x-bpp) data set
3018             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3019                                           // (we correct pos. in loop below)
3020          "avg_4lp:                       \n\t"
3021             "movq (%%edi,%%ecx,), %%mm0  \n\t"
3022             "psrlq _ShiftRem, %%mm2      \n\t" // shift data to pos. correctly
3023             "movq (%%esi,%%ecx,), %%mm1  \n\t"
3024             // add (Prev_row/2) to average
3025             "movq %%mm5, %%mm3           \n\t"
3026             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
3027             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
3028             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
3029                                                // byte
3030             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
3031                                                // each byte
3032             // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3033             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3034                                                // LBCarrys
3035             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3036                                                // where both
3037                               // lsb's were == 1 (only valid for active group)
3038             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3039             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3040                                                // byte
3041             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3042                                                // for each byte
3043             "pand %%mm7, %%mm2           \n\t" // leave only Active Group 1
3044                                                // bytes to add to Avg
3045             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
3046                                                // for each Active
3047                               // byte
3048             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3049             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3050             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
3051             "addl $8, %%ecx              \n\t"
3052             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3053                                                // LBCarrys
3054             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3055                                                // where both
3056                               // lsb's were == 1 (only valid for active group)
3057             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3058             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3059                                                // byte
3060             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3061                                                // for each byte
3062             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
3063                                                // bytes to add to Avg
3064             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
3065                                                // Avg for each Active
3066                               // byte
3067             "cmpl _MMXLength, %%ecx      \n\t"
3068             // now ready to write back to memory
3069             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3070             // prep Raw(x-bpp) for next loop
3071             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3072             "jb avg_4lp                  \n\t"
3073
3074             : "=S" (dummy_value_S),            // output regs (dummy)
3075               "=D" (dummy_value_D)
3076
3077             : "0" (prev_row),  // esi          // input regs
3078               "1" (row)        // edi
3079
3080             : "%ecx"                           // clobber list
3081 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3082             , "%mm0", "%mm1", "%mm2", "%mm3"
3083             , "%mm4", "%mm5", "%mm6", "%mm7"
3084 #endif
3085          );
3086       }
3087       break;  // end 4,6 bpp
3088
3089       case 2:
3090       {
3091          _ActiveMask.use  = 0x000000000000ffffLL;
3092          _ShiftBpp.use = 16;   // == 2 * 8
3093          _ShiftRem.use = 48;   // == 64 - 16
3094
3095          __asm__ __volatile__ (
3096             // load _ActiveMask
3097             "movq _ActiveMask, %%mm7     \n\t"
3098             // re-init address pointers and offset
3099             "movl _dif, %%ecx            \n\t" // ecx:  x = offset to alignment
3100                                                // boundary
3101             "movq _LBCarryMask, %%mm5    \n\t"
3102 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
3103             "movq _HBClearMask, %%mm4    \n\t"
3104 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
3105
3106             // prime the pump:  load the first Raw(x-bpp) data set
3107             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3108                               // (we correct pos. in loop below)
3109          "avg_2lp:                       \n\t"
3110             "movq (%%edi,%%ecx,), %%mm0  \n\t"
3111             "psrlq _ShiftRem, %%mm2      \n\t" // shift data to pos. correctly
3112             "movq (%%esi,%%ecx,), %%mm1  \n\t" //  (GRR BUGFIX:  was psllq)
3113             // add (Prev_row/2) to average
3114             "movq %%mm5, %%mm3           \n\t"
3115             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
3116             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
3117             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
3118                                                // byte
3119             "movq %%mm7, %%mm6           \n\t"
3120             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
3121                                                // each byte
3122
3123             // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3124             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3125                                                // LBCarrys
3126             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3127                                                // where both
3128                                                // lsb's were == 1 (only valid
3129                                                // for active group)
3130             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3131             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3132                                                // byte
3133             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3134                                                // for each byte
3135             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 1
3136                                                // bytes to add to Avg
3137             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
3138                                                // for each Active byte
3139
3140             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3141             "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover
3142                                                // bytes 2 & 3
3143             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3144             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
3145             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3146                                                // LBCarrys
3147             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3148                                                // where both
3149                                                // lsb's were == 1 (only valid
3150                                                // for active group)
3151             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3152             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3153                                                // byte
3154             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3155                                                // for each byte
3156             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
3157                                                // bytes to add to Avg
3158             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
3159                                                // Avg for each Active byte
3160
3161             // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
3162             "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover
3163                                                // bytes 4 & 5
3164             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3165             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
3166             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3167                                                // LBCarrys
3168             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3169                                                // where both lsb's were == 1
3170                                                // (only valid for active group)
3171             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3172             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3173                                                // byte
3174             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3175                                                // for each byte
3176             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
3177                                                // bytes to add to Avg
3178             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
3179                                                // Avg for each Active byte
3180
3181             // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
3182             "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover
3183                                                // bytes 6 & 7
3184             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3185             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
3186             "addl $8, %%ecx              \n\t"
3187             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3188                                                // LBCarrys
3189             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3190                                                // where both
3191                                                // lsb's were == 1 (only valid
3192                                                // for active group)
3193             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3194             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3195                                                // byte
3196             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3197                                                // for each byte
3198             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
3199                                                // bytes to add to Avg
3200             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
3201                                                // Avg for each Active byte
3202
3203             "cmpl _MMXLength, %%ecx      \n\t"
3204             // now ready to write back to memory
3205             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3206             // prep Raw(x-bpp) for next loop
3207             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3208             "jb avg_2lp                  \n\t"
3209
3210             : "=S" (dummy_value_S),            // output regs (dummy)
3211               "=D" (dummy_value_D)
3212
3213             : "0" (prev_row),  // esi          // input regs
3214               "1" (row)        // edi
3215
3216             : "%ecx"                           // clobber list
3217 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3218             , "%mm0", "%mm1", "%mm2", "%mm3"
3219             , "%mm4", "%mm5", "%mm6", "%mm7"
3220 #endif
3221          );
3222       }
3223       break;  // end 2 bpp
3224
3225       case 1:
3226       {
3227          __asm__ __volatile__ (
3228             // re-init address pointers and offset
3229 #ifdef __PIC__
3230             "pushl %%ebx                 \n\t" // save Global Offset Table index
3231 #endif
3232             "movl _dif, %%ebx            \n\t" // ebx:  x = offset to alignment
3233                                                // boundary
3234 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
3235             "cmpl _FullLength, %%ebx     \n\t" // test if offset at end of array
3236             "jnb avg_1end                \n\t"
3237             // do Paeth decode for remaining bytes
3238 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
3239             "movl %%edi, %%edx           \n\t"
3240 // preload  "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
3241             "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
3242             "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx
3243                                                //  in loop below
3244          "avg_1lp:                       \n\t"
3245             // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3246             "xorl %%eax, %%eax           \n\t"
3247             "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
3248             "movb (%%edx,%%ebx,), %%al   \n\t" // load al with Raw(x-bpp)
3249             "addw %%cx, %%ax             \n\t"
3250             "incl %%ebx                  \n\t"
3251             "shrw %%ax                   \n\t" // divide by 2
3252             "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset
3253                                                // inc ebx
3254             "cmpl _FullLength, %%ebx     \n\t" // check if at end of array
3255             "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x);
3256                          // mov does not affect flags; -1 to offset inc ebx
3257             "jb avg_1lp                  \n\t"
3258
3259          "avg_1end:                      \n\t"
3260 #ifdef __PIC__
3261             "popl %%ebx                  \n\t" // Global Offset Table index
3262 #endif
3263
3264             : "=c" (dummy_value_c),            // output regs (dummy)
3265               "=S" (dummy_value_S),
3266               "=D" (dummy_value_D)
3267
3268             : "0" (bpp),       // ecx          // input regs
3269               "1" (prev_row),  // esi
3270               "2" (row)        // edi
3271
3272             : "%eax", "%edx"                   // clobber list
3273 #ifndef __PIC__
3274             , "%ebx"
3275 #endif
3276          );
3277       }
3278       return;  // end 1 bpp
3279
3280       case 8:
3281       {
3282          __asm__ __volatile__ (
3283             // re-init address pointers and offset
3284             "movl _dif, %%ecx            \n\t" // ecx:  x == offset to alignment
3285             "movq _LBCarryMask, %%mm5    \n\t" //            boundary
3286 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
3287             "movq _HBClearMask, %%mm4    \n\t"
3288 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
3289
3290             // prime the pump:  load the first Raw(x-bpp) data set
3291             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3292                                       // (NO NEED to correct pos. in loop below)
3293
3294          "avg_8lp:                       \n\t"
3295             "movq (%%edi,%%ecx,), %%mm0  \n\t"
3296             "movq %%mm5, %%mm3           \n\t"
3297             "movq (%%esi,%%ecx,), %%mm1  \n\t"
3298             "addl $8, %%ecx              \n\t"
3299             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
3300             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
3301             "pand %%mm2, %%mm3           \n\t" // get LBCarrys for each byte
3302                                                //  where both lsb's were == 1
3303             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3304             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7, each byte
3305             "paddb %%mm3, %%mm0          \n\t" // add LBCarrys to Avg, each byte
3306             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7, each byte
3307             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg, each
3308             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) to Avg for each
3309             "cmpl _MMXLength, %%ecx      \n\t"
3310             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3311             "movq %%mm0, %%mm2           \n\t" // reuse as Raw(x-bpp)
3312             "jb avg_8lp                  \n\t"
3313
3314             : "=S" (dummy_value_S),            // output regs (dummy)
3315               "=D" (dummy_value_D)
3316
3317             : "0" (prev_row),  // esi          // input regs
3318               "1" (row)        // edi
3319
3320             : "%ecx"                           // clobber list
3321 #if 0  /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
3322             , "%mm0", "%mm1", "%mm2"
3323             , "%mm3", "%mm4", "%mm5"
3324 #endif
3325          );
3326       }
3327       break;  // end 8 bpp
3328
3329       default:                  // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
3330       {
3331
3332 #ifdef PNG_DEBUG
3333          // GRR:  PRINT ERROR HERE:  SHOULD NEVER BE REACHED
3334         png_debug(1,
3335         "Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
3336 #endif
3337
3338 #if 0
3339         __asm__ __volatile__ (
3340             "movq _LBCarryMask, %%mm5    \n\t"
3341             // re-init address pointers and offset
3342             "movl _dif, %%ebx            \n\t" // ebx:  x = offset to
3343                                                // alignment boundary
3344             "movl row, %%edi             \n\t" // edi:  Avg(x)
3345             "movq _HBClearMask, %%mm4    \n\t"
3346             "movl %%edi, %%edx           \n\t"
3347             "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
3348             "subl bpp, %%edx             \n\t" // edx:  Raw(x-bpp)
3349          "avg_Alp:                       \n\t"
3350             "movq (%%edi,%%ebx,), %%mm0  \n\t"
3351             "movq %%mm5, %%mm3           \n\t"
3352             "movq (%%esi,%%ebx,), %%mm1  \n\t"
3353             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
3354             "movq (%%edx,%%ebx,), %%mm2  \n\t"
3355             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
3356             "pand %%mm2, %%mm3           \n\t" // get LBCarrys for each byte
3357                                                // where both lsb's were == 1
3358             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3359             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
3360                                                // byte
3361             "paddb %%mm3, %%mm0          \n\t" // add LBCarrys to Avg for each
3362                                                // byte
3363             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3364                                                // byte
3365             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
3366                                                // each byte
3367             "addl $8, %%ebx              \n\t"
3368             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) to Avg for each
3369                                                // byte
3370             "cmpl _MMXLength, %%ebx      \n\t"
3371             "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
3372             "jb avg_Alp                  \n\t"
3373
3374             : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
3375
3376             : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
3377
3378             : "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
3379          );
3380 #endif /* 0 - NEVER REACHED */
3381       }
3382       break;
3383
3384    } // end switch (bpp)
3385
3386    __asm__ __volatile__ (
3387       // MMX acceleration complete; now do clean-up
3388       // check if any remaining bytes left to decode
3389 #ifdef __PIC__
3390       "pushl %%ebx                 \n\t" // save index to Global Offset Table
3391 #endif
3392       "movl _MMXLength, %%ebx      \n\t" // ebx:  x == offset bytes after MMX
3393 //pre "movl row, %%edi             \n\t" // edi:  Avg(x)
3394       "cmpl _FullLength, %%ebx     \n\t" // test if offset at end of array
3395       "jnb avg_end                 \n\t"
3396
3397       // do Avg decode for remaining bytes
3398 //pre "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
3399       "movl %%edi, %%edx           \n\t"
3400 //pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
3401       "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
3402       "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
3403
3404    "avg_lp2:                       \n\t"
3405       // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3406       "xorl %%eax, %%eax           \n\t"
3407       "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
3408       "movb (%%edx,%%ebx,), %%al   \n\t" // load al with Raw(x-bpp)
3409       "addw %%cx, %%ax             \n\t"
3410       "incl %%ebx                  \n\t"
3411       "shrw %%ax                   \n\t" // divide by 2
3412       "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
3413       "cmpl _FullLength, %%ebx     \n\t" // check if at end of array
3414       "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x) [mov does not
3415       "jb avg_lp2                  \n\t" //  affect flags; -1 to offset inc ebx]
3416
3417    "avg_end:                       \n\t"
3418       "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
3419 #ifdef __PIC__
3420       "popl %%ebx                  \n\t" // restore index to Global Offset Table
3421 #endif
3422
3423       : "=c" (dummy_value_c),            // output regs (dummy)
3424         "=S" (dummy_value_S),
3425         "=D" (dummy_value_D)
3426
3427       : "0" (bpp),       // ecx          // input regs
3428         "1" (prev_row),  // esi
3429         "2" (row)        // edi
3430
3431       : "%eax", "%edx"                   // clobber list
3432 #ifndef __PIC__
3433       , "%ebx"
3434 #endif
3435    );
3436
3437 } /* end png_read_filter_row_mmx_avg() */
3438 #endif
3439
3440
3441
3442 #ifdef PNG_THREAD_UNSAFE_OK
3443 //===========================================================================//
3444 //                                                                           //
3445 //         P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H         //
3446 //                                                                           //
3447 //===========================================================================//
3448
3449 // Optimized code for PNG Paeth filter decoder
3450
3451 static void /* PRIVATE */
3452 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
3453                               png_bytep prev_row)
3454 {
3455    int bpp;
3456    int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
3457    int dummy_value_S;
3458    int dummy_value_D;
3459
3460    bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3461    _FullLength  = row_info->rowbytes; // # of bytes to filter
3462
3463    __asm__ __volatile__ (
3464 #ifdef __PIC__
3465       "pushl %%ebx                 \n\t" // save index to Global Offset Table
3466 #endif
3467       "xorl %%ebx, %%ebx           \n\t" // ebx:  x offset
3468 //pre "movl row, %%edi             \n\t"
3469       "xorl %%edx, %%edx           \n\t" // edx:  x-bpp offset
3470 //pre "movl prev_row, %%esi        \n\t"
3471       "xorl %%eax, %%eax           \n\t"
3472
3473       // Compute the Raw value for the first bpp bytes
3474       // Note: the formula works out to be always
3475       //   Paeth(x) = Raw(x) + Prior(x)      where x < bpp
3476    "paeth_rlp:                     \n\t"
3477       "movb (%%edi,%%ebx,), %%al   \n\t"
3478       "addb (%%esi,%%ebx,), %%al   \n\t"
3479       "incl %%ebx                  \n\t"
3480 //pre "cmpl bpp, %%ebx             \n\t" (bpp is preloaded into ecx)
3481       "cmpl %%ecx, %%ebx           \n\t"
3482       "movb %%al, -1(%%edi,%%ebx,) \n\t"
3483       "jb paeth_rlp                \n\t"
3484       // get # of bytes to alignment
3485       "movl %%edi, _dif            \n\t" // take start of row
3486       "addl %%ebx, _dif            \n\t" // add bpp
3487       "xorl %%ecx, %%ecx           \n\t"
3488       "addl $0xf, _dif             \n\t" // add 7 + 8 to incr past alignment
3489                                          // boundary
3490       "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
3491       "subl %%edi, _dif            \n\t" // subtract from start ==> value ebx
3492                                          // at alignment
3493       "jz paeth_go                 \n\t"
3494       // fix alignment
3495
3496    "paeth_lp1:                     \n\t"
3497       "xorl %%eax, %%eax           \n\t"
3498       // pav = p - a = (a + b - c) - a = b - c
3499       "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
3500       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
3501       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
3502       "movl %%eax, _patemp         \n\t" // Save pav for later use
3503       "xorl %%eax, %%eax           \n\t"
3504       // pbv = p - b = (a + b - c) - b = a - c
3505       "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
3506       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
3507       "movl %%eax, %%ecx           \n\t"
3508       // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3509       "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
3510       // pc = abs(pcv)
3511       "testl $0x80000000, %%eax    \n\t"
3512       "jz paeth_pca                \n\t"
3513       "negl %%eax                  \n\t" // reverse sign of neg values
3514
3515    "paeth_pca:                     \n\t"
3516       "movl %%eax, _pctemp         \n\t" // save pc for later use
3517       // pb = abs(pbv)
3518       "testl $0x80000000, %%ecx    \n\t"
3519       "jz paeth_pba                \n\t"
3520       "negl %%ecx                  \n\t" // reverse sign of neg values
3521
3522    "paeth_pba:                     \n\t"
3523       "movl %%ecx, _pbtemp         \n\t" // save pb for later use
3524       // pa = abs(pav)
3525       "movl _patemp, %%eax         \n\t"
3526       "testl $0x80000000, %%eax    \n\t"
3527       "jz paeth_paa                \n\t"
3528       "negl %%eax                  \n\t" // reverse sign of neg values
3529
3530    "paeth_paa:                     \n\t"
3531       "movl %%eax, _patemp         \n\t" // save pa for later use
3532       // test if pa <= pb
3533       "cmpl %%ecx, %%eax           \n\t"
3534       "jna paeth_abb               \n\t"
3535       // pa > pb; now test if pb <= pc
3536       "cmpl _pctemp, %%ecx         \n\t"
3537       "jna paeth_bbc               \n\t"
3538       // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3539       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
3540       "jmp paeth_paeth             \n\t"
3541
3542    "paeth_bbc:                     \n\t"
3543       // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3544       "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
3545       "jmp paeth_paeth             \n\t"
3546
3547    "paeth_abb:                     \n\t"
3548       // pa <= pb; now test if pa <= pc
3549       "cmpl _pctemp, %%eax         \n\t"
3550       "jna paeth_abc               \n\t"
3551       // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3552       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
3553       "jmp paeth_paeth             \n\t"
3554
3555    "paeth_abc:                     \n\t"
3556       // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3557       "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
3558
3559    "paeth_paeth:                   \n\t"
3560       "incl %%ebx                  \n\t"
3561       "incl %%edx                  \n\t"
3562       // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3563       "addb %%cl, -1(%%edi,%%ebx,) \n\t"
3564       "cmpl _dif, %%ebx            \n\t"
3565       "jb paeth_lp1                \n\t"
3566
3567    "paeth_go:                      \n\t"
3568       "movl _FullLength, %%ecx     \n\t"
3569       "movl %%ecx, %%eax           \n\t"
3570       "subl %%ebx, %%eax           \n\t" // subtract alignment fix
3571       "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
3572       "subl %%eax, %%ecx           \n\t" // drop over bytes from original length
3573       "movl %%ecx, _MMXLength      \n\t"
3574 #ifdef __PIC__
3575       "popl %%ebx                  \n\t" // restore index to Global Offset Table
3576 #endif
3577
3578       : "=c" (dummy_value_c),            // output regs (dummy)
3579         "=S" (dummy_value_S),
3580         "=D" (dummy_value_D)
3581
3582       : "0" (bpp),       // ecx          // input regs
3583         "1" (prev_row),  // esi
3584         "2" (row)        // edi
3585
3586       : "%eax", "%edx"                   // clobber list
3587 #ifndef __PIC__
3588       , "%ebx"
3589 #endif
3590    );
3591
3592    // now do the math for the rest of the row
3593    switch (bpp)
3594    {
3595       case 3:
3596       {
3597          _ActiveMask.use = 0x0000000000ffffffLL;
3598          _ActiveMaskEnd.use = 0xffff000000000000LL;
3599          _ShiftBpp.use = 24;    // == bpp(3) * 8
3600          _ShiftRem.use = 40;    // == 64 - 24
3601
3602          __asm__ __volatile__ (
3603             "movl _dif, %%ecx            \n\t"
3604 // preload  "movl row, %%edi             \n\t"
3605 // preload  "movl prev_row, %%esi        \n\t"
3606             "pxor %%mm0, %%mm0           \n\t"
3607             // prime the pump:  load the first Raw(x-bpp) data set
3608             "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3609          "paeth_3lp:                     \n\t"
3610             "psrlq _ShiftRem, %%mm1      \n\t" // shift last 3 bytes to 1st
3611                                                // 3 bytes
3612             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
3613             "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
3614             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
3615             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3616             "psrlq _ShiftRem, %%mm3      \n\t" // shift last 3 bytes to 1st
3617                                                // 3 bytes
3618             // pav = p - a = (a + b - c) - a = b - c
3619             "movq %%mm2, %%mm4           \n\t"
3620             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3621             // pbv = p - b = (a + b - c) - b = a - c
3622             "movq %%mm1, %%mm5           \n\t"
3623             "psubw %%mm3, %%mm4          \n\t"
3624             "pxor %%mm7, %%mm7           \n\t"
3625             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3626             "movq %%mm4, %%mm6           \n\t"
3627             "psubw %%mm3, %%mm5          \n\t"
3628
3629             // pa = abs(p-a) = abs(pav)
3630             // pb = abs(p-b) = abs(pbv)
3631             // pc = abs(p-c) = abs(pcv)
3632             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3633             "paddw %%mm5, %%mm6          \n\t"
3634             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3635             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3636             "psubw %%mm0, %%mm4          \n\t"
3637             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3638             "psubw %%mm0, %%mm4          \n\t"
3639             "psubw %%mm7, %%mm5          \n\t"
3640             "pxor %%mm0, %%mm0           \n\t"
3641             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3642             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3643             "psubw %%mm7, %%mm5          \n\t"
3644             "psubw %%mm0, %%mm6          \n\t"
3645             //  test pa <= pb
3646             "movq %%mm4, %%mm7           \n\t"
3647             "psubw %%mm0, %%mm6          \n\t"
3648             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3649             "movq %%mm7, %%mm0           \n\t"
3650             // use mm7 mask to merge pa & pb
3651             "pand %%mm7, %%mm5           \n\t"
3652             // use mm0 mask copy to merge a & b
3653             "pand %%mm0, %%mm2           \n\t"
3654             "pandn %%mm4, %%mm7          \n\t"
3655             "pandn %%mm1, %%mm0          \n\t"
3656             "paddw %%mm5, %%mm7          \n\t"
3657             "paddw %%mm2, %%mm0          \n\t"
3658             //  test  ((pa <= pb)? pa:pb) <= pc
3659             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3660             "pxor %%mm1, %%mm1           \n\t"
3661             "pand %%mm7, %%mm3           \n\t"
3662             "pandn %%mm0, %%mm7          \n\t"
3663             "paddw %%mm3, %%mm7          \n\t"
3664             "pxor %%mm0, %%mm0           \n\t"
3665             "packuswb %%mm1, %%mm7       \n\t"
3666             "movq (%%esi,%%ecx,), %%mm3  \n\t" // load c=Prior(x-bpp)
3667             "pand _ActiveMask, %%mm7     \n\t"
3668             "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
3669             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3670             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3671             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
3672             "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as
3673                                                // Raw(x-bpp)
3674             // now do Paeth for 2nd set of bytes (3-5)
3675             "psrlq _ShiftBpp, %%mm2      \n\t" // load b=Prior(x) step 2
3676             "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
3677             "pxor %%mm7, %%mm7           \n\t"
3678             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3679             // pbv = p - b = (a + b - c) - b = a - c
3680             "movq %%mm1, %%mm5           \n\t"
3681             // pav = p - a = (a + b - c) - a = b - c
3682             "movq %%mm2, %%mm4           \n\t"
3683             "psubw %%mm3, %%mm5          \n\t"
3684             "psubw %%mm3, %%mm4          \n\t"
3685             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
3686             //       pav + pbv = pbv + pav
3687             "movq %%mm5, %%mm6           \n\t"
3688             "paddw %%mm4, %%mm6          \n\t"
3689
3690             // pa = abs(p-a) = abs(pav)
3691             // pb = abs(p-b) = abs(pbv)
3692             // pc = abs(p-c) = abs(pcv)
3693             "pcmpgtw %%mm5, %%mm0        \n\t" // create mask pbv bytes < 0
3694             "pcmpgtw %%mm4, %%mm7        \n\t" // create mask pav bytes < 0
3695             "pand %%mm5, %%mm0           \n\t" // only pbv bytes < 0 in mm0
3696             "pand %%mm4, %%mm7           \n\t" // only pav bytes < 0 in mm7
3697             "psubw %%mm0, %%mm5          \n\t"
3698             "psubw %%mm7, %%mm4          \n\t"
3699             "psubw %%mm0, %%mm5          \n\t"
3700             "psubw %%mm7, %%mm4          \n\t"
3701             "pxor %%mm0, %%mm0           \n\t"
3702             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3703             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3704             "psubw %%mm0, %%mm6          \n\t"
3705             //  test pa <= pb
3706             "movq %%mm4, %%mm7           \n\t"
3707             "psubw %%mm0, %%mm6          \n\t"
3708             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3709             "movq %%mm7, %%mm0           \n\t"
3710             // use mm7 mask to merge pa & pb
3711             "pand %%mm7, %%mm5           \n\t"
3712             // use mm0 mask copy to merge a & b
3713             "pand %%mm0, %%mm2           \n\t"
3714             "pandn %%mm4, %%mm7          \n\t"
3715             "pandn %%mm1, %%mm0          \n\t"
3716             "paddw %%mm5, %%mm7          \n\t"
3717             "paddw %%mm2, %%mm0          \n\t"
3718             //  test  ((pa <= pb)? pa:pb) <= pc
3719             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3720             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
3721             "pand %%mm7, %%mm3           \n\t"
3722             "pandn %%mm0, %%mm7          \n\t"
3723             "pxor %%mm1, %%mm1           \n\t"
3724             "paddw %%mm3, %%mm7          \n\t"
3725             "pxor %%mm0, %%mm0           \n\t"
3726             "packuswb %%mm1, %%mm7       \n\t"
3727             "movq %%mm2, %%mm3           \n\t" // load c=Prior(x-bpp) step 1
3728             "pand _ActiveMask, %%mm7     \n\t"
3729             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3730             "psllq _ShiftBpp, %%mm7      \n\t" // shift bytes to 2nd group of
3731                                                // 3 bytes
3732              // pav = p - a = (a + b - c) - a = b - c
3733             "movq %%mm2, %%mm4           \n\t"
3734             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3735             "psllq _ShiftBpp, %%mm3      \n\t" // load c=Prior(x-bpp) step 2
3736             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
3737             "movq %%mm7, %%mm1           \n\t"
3738             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3739             "psllq _ShiftBpp, %%mm1      \n\t" // shift bytes
3740                                     // now mm1 will be used as Raw(x-bpp)
3741             // now do Paeth for 3rd, and final, set of bytes (6-7)
3742             "pxor %%mm7, %%mm7           \n\t"
3743             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
3744             "psubw %%mm3, %%mm4          \n\t"
3745             // pbv = p - b = (a + b - c) - b = a - c
3746             "movq %%mm1, %%mm5           \n\t"
3747             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3748             "movq %%mm4, %%mm6           \n\t"
3749             "psubw %%mm3, %%mm5          \n\t"
3750             "pxor %%mm0, %%mm0           \n\t"
3751             "paddw %%mm5, %%mm6          \n\t"
3752
3753             // pa = abs(p-a) = abs(pav)
3754             // pb = abs(p-b) = abs(pbv)
3755             // pc = abs(p-c) = abs(pcv)
3756             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3757             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3758             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3759             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3760             "psubw %%mm0, %%mm4          \n\t"
3761             "psubw %%mm7, %%mm5          \n\t"
3762             "psubw %%mm0, %%mm4          \n\t"
3763             "psubw %%mm7, %%mm5          \n\t"
3764             "pxor %%mm0, %%mm0           \n\t"
3765             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3766             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3767             "psubw %%mm0, %%mm6          \n\t"
3768             //  test pa <= pb
3769             "movq %%mm4, %%mm7           \n\t"
3770             "psubw %%mm0, %%mm6          \n\t"
3771             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3772             "movq %%mm7, %%mm0           \n\t"
3773             // use mm0 mask copy to merge a & b
3774             "pand %%mm0, %%mm2           \n\t"
3775             // use mm7 mask to merge pa & pb
3776             "pand %%mm7, %%mm5           \n\t"
3777             "pandn %%mm1, %%mm0          \n\t"
3778             "pandn %%mm4, %%mm7          \n\t"
3779             "paddw %%mm2, %%mm0          \n\t"
3780             "paddw %%mm5, %%mm7          \n\t"
3781             //  test  ((pa <= pb)? pa:pb) <= pc
3782             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3783             "pand %%mm7, %%mm3           \n\t"
3784             "pandn %%mm0, %%mm7          \n\t"
3785             "paddw %%mm3, %%mm7          \n\t"
3786             "pxor %%mm1, %%mm1           \n\t"
3787             "packuswb %%mm7, %%mm1       \n\t"
3788             // step ecx to next set of 8 bytes and repeat loop til done
3789             "addl $8, %%ecx              \n\t"
3790             "pand _ActiveMaskEnd, %%mm1  \n\t"
3791             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with
3792                                                  // Raw(x)
3793
3794             "cmpl _MMXLength, %%ecx      \n\t"
3795             "pxor %%mm0, %%mm0           \n\t" // pxor does not affect flags
3796             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3797                                  // mm1 will be used as Raw(x-bpp) next loop
3798                            // mm3 ready to be used as Prior(x-bpp) next loop
3799             "jb paeth_3lp                \n\t"
3800
3801             : "=S" (dummy_value_S),             // output regs (dummy)
3802               "=D" (dummy_value_D)
3803
3804             : "0" (prev_row),  // esi           // input regs
3805               "1" (row)        // edi
3806
3807             : "%ecx"                            // clobber list
3808 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3809             , "%mm0", "%mm1", "%mm2", "%mm3"
3810             , "%mm4", "%mm5", "%mm6", "%mm7"
3811 #endif
3812          );
3813       }
3814       break;  // end 3 bpp
3815
3816       case 6:
3817       //case 7:   // GRR BOGUS
3818       //case 5:   // GRR BOGUS
3819       {
3820          _ActiveMask.use  = 0x00000000ffffffffLL;
3821          _ActiveMask2.use = 0xffffffff00000000LL;
3822          _ShiftBpp.use = bpp << 3;    // == bpp * 8
3823          _ShiftRem.use = 64 - _ShiftBpp.use;
3824
3825          __asm__ __volatile__ (
3826             "movl _dif, %%ecx            \n\t"
3827 // preload  "movl row, %%edi             \n\t"
3828 // preload  "movl prev_row, %%esi        \n\t"
3829             // prime the pump:  load the first Raw(x-bpp) data set
3830             "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3831             "pxor %%mm0, %%mm0           \n\t"
3832
3833          "paeth_6lp:                     \n\t"
3834             // must shift to position Raw(x-bpp) data
3835             "psrlq _ShiftRem, %%mm1      \n\t"
3836             // do first set of 4 bytes
3837             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3838             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
3839             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
3840             "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
3841             // must shift to position Prior(x-bpp) data
3842             "psrlq _ShiftRem, %%mm3      \n\t"
3843             // pav = p - a = (a + b - c) - a = b - c
3844             "movq %%mm2, %%mm4           \n\t"
3845             "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
3846             // pbv = p - b = (a + b - c) - b = a - c
3847             "movq %%mm1, %%mm5           \n\t"
3848             "psubw %%mm3, %%mm4          \n\t"
3849             "pxor %%mm7, %%mm7           \n\t"
3850             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3851             "movq %%mm4, %%mm6           \n\t"
3852             "psubw %%mm3, %%mm5          \n\t"
3853             // pa = abs(p-a) = abs(pav)
3854             // pb = abs(p-b) = abs(pbv)
3855             // pc = abs(p-c) = abs(pcv)
3856             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3857             "paddw %%mm5, %%mm6          \n\t"
3858             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3859             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3860             "psubw %%mm0, %%mm4          \n\t"
3861             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3862             "psubw %%mm0, %%mm4          \n\t"
3863             "psubw %%mm7, %%mm5          \n\t"
3864             "pxor %%mm0, %%mm0           \n\t"
3865             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3866             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3867             "psubw %%mm7, %%mm5          \n\t"
3868             "psubw %%mm0, %%mm6          \n\t"
3869             //  test pa <= pb
3870             "movq %%mm4, %%mm7           \n\t"
3871             "psubw %%mm0, %%mm6          \n\t"
3872             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3873             "movq %%mm7, %%mm0           \n\t"
3874             // use mm7 mask to merge pa & pb
3875             "pand %%mm7, %%mm5           \n\t"
3876             // use mm0 mask copy to merge a & b
3877             "pand %%mm0, %%mm2           \n\t"
3878             "pandn %%mm4, %%mm7          \n\t"
3879             "pandn %%mm1, %%mm0          \n\t"
3880             "paddw %%mm5, %%mm7          \n\t"
3881             "paddw %%mm2, %%mm0          \n\t"
3882             //  test  ((pa <= pb)? pa:pb) <= pc
3883             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3884             "pxor %%mm1, %%mm1           \n\t"
3885             "pand %%mm7, %%mm3           \n\t"
3886             "pandn %%mm0, %%mm7          \n\t"
3887             "paddw %%mm3, %%mm7          \n\t"
3888             "pxor %%mm0, %%mm0           \n\t"
3889             "packuswb %%mm1, %%mm7       \n\t"
3890             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3891             "pand _ActiveMask, %%mm7     \n\t"
3892             "psrlq _ShiftRem, %%mm3      \n\t"
3893             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x) step 1
3894             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor and Raw(x)
3895             "movq %%mm2, %%mm6           \n\t"
3896             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
3897             "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3898             "psllq _ShiftBpp, %%mm6      \n\t"
3899             "movq %%mm7, %%mm5           \n\t"
3900             "psrlq _ShiftRem, %%mm1      \n\t"
3901             "por %%mm6, %%mm3            \n\t"
3902             "psllq _ShiftBpp, %%mm5      \n\t"
3903             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3904             "por %%mm5, %%mm1            \n\t"
3905             // do second set of 4 bytes
3906             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3907             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
3908             // pav = p - a = (a + b - c) - a = b - c
3909             "movq %%mm2, %%mm4           \n\t"
3910             // pbv = p - b = (a + b - c) - b = a - c
3911             "movq %%mm1, %%mm5           \n\t"
3912             "psubw %%mm3, %%mm4          \n\t"
3913             "pxor %%mm7, %%mm7           \n\t"
3914             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3915             "movq %%mm4, %%mm6           \n\t"
3916             "psubw %%mm3, %%mm5          \n\t"
3917             // pa = abs(p-a) = abs(pav)
3918             // pb = abs(p-b) = abs(pbv)
3919             // pc = abs(p-c) = abs(pcv)
3920             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3921             "paddw %%mm5, %%mm6          \n\t"
3922             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3923             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3924             "psubw %%mm0, %%mm4          \n\t"
3925             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3926             "psubw %%mm0, %%mm4          \n\t"
3927             "psubw %%mm7, %%mm5          \n\t"
3928             "pxor %%mm0, %%mm0           \n\t"
3929             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3930             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3931             "psubw %%mm7, %%mm5          \n\t"
3932             "psubw %%mm0, %%mm6          \n\t"
3933             //  test pa <= pb
3934             "movq %%mm4, %%mm7           \n\t"
3935             "psubw %%mm0, %%mm6          \n\t"
3936             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3937             "movq %%mm7, %%mm0           \n\t"
3938             // use mm7 mask to merge pa & pb
3939             "pand %%mm7, %%mm5           \n\t"
3940             // use mm0 mask copy to merge a & b
3941             "pand %%mm0, %%mm2           \n\t"
3942             "pandn %%mm4, %%mm7          \n\t"
3943             "pandn %%mm1, %%mm0          \n\t"
3944             "paddw %%mm5, %%mm7          \n\t"
3945             "paddw %%mm2, %%mm0          \n\t"
3946             //  test  ((pa <= pb)? pa:pb) <= pc
3947             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3948             "pxor %%mm1, %%mm1           \n\t"
3949             "pand %%mm7, %%mm3           \n\t"
3950             "pandn %%mm0, %%mm7          \n\t"
3951             "pxor %%mm1, %%mm1           \n\t"
3952             "paddw %%mm3, %%mm7          \n\t"
3953             "pxor %%mm0, %%mm0           \n\t"
3954             // step ecx to next set of 8 bytes and repeat loop til done
3955             "addl $8, %%ecx              \n\t"
3956             "packuswb %%mm7, %%mm1       \n\t"
3957             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3958             "cmpl _MMXLength, %%ecx      \n\t"
3959             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3960                                 // mm1 will be used as Raw(x-bpp) next loop
3961             "jb paeth_6lp                \n\t"
3962
3963             : "=S" (dummy_value_S),             // output regs (dummy)
3964               "=D" (dummy_value_D)
3965
3966             : "0" (prev_row),  // esi           // input regs
3967               "1" (row)        // edi
3968
3969             : "%ecx"                            // clobber list
3970 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3971             , "%mm0", "%mm1", "%mm2", "%mm3"
3972             , "%mm4", "%mm5", "%mm6", "%mm7"
3973 #endif
3974          );
3975       }
3976       break;  // end 6 bpp
3977
3978       case 4:
3979       {
3980          _ActiveMask.use  = 0x00000000ffffffffLL;
3981
3982          __asm__ __volatile__ (
3983             "movl _dif, %%ecx            \n\t"
3984 // preload  "movl row, %%edi             \n\t"
3985 // preload  "movl prev_row, %%esi        \n\t"
3986             "pxor %%mm0, %%mm0           \n\t"
3987             // prime the pump:  load the first Raw(x-bpp) data set
3988             "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
3989                                      //  a=Raw(x-bpp) bytes
3990          "paeth_4lp:                     \n\t"
3991             // do first set of 4 bytes
3992             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3993             "punpckhbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
3994             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
3995             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3996             // pav = p - a = (a + b - c) - a = b - c
3997             "movq %%mm2, %%mm4           \n\t"
3998             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3999             // pbv = p - b = (a + b - c) - b = a - c
4000             "movq %%mm1, %%mm5           \n\t"
4001             "psubw %%mm3, %%mm4          \n\t"
4002             "pxor %%mm7, %%mm7           \n\t"
4003             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4004             "movq %%mm4, %%mm6           \n\t"
4005             "psubw %%mm3, %%mm5          \n\t"
4006             // pa = abs(p-a) = abs(pav)
4007             // pb = abs(p-b) = abs(pbv)
4008             // pc = abs(p-c) = abs(pcv)
4009             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4010             "paddw %%mm5, %%mm6          \n\t"
4011             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4012             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4013             "psubw %%mm0, %%mm4          \n\t"
4014             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4015             "psubw %%mm0, %%mm4          \n\t"
4016             "psubw %%mm7, %%mm5          \n\t"
4017             "pxor %%mm0, %%mm0           \n\t"
4018             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4019             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4020             "psubw %%mm7, %%mm5          \n\t"
4021             "psubw %%mm0, %%mm6          \n\t"
4022             //  test pa <= pb
4023             "movq %%mm4, %%mm7           \n\t"
4024             "psubw %%mm0, %%mm6          \n\t"
4025             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4026             "movq %%mm7, %%mm0           \n\t"
4027             // use mm7 mask to merge pa & pb
4028             "pand %%mm7, %%mm5           \n\t"
4029             // use mm0 mask copy to merge a & b
4030             "pand %%mm0, %%mm2           \n\t"
4031             "pandn %%mm4, %%mm7          \n\t"
4032             "pandn %%mm1, %%mm0          \n\t"
4033             "paddw %%mm5, %%mm7          \n\t"
4034             "paddw %%mm2, %%mm0          \n\t"
4035             //  test  ((pa <= pb)? pa:pb) <= pc
4036             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4037             "pxor %%mm1, %%mm1           \n\t"
4038             "pand %%mm7, %%mm3           \n\t"
4039             "pandn %%mm0, %%mm7          \n\t"
4040             "paddw %%mm3, %%mm7          \n\t"
4041             "pxor %%mm0, %%mm0           \n\t"
4042             "packuswb %%mm1, %%mm7       \n\t"
4043             "movq (%%esi,%%ecx,), %%mm3  \n\t" // load c=Prior(x-bpp)
4044             "pand _ActiveMask, %%mm7     \n\t"
4045             "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
4046             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
4047             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
4048             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
4049             "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as Raw(x-bpp)
4050             // do second set of 4 bytes
4051             "punpckhbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
4052             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
4053             // pav = p - a = (a + b - c) - a = b - c
4054             "movq %%mm2, %%mm4           \n\t"
4055             // pbv = p - b = (a + b - c) - b = a - c
4056             "movq %%mm1, %%mm5           \n\t"
4057             "psubw %%mm3, %%mm4          \n\t"
4058             "pxor %%mm7, %%mm7           \n\t"
4059             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4060             "movq %%mm4, %%mm6           \n\t"
4061             "psubw %%mm3, %%mm5          \n\t"
4062             // pa = abs(p-a) = abs(pav)
4063             // pb = abs(p-b) = abs(pbv)
4064             // pc = abs(p-c) = abs(pcv)
4065             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4066             "paddw %%mm5, %%mm6          \n\t"
4067             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4068             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4069             "psubw %%mm0, %%mm4          \n\t"
4070             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4071             "psubw %%mm0, %%mm4          \n\t"
4072             "psubw %%mm7, %%mm5          \n\t"
4073             "pxor %%mm0, %%mm0           \n\t"
4074             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4075             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4076             "psubw %%mm7, %%mm5          \n\t"
4077             "psubw %%mm0, %%mm6          \n\t"
4078             //  test pa <= pb
4079             "movq %%mm4, %%mm7           \n\t"
4080             "psubw %%mm0, %%mm6          \n\t"
4081             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4082             "movq %%mm7, %%mm0           \n\t"
4083             // use mm7 mask to merge pa & pb
4084             "pand %%mm7, %%mm5           \n\t"
4085             // use mm0 mask copy to merge a & b
4086             "pand %%mm0, %%mm2           \n\t"
4087             "pandn %%mm4, %%mm7          \n\t"
4088             "pandn %%mm1, %%mm0          \n\t"
4089             "paddw %%mm5, %%mm7          \n\t"
4090             "paddw %%mm2, %%mm0          \n\t"
4091             //  test  ((pa <= pb)? pa:pb) <= pc
4092             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4093             "pxor %%mm1, %%mm1           \n\t"
4094             "pand %%mm7, %%mm3           \n\t"
4095             "pandn %%mm0, %%mm7          \n\t"
4096             "pxor %%mm1, %%mm1           \n\t"
4097             "paddw %%mm3, %%mm7          \n\t"
4098             "pxor %%mm0, %%mm0           \n\t"
4099             // step ecx to next set of 8 bytes and repeat loop til done
4100             "addl $8, %%ecx              \n\t"
4101             "packuswb %%mm7, %%mm1       \n\t"
4102             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
4103             "cmpl _MMXLength, %%ecx      \n\t"
4104             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
4105                                 // mm1 will be used as Raw(x-bpp) next loop
4106             "jb paeth_4lp                \n\t"
4107
4108             : "=S" (dummy_value_S),             // output regs (dummy)
4109               "=D" (dummy_value_D)
4110
4111             : "0" (prev_row),  // esi           // input regs
4112               "1" (row)        // edi
4113
4114             : "%ecx"                            // clobber list
4115 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4116             , "%mm0", "%mm1", "%mm2", "%mm3"
4117             , "%mm4", "%mm5", "%mm6", "%mm7"
4118 #endif
4119          );
4120       }
4121       break;  // end 4 bpp
4122
4123       case 8:                          // bpp == 8
4124       {
4125          _ActiveMask.use  = 0x00000000ffffffffLL;
4126
4127          __asm__ __volatile__ (
4128             "movl _dif, %%ecx            \n\t"
4129 // preload  "movl row, %%edi             \n\t"
4130 // preload  "movl prev_row, %%esi        \n\t"
4131             "pxor %%mm0, %%mm0           \n\t"
4132             // prime the pump:  load the first Raw(x-bpp) data set
4133             "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
4134                                        //  a=Raw(x-bpp) bytes
4135          "paeth_8lp:                     \n\t"
4136             // do first set of 4 bytes
4137             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4138             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
4139             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
4140             "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
4141             // pav = p - a = (a + b - c) - a = b - c
4142             "movq %%mm2, %%mm4           \n\t"
4143             "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
4144             // pbv = p - b = (a + b - c) - b = a - c
4145             "movq %%mm1, %%mm5           \n\t"
4146             "psubw %%mm3, %%mm4          \n\t"
4147             "pxor %%mm7, %%mm7           \n\t"
4148             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4149             "movq %%mm4, %%mm6           \n\t"
4150             "psubw %%mm3, %%mm5          \n\t"
4151             // pa = abs(p-a) = abs(pav)
4152             // pb = abs(p-b) = abs(pbv)
4153             // pc = abs(p-c) = abs(pcv)
4154             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4155             "paddw %%mm5, %%mm6          \n\t"
4156             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4157             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4158             "psubw %%mm0, %%mm4          \n\t"
4159             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4160             "psubw %%mm0, %%mm4          \n\t"
4161             "psubw %%mm7, %%mm5          \n\t"
4162             "pxor %%mm0, %%mm0           \n\t"
4163             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4164             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4165             "psubw %%mm7, %%mm5          \n\t"
4166             "psubw %%mm0, %%mm6          \n\t"
4167             //  test pa <= pb
4168             "movq %%mm4, %%mm7           \n\t"
4169             "psubw %%mm0, %%mm6          \n\t"
4170             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4171             "movq %%mm7, %%mm0           \n\t"
4172             // use mm7 mask to merge pa & pb
4173             "pand %%mm7, %%mm5           \n\t"
4174             // use mm0 mask copy to merge a & b
4175             "pand %%mm0, %%mm2           \n\t"
4176             "pandn %%mm4, %%mm7          \n\t"
4177             "pandn %%mm1, %%mm0          \n\t"
4178             "paddw %%mm5, %%mm7          \n\t"
4179             "paddw %%mm2, %%mm0          \n\t"
4180             //  test  ((pa <= pb)? pa:pb) <= pc
4181             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4182             "pxor %%mm1, %%mm1           \n\t"
4183             "pand %%mm7, %%mm3           \n\t"
4184             "pandn %%mm0, %%mm7          \n\t"
4185             "paddw %%mm3, %%mm7          \n\t"
4186             "pxor %%mm0, %%mm0           \n\t"
4187             "packuswb %%mm1, %%mm7       \n\t"
4188             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4189             "pand _ActiveMask, %%mm7     \n\t"
4190             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
4191             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
4192             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
4193             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
4194             "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
4195
4196             // do second set of 4 bytes
4197             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
4198             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
4199             // pav = p - a = (a + b - c) - a = b - c
4200             "movq %%mm2, %%mm4           \n\t"
4201             // pbv = p - b = (a + b - c) - b = a - c
4202             "movq %%mm1, %%mm5           \n\t"
4203             "psubw %%mm3, %%mm4          \n\t"
4204             "pxor %%mm7, %%mm7           \n\t"
4205             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4206             "movq %%mm4, %%mm6           \n\t"
4207             "psubw %%mm3, %%mm5          \n\t"
4208             // pa = abs(p-a) = abs(pav)
4209             // pb = abs(p-b) = abs(pbv)
4210             // pc = abs(p-c) = abs(pcv)
4211             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4212             "paddw %%mm5, %%mm6          \n\t"
4213             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4214             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4215             "psubw %%mm0, %%mm4          \n\t"
4216             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4217             "psubw %%mm0, %%mm4          \n\t"
4218             "psubw %%mm7, %%mm5          \n\t"
4219             "pxor %%mm0, %%mm0           \n\t"
4220             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4221             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4222             "psubw %%mm7, %%mm5          \n\t"
4223             "psubw %%mm0, %%mm6          \n\t"
4224             //  test pa <= pb
4225             "movq %%mm4, %%mm7           \n\t"
4226             "psubw %%mm0, %%mm6          \n\t"
4227             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4228             "movq %%mm7, %%mm0           \n\t"
4229             // use mm7 mask to merge pa & pb
4230             "pand %%mm7, %%mm5           \n\t"
4231             // use mm0 mask copy to merge a & b
4232             "pand %%mm0, %%mm2           \n\t"
4233             "pandn %%mm4, %%mm7          \n\t"
4234             "pandn %%mm1, %%mm0          \n\t"
4235             "paddw %%mm5, %%mm7          \n\t"
4236             "paddw %%mm2, %%mm0          \n\t"
4237             //  test  ((pa <= pb)? pa:pb) <= pc
4238             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4239             "pxor %%mm1, %%mm1           \n\t"
4240             "pand %%mm7, %%mm3           \n\t"
4241             "pandn %%mm0, %%mm7          \n\t"
4242             "pxor %%mm1, %%mm1           \n\t"
4243             "paddw %%mm3, %%mm7          \n\t"
4244             "pxor %%mm0, %%mm0           \n\t"
4245             // step ecx to next set of 8 bytes and repeat loop til done
4246             "addl $8, %%ecx              \n\t"
4247             "packuswb %%mm7, %%mm1       \n\t"
4248             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
4249             "cmpl _MMXLength, %%ecx      \n\t"
4250             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
4251                             // mm1 will be used as Raw(x-bpp) next loop
4252             "jb paeth_8lp                \n\t"
4253
4254             : "=S" (dummy_value_S),             // output regs (dummy)
4255               "=D" (dummy_value_D)
4256
4257             : "0" (prev_row),  // esi           // input regs
4258               "1" (row)        // edi
4259
4260             : "%ecx"                            // clobber list
4261 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4262             , "%mm0", "%mm1", "%mm2", "%mm3"
4263             , "%mm4", "%mm5", "%mm6", "%mm7"
4264 #endif
4265          );
4266       }
4267       break;  // end 8 bpp
4268
4269       case 1:                // bpp = 1
4270       case 2:                // bpp = 2
4271       default:               // bpp > 8
4272       {
4273          __asm__ __volatile__ (
4274 #ifdef __PIC__
4275             "pushl %%ebx                 \n\t" // save Global Offset Table index
4276 #endif
4277             "movl _dif, %%ebx            \n\t"
4278             "cmpl _FullLength, %%ebx     \n\t"
4279             "jnb paeth_dend              \n\t"
4280
4281 // preload  "movl row, %%edi             \n\t"
4282 // preload  "movl prev_row, %%esi        \n\t"
4283             // do Paeth decode for remaining bytes
4284             "movl %%ebx, %%edx           \n\t"
4285 // preload  "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
4286             "subl %%ecx, %%edx           \n\t" // edx = ebx - bpp
4287             "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx
4288
4289          "paeth_dlp:                     \n\t"
4290             "xorl %%eax, %%eax           \n\t"
4291             // pav = p - a = (a + b - c) - a = b - c
4292             "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
4293             "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4294             "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
4295             "movl %%eax, _patemp         \n\t" // Save pav for later use
4296             "xorl %%eax, %%eax           \n\t"
4297             // pbv = p - b = (a + b - c) - b = a - c
4298             "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
4299             "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
4300             "movl %%eax, %%ecx           \n\t"
4301             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4302             "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
4303             // pc = abs(pcv)
4304             "testl $0x80000000, %%eax    \n\t"
4305             "jz paeth_dpca               \n\t"
4306             "negl %%eax                  \n\t" // reverse sign of neg values
4307
4308          "paeth_dpca:                    \n\t"
4309             "movl %%eax, _pctemp         \n\t" // save pc for later use
4310             // pb = abs(pbv)
4311             "testl $0x80000000, %%ecx    \n\t"
4312             "jz paeth_dpba               \n\t"
4313             "negl %%ecx                  \n\t" // reverse sign of neg values
4314
4315          "paeth_dpba:                    \n\t"
4316             "movl %%ecx, _pbtemp         \n\t" // save pb for later use
4317             // pa = abs(pav)
4318             "movl _patemp, %%eax         \n\t"
4319             "testl $0x80000000, %%eax    \n\t"
4320             "jz paeth_dpaa               \n\t"
4321             "negl %%eax                  \n\t" // reverse sign of neg values
4322
4323          "paeth_dpaa:                    \n\t"
4324             "movl %%eax, _patemp         \n\t" // save pa for later use
4325             // test if pa <= pb
4326             "cmpl %%ecx, %%eax           \n\t"
4327             "jna paeth_dabb              \n\t"
4328             // pa > pb; now test if pb <= pc
4329             "cmpl _pctemp, %%ecx         \n\t"
4330             "jna paeth_dbbc              \n\t"
4331             // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4332             "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4333             "jmp paeth_dpaeth            \n\t"
4334
4335          "paeth_dbbc:                    \n\t"
4336             // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4337             "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
4338             "jmp paeth_dpaeth            \n\t"
4339
4340          "paeth_dabb:                    \n\t"
4341             // pa <= pb; now test if pa <= pc
4342             "cmpl _pctemp, %%eax         \n\t"
4343             "jna paeth_dabc              \n\t"
4344             // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4345             "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4346             "jmp paeth_dpaeth            \n\t"
4347
4348          "paeth_dabc:                    \n\t"
4349             // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4350             "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
4351
4352          "paeth_dpaeth:                  \n\t"
4353             "incl %%ebx                  \n\t"
4354             "incl %%edx                  \n\t"
4355             // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4356             "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4357             "cmpl _FullLength, %%ebx     \n\t"
4358             "jb paeth_dlp                \n\t"
4359
4360          "paeth_dend:                    \n\t"
4361 #ifdef __PIC__
4362             "popl %%ebx                  \n\t" // index to Global Offset Table
4363 #endif
4364
4365             : "=c" (dummy_value_c),            // output regs (dummy)
4366               "=S" (dummy_value_S),
4367               "=D" (dummy_value_D)
4368
4369             : "0" (bpp),       // ecx          // input regs
4370               "1" (prev_row),  // esi
4371               "2" (row)        // edi
4372
4373             : "%eax", "%edx"                   // clobber list
4374 #ifndef __PIC__
4375             , "%ebx"
4376 #endif
4377          );
4378       }
4379       return;                   // No need to go further with this one
4380
4381    } // end switch (bpp)
4382
4383    __asm__ __volatile__ (
4384       // MMX acceleration complete; now do clean-up
4385       // check if any remaining bytes left to decode
4386 #ifdef __PIC__
4387       "pushl %%ebx                 \n\t" // save index to Global Offset Table
4388 #endif
4389       "movl _MMXLength, %%ebx      \n\t"
4390       "cmpl _FullLength, %%ebx     \n\t"
4391       "jnb paeth_end               \n\t"
4392 //pre "movl row, %%edi             \n\t"
4393 //pre "movl prev_row, %%esi        \n\t"
4394       // do Paeth decode for remaining bytes
4395       "movl %%ebx, %%edx           \n\t"
4396 //pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
4397       "subl %%ecx, %%edx           \n\t" // edx = ebx - bpp
4398       "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
4399
4400    "paeth_lp2:                     \n\t"
4401       "xorl %%eax, %%eax           \n\t"
4402       // pav = p - a = (a + b - c) - a = b - c
4403       "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
4404       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4405       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
4406       "movl %%eax, _patemp         \n\t" // Save pav for later use
4407       "xorl %%eax, %%eax           \n\t"
4408       // pbv = p - b = (a + b - c) - b = a - c
4409       "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
4410       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
4411       "movl %%eax, %%ecx           \n\t"
4412       // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4413       "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
4414       // pc = abs(pcv)
4415       "testl $0x80000000, %%eax    \n\t"
4416       "jz paeth_pca2               \n\t"
4417       "negl %%eax                  \n\t" // reverse sign of neg values
4418
4419    "paeth_pca2:                    \n\t"
4420       "movl %%eax, _pctemp         \n\t" // save pc for later use
4421       // pb = abs(pbv)
4422       "testl $0x80000000, %%ecx    \n\t"
4423       "jz paeth_pba2               \n\t"
4424       "negl %%ecx                  \n\t" // reverse sign of neg values
4425
4426    "paeth_pba2:                    \n\t"
4427       "movl %%ecx, _pbtemp         \n\t" // save pb for later use
4428       // pa = abs(pav)
4429       "movl _patemp, %%eax         \n\t"
4430       "testl $0x80000000, %%eax    \n\t"
4431       "jz paeth_paa2               \n\t"
4432       "negl %%eax                  \n\t" // reverse sign of neg values
4433
4434    "paeth_paa2:                    \n\t"
4435       "movl %%eax, _patemp         \n\t" // save pa for later use
4436       // test if pa <= pb
4437       "cmpl %%ecx, %%eax           \n\t"
4438       "jna paeth_abb2              \n\t"
4439       // pa > pb; now test if pb <= pc
4440       "cmpl _pctemp, %%ecx         \n\t"
4441       "jna paeth_bbc2              \n\t"
4442       // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4443       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4444       "jmp paeth_paeth2            \n\t"
4445
4446    "paeth_bbc2:                    \n\t"
4447       // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4448       "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
4449       "jmp paeth_paeth2            \n\t"
4450
4451    "paeth_abb2:                    \n\t"
4452       // pa <= pb; now test if pa <= pc
4453       "cmpl _pctemp, %%eax         \n\t"
4454       "jna paeth_abc2              \n\t"
4455       // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4456       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4457       "jmp paeth_paeth2            \n\t"
4458
4459    "paeth_abc2:                    \n\t"
4460       // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4461       "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
4462
4463    "paeth_paeth2:                  \n\t"
4464       "incl %%ebx                  \n\t"
4465       "incl %%edx                  \n\t"
4466       // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4467       "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4468       "cmpl _FullLength, %%ebx     \n\t"
4469       "jb paeth_lp2                \n\t"
4470
4471    "paeth_end:                     \n\t"
4472       "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
4473 #ifdef __PIC__
4474       "popl %%ebx                  \n\t" // restore index to Global Offset Table
4475 #endif
4476
4477       : "=c" (dummy_value_c),            // output regs (dummy)
4478         "=S" (dummy_value_S),
4479         "=D" (dummy_value_D)
4480
4481       : "0" (bpp),       // ecx          // input regs
4482         "1" (prev_row),  // esi
4483         "2" (row)        // edi
4484
4485       : "%eax", "%edx"                   // clobber list (no input regs!)
4486 #ifndef __PIC__
4487       , "%ebx"
4488 #endif
4489    );
4490
4491 } /* end png_read_filter_row_mmx_paeth() */
4492 #endif
4493
4494
4495
4496
4497 #ifdef PNG_THREAD_UNSAFE_OK
4498 //===========================================================================//
4499 //                                                                           //
4500 //           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B           //
4501 //                                                                           //
4502 //===========================================================================//
4503
4504 // Optimized code for PNG Sub filter decoder
4505
4506 static void /* PRIVATE */
4507 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
4508 {
4509    int bpp;
4510    int dummy_value_a;
4511    int dummy_value_D;
4512
4513    bpp = (row_info->pixel_depth + 7) >> 3;   // calc number of bytes per pixel
4514    _FullLength = row_info->rowbytes - bpp;   // number of bytes to filter
4515
4516    __asm__ __volatile__ (
4517 //pre "movl row, %%edi             \n\t"
4518       "movl %%edi, %%esi           \n\t" // lp = row
4519 //pre "movl bpp, %%eax             \n\t"
4520       "addl %%eax, %%edi           \n\t" // rp = row + bpp
4521 //irr "xorl %%eax, %%eax           \n\t"
4522       // get # of bytes to alignment
4523       "movl %%edi, _dif            \n\t" // take start of row
4524       "addl $0xf, _dif             \n\t" // add 7 + 8 to incr past
4525                                          //  alignment boundary
4526       "xorl %%ecx, %%ecx           \n\t"
4527       "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
4528       "subl %%edi, _dif            \n\t" // subtract from start ==> value
4529       "jz sub_go                   \n\t" //  ecx at alignment
4530
4531    "sub_lp1:                       \n\t" // fix alignment
4532       "movb (%%esi,%%ecx,), %%al   \n\t"
4533       "addb %%al, (%%edi,%%ecx,)   \n\t"
4534       "incl %%ecx                  \n\t"
4535       "cmpl _dif, %%ecx            \n\t"
4536       "jb sub_lp1                  \n\t"
4537
4538    "sub_go:                        \n\t"
4539       "movl _FullLength, %%eax     \n\t"
4540       "movl %%eax, %%edx           \n\t"
4541       "subl %%ecx, %%edx           \n\t" // subtract alignment fix
4542       "andl $0x00000007, %%edx     \n\t" // calc bytes over mult of 8
4543       "subl %%edx, %%eax           \n\t" // drop over bytes from length
4544       "movl %%eax, _MMXLength      \n\t"
4545
4546       : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4547         "=D" (dummy_value_D)    // 1
4548
4549       : "0" (bpp),              // eax    // input regs
4550         "1" (row)               // edi
4551
4552       : "%esi", "%ecx", "%edx"            // clobber list
4553
4554 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4555       , "%mm0", "%mm1", "%mm2", "%mm3"
4556       , "%mm4", "%mm5", "%mm6", "%mm7"
4557 #endif
4558    );
4559
4560    // now do the math for the rest of the row
4561    switch (bpp)
4562    {
4563       case 3:
4564       {
4565          _ActiveMask.use  = 0x0000ffffff000000LL;
4566          _ShiftBpp.use = 24;       // == 3 * 8
4567          _ShiftRem.use  = 40;      // == 64 - 24
4568
4569          __asm__ __volatile__ (
4570 // preload  "movl row, %%edi              \n\t"
4571             "movq _ActiveMask, %%mm7       \n\t" // load _ActiveMask for 2nd
4572                                                 //  active byte group
4573             "movl %%edi, %%esi            \n\t" // lp = row
4574 // preload  "movl bpp, %%eax              \n\t"
4575             "addl %%eax, %%edi            \n\t" // rp = row + bpp
4576             "movq %%mm7, %%mm6            \n\t"
4577             "movl _dif, %%edx             \n\t"
4578             "psllq _ShiftBpp, %%mm6       \n\t" // move mask in mm6 to cover
4579                                                 //  3rd active byte group
4580             // prime the pump:  load the first Raw(x-bpp) data set
4581             "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4582
4583          "sub_3lp:                        \n\t" // shift data for adding first
4584             "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
4585                                                 //  shift clears inactive bytes)
4586             // add 1st active group
4587             "movq (%%edi,%%edx,), %%mm0   \n\t"
4588             "paddb %%mm1, %%mm0           \n\t"
4589
4590             // add 2nd active group
4591             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4592             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4593             "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
4594             "paddb %%mm1, %%mm0           \n\t"
4595
4596             // add 3rd active group
4597             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4598             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4599             "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
4600             "addl $8, %%edx               \n\t"
4601             "paddb %%mm1, %%mm0           \n\t"
4602
4603             "cmpl _MMXLength, %%edx       \n\t"
4604             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4605             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
4606             "jb sub_3lp                   \n\t"
4607
4608             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4609               "=D" (dummy_value_D)    // 1
4610
4611             : "0" (bpp),              // eax    // input regs
4612               "1" (row)               // edi
4613
4614             : "%edx", "%esi"                    // clobber list
4615 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4616             , "%mm0", "%mm1", "%mm6", "%mm7"
4617 #endif
4618          );
4619       }
4620       break;
4621
4622       case 1:
4623       {
4624          __asm__ __volatile__ (
4625             "movl _dif, %%edx            \n\t"
4626 // preload  "movl row, %%edi             \n\t"
4627             "cmpl _FullLength, %%edx     \n\t"
4628             "jnb sub_1end                \n\t"
4629             "movl %%edi, %%esi           \n\t" // lp = row
4630             "xorl %%eax, %%eax           \n\t"
4631 // preload  "movl bpp, %%eax             \n\t"
4632             "addl %%eax, %%edi           \n\t" // rp = row + bpp
4633
4634          "sub_1lp:                       \n\t"
4635             "movb (%%esi,%%edx,), %%al   \n\t"
4636             "addb %%al, (%%edi,%%edx,)   \n\t"
4637             "incl %%edx                  \n\t"
4638             "cmpl _FullLength, %%edx     \n\t"
4639             "jb sub_1lp                  \n\t"
4640
4641          "sub_1end:                      \n\t"
4642
4643             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4644               "=D" (dummy_value_D)    // 1
4645
4646             : "0" (bpp),              // eax    // input regs
4647               "1" (row)               // edi
4648
4649             : "%edx", "%esi"                    // clobber list
4650          );
4651       }
4652       return;
4653
4654       case 6:
4655       case 4:
4656       //case 7:   // GRR BOGUS
4657       //case 5:   // GRR BOGUS
4658       {
4659          _ShiftBpp.use = bpp << 3;
4660          _ShiftRem.use = 64 - _ShiftBpp.use;
4661
4662          __asm__ __volatile__ (
4663 // preload  "movl row, %%edi              \n\t"
4664             "movl _dif, %%edx             \n\t"
4665             "movl %%edi, %%esi            \n\t" // lp = row
4666 // preload  "movl bpp, %%eax              \n\t"
4667             "addl %%eax, %%edi            \n\t" // rp = row + bpp
4668
4669             // prime the pump:  load the first Raw(x-bpp) data set
4670             "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4671
4672          "sub_4lp:                        \n\t" // shift data for adding first
4673             "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
4674                                                 //  shift clears inactive bytes)
4675             "movq (%%edi,%%edx,), %%mm0   \n\t"
4676             "paddb %%mm1, %%mm0           \n\t"
4677
4678             // add 2nd active group
4679             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4680             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4681             "addl $8, %%edx               \n\t"
4682             "paddb %%mm1, %%mm0           \n\t"
4683
4684             "cmpl _MMXLength, %%edx       \n\t"
4685             "movq %%mm0, -8(%%edi,%%edx,) \n\t"
4686             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
4687             "jb sub_4lp                   \n\t"
4688
4689             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4690               "=D" (dummy_value_D)    // 1
4691
4692             : "0" (bpp),              // eax    // input regs
4693               "1" (row)               // edi
4694
4695             : "%edx", "%esi"                    // clobber list
4696 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4697             , "%mm0", "%mm1"
4698 #endif
4699          );
4700       }
4701       break;
4702
4703       case 2:
4704       {
4705          _ActiveMask.use = 0x00000000ffff0000LL;
4706          _ShiftBpp.use = 16;       // == 2 * 8
4707          _ShiftRem.use = 48;       // == 64 - 16
4708
4709          __asm__ __volatile__ (
4710             "movq _ActiveMask, %%mm7      \n\t" // load _ActiveMask for 2nd
4711                                                 //  active byte group
4712             "movl _dif, %%edx             \n\t"
4713             "movq %%mm7, %%mm6            \n\t"
4714 // preload  "movl row, %%edi              \n\t"
4715             "psllq _ShiftBpp, %%mm6       \n\t" // move mask in mm6 to cover
4716                                                 //  3rd active byte group
4717             "movl %%edi, %%esi            \n\t" // lp = row
4718             "movq %%mm6, %%mm5            \n\t"
4719 // preload  "movl bpp, %%eax              \n\t"
4720             "addl %%eax, %%edi            \n\t" // rp = row + bpp
4721             "psllq _ShiftBpp, %%mm5       \n\t" // move mask in mm5 to cover
4722                                                 //  4th active byte group
4723             // prime the pump:  load the first Raw(x-bpp) data set
4724             "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4725
4726          "sub_2lp:                        \n\t" // shift data for adding first
4727             "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
4728                                                 //  shift clears inactive bytes)
4729             // add 1st active group
4730             "movq (%%edi,%%edx,), %%mm0   \n\t"
4731             "paddb %%mm1, %%mm0           \n\t"
4732
4733             // add 2nd active group
4734             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4735             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4736             "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
4737             "paddb %%mm1, %%mm0           \n\t"
4738
4739             // add 3rd active group
4740             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4741             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4742             "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
4743             "paddb %%mm1, %%mm0           \n\t"
4744
4745             // add 4th active group
4746             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4747             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4748             "pand %%mm5, %%mm1            \n\t" // mask to use 4th active group
4749             "addl $8, %%edx               \n\t"
4750             "paddb %%mm1, %%mm0           \n\t"
4751             "cmpl _MMXLength, %%edx       \n\t"
4752             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4753             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
4754             "jb sub_2lp                   \n\t"
4755
4756             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4757               "=D" (dummy_value_D)    // 1
4758
4759             : "0" (bpp),              // eax    // input regs
4760               "1" (row)               // edi
4761
4762             : "%edx", "%esi"                    // clobber list
4763 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4764             , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
4765 #endif
4766          );
4767       }
4768       break;
4769
4770       case 8:
4771       {
4772          __asm__ __volatile__ (
4773 // preload  "movl row, %%edi              \n\t"
4774             "movl _dif, %%edx             \n\t"
4775             "movl %%edi, %%esi            \n\t" // lp = row
4776 // preload  "movl bpp, %%eax              \n\t"
4777             "addl %%eax, %%edi            \n\t" // rp = row + bpp
4778             "movl _MMXLength, %%ecx       \n\t"
4779
4780             // prime the pump:  load the first Raw(x-bpp) data set
4781             "movq -8(%%edi,%%edx,), %%mm7 \n\t"
4782             "andl $0x0000003f, %%ecx      \n\t" // calc bytes over mult of 64
4783
4784          "sub_8lp:                        \n\t"
4785             "movq (%%edi,%%edx,), %%mm0   \n\t" // load Sub(x) for 1st 8 bytes
4786             "paddb %%mm7, %%mm0           \n\t"
4787             "movq 8(%%edi,%%edx,), %%mm1  \n\t" // load Sub(x) for 2nd 8 bytes
4788             "movq %%mm0, (%%edi,%%edx,)   \n\t" // write Raw(x) for 1st 8 bytes
4789
4790             // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
4791             // This will be repeated for each group of 8 bytes with the 8th
4792             // group being used as the Raw(x-bpp) for the 1st group of the
4793             // next loop.
4794
4795             "paddb %%mm0, %%mm1           \n\t"
4796             "movq 16(%%edi,%%edx,), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes
4797             "movq %%mm1, 8(%%edi,%%edx,)  \n\t" // write Raw(x) for 2nd 8 bytes
4798             "paddb %%mm1, %%mm2           \n\t"
4799             "movq 24(%%edi,%%edx,), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes
4800             "movq %%mm2, 16(%%edi,%%edx,) \n\t" // write Raw(x) for 3rd 8 bytes
4801             "paddb %%mm2, %%mm3           \n\t"
4802             "movq 32(%%edi,%%edx,), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes
4803             "movq %%mm3, 24(%%edi,%%edx,) \n\t" // write Raw(x) for 4th 8 bytes
4804             "paddb %%mm3, %%mm4           \n\t"
4805             "movq 40(%%edi,%%edx,), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes
4806             "movq %%mm4, 32(%%edi,%%edx,) \n\t" // write Raw(x) for 5th 8 bytes
4807             "paddb %%mm4, %%mm5           \n\t"
4808             "movq 48(%%edi,%%edx,), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes
4809             "movq %%mm5, 40(%%edi,%%edx,) \n\t" // write Raw(x) for 6th 8 bytes
4810             "paddb %%mm5, %%mm6           \n\t"
4811             "movq 56(%%edi,%%edx,), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes
4812             "movq %%mm6, 48(%%edi,%%edx,) \n\t" // write Raw(x) for 7th 8 bytes
4813             "addl $64, %%edx              \n\t"
4814             "paddb %%mm6, %%mm7           \n\t"
4815             "cmpl %%ecx, %%edx            \n\t"
4816             "movq %%mm7, -8(%%edi,%%edx,) \n\t" // write Raw(x) for 8th 8 bytes
4817             "jb sub_8lp                   \n\t"
4818
4819             "cmpl _MMXLength, %%edx       \n\t"
4820             "jnb sub_8lt8                 \n\t"
4821
4822          "sub_8lpA:                       \n\t"
4823             "movq (%%edi,%%edx,), %%mm0   \n\t"
4824             "addl $8, %%edx               \n\t"
4825             "paddb %%mm7, %%mm0           \n\t"
4826             "cmpl _MMXLength, %%edx       \n\t"
4827             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // -8 to offset early addl edx
4828             "movq %%mm0, %%mm7            \n\t" // move calculated Raw(x) data
4829                                                 //  to mm1 to be new Raw(x-bpp)
4830                                                 //  for next loop
4831             "jb sub_8lpA                  \n\t"
4832
4833          "sub_8lt8:                       \n\t"
4834
4835             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4836               "=D" (dummy_value_D)    // 1
4837
4838             : "0" (bpp),              // eax    // input regs
4839               "1" (row)               // edi
4840
4841             : "%ecx", "%edx", "%esi"            // clobber list
4842 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4843             , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
4844 #endif
4845          );
4846       }
4847       break;
4848
4849       default:                // bpp greater than 8 bytes   GRR BOGUS
4850       {
4851          __asm__ __volatile__ (
4852             "movl _dif, %%edx             \n\t"
4853 // preload  "movl row, %%edi              \n\t"
4854             "movl %%edi, %%esi            \n\t" // lp = row
4855 // preload  "movl bpp, %%eax              \n\t"
4856             "addl %%eax, %%edi            \n\t" // rp = row + bpp
4857
4858          "sub_Alp:                        \n\t"
4859             "movq (%%edi,%%edx,), %%mm0   \n\t"
4860             "movq (%%esi,%%edx,), %%mm1   \n\t"
4861             "addl $8, %%edx               \n\t"
4862             "paddb %%mm1, %%mm0           \n\t"
4863             "cmpl _MMXLength, %%edx       \n\t"
4864             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // mov does not affect flags;
4865                                                 //  -8 to offset addl edx
4866             "jb sub_Alp                   \n\t"
4867
4868             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4869               "=D" (dummy_value_D)    // 1
4870
4871             : "0" (bpp),              // eax    // input regs
4872               "1" (row)               // edi
4873
4874             : "%edx", "%esi"                    // clobber list
4875 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4876             , "%mm0", "%mm1"
4877 #endif
4878          );
4879       }
4880       break;
4881
4882    } // end switch (bpp)
4883
4884    __asm__ __volatile__ (
4885       "movl _MMXLength, %%edx       \n\t"
4886 //pre "movl row, %%edi              \n\t"
4887       "cmpl _FullLength, %%edx      \n\t"
4888       "jnb sub_end                  \n\t"
4889
4890       "movl %%edi, %%esi            \n\t" // lp = row
4891 //pre "movl bpp, %%eax              \n\t"
4892       "addl %%eax, %%edi            \n\t" // rp = row + bpp
4893       "xorl %%eax, %%eax            \n\t"
4894
4895    "sub_lp2:                        \n\t"
4896       "movb (%%esi,%%edx,), %%al    \n\t"
4897       "addb %%al, (%%edi,%%edx,)    \n\t"
4898       "incl %%edx                   \n\t"
4899       "cmpl _FullLength, %%edx      \n\t"
4900       "jb sub_lp2                   \n\t"
4901
4902    "sub_end:                        \n\t"
4903       "EMMS                         \n\t" // end MMX instructions
4904
4905       : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4906         "=D" (dummy_value_D)    // 1
4907
4908       : "0" (bpp),              // eax    // input regs
4909         "1" (row)               // edi
4910
4911       : "%edx", "%esi"                    // clobber list
4912    );
4913
4914 } // end of png_read_filter_row_mmx_sub()
4915 #endif
4916
4917
4918
4919
4920 //===========================================================================//
4921 //                                                                           //
4922 //            P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P            //
4923 //                                                                           //
4924 //===========================================================================//
4925
4926 // Optimized code for PNG Up filter decoder
4927
4928 static void /* PRIVATE */
4929 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
4930                            png_bytep prev_row)
4931 {
4932    png_uint_32 len;
4933    int dummy_value_d;   // fix 'forbidden register 3 (dx) was spilled' error
4934    int dummy_value_S;
4935    int dummy_value_D;
4936
4937    len = row_info->rowbytes;              // number of bytes to filter
4938
4939    __asm__ __volatile__ (
4940 //pre "movl row, %%edi              \n\t"
4941       // get # of bytes to alignment
4942 #ifdef __PIC__
4943       "pushl %%ebx                  \n\t"
4944 #endif
4945       "movl %%edi, %%ecx            \n\t"
4946       "xorl %%ebx, %%ebx            \n\t"
4947       "addl $0x7, %%ecx             \n\t"
4948       "xorl %%eax, %%eax            \n\t"
4949       "andl $0xfffffff8, %%ecx      \n\t"
4950 //pre "movl prev_row, %%esi         \n\t"
4951       "subl %%edi, %%ecx            \n\t"
4952       "jz up_go                     \n\t"
4953
4954    "up_lp1:                         \n\t" // fix alignment
4955       "movb (%%edi,%%ebx,), %%al    \n\t"
4956       "addb (%%esi,%%ebx,), %%al    \n\t"
4957       "incl %%ebx                   \n\t"
4958       "cmpl %%ecx, %%ebx            \n\t"
4959       "movb %%al, -1(%%edi,%%ebx,)  \n\t" // mov does not affect flags; -1 to
4960       "jb up_lp1                    \n\t" //  offset incl ebx
4961
4962    "up_go:                          \n\t"
4963 //pre "movl len, %%edx              \n\t"
4964       "movl %%edx, %%ecx            \n\t"
4965       "subl %%ebx, %%edx            \n\t" // subtract alignment fix
4966       "andl $0x0000003f, %%edx      \n\t" // calc bytes over mult of 64
4967       "subl %%edx, %%ecx            \n\t" // drop over bytes from length
4968
4969       // unrolled loop - use all MMX registers and interleave to reduce
4970       // number of branch instructions (loops) and reduce partial stalls
4971    "up_loop:                        \n\t"
4972       "movq (%%esi,%%ebx,), %%mm1   \n\t"
4973       "movq (%%edi,%%ebx,), %%mm0   \n\t"
4974       "movq 8(%%esi,%%ebx,), %%mm3  \n\t"
4975       "paddb %%mm1, %%mm0           \n\t"
4976       "movq 8(%%edi,%%ebx,), %%mm2  \n\t"
4977       "movq %%mm0, (%%edi,%%ebx,)   \n\t"
4978       "paddb %%mm3, %%mm2           \n\t"
4979       "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
4980       "movq %%mm2, 8(%%edi,%%ebx,)  \n\t"
4981       "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
4982       "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
4983       "paddb %%mm5, %%mm4           \n\t"
4984       "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
4985       "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
4986       "paddb %%mm7, %%mm6           \n\t"
4987       "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
4988       "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
4989       "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
4990       "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
4991       "paddb %%mm1, %%mm0           \n\t"
4992       "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
4993       "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
4994       "paddb %%mm3, %%mm2           \n\t"
4995       "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
4996       "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
4997       "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
4998       "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
4999       "paddb %%mm5, %%mm4           \n\t"
5000       "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
5001       "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
5002       "addl $64, %%ebx              \n\t"
5003       "paddb %%mm7, %%mm6           \n\t"
5004       "cmpl %%ecx, %%ebx            \n\t"
5005       "movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
5006       "jb up_loop                   \n\t" //  -8 to offset addl ebx
5007
5008       "cmpl $0, %%edx               \n\t" // test for bytes over mult of 64
5009       "jz up_end                    \n\t"
5010
5011       "cmpl $8, %%edx               \n\t" // test for less than 8 bytes
5012       "jb up_lt8                    \n\t" //  [added by lcreeve at netins.net]
5013
5014       "addl %%edx, %%ecx            \n\t"
5015       "andl $0x00000007, %%edx      \n\t" // calc bytes over mult of 8
5016       "subl %%edx, %%ecx            \n\t" // drop over bytes from length
5017       "jz up_lt8                    \n\t"
5018
5019    "up_lpA:                         \n\t" // use MMX regs to update 8 bytes sim.
5020       "movq (%%esi,%%ebx,), %%mm1   \n\t"
5021       "movq (%%edi,%%ebx,), %%mm0   \n\t"
5022       "addl $8, %%ebx               \n\t"
5023       "paddb %%mm1, %%mm0           \n\t"
5024       "cmpl %%ecx, %%ebx            \n\t"
5025       "movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to
5026       "jb up_lpA                    \n\t" //  offset add ebx
5027       "cmpl $0, %%edx               \n\t" // test for bytes over mult of 8
5028       "jz up_end                    \n\t"
5029
5030    "up_lt8:                         \n\t"
5031       "xorl %%eax, %%eax            \n\t"
5032       "addl %%edx, %%ecx            \n\t" // move over byte count into counter
5033
5034    "up_lp2:                         \n\t" // use x86 regs for remaining bytes
5035       "movb (%%edi,%%ebx,), %%al    \n\t"
5036       "addb (%%esi,%%ebx,), %%al    \n\t"
5037       "incl %%ebx                   \n\t"
5038       "cmpl %%ecx, %%ebx            \n\t"
5039       "movb %%al, -1(%%edi,%%ebx,)  \n\t" // mov does not affect flags; -1 to
5040       "jb up_lp2                    \n\t" //  offset inc ebx
5041
5042    "up_end:                         \n\t"
5043       "EMMS                         \n\t" // conversion of filtered row complete
5044 #ifdef __PIC__
5045       "popl %%ebx                   \n\t"
5046 #endif
5047
5048       : "=d" (dummy_value_d),   // 0      // output regs (dummy)
5049         "=S" (dummy_value_S),   // 1
5050         "=D" (dummy_value_D)    // 2
5051
5052       : "0" (len),              // edx    // input regs
5053         "1" (prev_row),         // esi
5054         "2" (row)               // edi
5055
5056       : "%eax", "%ecx"            // clobber list (no input regs!)
5057 #ifndef __PIC__
5058       , "%ebx"
5059 #endif
5060
5061 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
5062       , "%mm0", "%mm1", "%mm2", "%mm3"
5063       , "%mm4", "%mm5", "%mm6", "%mm7"
5064 #endif
5065    );
5066
5067 } // end of png_read_filter_row_mmx_up()
5068
5069 #endif /* PNG_MMX_CODE_SUPPORTED */
5070
5071
5072
5073
5074 /*===========================================================================*/
5075 /*                                                                           */
5076 /*                   P N G _ R E A D _ F I L T E R _ R O W                   */
5077 /*                                                                           */
5078 /*===========================================================================*/
5079
5080
5081 /* Optimized png_read_filter_row routines */
5082
5083 void /* PRIVATE */
5084 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
5085    row, png_bytep prev_row, int filter)
5086 {
5087 #ifdef PNG_DEBUG
5088    char filnm[10];
5089 #endif
5090
5091 #if defined(PNG_MMX_CODE_SUPPORTED)
5092 /* GRR:  these are superseded by png_ptr->asm_flags: */
5093 #define UseMMX_sub    1   // GRR:  converted 20000730
5094 #define UseMMX_up     1   // GRR:  converted 20000729
5095 #define UseMMX_avg    1   // GRR:  converted 20000828 (+ 16-bit bugfix 20000916)
5096 #define UseMMX_paeth  1   // GRR:  converted 20000828
5097
5098    if (_mmx_supported == 2) {
5099        /* this should have happened in png_init_mmx_flags() already */
5100 #if !defined(PNG_1_0_X)
5101        png_warning(png_ptr, "asm_flags may not have been initialized");
5102 #endif
5103        png_mmx_support();
5104    }
5105 #endif /* PNG_MMX_CODE_SUPPORTED */
5106
5107 #ifdef PNG_DEBUG
5108    png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
5109    switch (filter)
5110    {
5111       case 0: sprintf(filnm, "none");
5112          break;
5113       case 1: sprintf(filnm, "sub-%s",
5114 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5115 #if !defined(PNG_1_0_X)
5116         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" :
5117 #endif
5118 #endif
5119 "x86");
5120          break;
5121       case 2: sprintf(filnm, "up-%s",
5122 #ifdef PNG_MMX_CODE_SUPPORTED
5123 #if !defined(PNG_1_0_X)
5124         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" :
5125 #endif
5126 #endif
5127  "x86");
5128          break;
5129       case 3: sprintf(filnm, "avg-%s",
5130 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5131 #if !defined(PNG_1_0_X)
5132         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" :
5133 #endif
5134 #endif
5135  "x86");
5136          break;
5137       case 4: sprintf(filnm, "Paeth-%s",
5138 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5139 #if !defined(PNG_1_0_X)
5140         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":
5141 #endif
5142 #endif
5143 "x86");
5144          break;
5145       default: sprintf(filnm, "unknw");
5146          break;
5147    }
5148    png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
5149    png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
5150    png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
5151       (int)((row_info->pixel_depth + 7) >> 3));
5152    png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
5153 #endif /* PNG_DEBUG */
5154
5155    switch (filter)
5156    {
5157       case PNG_FILTER_VALUE_NONE:
5158          break;
5159
5160       case PNG_FILTER_VALUE_SUB:
5161 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5162 #if !defined(PNG_1_0_X)
5163          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
5164              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5165              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5166 #else
5167          if (_mmx_supported)
5168 #endif
5169          {
5170             png_read_filter_row_mmx_sub(row_info, row);
5171          }
5172          else
5173 #endif /* PNG_MMX_CODE_SUPPORTED */
5174          {
5175             png_uint_32 i;
5176             png_uint_32 istop = row_info->rowbytes;
5177             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5178             png_bytep rp = row + bpp;
5179             png_bytep lp = row;
5180
5181             for (i = bpp; i < istop; i++)
5182             {
5183                *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
5184                rp++;
5185             }
5186          }  /* end !UseMMX_sub */
5187          break;
5188
5189       case PNG_FILTER_VALUE_UP:
5190 #if defined(PNG_MMX_CODE_SUPPORTED)
5191 #if !defined(PNG_1_0_X)
5192          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
5193              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5194              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5195 #else
5196          if (_mmx_supported)
5197 #endif
5198          {
5199             png_read_filter_row_mmx_up(row_info, row, prev_row);
5200          }
5201           else
5202 #endif /* PNG_MMX_CODE_SUPPORTED */
5203          {
5204             png_uint_32 i;
5205             png_uint_32 istop = row_info->rowbytes;
5206             png_bytep rp = row;
5207             png_bytep pp = prev_row;
5208
5209             for (i = 0; i < istop; ++i)
5210             {
5211                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5212                rp++;
5213             }
5214          }  /* end !UseMMX_up */
5215          break;
5216
5217       case PNG_FILTER_VALUE_AVG:
5218 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5219 #if !defined(PNG_1_0_X)
5220          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
5221              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5222              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5223 #else
5224          if (_mmx_supported)
5225 #endif
5226          {
5227             png_read_filter_row_mmx_avg(row_info, row, prev_row);
5228          }
5229          else
5230 #endif /* PNG_MMX_CODE_SUPPORTED */
5231          {
5232             png_uint_32 i;
5233             png_bytep rp = row;
5234             png_bytep pp = prev_row;
5235             png_bytep lp = row;
5236             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5237             png_uint_32 istop = row_info->rowbytes - bpp;
5238
5239             for (i = 0; i < bpp; i++)
5240             {
5241                *rp = (png_byte)(((int)(*rp) +
5242                   ((int)(*pp++) >> 1)) & 0xff);
5243                rp++;
5244             }
5245
5246             for (i = 0; i < istop; i++)
5247             {
5248                *rp = (png_byte)(((int)(*rp) +
5249                   ((int)(*pp++ + *lp++) >> 1)) & 0xff);
5250                rp++;
5251             }
5252          }  /* end !UseMMX_avg */
5253          break;
5254
5255       case PNG_FILTER_VALUE_PAETH:
5256 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5257 #if !defined(PNG_1_0_X)
5258          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
5259              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5260              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5261 #else
5262          if (_mmx_supported)
5263 #endif
5264          {
5265             png_read_filter_row_mmx_paeth(row_info, row, prev_row);
5266          }
5267          else
5268 #endif /* PNG_MMX_CODE_SUPPORTED */
5269          {
5270             png_uint_32 i;
5271             png_bytep rp = row;
5272             png_bytep pp = prev_row;
5273             png_bytep lp = row;
5274             png_bytep cp = prev_row;
5275             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5276             png_uint_32 istop = row_info->rowbytes - bpp;
5277
5278             for (i = 0; i < bpp; i++)
5279             {
5280                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5281                rp++;
5282             }
5283
5284             for (i = 0; i < istop; i++)   /* use leftover rp,pp */
5285             {
5286                int a, b, c, pa, pb, pc, p;
5287
5288                a = *lp++;
5289                b = *pp++;
5290                c = *cp++;
5291
5292                p = b - c;
5293                pc = a - c;
5294
5295 #ifdef PNG_USE_ABS
5296                pa = abs(p);
5297                pb = abs(pc);
5298                pc = abs(p + pc);
5299 #else
5300                pa = p < 0 ? -p : p;
5301                pb = pc < 0 ? -pc : pc;
5302                pc = (p + pc) < 0 ? -(p + pc) : p + pc;
5303 #endif
5304
5305                /*
5306                   if (pa <= pb && pa <= pc)
5307                      p = a;
5308                   else if (pb <= pc)
5309                      p = b;
5310                   else
5311                      p = c;
5312                 */
5313
5314                p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
5315
5316                *rp = (png_byte)(((int)(*rp) + p) & 0xff);
5317                rp++;
5318             }
5319          }  /* end !UseMMX_paeth */
5320          break;
5321
5322       default:
5323          png_warning(png_ptr, "Ignoring bad row-filter type");
5324          *row=0;
5325          break;
5326    }
5327 }
5328
5329 #endif /* PNG_HAVE_MMX_READ_FILTER_ROW */
5330
5331
5332 /*===========================================================================*/
5333 /*                                                                           */
5334 /*                      P N G _ M M X _ S U P P O R T                        */
5335 /*                                                                           */
5336 /*===========================================================================*/
5337
5338 /* GRR NOTES:  (1) the following code assumes 386 or better (pushfl/popfl)
5339  *             (2) all instructions compile with gcc 2.7.2.3 and later
5340  *             (3) the function is moved down here to prevent gcc from
5341  *                  inlining it in multiple places and then barfing be-
5342  *                  cause the ".NOT_SUPPORTED" label is multiply defined
5343  *             [is there a way to signal that a *single* function should
5344  *              not be inlined?  is there a way to modify the label for
5345  *              each inlined instance, e.g., by appending _1, _2, etc.?
5346  *              maybe if don't use leading "." in label name? (nope...sigh)]
5347  */
5348
5349 int PNGAPI
5350 png_mmx_support(void)
5351 {
5352 #if defined(PNG_MMX_CODE_SUPPORTED)
5353     int result;
5354     __asm__ __volatile__ (
5355         "pushl %%ebx          \n\t"  // ebx gets clobbered by CPUID instruction
5356         "pushl %%ecx          \n\t"  // so does ecx...
5357         "pushl %%edx          \n\t"  // ...and edx (but ecx & edx safe on Linux)
5358 //      ".byte  0x66          \n\t"  // convert 16-bit pushf to 32-bit pushfd
5359 //      "pushf                \n\t"  // 16-bit pushf
5360         "pushfl               \n\t"  // save Eflag to stack
5361         "popl %%eax           \n\t"  // get Eflag from stack into eax
5362         "movl %%eax, %%ecx    \n\t"  // make another copy of Eflag in ecx
5363         "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
5364         "pushl %%eax          \n\t"  // save modified Eflag back to stack
5365 //      ".byte  0x66          \n\t"  // convert 16-bit popf to 32-bit popfd
5366 //      "popf                 \n\t"  // 16-bit popf
5367         "popfl                \n\t"  // restore modified value to Eflag reg
5368         "pushfl               \n\t"  // save Eflag to stack
5369         "popl %%eax           \n\t"  // get Eflag from stack
5370         "pushl %%ecx          \n\t"  // save original Eflag to stack
5371         "popfl                \n\t"  // restore original Eflag
5372         "xorl %%ecx, %%eax    \n\t"  // compare new Eflag with original Eflag
5373         "jz 0f                \n\t"  // if same, CPUID instr. is not supported
5374
5375         "xorl %%eax, %%eax    \n\t"  // set eax to zero
5376 //      ".byte  0x0f, 0xa2    \n\t"  // CPUID instruction (two-byte opcode)
5377         "cpuid                \n\t"  // get the CPU identification info
5378         "cmpl $1, %%eax       \n\t"  // make sure eax return non-zero value
5379         "jl 0f                \n\t"  // if eax is zero, MMX is not supported
5380
5381         "xorl %%eax, %%eax    \n\t"  // set eax to zero and...
5382         "incl %%eax           \n\t"  // ...increment eax to 1.  This pair is
5383                                      // faster than the instruction "mov eax, 1"
5384         "cpuid                \n\t"  // get the CPU identification info again
5385         "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
5386         "cmpl $0, %%edx       \n\t"  // 0 = MMX not supported
5387         "jz 0f                \n\t"  // non-zero = yes, MMX IS supported
5388
5389         "movl $1, %%eax       \n\t"  // set return value to 1
5390         "jmp  1f              \n\t"  // DONE:  have MMX support
5391
5392     "0:                       \n\t"  // .NOT_SUPPORTED: target label for jump instructions
5393         "movl $0, %%eax       \n\t"  // set return value to 0
5394     "1:                       \n\t"  // .RETURN: target label for jump instructions
5395         "popl %%edx           \n\t"  // restore edx
5396         "popl %%ecx           \n\t"  // restore ecx
5397         "popl %%ebx           \n\t"  // restore ebx
5398
5399 //      "ret                  \n\t"  // DONE:  no MMX support
5400                                      // (fall through to standard C "ret")
5401
5402         : "=a" (result)              // output list
5403
5404         :                            // any variables used on input (none)
5405
5406                                      // no clobber list
5407 //      , "%ebx", "%ecx", "%edx"     // GRR:  we handle these manually
5408 //      , "memory"   // if write to a variable gcc thought was in a reg
5409 //      , "cc"       // "condition codes" (flag bits)
5410     );
5411     _mmx_supported = result;
5412 #else
5413     _mmx_supported = 0;
5414 #endif /* PNG_MMX_CODE_SUPPORTED */
5415
5416     return _mmx_supported;
5417 }
5418
5419
5420 #endif /* PNG_USE_PNGGCCRD */