core: use the default font for the main splash message if not requested otherwise
[fbsplash.git] / core / libs / libpng-1.2.18 / pnggccrd.c
blobfab523c0bdcea61bbc5c0e98b8658f60f5831e87
2 /* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
4 * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
6 * See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
7 * and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
8 * for Intel's performance analysis of the MMX vs. non-MMX code.
10 * Last changed in libpng 1.2.15 January 5, 2007
11 * For conditions of distribution and use, see copyright notice in png.h
12 * Copyright (c) 1998-2007 Glenn Randers-Pehrson
13 * Copyright (c) 1998, Intel Corporation
15 * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
16 * Interface to libpng contributed by Gilles Vollant, 1999.
17 * GNU C port by Greg Roelofs, 1999-2001.
19 * Lines 2350-4300 converted in place with intel2gas 1.3.1:
21 * intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
23 * and then cleaned up by hand. See http://hermes.terminal.at/intel2gas/ .
25 * NOTE: A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
26 * is required to assemble the newer MMX instructions such as movq.
27 * For djgpp, see
29 * ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
31 * (or a later version in the same directory). For Linux, check your
32 * distribution's web site(s) or try these links:
34 * http://rufus.w3.org/linux/RPM/binutils.html
35 * http://www.debian.org/Packages/stable/devel/binutils.html
36 * ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
37 * binutils.tgz
39 * For other platforms, see the main GNU site:
41 * ftp://ftp.gnu.org/pub/gnu/binutils/
43 * Version 2.5.2l.15 is definitely too old...
47 * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
48 * =====================================
50 * 19991006:
51 * - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
53 * 19991007:
54 * - additional optimizations (possible or definite):
55 * x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
56 * - write MMX code for 48-bit case (pixel_bytes == 6)
57 * - figure out what's up with 24-bit case (pixel_bytes == 3):
58 * why subtract 8 from width_mmx in the pass 4/5 case?
59 * (only width_mmx case) (near line 1606)
60 * x [DONE] replace pixel_bytes within each block with the true
61 * constant value (or are compilers smart enough to do that?)
62 * - rewrite all MMX interlacing code so it's aligned with
63 * the *beginning* of the row buffer, not the end. This
64 * would not only allow one to eliminate half of the memory
65 * writes for odd passes (that is, pass == odd), it may also
66 * eliminate some unaligned-data-access exceptions (assuming
67 * there's a penalty for not aligning 64-bit accesses on
68 * 64-bit boundaries). The only catch is that the "leftover"
69 * pixel(s) at the end of the row would have to be saved,
70 * but there are enough unused MMX registers in every case,
71 * so this is not a problem. A further benefit is that the
72 * post-MMX cleanup code (C code) in at least some of the
73 * cases could be done within the assembler block.
74 * x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
75 * inconsistent, and don't match the MMX Programmer's Reference
76 * Manual conventions anyway. They should be changed to
77 * "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
78 * was lowest in memory (e.g., corresponding to a left pixel)
79 * and b7 is the byte that was highest (e.g., a right pixel).
81 * 19991016:
82 * - Brennan's Guide notwithstanding, gcc under Linux does *not*
83 * want globals prefixed by underscores when referencing them--
84 * i.e., if the variable is const4, then refer to it as const4,
85 * not _const4. This seems to be a djgpp-specific requirement.
86 * Also, such variables apparently *must* be declared outside
87 * of functions; neither static nor automatic variables work if
88 * defined within the scope of a single function, but both
89 * static and truly global (multi-module) variables work fine.
91 * 19991023:
92 * - fixed png_combine_row() non-MMX replication bug (odd passes only?)
93 * - switched from string-concatenation-with-macros to cleaner method of
94 * renaming global variables for djgpp--i.e., always use prefixes in
95 * inlined assembler code (== strings) and conditionally rename the
96 * variables, not the other way around. Hence _const4, _mask8_0, etc.
98 * 19991024:
99 * - fixed mmxsupport()/png_do_read_interlace() first-row bug
100 * This one was severely weird: even though mmxsupport() doesn't touch
101 * ebx (where "row" pointer was stored), it nevertheless managed to zero
102 * the register (even in static/non-fPIC code--see below), which in turn
103 * caused png_do_read_interlace() to return prematurely on the first row of
104 * interlaced images (i.e., without expanding the interlaced pixels).
105 * Inspection of the generated assembly code didn't turn up any clues,
106 * although it did point at a minor optimization (i.e., get rid of
107 * mmx_supported_local variable and just use eax). Possibly the CPUID
108 * instruction is more destructive than it looks? (Not yet checked.)
109 * - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
110 * listings... Apparently register spillage has to do with ebx, since
111 * it's used to index the global offset table. Commenting it out of the
112 * input-reg lists in png_combine_row() eliminated compiler barfage, so
113 * ifdef'd with __PIC__ macro: if defined, use a global for unmask
115 * 19991107:
116 * - verified CPUID clobberage: 12-char string constant ("GenuineIntel",
117 * "AuthenticAMD", etc.) placed in ebx:ecx:edx. Still need to polish.
119 * 19991120:
120 * - made "diff" variable (now "_dif") global to simplify conversion of
121 * filtering routines (running out of regs, sigh). "diff" is still used
122 * in interlacing routines, however.
123 * - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
124 * macro determines which is used); original not yet tested.
126 * 20000213:
127 * - when compiling with gcc, be sure to use -fomit-frame-pointer
129 * 20000319:
130 * - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
131 * pass == 4 or 5, that caused visible corruption of interlaced images
133 * 20000623:
134 * - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
135 * many of the form "forbidden register 0 (ax) was spilled for class AREG."
136 * This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
137 * Chuck Wilson supplied a patch involving dummy output registers. See
138 * http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
139 * for the original (anonymous) SourceForge bug report.
141 * 20000706:
142 * - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
143 * pnggccrd.c: In function `png_combine_row':
144 * pnggccrd.c:525: more than 10 operands in `asm'
145 * pnggccrd.c:669: more than 10 operands in `asm'
146 * pnggccrd.c:828: more than 10 operands in `asm'
147 * pnggccrd.c:994: more than 10 operands in `asm'
148 * pnggccrd.c:1177: more than 10 operands in `asm'
149 * They are all the same problem and can be worked around by using the
150 * global _unmask variable unconditionally, not just in the -fPIC case.
151 * Reportedly earlier versions of gcc also have the problem with more than
152 * 10 operands; they just don't report it. Much strangeness ensues, etc.
154 * 20000729:
155 * - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
156 * MMX routine); began converting png_read_filter_row_mmx_sub()
157 * - to finish remaining sections:
158 * - clean up indentation and comments
159 * - preload local variables
160 * - add output and input regs (order of former determines numerical
161 * mapping of latter)
162 * - avoid all usage of ebx (including bx, bh, bl) register [20000823]
163 * - remove "$" from addressing of Shift and Mask variables [20000823]
165 * 20000731:
166 * - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
168 * 20000822:
169 * - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
170 * shared-library (-fPIC) version! Code works just fine as part of static
171 * library. Damn damn damn damn damn, should have tested that sooner.
172 * ebx is getting clobbered again (explicitly this time); need to save it
173 * on stack or rewrite asm code to avoid using it altogether. Blargh!
175 * 20000823:
176 * - first section was trickiest; all remaining sections have ebx -> edx now.
177 * (-fPIC works again.) Also added missing underscores to various Shift*
178 * and *Mask* globals and got rid of leading "$" signs.
180 * 20000826:
181 * - added visual separators to help navigate microscopic printed copies
182 * (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
183 * on png_read_filter_row_mmx_avg()
185 * 20000828:
186 * - finished png_read_filter_row_mmx_avg(): only Paeth left! (930 lines...)
187 * What the hell, did png_read_filter_row_mmx_paeth(), too. Comments not
188 * cleaned up/shortened in either routine, but functionality is complete
189 * and seems to be working fine.
191 * 20000829:
192 * - ahhh, figured out last(?) bit of gcc/gas asm-fu: if register is listed
193 * as an input reg (with dummy output variables, etc.), then it *cannot*
194 * also appear in the clobber list or gcc 2.95.2 will barf. The solution
195 * is simple enough...
197 * 20000914:
198 * - bug in png_read_filter_row_mmx_avg(): 16-bit grayscale not handled
199 * correctly (but 48-bit RGB just fine)
201 * 20000916:
202 * - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
203 * - "_ShiftBpp.use = 24;" should have been "_ShiftBpp.use = 16;"
204 * - "_ShiftRem.use = 40;" should have been "_ShiftRem.use = 48;"
205 * - "psllq _ShiftRem, %%mm2" should have been "psrlq _ShiftRem, %%mm2"
207 * 20010101:
208 * - added new png_init_mmx_flags() function (here only because it needs to
209 * call mmxsupport(), which should probably become global png_mmxsupport());
210 * modified other MMX routines to run conditionally (png_ptr->asm_flags)
212 * 20010103:
213 * - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
214 * and made it public; moved png_init_mmx_flags() to png.c as internal func
216 * 20010104:
217 * - removed dependency on png_read_filter_row_c() (C code already duplicated
218 * within MMX version of png_read_filter_row()) so no longer necessary to
219 * compile it into pngrutil.o
221 * 20010310:
222 * - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
224 * 20020304:
225 * - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
227 * 20040724:
228 * - more tinkering with clobber list at lines 4529 and 5033, to get
229 * it to compile on gcc-3.4.
231 * STILL TO DO:
232 * - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
233 * - write MMX code for 48-bit case (pixel_bytes == 6)
234 * - figure out what's up with 24-bit case (pixel_bytes == 3):
235 * why subtract 8 from width_mmx in the pass 4/5 case?
236 * (only width_mmx case) (near line 1606)
237 * - rewrite all MMX interlacing code so it's aligned with beginning
238 * of the row buffer, not the end (see 19991007 for details)
239 * x pick one version of mmxsupport() and get rid of the other
240 * - add error messages to any remaining bogus default cases
241 * - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
242 * x add support for runtime enable/disable/query of various MMX routines
245 #define PNG_INTERNAL
246 #include "png.h"
248 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGGCCRD)
250 int PNGAPI png_mmx_support(void);
252 #ifdef PNG_USE_LOCAL_ARRAYS
253 const static int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
254 const static int FARDATA png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
255 const static int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
256 #endif
258 #if defined(PNG_MMX_CODE_SUPPORTED)
259 /* djgpp, Win32, Cygwin, and OS2 add their own underscores to global variables,
260 * so define them without: */
261 #if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__) || \
262 defined(__OS2__)
263 # define _mmx_supported mmx_supported
264 # define _const4 const4
265 # define _const6 const6
266 # define _mask8_0 mask8_0
267 # define _mask16_1 mask16_1
268 # define _mask16_0 mask16_0
269 # define _mask24_2 mask24_2
270 # define _mask24_1 mask24_1
271 # define _mask24_0 mask24_0
272 # define _mask32_3 mask32_3
273 # define _mask32_2 mask32_2
274 # define _mask32_1 mask32_1
275 # define _mask32_0 mask32_0
276 # define _mask48_5 mask48_5
277 # define _mask48_4 mask48_4
278 # define _mask48_3 mask48_3
279 # define _mask48_2 mask48_2
280 # define _mask48_1 mask48_1
281 # define _mask48_0 mask48_0
282 # define _LBCarryMask LBCarryMask
283 # define _HBClearMask HBClearMask
284 # define _ActiveMask ActiveMask
285 # define _ActiveMask2 ActiveMask2
286 # define _ActiveMaskEnd ActiveMaskEnd
287 # define _ShiftBpp ShiftBpp
288 # define _ShiftRem ShiftRem
289 #ifdef PNG_THREAD_UNSAFE_OK
290 # define _unmask unmask
291 # define _FullLength FullLength
292 # define _MMXLength MMXLength
293 # define _dif dif
294 # define _patemp patemp
295 # define _pbtemp pbtemp
296 # define _pctemp pctemp
297 #endif
298 #endif
301 /* These constants are used in the inlined MMX assembly code.
302 Ignore gcc's "At top level: defined but not used" warnings. */
304 /* GRR 20000706: originally _unmask was needed only when compiling with -fPIC,
305 * since that case uses the %ebx register for indexing the Global Offset Table
306 * and there were no other registers available. But gcc 2.95 and later emit
307 * "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
308 * in the non-PIC case, so we'll just use the global unconditionally now.
310 #ifdef PNG_THREAD_UNSAFE_OK
311 static int _unmask;
312 #endif
314 const static unsigned long long _mask8_0 = 0x0102040810204080LL;
316 const static unsigned long long _mask16_1 = 0x0101020204040808LL;
317 const static unsigned long long _mask16_0 = 0x1010202040408080LL;
319 const static unsigned long long _mask24_2 = 0x0101010202020404LL;
320 const static unsigned long long _mask24_1 = 0x0408080810101020LL;
321 const static unsigned long long _mask24_0 = 0x2020404040808080LL;
323 const static unsigned long long _mask32_3 = 0x0101010102020202LL;
324 const static unsigned long long _mask32_2 = 0x0404040408080808LL;
325 const static unsigned long long _mask32_1 = 0x1010101020202020LL;
326 const static unsigned long long _mask32_0 = 0x4040404080808080LL;
328 const static unsigned long long _mask48_5 = 0x0101010101010202LL;
329 const static unsigned long long _mask48_4 = 0x0202020204040404LL;
330 const static unsigned long long _mask48_3 = 0x0404080808080808LL;
331 const static unsigned long long _mask48_2 = 0x1010101010102020LL;
332 const static unsigned long long _mask48_1 = 0x2020202040404040LL;
333 const static unsigned long long _mask48_0 = 0x4040808080808080LL;
335 const static unsigned long long _const4 = 0x0000000000FFFFFFLL;
336 //const static unsigned long long _const5 = 0x000000FFFFFF0000LL; // NOT USED
337 const static unsigned long long _const6 = 0x00000000000000FFLL;
339 // These are used in the row-filter routines and should/would be local
340 // variables if not for gcc addressing limitations.
341 // WARNING: Their presence probably defeats the thread safety of libpng.
343 #ifdef PNG_THREAD_UNSAFE_OK
344 static png_uint_32 _FullLength;
345 static png_uint_32 _MMXLength;
346 static int _dif;
347 static int _patemp; // temp variables for Paeth routine
348 static int _pbtemp;
349 static int _pctemp;
350 #endif
352 void /* PRIVATE */
353 png_squelch_warnings(void)
355 #ifdef PNG_THREAD_UNSAFE_OK
356 _dif = _dif;
357 _patemp = _patemp;
358 _pbtemp = _pbtemp;
359 _pctemp = _pctemp;
360 _MMXLength = _MMXLength;
361 #endif
362 _const4 = _const4;
363 _const6 = _const6;
364 _mask8_0 = _mask8_0;
365 _mask16_1 = _mask16_1;
366 _mask16_0 = _mask16_0;
367 _mask24_2 = _mask24_2;
368 _mask24_1 = _mask24_1;
369 _mask24_0 = _mask24_0;
370 _mask32_3 = _mask32_3;
371 _mask32_2 = _mask32_2;
372 _mask32_1 = _mask32_1;
373 _mask32_0 = _mask32_0;
374 _mask48_5 = _mask48_5;
375 _mask48_4 = _mask48_4;
376 _mask48_3 = _mask48_3;
377 _mask48_2 = _mask48_2;
378 _mask48_1 = _mask48_1;
379 _mask48_0 = _mask48_0;
381 #endif /* PNG_MMX_CODE_SUPPORTED */
384 static int _mmx_supported = 2;
386 /*===========================================================================*/
387 /* */
388 /* P N G _ C O M B I N E _ R O W */
389 /* */
390 /*===========================================================================*/
392 #if defined(PNG_HAVE_MMX_COMBINE_ROW)
394 #define BPP2 2
395 #define BPP3 3 /* bytes per pixel (a.k.a. pixel_bytes) */
396 #define BPP4 4
397 #define BPP6 6 /* (defined only to help avoid cut-and-paste errors) */
398 #define BPP8 8
400 /* Combines the row recently read in with the previous row.
401 This routine takes care of alpha and transparency if requested.
402 This routine also handles the two methods of progressive display
403 of interlaced images, depending on the mask value.
404 The mask value describes which pixels are to be combined with
405 the row. The pattern always repeats every 8 pixels, so just 8
406 bits are needed. A one indicates the pixel is to be combined; a
407 zero indicates the pixel is to be skipped. This is in addition
408 to any alpha or transparency value associated with the pixel.
409 If you want all pixels to be combined, pass 0xff (255) in mask. */
411 /* Use this routine for the x86 platform - it uses a faster MMX routine
412 if the machine supports MMX. */
414 void /* PRIVATE */
415 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
417 png_debug(1, "in png_combine_row (pnggccrd.c)\n");
419 #if defined(PNG_MMX_CODE_SUPPORTED)
420 if (_mmx_supported == 2) {
421 #if !defined(PNG_1_0_X)
422 /* this should have happened in png_init_mmx_flags() already */
423 png_warning(png_ptr, "asm_flags may not have been initialized");
424 #endif
425 png_mmx_support();
427 #endif
429 if (mask == 0xff)
431 png_debug(2,"mask == 0xff: doing single png_memcpy()\n");
432 png_memcpy(row, png_ptr->row_buf + 1,
433 (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width));
435 else /* (png_combine_row() is never called with mask == 0) */
437 switch (png_ptr->row_info.pixel_depth)
439 case 1: /* png_ptr->row_info.pixel_depth */
441 png_bytep sp;
442 png_bytep dp;
443 int s_inc, s_start, s_end;
444 int m;
445 int shift;
446 png_uint_32 i;
448 sp = png_ptr->row_buf + 1;
449 dp = row;
450 m = 0x80;
451 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
452 if (png_ptr->transformations & PNG_PACKSWAP)
454 s_start = 0;
455 s_end = 7;
456 s_inc = 1;
458 else
459 #endif
461 s_start = 7;
462 s_end = 0;
463 s_inc = -1;
466 shift = s_start;
468 for (i = 0; i < png_ptr->width; i++)
470 if (m & mask)
472 int value;
474 value = (*sp >> shift) & 0x1;
475 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
476 *dp |= (png_byte)(value << shift);
479 if (shift == s_end)
481 shift = s_start;
482 sp++;
483 dp++;
485 else
486 shift += s_inc;
488 if (m == 1)
489 m = 0x80;
490 else
491 m >>= 1;
493 break;
496 case 2: /* png_ptr->row_info.pixel_depth */
498 png_bytep sp;
499 png_bytep dp;
500 int s_start, s_end, s_inc;
501 int m;
502 int shift;
503 png_uint_32 i;
504 int value;
506 sp = png_ptr->row_buf + 1;
507 dp = row;
508 m = 0x80;
509 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
510 if (png_ptr->transformations & PNG_PACKSWAP)
512 s_start = 0;
513 s_end = 6;
514 s_inc = 2;
516 else
517 #endif
519 s_start = 6;
520 s_end = 0;
521 s_inc = -2;
524 shift = s_start;
526 for (i = 0; i < png_ptr->width; i++)
528 if (m & mask)
530 value = (*sp >> shift) & 0x3;
531 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
532 *dp |= (png_byte)(value << shift);
535 if (shift == s_end)
537 shift = s_start;
538 sp++;
539 dp++;
541 else
542 shift += s_inc;
543 if (m == 1)
544 m = 0x80;
545 else
546 m >>= 1;
548 break;
551 case 4: /* png_ptr->row_info.pixel_depth */
553 png_bytep sp;
554 png_bytep dp;
555 int s_start, s_end, s_inc;
556 int m;
557 int shift;
558 png_uint_32 i;
559 int value;
561 sp = png_ptr->row_buf + 1;
562 dp = row;
563 m = 0x80;
564 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
565 if (png_ptr->transformations & PNG_PACKSWAP)
567 s_start = 0;
568 s_end = 4;
569 s_inc = 4;
571 else
572 #endif
574 s_start = 4;
575 s_end = 0;
576 s_inc = -4;
578 shift = s_start;
580 for (i = 0; i < png_ptr->width; i++)
582 if (m & mask)
584 value = (*sp >> shift) & 0xf;
585 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
586 *dp |= (png_byte)(value << shift);
589 if (shift == s_end)
591 shift = s_start;
592 sp++;
593 dp++;
595 else
596 shift += s_inc;
597 if (m == 1)
598 m = 0x80;
599 else
600 m >>= 1;
602 break;
605 case 8: /* png_ptr->row_info.pixel_depth */
607 png_bytep srcptr;
608 png_bytep dstptr;
610 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
611 #if !defined(PNG_1_0_X)
612 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
613 /* && _mmx_supported */ )
614 #else
615 if (_mmx_supported)
616 #endif
618 png_uint_32 len;
619 int diff;
620 int dummy_value_a; // fix 'forbidden register spilled' error
621 int dummy_value_d;
622 int dummy_value_c;
623 int dummy_value_S;
624 int dummy_value_D;
625 _unmask = ~mask; // global variable for -fPIC version
626 srcptr = png_ptr->row_buf + 1;
627 dstptr = row;
628 len = png_ptr->width &~7; // reduce to multiple of 8
629 diff = (int) (png_ptr->width & 7); // amount lost
631 __asm__ __volatile__ (
632 "movd _unmask, %%mm7 \n\t" // load bit pattern
633 "psubb %%mm6, %%mm6 \n\t" // zero mm6
634 "punpcklbw %%mm7, %%mm7 \n\t"
635 "punpcklwd %%mm7, %%mm7 \n\t"
636 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
638 "movq _mask8_0, %%mm0 \n\t"
639 "pand %%mm7, %%mm0 \n\t" // nonzero if keep byte
640 "pcmpeqb %%mm6, %%mm0 \n\t" // zeros->1s, v versa
642 // preload "movl len, %%ecx \n\t" // load length of line
643 // preload "movl srcptr, %%esi \n\t" // load source
644 // preload "movl dstptr, %%edi \n\t" // load dest
646 "cmpl $0, %%ecx \n\t" // len == 0 ?
647 "je mainloop8end \n\t"
649 "mainloop8: \n\t"
650 "movq (%%esi), %%mm4 \n\t" // *srcptr
651 "pand %%mm0, %%mm4 \n\t"
652 "movq %%mm0, %%mm6 \n\t"
653 "pandn (%%edi), %%mm6 \n\t" // *dstptr
654 "por %%mm6, %%mm4 \n\t"
655 "movq %%mm4, (%%edi) \n\t"
656 "addl $8, %%esi \n\t" // inc by 8 bytes processed
657 "addl $8, %%edi \n\t"
658 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
659 "ja mainloop8 \n\t"
661 "mainloop8end: \n\t"
662 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
663 "movl %%eax, %%ecx \n\t"
664 "cmpl $0, %%ecx \n\t"
665 "jz end8 \n\t"
666 // preload "movl mask, %%edx \n\t"
667 "sall $24, %%edx \n\t" // make low byte, high byte
669 "secondloop8: \n\t"
670 "sall %%edx \n\t" // move high bit to CF
671 "jnc skip8 \n\t" // if CF = 0
672 "movb (%%esi), %%al \n\t"
673 "movb %%al, (%%edi) \n\t"
675 "skip8: \n\t"
676 "incl %%esi \n\t"
677 "incl %%edi \n\t"
678 "decl %%ecx \n\t"
679 "jnz secondloop8 \n\t"
681 "end8: \n\t"
682 "EMMS \n\t" // DONE
684 : "=a" (dummy_value_a), // output regs (dummy)
685 "=d" (dummy_value_d),
686 "=c" (dummy_value_c),
687 "=S" (dummy_value_S),
688 "=D" (dummy_value_D)
690 : "3" (srcptr), // esi // input regs
691 "4" (dstptr), // edi
692 "0" (diff), // eax
693 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
694 "2" (len), // ecx
695 "1" (mask) // edx
697 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
698 : "%mm0", "%mm4", "%mm6", "%mm7" // clobber list
699 #endif
702 else /* mmx _not supported - Use modified C routine */
703 #endif /* PNG_MMX_CODE_SUPPORTED */
705 register png_uint_32 i;
706 png_uint_32 initial_val = png_pass_start[png_ptr->pass];
707 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
708 register int stride = png_pass_inc[png_ptr->pass];
709 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
710 register int rep_bytes = png_pass_width[png_ptr->pass];
711 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
712 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
713 int diff = (int) (png_ptr->width & 7); /* amount lost */
714 register png_uint_32 final_val = len; /* GRR bugfix */
716 srcptr = png_ptr->row_buf + 1 + initial_val;
717 dstptr = row + initial_val;
719 for (i = initial_val; i < final_val; i += stride)
721 png_memcpy(dstptr, srcptr, rep_bytes);
722 srcptr += stride;
723 dstptr += stride;
725 if (diff) /* number of leftover pixels: 3 for pngtest */
727 final_val+=diff /* *BPP1 */ ;
728 for (; i < final_val; i += stride)
730 if (rep_bytes > (int)(final_val-i))
731 rep_bytes = (int)(final_val-i);
732 png_memcpy(dstptr, srcptr, rep_bytes);
733 srcptr += stride;
734 dstptr += stride;
738 } /* end of else (_mmx_supported) */
740 break;
741 } /* end 8 bpp */
743 case 16: /* png_ptr->row_info.pixel_depth */
745 png_bytep srcptr;
746 png_bytep dstptr;
748 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
749 #if !defined(PNG_1_0_X)
750 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
751 /* && _mmx_supported */ )
752 #else
753 if (_mmx_supported)
754 #endif
756 png_uint_32 len;
757 int diff;
758 int dummy_value_a; // fix 'forbidden register spilled' error
759 int dummy_value_d;
760 int dummy_value_c;
761 int dummy_value_S;
762 int dummy_value_D;
763 _unmask = ~mask; // global variable for -fPIC version
764 srcptr = png_ptr->row_buf + 1;
765 dstptr = row;
766 len = png_ptr->width &~7; // reduce to multiple of 8
767 diff = (int) (png_ptr->width & 7); // amount lost //
769 __asm__ __volatile__ (
770 "movd _unmask, %%mm7 \n\t" // load bit pattern
771 "psubb %%mm6, %%mm6 \n\t" // zero mm6
772 "punpcklbw %%mm7, %%mm7 \n\t"
773 "punpcklwd %%mm7, %%mm7 \n\t"
774 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
776 "movq _mask16_0, %%mm0 \n\t"
777 "movq _mask16_1, %%mm1 \n\t"
779 "pand %%mm7, %%mm0 \n\t"
780 "pand %%mm7, %%mm1 \n\t"
782 "pcmpeqb %%mm6, %%mm0 \n\t"
783 "pcmpeqb %%mm6, %%mm1 \n\t"
785 // preload "movl len, %%ecx \n\t" // load length of line
786 // preload "movl srcptr, %%esi \n\t" // load source
787 // preload "movl dstptr, %%edi \n\t" // load dest
789 "cmpl $0, %%ecx \n\t"
790 "jz mainloop16end \n\t"
792 "mainloop16: \n\t"
793 "movq (%%esi), %%mm4 \n\t"
794 "pand %%mm0, %%mm4 \n\t"
795 "movq %%mm0, %%mm6 \n\t"
796 "movq (%%edi), %%mm7 \n\t"
797 "pandn %%mm7, %%mm6 \n\t"
798 "por %%mm6, %%mm4 \n\t"
799 "movq %%mm4, (%%edi) \n\t"
801 "movq 8(%%esi), %%mm5 \n\t"
802 "pand %%mm1, %%mm5 \n\t"
803 "movq %%mm1, %%mm7 \n\t"
804 "movq 8(%%edi), %%mm6 \n\t"
805 "pandn %%mm6, %%mm7 \n\t"
806 "por %%mm7, %%mm5 \n\t"
807 "movq %%mm5, 8(%%edi) \n\t"
809 "addl $16, %%esi \n\t" // inc by 16 bytes processed
810 "addl $16, %%edi \n\t"
811 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
812 "ja mainloop16 \n\t"
814 "mainloop16end: \n\t"
815 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
816 "movl %%eax, %%ecx \n\t"
817 "cmpl $0, %%ecx \n\t"
818 "jz end16 \n\t"
819 // preload "movl mask, %%edx \n\t"
820 "sall $24, %%edx \n\t" // make low byte, high byte
822 "secondloop16: \n\t"
823 "sall %%edx \n\t" // move high bit to CF
824 "jnc skip16 \n\t" // if CF = 0
825 "movw (%%esi), %%ax \n\t"
826 "movw %%ax, (%%edi) \n\t"
828 "skip16: \n\t"
829 "addl $2, %%esi \n\t"
830 "addl $2, %%edi \n\t"
831 "decl %%ecx \n\t"
832 "jnz secondloop16 \n\t"
834 "end16: \n\t"
835 "EMMS \n\t" // DONE
837 : "=a" (dummy_value_a), // output regs (dummy)
838 "=c" (dummy_value_c),
839 "=d" (dummy_value_d),
840 "=S" (dummy_value_S),
841 "=D" (dummy_value_D)
843 : "0" (diff), // eax // input regs
844 // was (unmask) " " RESERVED // ebx // Global Offset Table idx
845 "1" (len), // ecx
846 "2" (mask), // edx
847 "3" (srcptr), // esi
848 "4" (dstptr) // edi
850 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
851 : "%mm0", "%mm1", "%mm4" // clobber list
852 , "%mm5", "%mm6", "%mm7"
853 #endif
856 else /* mmx _not supported - Use modified C routine */
857 #endif /* PNG_MMX_CODE_SUPPORTED */
859 register png_uint_32 i;
860 png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
861 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
862 register int stride = BPP2 * png_pass_inc[png_ptr->pass];
863 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
864 register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
865 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
866 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
867 int diff = (int) (png_ptr->width & 7); /* amount lost */
868 register png_uint_32 final_val = BPP2 * len; /* GRR bugfix */
870 srcptr = png_ptr->row_buf + 1 + initial_val;
871 dstptr = row + initial_val;
873 for (i = initial_val; i < final_val; i += stride)
875 png_memcpy(dstptr, srcptr, rep_bytes);
876 srcptr += stride;
877 dstptr += stride;
879 if (diff) /* number of leftover pixels: 3 for pngtest */
881 final_val+=diff*BPP2;
882 for (; i < final_val; i += stride)
884 if (rep_bytes > (int)(final_val-i))
885 rep_bytes = (int)(final_val-i);
886 png_memcpy(dstptr, srcptr, rep_bytes);
887 srcptr += stride;
888 dstptr += stride;
891 } /* end of else (_mmx_supported) */
893 break;
894 } /* end 16 bpp */
896 case 24: /* png_ptr->row_info.pixel_depth */
898 png_bytep srcptr;
899 png_bytep dstptr;
901 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
902 #if !defined(PNG_1_0_X)
903 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
904 /* && _mmx_supported */ )
905 #else
906 if (_mmx_supported)
907 #endif
909 png_uint_32 len;
910 int diff;
911 int dummy_value_a; // fix 'forbidden register spilled' error
912 int dummy_value_d;
913 int dummy_value_c;
914 int dummy_value_S;
915 int dummy_value_D;
916 _unmask = ~mask; // global variable for -fPIC version
917 srcptr = png_ptr->row_buf + 1;
918 dstptr = row;
919 len = png_ptr->width &~7; // reduce to multiple of 8
920 diff = (int) (png_ptr->width & 7); // amount lost //
922 __asm__ __volatile__ (
923 "movd _unmask, %%mm7 \n\t" // load bit pattern
924 "psubb %%mm6, %%mm6 \n\t" // zero mm6
925 "punpcklbw %%mm7, %%mm7 \n\t"
926 "punpcklwd %%mm7, %%mm7 \n\t"
927 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
929 "movq _mask24_0, %%mm0 \n\t"
930 "movq _mask24_1, %%mm1 \n\t"
931 "movq _mask24_2, %%mm2 \n\t"
933 "pand %%mm7, %%mm0 \n\t"
934 "pand %%mm7, %%mm1 \n\t"
935 "pand %%mm7, %%mm2 \n\t"
937 "pcmpeqb %%mm6, %%mm0 \n\t"
938 "pcmpeqb %%mm6, %%mm1 \n\t"
939 "pcmpeqb %%mm6, %%mm2 \n\t"
941 // preload "movl len, %%ecx \n\t" // load length of line
942 // preload "movl srcptr, %%esi \n\t" // load source
943 // preload "movl dstptr, %%edi \n\t" // load dest
945 "cmpl $0, %%ecx \n\t"
946 "jz mainloop24end \n\t"
948 "mainloop24: \n\t"
949 "movq (%%esi), %%mm4 \n\t"
950 "pand %%mm0, %%mm4 \n\t"
951 "movq %%mm0, %%mm6 \n\t"
952 "movq (%%edi), %%mm7 \n\t"
953 "pandn %%mm7, %%mm6 \n\t"
954 "por %%mm6, %%mm4 \n\t"
955 "movq %%mm4, (%%edi) \n\t"
957 "movq 8(%%esi), %%mm5 \n\t"
958 "pand %%mm1, %%mm5 \n\t"
959 "movq %%mm1, %%mm7 \n\t"
960 "movq 8(%%edi), %%mm6 \n\t"
961 "pandn %%mm6, %%mm7 \n\t"
962 "por %%mm7, %%mm5 \n\t"
963 "movq %%mm5, 8(%%edi) \n\t"
965 "movq 16(%%esi), %%mm6 \n\t"
966 "pand %%mm2, %%mm6 \n\t"
967 "movq %%mm2, %%mm4 \n\t"
968 "movq 16(%%edi), %%mm7 \n\t"
969 "pandn %%mm7, %%mm4 \n\t"
970 "por %%mm4, %%mm6 \n\t"
971 "movq %%mm6, 16(%%edi) \n\t"
973 "addl $24, %%esi \n\t" // inc by 24 bytes processed
974 "addl $24, %%edi \n\t"
975 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
977 "ja mainloop24 \n\t"
979 "mainloop24end: \n\t"
980 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
981 "movl %%eax, %%ecx \n\t"
982 "cmpl $0, %%ecx \n\t"
983 "jz end24 \n\t"
984 // preload "movl mask, %%edx \n\t"
985 "sall $24, %%edx \n\t" // make low byte, high byte
987 "secondloop24: \n\t"
988 "sall %%edx \n\t" // move high bit to CF
989 "jnc skip24 \n\t" // if CF = 0
990 "movw (%%esi), %%ax \n\t"
991 "movw %%ax, (%%edi) \n\t"
992 "xorl %%eax, %%eax \n\t"
993 "movb 2(%%esi), %%al \n\t"
994 "movb %%al, 2(%%edi) \n\t"
996 "skip24: \n\t"
997 "addl $3, %%esi \n\t"
998 "addl $3, %%edi \n\t"
999 "decl %%ecx \n\t"
1000 "jnz secondloop24 \n\t"
1002 "end24: \n\t"
1003 "EMMS \n\t" // DONE
1005 : "=a" (dummy_value_a), // output regs (dummy)
1006 "=d" (dummy_value_d),
1007 "=c" (dummy_value_c),
1008 "=S" (dummy_value_S),
1009 "=D" (dummy_value_D)
1011 : "3" (srcptr), // esi // input regs
1012 "4" (dstptr), // edi
1013 "0" (diff), // eax
1014 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1015 "2" (len), // ecx
1016 "1" (mask) // edx
1018 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1019 : "%mm0", "%mm1", "%mm2" // clobber list
1020 , "%mm4", "%mm5", "%mm6", "%mm7"
1021 #endif
1024 else /* mmx _not supported - Use modified C routine */
1025 #endif /* PNG_MMX_CODE_SUPPORTED */
1027 register png_uint_32 i;
1028 png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
1029 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1030 register int stride = BPP3 * png_pass_inc[png_ptr->pass];
1031 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1032 register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
1033 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1034 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1035 int diff = (int) (png_ptr->width & 7); /* amount lost */
1036 register png_uint_32 final_val = BPP3 * len; /* GRR bugfix */
1038 srcptr = png_ptr->row_buf + 1 + initial_val;
1039 dstptr = row + initial_val;
1041 for (i = initial_val; i < final_val; i += stride)
1043 png_memcpy(dstptr, srcptr, rep_bytes);
1044 srcptr += stride;
1045 dstptr += stride;
1047 if (diff) /* number of leftover pixels: 3 for pngtest */
1049 final_val+=diff*BPP3;
1050 for (; i < final_val; i += stride)
1052 if (rep_bytes > (int)(final_val-i))
1053 rep_bytes = (int)(final_val-i);
1054 png_memcpy(dstptr, srcptr, rep_bytes);
1055 srcptr += stride;
1056 dstptr += stride;
1059 } /* end of else (_mmx_supported) */
1061 break;
1062 } /* end 24 bpp */
1064 case 32: /* png_ptr->row_info.pixel_depth */
1066 png_bytep srcptr;
1067 png_bytep dstptr;
1069 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1070 #if !defined(PNG_1_0_X)
1071 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1072 /* && _mmx_supported */ )
1073 #else
1074 if (_mmx_supported)
1075 #endif
1077 png_uint_32 len;
1078 int diff;
1079 int dummy_value_a; // fix 'forbidden register spilled' error
1080 int dummy_value_d;
1081 int dummy_value_c;
1082 int dummy_value_S;
1083 int dummy_value_D;
1084 _unmask = ~mask; // global variable for -fPIC version
1085 srcptr = png_ptr->row_buf + 1;
1086 dstptr = row;
1087 len = png_ptr->width &~7; // reduce to multiple of 8
1088 diff = (int) (png_ptr->width & 7); // amount lost //
1090 __asm__ __volatile__ (
1091 "movd _unmask, %%mm7 \n\t" // load bit pattern
1092 "psubb %%mm6, %%mm6 \n\t" // zero mm6
1093 "punpcklbw %%mm7, %%mm7 \n\t"
1094 "punpcklwd %%mm7, %%mm7 \n\t"
1095 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
1097 "movq _mask32_0, %%mm0 \n\t"
1098 "movq _mask32_1, %%mm1 \n\t"
1099 "movq _mask32_2, %%mm2 \n\t"
1100 "movq _mask32_3, %%mm3 \n\t"
1102 "pand %%mm7, %%mm0 \n\t"
1103 "pand %%mm7, %%mm1 \n\t"
1104 "pand %%mm7, %%mm2 \n\t"
1105 "pand %%mm7, %%mm3 \n\t"
1107 "pcmpeqb %%mm6, %%mm0 \n\t"
1108 "pcmpeqb %%mm6, %%mm1 \n\t"
1109 "pcmpeqb %%mm6, %%mm2 \n\t"
1110 "pcmpeqb %%mm6, %%mm3 \n\t"
1112 // preload "movl len, %%ecx \n\t" // load length of line
1113 // preload "movl srcptr, %%esi \n\t" // load source
1114 // preload "movl dstptr, %%edi \n\t" // load dest
1116 "cmpl $0, %%ecx \n\t" // lcr
1117 "jz mainloop32end \n\t"
1119 "mainloop32: \n\t"
1120 "movq (%%esi), %%mm4 \n\t"
1121 "pand %%mm0, %%mm4 \n\t"
1122 "movq %%mm0, %%mm6 \n\t"
1123 "movq (%%edi), %%mm7 \n\t"
1124 "pandn %%mm7, %%mm6 \n\t"
1125 "por %%mm6, %%mm4 \n\t"
1126 "movq %%mm4, (%%edi) \n\t"
1128 "movq 8(%%esi), %%mm5 \n\t"
1129 "pand %%mm1, %%mm5 \n\t"
1130 "movq %%mm1, %%mm7 \n\t"
1131 "movq 8(%%edi), %%mm6 \n\t"
1132 "pandn %%mm6, %%mm7 \n\t"
1133 "por %%mm7, %%mm5 \n\t"
1134 "movq %%mm5, 8(%%edi) \n\t"
1136 "movq 16(%%esi), %%mm6 \n\t"
1137 "pand %%mm2, %%mm6 \n\t"
1138 "movq %%mm2, %%mm4 \n\t"
1139 "movq 16(%%edi), %%mm7 \n\t"
1140 "pandn %%mm7, %%mm4 \n\t"
1141 "por %%mm4, %%mm6 \n\t"
1142 "movq %%mm6, 16(%%edi) \n\t"
1144 "movq 24(%%esi), %%mm7 \n\t"
1145 "pand %%mm3, %%mm7 \n\t"
1146 "movq %%mm3, %%mm5 \n\t"
1147 "movq 24(%%edi), %%mm4 \n\t"
1148 "pandn %%mm4, %%mm5 \n\t"
1149 "por %%mm5, %%mm7 \n\t"
1150 "movq %%mm7, 24(%%edi) \n\t"
1152 "addl $32, %%esi \n\t" // inc by 32 bytes processed
1153 "addl $32, %%edi \n\t"
1154 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
1155 "ja mainloop32 \n\t"
1157 "mainloop32end: \n\t"
1158 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
1159 "movl %%eax, %%ecx \n\t"
1160 "cmpl $0, %%ecx \n\t"
1161 "jz end32 \n\t"
1162 // preload "movl mask, %%edx \n\t"
1163 "sall $24, %%edx \n\t" // low byte => high byte
1165 "secondloop32: \n\t"
1166 "sall %%edx \n\t" // move high bit to CF
1167 "jnc skip32 \n\t" // if CF = 0
1168 "movl (%%esi), %%eax \n\t"
1169 "movl %%eax, (%%edi) \n\t"
1171 "skip32: \n\t"
1172 "addl $4, %%esi \n\t"
1173 "addl $4, %%edi \n\t"
1174 "decl %%ecx \n\t"
1175 "jnz secondloop32 \n\t"
1177 "end32: \n\t"
1178 "EMMS \n\t" // DONE
1180 : "=a" (dummy_value_a), // output regs (dummy)
1181 "=d" (dummy_value_d),
1182 "=c" (dummy_value_c),
1183 "=S" (dummy_value_S),
1184 "=D" (dummy_value_D)
1186 : "3" (srcptr), // esi // input regs
1187 "4" (dstptr), // edi
1188 "0" (diff), // eax
1189 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1190 "2" (len), // ecx
1191 "1" (mask) // edx
1193 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1194 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
1195 , "%mm4", "%mm5", "%mm6", "%mm7"
1196 #endif
1199 else /* mmx _not supported - Use modified C routine */
1200 #endif /* PNG_MMX_CODE_SUPPORTED */
1202 register png_uint_32 i;
1203 png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
1204 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1205 register int stride = BPP4 * png_pass_inc[png_ptr->pass];
1206 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1207 register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
1208 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1209 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1210 int diff = (int) (png_ptr->width & 7); /* amount lost */
1211 register png_uint_32 final_val = BPP4 * len; /* GRR bugfix */
1213 srcptr = png_ptr->row_buf + 1 + initial_val;
1214 dstptr = row + initial_val;
1216 for (i = initial_val; i < final_val; i += stride)
1218 png_memcpy(dstptr, srcptr, rep_bytes);
1219 srcptr += stride;
1220 dstptr += stride;
1222 if (diff) /* number of leftover pixels: 3 for pngtest */
1224 final_val+=diff*BPP4;
1225 for (; i < final_val; i += stride)
1227 if (rep_bytes > (int)(final_val-i))
1228 rep_bytes = (int)(final_val-i);
1229 png_memcpy(dstptr, srcptr, rep_bytes);
1230 srcptr += stride;
1231 dstptr += stride;
1234 } /* end of else (_mmx_supported) */
1236 break;
1237 } /* end 32 bpp */
1239 case 48: /* png_ptr->row_info.pixel_depth */
1241 png_bytep srcptr;
1242 png_bytep dstptr;
1244 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1245 #if !defined(PNG_1_0_X)
1246 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1247 /* && _mmx_supported */ )
1248 #else
1249 if (_mmx_supported)
1250 #endif
1252 png_uint_32 len;
1253 int diff;
1254 int dummy_value_a; // fix 'forbidden register spilled' error
1255 int dummy_value_d;
1256 int dummy_value_c;
1257 int dummy_value_S;
1258 int dummy_value_D;
1259 _unmask = ~mask; // global variable for -fPIC version
1260 srcptr = png_ptr->row_buf + 1;
1261 dstptr = row;
1262 len = png_ptr->width &~7; // reduce to multiple of 8
1263 diff = (int) (png_ptr->width & 7); // amount lost //
1265 __asm__ __volatile__ (
1266 "movd _unmask, %%mm7 \n\t" // load bit pattern
1267 "psubb %%mm6, %%mm6 \n\t" // zero mm6
1268 "punpcklbw %%mm7, %%mm7 \n\t"
1269 "punpcklwd %%mm7, %%mm7 \n\t"
1270 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
1272 "movq _mask48_0, %%mm0 \n\t"
1273 "movq _mask48_1, %%mm1 \n\t"
1274 "movq _mask48_2, %%mm2 \n\t"
1275 "movq _mask48_3, %%mm3 \n\t"
1276 "movq _mask48_4, %%mm4 \n\t"
1277 "movq _mask48_5, %%mm5 \n\t"
1279 "pand %%mm7, %%mm0 \n\t"
1280 "pand %%mm7, %%mm1 \n\t"
1281 "pand %%mm7, %%mm2 \n\t"
1282 "pand %%mm7, %%mm3 \n\t"
1283 "pand %%mm7, %%mm4 \n\t"
1284 "pand %%mm7, %%mm5 \n\t"
1286 "pcmpeqb %%mm6, %%mm0 \n\t"
1287 "pcmpeqb %%mm6, %%mm1 \n\t"
1288 "pcmpeqb %%mm6, %%mm2 \n\t"
1289 "pcmpeqb %%mm6, %%mm3 \n\t"
1290 "pcmpeqb %%mm6, %%mm4 \n\t"
1291 "pcmpeqb %%mm6, %%mm5 \n\t"
1293 // preload "movl len, %%ecx \n\t" // load length of line
1294 // preload "movl srcptr, %%esi \n\t" // load source
1295 // preload "movl dstptr, %%edi \n\t" // load dest
1297 "cmpl $0, %%ecx \n\t"
1298 "jz mainloop48end \n\t"
1300 "mainloop48: \n\t"
1301 "movq (%%esi), %%mm7 \n\t"
1302 "pand %%mm0, %%mm7 \n\t"
1303 "movq %%mm0, %%mm6 \n\t"
1304 "pandn (%%edi), %%mm6 \n\t"
1305 "por %%mm6, %%mm7 \n\t"
1306 "movq %%mm7, (%%edi) \n\t"
1308 "movq 8(%%esi), %%mm6 \n\t"
1309 "pand %%mm1, %%mm6 \n\t"
1310 "movq %%mm1, %%mm7 \n\t"
1311 "pandn 8(%%edi), %%mm7 \n\t"
1312 "por %%mm7, %%mm6 \n\t"
1313 "movq %%mm6, 8(%%edi) \n\t"
1315 "movq 16(%%esi), %%mm6 \n\t"
1316 "pand %%mm2, %%mm6 \n\t"
1317 "movq %%mm2, %%mm7 \n\t"
1318 "pandn 16(%%edi), %%mm7 \n\t"
1319 "por %%mm7, %%mm6 \n\t"
1320 "movq %%mm6, 16(%%edi) \n\t"
1322 "movq 24(%%esi), %%mm7 \n\t"
1323 "pand %%mm3, %%mm7 \n\t"
1324 "movq %%mm3, %%mm6 \n\t"
1325 "pandn 24(%%edi), %%mm6 \n\t"
1326 "por %%mm6, %%mm7 \n\t"
1327 "movq %%mm7, 24(%%edi) \n\t"
1329 "movq 32(%%esi), %%mm6 \n\t"
1330 "pand %%mm4, %%mm6 \n\t"
1331 "movq %%mm4, %%mm7 \n\t"
1332 "pandn 32(%%edi), %%mm7 \n\t"
1333 "por %%mm7, %%mm6 \n\t"
1334 "movq %%mm6, 32(%%edi) \n\t"
1336 "movq 40(%%esi), %%mm7 \n\t"
1337 "pand %%mm5, %%mm7 \n\t"
1338 "movq %%mm5, %%mm6 \n\t"
1339 "pandn 40(%%edi), %%mm6 \n\t"
1340 "por %%mm6, %%mm7 \n\t"
1341 "movq %%mm7, 40(%%edi) \n\t"
1343 "addl $48, %%esi \n\t" // inc by 48 bytes processed
1344 "addl $48, %%edi \n\t"
1345 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
1347 "ja mainloop48 \n\t"
1349 "mainloop48end: \n\t"
1350 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
1351 "movl %%eax, %%ecx \n\t"
1352 "cmpl $0, %%ecx \n\t"
1353 "jz end48 \n\t"
1354 // preload "movl mask, %%edx \n\t"
1355 "sall $24, %%edx \n\t" // make low byte, high byte
1357 "secondloop48: \n\t"
1358 "sall %%edx \n\t" // move high bit to CF
1359 "jnc skip48 \n\t" // if CF = 0
1360 "movl (%%esi), %%eax \n\t"
1361 "movl %%eax, (%%edi) \n\t"
1363 "skip48: \n\t"
1364 "addl $4, %%esi \n\t"
1365 "addl $4, %%edi \n\t"
1366 "decl %%ecx \n\t"
1367 "jnz secondloop48 \n\t"
1369 "end48: \n\t"
1370 "EMMS \n\t" // DONE
1372 : "=a" (dummy_value_a), // output regs (dummy)
1373 "=d" (dummy_value_d),
1374 "=c" (dummy_value_c),
1375 "=S" (dummy_value_S),
1376 "=D" (dummy_value_D)
1378 : "3" (srcptr), // esi // input regs
1379 "4" (dstptr), // edi
1380 "0" (diff), // eax
1381 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1382 "2" (len), // ecx
1383 "1" (mask) // edx
1385 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1386 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
1387 , "%mm4", "%mm5", "%mm6", "%mm7"
1388 #endif
1391 else /* mmx _not supported - Use modified C routine */
1392 #endif /* PNG_MMX_CODE_SUPPORTED */
1394 register png_uint_32 i;
1395 png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
1396 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1397 register int stride = BPP6 * png_pass_inc[png_ptr->pass];
1398 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1399 register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
1400 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1401 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1402 int diff = (int) (png_ptr->width & 7); /* amount lost */
1403 register png_uint_32 final_val = BPP6 * len; /* GRR bugfix */
1405 srcptr = png_ptr->row_buf + 1 + initial_val;
1406 dstptr = row + initial_val;
1408 for (i = initial_val; i < final_val; i += stride)
1410 png_memcpy(dstptr, srcptr, rep_bytes);
1411 srcptr += stride;
1412 dstptr += stride;
1414 if (diff) /* number of leftover pixels: 3 for pngtest */
1416 final_val+=diff*BPP6;
1417 for (; i < final_val; i += stride)
1419 if (rep_bytes > (int)(final_val-i))
1420 rep_bytes = (int)(final_val-i);
1421 png_memcpy(dstptr, srcptr, rep_bytes);
1422 srcptr += stride;
1423 dstptr += stride;
1426 } /* end of else (_mmx_supported) */
1428 break;
1429 } /* end 48 bpp */
1431 case 64: /* png_ptr->row_info.pixel_depth */
1433 png_bytep srcptr;
1434 png_bytep dstptr;
1435 register png_uint_32 i;
1436 png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
1437 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1438 register int stride = BPP8 * png_pass_inc[png_ptr->pass];
1439 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1440 register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
1441 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1442 png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1443 int diff = (int) (png_ptr->width & 7); /* amount lost */
1444 register png_uint_32 final_val = BPP8 * len; /* GRR bugfix */
1446 srcptr = png_ptr->row_buf + 1 + initial_val;
1447 dstptr = row + initial_val;
1449 for (i = initial_val; i < final_val; i += stride)
1451 png_memcpy(dstptr, srcptr, rep_bytes);
1452 srcptr += stride;
1453 dstptr += stride;
1455 if (diff) /* number of leftover pixels: 3 for pngtest */
1457 final_val+=diff*BPP8;
1458 for (; i < final_val; i += stride)
1460 if (rep_bytes > (int)(final_val-i))
1461 rep_bytes = (int)(final_val-i);
1462 png_memcpy(dstptr, srcptr, rep_bytes);
1463 srcptr += stride;
1464 dstptr += stride;
1468 break;
1469 } /* end 64 bpp */
1471 default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
1473 /* this should never happen */
1474 png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
1475 break;
1477 } /* end switch (png_ptr->row_info.pixel_depth) */
1479 } /* end if (non-trivial mask) */
1481 } /* end png_combine_row() */
1483 #endif /* PNG_HAVE_MMX_COMBINE_ROW */
1488 /*===========================================================================*/
1489 /* */
1490 /* P N G _ D O _ R E A D _ I N T E R L A C E */
1491 /* */
1492 /*===========================================================================*/
1494 #if defined(PNG_READ_INTERLACING_SUPPORTED)
1495 #if defined(PNG_HAVE_MMX_READ_INTERLACE)
1497 /* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
1498 * has taken place. [GRR: what other steps come before and/or after?]
1501 void /* PRIVATE */
1502 png_do_read_interlace(png_structp png_ptr)
1504 png_row_infop row_info = &(png_ptr->row_info);
1505 png_bytep row = png_ptr->row_buf + 1;
1506 int pass = png_ptr->pass;
1507 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1508 png_uint_32 transformations = png_ptr->transformations;
1509 #endif
1511 png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
1513 #if defined(PNG_MMX_CODE_SUPPORTED)
1514 if (_mmx_supported == 2) {
1515 #if !defined(PNG_1_0_X)
1516 /* this should have happened in png_init_mmx_flags() already */
1517 png_warning(png_ptr, "asm_flags may not have been initialized");
1518 #endif
1519 png_mmx_support();
1521 #endif
1523 if (row != NULL && row_info != NULL)
1525 png_uint_32 final_width;
1527 final_width = row_info->width * png_pass_inc[pass];
1529 switch (row_info->pixel_depth)
1531 case 1:
1533 png_bytep sp, dp;
1534 int sshift, dshift;
1535 int s_start, s_end, s_inc;
1536 png_byte v;
1537 png_uint_32 i;
1538 int j;
1540 sp = row + (png_size_t)((row_info->width - 1) >> 3);
1541 dp = row + (png_size_t)((final_width - 1) >> 3);
1542 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1543 if (transformations & PNG_PACKSWAP)
1545 sshift = (int)((row_info->width + 7) & 7);
1546 dshift = (int)((final_width + 7) & 7);
1547 s_start = 7;
1548 s_end = 0;
1549 s_inc = -1;
1551 else
1552 #endif
1554 sshift = 7 - (int)((row_info->width + 7) & 7);
1555 dshift = 7 - (int)((final_width + 7) & 7);
1556 s_start = 0;
1557 s_end = 7;
1558 s_inc = 1;
1561 for (i = row_info->width; i; i--)
1563 v = (png_byte)((*sp >> sshift) & 0x1);
1564 for (j = 0; j < png_pass_inc[pass]; j++)
1566 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1567 *dp |= (png_byte)(v << dshift);
1568 if (dshift == s_end)
1570 dshift = s_start;
1571 dp--;
1573 else
1574 dshift += s_inc;
1576 if (sshift == s_end)
1578 sshift = s_start;
1579 sp--;
1581 else
1582 sshift += s_inc;
1584 break;
1587 case 2:
1589 png_bytep sp, dp;
1590 int sshift, dshift;
1591 int s_start, s_end, s_inc;
1592 png_uint_32 i;
1594 sp = row + (png_size_t)((row_info->width - 1) >> 2);
1595 dp = row + (png_size_t)((final_width - 1) >> 2);
1596 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1597 if (transformations & PNG_PACKSWAP)
1599 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1600 dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1601 s_start = 6;
1602 s_end = 0;
1603 s_inc = -2;
1605 else
1606 #endif
1608 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1609 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1610 s_start = 0;
1611 s_end = 6;
1612 s_inc = 2;
1615 for (i = row_info->width; i; i--)
1617 png_byte v;
1618 int j;
1620 v = (png_byte)((*sp >> sshift) & 0x3);
1621 for (j = 0; j < png_pass_inc[pass]; j++)
1623 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1624 *dp |= (png_byte)(v << dshift);
1625 if (dshift == s_end)
1627 dshift = s_start;
1628 dp--;
1630 else
1631 dshift += s_inc;
1633 if (sshift == s_end)
1635 sshift = s_start;
1636 sp--;
1638 else
1639 sshift += s_inc;
1641 break;
1644 case 4:
1646 png_bytep sp, dp;
1647 int sshift, dshift;
1648 int s_start, s_end, s_inc;
1649 png_uint_32 i;
1651 sp = row + (png_size_t)((row_info->width - 1) >> 1);
1652 dp = row + (png_size_t)((final_width - 1) >> 1);
1653 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1654 if (transformations & PNG_PACKSWAP)
1656 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1657 dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1658 s_start = 4;
1659 s_end = 0;
1660 s_inc = -4;
1662 else
1663 #endif
1665 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1666 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1667 s_start = 0;
1668 s_end = 4;
1669 s_inc = 4;
1672 for (i = row_info->width; i; i--)
1674 png_byte v;
1675 int j;
1677 v = (png_byte)((*sp >> sshift) & 0xf);
1678 for (j = 0; j < png_pass_inc[pass]; j++)
1680 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1681 *dp |= (png_byte)(v << dshift);
1682 if (dshift == s_end)
1684 dshift = s_start;
1685 dp--;
1687 else
1688 dshift += s_inc;
1690 if (sshift == s_end)
1692 sshift = s_start;
1693 sp--;
1695 else
1696 sshift += s_inc;
1698 break;
1701 /*====================================================================*/
1703 default: /* 8-bit or larger (this is where the routine is modified) */
1705 #if 0
1706 // static unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
1707 // static unsigned long long const4 = 0x0000000000FFFFFFLL; no good
1708 // unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
1709 // unsigned long long const4 = 0x0000000000FFFFFFLL; no good
1710 #endif
1711 png_bytep sptr, dp;
1712 png_uint_32 i;
1713 png_size_t pixel_bytes;
1714 int width = (int)row_info->width;
1716 pixel_bytes = (row_info->pixel_depth >> 3);
1718 /* point sptr at the last pixel in the pre-expanded row: */
1719 sptr = row + (width - 1) * pixel_bytes;
1721 /* point dp at the last pixel position in the expanded row: */
1722 dp = row + (final_width - 1) * pixel_bytes;
1724 /* New code by Nirav Chhatrapati - Intel Corporation */
1726 #if defined(PNG_MMX_CODE_SUPPORTED)
1727 #if !defined(PNG_1_0_X)
1728 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1729 /* && _mmx_supported */ )
1730 #else
1731 if (_mmx_supported)
1732 #endif
1734 //--------------------------------------------------------------
1735 if (pixel_bytes == 3)
1737 if (((pass == 0) || (pass == 1)) && width)
1739 int dummy_value_c; // fix 'forbidden register spilled'
1740 int dummy_value_S;
1741 int dummy_value_D;
1742 int dummy_value_a;
1744 __asm__ __volatile__ (
1745 "subl $21, %%edi \n\t"
1746 // (png_pass_inc[pass] - 1)*pixel_bytes
1748 ".loop3_pass0: \n\t"
1749 "movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
1750 "pand (%3), %%mm0 \n\t" // z z z z z 2 1 0
1751 "movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
1752 "psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
1753 "movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
1754 "psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
1755 "psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
1756 "por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
1757 "por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
1758 "movq %%mm0, %%mm3 \n\t" // 2 1 0 2 1 0 2 1
1759 "psllq $16, %%mm0 \n\t" // 0 2 1 0 2 1 z z
1760 "movq %%mm3, %%mm4 \n\t" // 2 1 0 2 1 0 2 1
1761 "punpckhdq %%mm0, %%mm3 \n\t" // 0 2 1 0 2 1 0 2
1762 "movq %%mm4, 16(%%edi) \n\t"
1763 "psrlq $32, %%mm0 \n\t" // z z z z 0 2 1 0
1764 "movq %%mm3, 8(%%edi) \n\t"
1765 "punpckldq %%mm4, %%mm0 \n\t" // 1 0 2 1 0 2 1 0
1766 "subl $3, %%esi \n\t"
1767 "movq %%mm0, (%%edi) \n\t"
1768 "subl $24, %%edi \n\t"
1769 "decl %%ecx \n\t"
1770 "jnz .loop3_pass0 \n\t"
1771 "EMMS \n\t" // DONE
1773 : "=c" (dummy_value_c), // output regs (dummy)
1774 "=S" (dummy_value_S),
1775 "=D" (dummy_value_D),
1776 "=a" (dummy_value_a)
1779 : "1" (sptr), // esi // input regs
1780 "2" (dp), // edi
1781 "0" (width), // ecx
1782 "3" (&_const4) // %1(?) (0x0000000000FFFFFFLL)
1784 #if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1785 : "%mm0", "%mm1", "%mm2" // clobber list
1786 , "%mm3", "%mm4"
1787 #endif
1790 else if (((pass == 2) || (pass == 3)) && width)
1792 int dummy_value_c; // fix 'forbidden register spilled'
1793 int dummy_value_S;
1794 int dummy_value_D;
1795 int dummy_value_a;
1797 __asm__ __volatile__ (
1798 "subl $9, %%edi \n\t"
1799 // (png_pass_inc[pass] - 1)*pixel_bytes
1801 ".loop3_pass2: \n\t"
1802 "movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
1803 "pand (%3), %%mm0 \n\t" // z z z z z 2 1 0
1804 "movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
1805 "psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
1806 "movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
1807 "psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
1808 "psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
1809 "por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
1810 "por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
1811 "movq %%mm0, 4(%%edi) \n\t"
1812 "psrlq $16, %%mm0 \n\t" // z z 2 1 0 2 1 0
1813 "subl $3, %%esi \n\t"
1814 "movd %%mm0, (%%edi) \n\t"
1815 "subl $12, %%edi \n\t"
1816 "decl %%ecx \n\t"
1817 "jnz .loop3_pass2 \n\t"
1818 "EMMS \n\t" // DONE
1820 : "=c" (dummy_value_c), // output regs (dummy)
1821 "=S" (dummy_value_S),
1822 "=D" (dummy_value_D),
1823 "=a" (dummy_value_a)
1825 : "1" (sptr), // esi // input regs
1826 "2" (dp), // edi
1827 "0" (width), // ecx
1828 "3" (&_const4) // (0x0000000000FFFFFFLL)
1830 #if 0 /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
1831 : "%mm0", "%mm1", "%mm2" // clobber list
1832 #endif
1835 else if (width) /* && ((pass == 4) || (pass == 5)) */
1837 int width_mmx = ((width >> 1) << 1) - 8; // GRR: huh?
1838 if (width_mmx < 0)
1839 width_mmx = 0;
1840 width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
1841 if (width_mmx)
1843 // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1844 // sptr points at last pixel in pre-expanded row
1845 // dp points at last pixel position in expanded row
1846 int dummy_value_c; // fix 'forbidden register spilled'
1847 int dummy_value_S;
1848 int dummy_value_D;
1849 int dummy_value_a;
1850 int dummy_value_d;
1852 __asm__ __volatile__ (
1853 "subl $3, %%esi \n\t"
1854 "subl $9, %%edi \n\t"
1855 // (png_pass_inc[pass] + 1)*pixel_bytes
1857 ".loop3_pass4: \n\t"
1858 "movq (%%esi), %%mm0 \n\t" // x x 5 4 3 2 1 0
1859 "movq %%mm0, %%mm1 \n\t" // x x 5 4 3 2 1 0
1860 "movq %%mm0, %%mm2 \n\t" // x x 5 4 3 2 1 0
1861 "psllq $24, %%mm0 \n\t" // 4 3 2 1 0 z z z
1862 "pand (%3), %%mm1 \n\t" // z z z z z 2 1 0
1863 "psrlq $24, %%mm2 \n\t" // z z z x x 5 4 3
1864 "por %%mm1, %%mm0 \n\t" // 4 3 2 1 0 2 1 0
1865 "movq %%mm2, %%mm3 \n\t" // z z z x x 5 4 3
1866 "psllq $8, %%mm2 \n\t" // z z x x 5 4 3 z
1867 "movq %%mm0, (%%edi) \n\t"
1868 "psrlq $16, %%mm3 \n\t" // z z z z z x x 5
1869 "pand (%4), %%mm3 \n\t" // z z z z z z z 5
1870 "por %%mm3, %%mm2 \n\t" // z z x x 5 4 3 5
1871 "subl $6, %%esi \n\t"
1872 "movd %%mm2, 8(%%edi) \n\t"
1873 "subl $12, %%edi \n\t"
1874 "subl $2, %%ecx \n\t"
1875 "jnz .loop3_pass4 \n\t"
1876 "EMMS \n\t" // DONE
1878 : "=c" (dummy_value_c), // output regs (dummy)
1879 "=S" (dummy_value_S),
1880 "=D" (dummy_value_D),
1881 "=a" (dummy_value_a),
1882 "=d" (dummy_value_d)
1884 : "1" (sptr), // esi // input regs
1885 "2" (dp), // edi
1886 "0" (width_mmx), // ecx
1887 "3" (&_const4), // 0x0000000000FFFFFFLL
1888 "4" (&_const6) // 0x00000000000000FFLL
1890 #if 0 /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
1891 : "%mm0", "%mm1" // clobber list
1892 , "%mm2", "%mm3"
1893 #endif
1897 sptr -= width_mmx*3;
1898 dp -= width_mmx*6;
1899 for (i = width; i; i--)
1901 png_byte v[8];
1902 int j;
1904 png_memcpy(v, sptr, 3);
1905 for (j = 0; j < png_pass_inc[pass]; j++)
1907 png_memcpy(dp, v, 3);
1908 dp -= 3;
1910 sptr -= 3;
1913 } /* end of pixel_bytes == 3 */
1915 //--------------------------------------------------------------
1916 else if (pixel_bytes == 1)
1918 if (((pass == 0) || (pass == 1)) && width)
1920 int width_mmx = ((width >> 2) << 2);
1921 width -= width_mmx; // 0-3 pixels => 0-3 bytes
1922 if (width_mmx)
1924 int dummy_value_c; // fix 'forbidden register spilled'
1925 int dummy_value_S;
1926 int dummy_value_D;
1928 __asm__ __volatile__ (
1929 "subl $3, %%esi \n\t"
1930 "subl $31, %%edi \n\t"
1932 ".loop1_pass0: \n\t"
1933 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
1934 "movq %%mm0, %%mm1 \n\t" // x x x x 3 2 1 0
1935 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
1936 "movq %%mm0, %%mm2 \n\t" // 3 3 2 2 1 1 0 0
1937 "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
1938 "movq %%mm0, %%mm3 \n\t" // 1 1 1 1 0 0 0 0
1939 "punpckldq %%mm0, %%mm0 \n\t" // 0 0 0 0 0 0 0 0
1940 "punpckhdq %%mm3, %%mm3 \n\t" // 1 1 1 1 1 1 1 1
1941 "movq %%mm0, (%%edi) \n\t"
1942 "punpckhwd %%mm2, %%mm2 \n\t" // 3 3 3 3 2 2 2 2
1943 "movq %%mm3, 8(%%edi) \n\t"
1944 "movq %%mm2, %%mm4 \n\t" // 3 3 3 3 2 2 2 2
1945 "punpckldq %%mm2, %%mm2 \n\t" // 2 2 2 2 2 2 2 2
1946 "punpckhdq %%mm4, %%mm4 \n\t" // 3 3 3 3 3 3 3 3
1947 "movq %%mm2, 16(%%edi) \n\t"
1948 "subl $4, %%esi \n\t"
1949 "movq %%mm4, 24(%%edi) \n\t"
1950 "subl $32, %%edi \n\t"
1951 "subl $4, %%ecx \n\t"
1952 "jnz .loop1_pass0 \n\t"
1953 "EMMS \n\t" // DONE
1955 : "=c" (dummy_value_c), // output regs (dummy)
1956 "=S" (dummy_value_S),
1957 "=D" (dummy_value_D)
1959 : "1" (sptr), // esi // input regs
1960 "2" (dp), // edi
1961 "0" (width_mmx) // ecx
1963 #if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1964 : "%mm0", "%mm1", "%mm2" // clobber list
1965 , "%mm3", "%mm4"
1966 #endif
1970 sptr -= width_mmx;
1971 dp -= width_mmx*8;
1972 for (i = width; i; i--)
1974 int j;
1976 /* I simplified this part in version 1.0.4e
1977 * here and in several other instances where
1978 * pixel_bytes == 1 -- GR-P
1980 * Original code:
1982 * png_byte v[8];
1983 * png_memcpy(v, sptr, pixel_bytes);
1984 * for (j = 0; j < png_pass_inc[pass]; j++)
1986 * png_memcpy(dp, v, pixel_bytes);
1987 * dp -= pixel_bytes;
1989 * sptr -= pixel_bytes;
1991 * Replacement code is in the next three lines:
1994 for (j = 0; j < png_pass_inc[pass]; j++)
1996 *dp-- = *sptr;
1998 --sptr;
2001 else if (((pass == 2) || (pass == 3)) && width)
2003 int width_mmx = ((width >> 2) << 2);
2004 width -= width_mmx; // 0-3 pixels => 0-3 bytes
2005 if (width_mmx)
2007 int dummy_value_c; // fix 'forbidden register spilled'
2008 int dummy_value_S;
2009 int dummy_value_D;
2011 __asm__ __volatile__ (
2012 "subl $3, %%esi \n\t"
2013 "subl $15, %%edi \n\t"
2015 ".loop1_pass2: \n\t"
2016 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2017 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
2018 "movq %%mm0, %%mm1 \n\t" // 3 3 2 2 1 1 0 0
2019 "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
2020 "punpckhwd %%mm1, %%mm1 \n\t" // 3 3 3 3 2 2 2 2
2021 "movq %%mm0, (%%edi) \n\t"
2022 "subl $4, %%esi \n\t"
2023 "movq %%mm1, 8(%%edi) \n\t"
2024 "subl $16, %%edi \n\t"
2025 "subl $4, %%ecx \n\t"
2026 "jnz .loop1_pass2 \n\t"
2027 "EMMS \n\t" // DONE
2029 : "=c" (dummy_value_c), // output regs (dummy)
2030 "=S" (dummy_value_S),
2031 "=D" (dummy_value_D)
2033 : "1" (sptr), // esi // input regs
2034 "2" (dp), // edi
2035 "0" (width_mmx) // ecx
2037 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2038 : "%mm0", "%mm1" // clobber list
2039 #endif
2043 sptr -= width_mmx;
2044 dp -= width_mmx*4;
2045 for (i = width; i; i--)
2047 int j;
2049 for (j = 0; j < png_pass_inc[pass]; j++)
2051 *dp-- = *sptr;
2053 --sptr;
2056 else if (width) /* && ((pass == 4) || (pass == 5)) */
2058 int width_mmx = ((width >> 3) << 3);
2059 width -= width_mmx; // 0-3 pixels => 0-3 bytes
2060 if (width_mmx)
2062 int dummy_value_c; // fix 'forbidden register spilled'
2063 int dummy_value_S;
2064 int dummy_value_D;
2066 __asm__ __volatile__ (
2067 "subl $7, %%esi \n\t"
2068 "subl $15, %%edi \n\t"
2070 ".loop1_pass4: \n\t"
2071 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2072 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2073 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
2074 "punpckhbw %%mm1, %%mm1 \n\t" // 7 7 6 6 5 5 4 4
2075 "movq %%mm1, 8(%%edi) \n\t"
2076 "subl $8, %%esi \n\t"
2077 "movq %%mm0, (%%edi) \n\t"
2078 "subl $16, %%edi \n\t"
2079 "subl $8, %%ecx \n\t"
2080 "jnz .loop1_pass4 \n\t"
2081 "EMMS \n\t" // DONE
2083 : "=c" (dummy_value_c), // output regs (none)
2084 "=S" (dummy_value_S),
2085 "=D" (dummy_value_D)
2087 : "1" (sptr), // esi // input regs
2088 "2" (dp), // edi
2089 "0" (width_mmx) // ecx
2091 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2092 : "%mm0", "%mm1" // clobber list
2093 #endif
2097 sptr -= width_mmx;
2098 dp -= width_mmx*2;
2099 for (i = width; i; i--)
2101 int j;
2103 for (j = 0; j < png_pass_inc[pass]; j++)
2105 *dp-- = *sptr;
2107 --sptr;
2110 } /* end of pixel_bytes == 1 */
2112 //--------------------------------------------------------------
2113 else if (pixel_bytes == 2)
2115 if (((pass == 0) || (pass == 1)) && width)
2117 int width_mmx = ((width >> 1) << 1);
2118 width -= width_mmx; // 0,1 pixels => 0,2 bytes
2119 if (width_mmx)
2121 int dummy_value_c; // fix 'forbidden register spilled'
2122 int dummy_value_S;
2123 int dummy_value_D;
2125 __asm__ __volatile__ (
2126 "subl $2, %%esi \n\t"
2127 "subl $30, %%edi \n\t"
2129 ".loop2_pass0: \n\t"
2130 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2131 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2132 "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
2133 "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
2134 "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
2135 "movq %%mm0, (%%edi) \n\t"
2136 "movq %%mm0, 8(%%edi) \n\t"
2137 "movq %%mm1, 16(%%edi) \n\t"
2138 "subl $4, %%esi \n\t"
2139 "movq %%mm1, 24(%%edi) \n\t"
2140 "subl $32, %%edi \n\t"
2141 "subl $2, %%ecx \n\t"
2142 "jnz .loop2_pass0 \n\t"
2143 "EMMS \n\t" // DONE
2145 : "=c" (dummy_value_c), // output regs (dummy)
2146 "=S" (dummy_value_S),
2147 "=D" (dummy_value_D)
2149 : "1" (sptr), // esi // input regs
2150 "2" (dp), // edi
2151 "0" (width_mmx) // ecx
2153 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2154 : "%mm0", "%mm1" // clobber list
2155 #endif
2159 sptr -= (width_mmx*2 - 2); // sign fixed
2160 dp -= (width_mmx*16 - 2); // sign fixed
2161 for (i = width; i; i--)
2163 png_byte v[8];
2164 int j;
2165 sptr -= 2;
2166 png_memcpy(v, sptr, 2);
2167 for (j = 0; j < png_pass_inc[pass]; j++)
2169 dp -= 2;
2170 png_memcpy(dp, v, 2);
2174 else if (((pass == 2) || (pass == 3)) && width)
2176 int width_mmx = ((width >> 1) << 1) ;
2177 width -= width_mmx; // 0,1 pixels => 0,2 bytes
2178 if (width_mmx)
2180 int dummy_value_c; // fix 'forbidden register spilled'
2181 int dummy_value_S;
2182 int dummy_value_D;
2184 __asm__ __volatile__ (
2185 "subl $2, %%esi \n\t"
2186 "subl $14, %%edi \n\t"
2188 ".loop2_pass2: \n\t"
2189 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2190 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2191 "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
2192 "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
2193 "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
2194 "movq %%mm0, (%%edi) \n\t"
2195 "subl $4, %%esi \n\t"
2196 "movq %%mm1, 8(%%edi) \n\t"
2197 "subl $16, %%edi \n\t"
2198 "subl $2, %%ecx \n\t"
2199 "jnz .loop2_pass2 \n\t"
2200 "EMMS \n\t" // DONE
2202 : "=c" (dummy_value_c), // output regs (dummy)
2203 "=S" (dummy_value_S),
2204 "=D" (dummy_value_D)
2206 : "1" (sptr), // esi // input regs
2207 "2" (dp), // edi
2208 "0" (width_mmx) // ecx
2210 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2211 : "%mm0", "%mm1" // clobber list
2212 #endif
2216 sptr -= (width_mmx*2 - 2); // sign fixed
2217 dp -= (width_mmx*8 - 2); // sign fixed
2218 for (i = width; i; i--)
2220 png_byte v[8];
2221 int j;
2222 sptr -= 2;
2223 png_memcpy(v, sptr, 2);
2224 for (j = 0; j < png_pass_inc[pass]; j++)
2226 dp -= 2;
2227 png_memcpy(dp, v, 2);
2231 else if (width) // pass == 4 or 5
2233 int width_mmx = ((width >> 1) << 1) ;
2234 width -= width_mmx; // 0,1 pixels => 0,2 bytes
2235 if (width_mmx)
2237 int dummy_value_c; // fix 'forbidden register spilled'
2238 int dummy_value_S;
2239 int dummy_value_D;
2241 __asm__ __volatile__ (
2242 "subl $2, %%esi \n\t"
2243 "subl $6, %%edi \n\t"
2245 ".loop2_pass4: \n\t"
2246 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2247 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2248 "subl $4, %%esi \n\t"
2249 "movq %%mm0, (%%edi) \n\t"
2250 "subl $8, %%edi \n\t"
2251 "subl $2, %%ecx \n\t"
2252 "jnz .loop2_pass4 \n\t"
2253 "EMMS \n\t" // DONE
2255 : "=c" (dummy_value_c), // output regs (dummy)
2256 "=S" (dummy_value_S),
2257 "=D" (dummy_value_D)
2259 : "1" (sptr), // esi // input regs
2260 "2" (dp), // edi
2261 "0" (width_mmx) // ecx
2263 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2264 : "%mm0" // clobber list
2265 #endif
2269 sptr -= (width_mmx*2 - 2); // sign fixed
2270 dp -= (width_mmx*4 - 2); // sign fixed
2271 for (i = width; i; i--)
2273 png_byte v[8];
2274 int j;
2275 sptr -= 2;
2276 png_memcpy(v, sptr, 2);
2277 for (j = 0; j < png_pass_inc[pass]; j++)
2279 dp -= 2;
2280 png_memcpy(dp, v, 2);
2284 } /* end of pixel_bytes == 2 */
2286 //--------------------------------------------------------------
2287 else if (pixel_bytes == 4)
2289 if (((pass == 0) || (pass == 1)) && width)
2291 int width_mmx = ((width >> 1) << 1);
2292 width -= width_mmx; // 0,1 pixels => 0,4 bytes
2293 if (width_mmx)
2295 int dummy_value_c; // fix 'forbidden register spilled'
2296 int dummy_value_S;
2297 int dummy_value_D;
2299 __asm__ __volatile__ (
2300 "subl $4, %%esi \n\t"
2301 "subl $60, %%edi \n\t"
2303 ".loop4_pass0: \n\t"
2304 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2305 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2306 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2307 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2308 "movq %%mm0, (%%edi) \n\t"
2309 "movq %%mm0, 8(%%edi) \n\t"
2310 "movq %%mm0, 16(%%edi) \n\t"
2311 "movq %%mm0, 24(%%edi) \n\t"
2312 "movq %%mm1, 32(%%edi) \n\t"
2313 "movq %%mm1, 40(%%edi) \n\t"
2314 "movq %%mm1, 48(%%edi) \n\t"
2315 "subl $8, %%esi \n\t"
2316 "movq %%mm1, 56(%%edi) \n\t"
2317 "subl $64, %%edi \n\t"
2318 "subl $2, %%ecx \n\t"
2319 "jnz .loop4_pass0 \n\t"
2320 "EMMS \n\t" // DONE
2322 : "=c" (dummy_value_c), // output regs (dummy)
2323 "=S" (dummy_value_S),
2324 "=D" (dummy_value_D)
2326 : "1" (sptr), // esi // input regs
2327 "2" (dp), // edi
2328 "0" (width_mmx) // ecx
2330 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2331 : "%mm0", "%mm1" // clobber list
2332 #endif
2336 sptr -= (width_mmx*4 - 4); // sign fixed
2337 dp -= (width_mmx*32 - 4); // sign fixed
2338 for (i = width; i; i--)
2340 png_byte v[8];
2341 int j;
2342 sptr -= 4;
2343 png_memcpy(v, sptr, 4);
2344 for (j = 0; j < png_pass_inc[pass]; j++)
2346 dp -= 4;
2347 png_memcpy(dp, v, 4);
2351 else if (((pass == 2) || (pass == 3)) && width)
2353 int width_mmx = ((width >> 1) << 1);
2354 width -= width_mmx; // 0,1 pixels => 0,4 bytes
2355 if (width_mmx)
2357 int dummy_value_c; // fix 'forbidden register spilled'
2358 int dummy_value_S;
2359 int dummy_value_D;
2361 __asm__ __volatile__ (
2362 "subl $4, %%esi \n\t"
2363 "subl $28, %%edi \n\t"
2365 ".loop4_pass2: \n\t"
2366 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2367 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2368 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2369 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2370 "movq %%mm0, (%%edi) \n\t"
2371 "movq %%mm0, 8(%%edi) \n\t"
2372 "movq %%mm1, 16(%%edi) \n\t"
2373 "movq %%mm1, 24(%%edi) \n\t"
2374 "subl $8, %%esi \n\t"
2375 "subl $32, %%edi \n\t"
2376 "subl $2, %%ecx \n\t"
2377 "jnz .loop4_pass2 \n\t"
2378 "EMMS \n\t" // DONE
2380 : "=c" (dummy_value_c), // output regs (dummy)
2381 "=S" (dummy_value_S),
2382 "=D" (dummy_value_D)
2384 : "1" (sptr), // esi // input regs
2385 "2" (dp), // edi
2386 "0" (width_mmx) // ecx
2388 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2389 : "%mm0", "%mm1" // clobber list
2390 #endif
2394 sptr -= (width_mmx*4 - 4); // sign fixed
2395 dp -= (width_mmx*16 - 4); // sign fixed
2396 for (i = width; i; i--)
2398 png_byte v[8];
2399 int j;
2400 sptr -= 4;
2401 png_memcpy(v, sptr, 4);
2402 for (j = 0; j < png_pass_inc[pass]; j++)
2404 dp -= 4;
2405 png_memcpy(dp, v, 4);
2409 else if (width) // pass == 4 or 5
2411 int width_mmx = ((width >> 1) << 1) ;
2412 width -= width_mmx; // 0,1 pixels => 0,4 bytes
2413 if (width_mmx)
2415 int dummy_value_c; // fix 'forbidden register spilled'
2416 int dummy_value_S;
2417 int dummy_value_D;
2419 __asm__ __volatile__ (
2420 "subl $4, %%esi \n\t"
2421 "subl $12, %%edi \n\t"
2423 ".loop4_pass4: \n\t"
2424 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2425 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2426 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2427 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2428 "movq %%mm0, (%%edi) \n\t"
2429 "subl $8, %%esi \n\t"
2430 "movq %%mm1, 8(%%edi) \n\t"
2431 "subl $16, %%edi \n\t"
2432 "subl $2, %%ecx \n\t"
2433 "jnz .loop4_pass4 \n\t"
2434 "EMMS \n\t" // DONE
2436 : "=c" (dummy_value_c), // output regs (dummy)
2437 "=S" (dummy_value_S),
2438 "=D" (dummy_value_D)
2440 : "1" (sptr), // esi // input regs
2441 "2" (dp), // edi
2442 "0" (width_mmx) // ecx
2444 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2445 : "%mm0", "%mm1" // clobber list
2446 #endif
2450 sptr -= (width_mmx*4 - 4); // sign fixed
2451 dp -= (width_mmx*8 - 4); // sign fixed
2452 for (i = width; i; i--)
2454 png_byte v[8];
2455 int j;
2456 sptr -= 4;
2457 png_memcpy(v, sptr, 4);
2458 for (j = 0; j < png_pass_inc[pass]; j++)
2460 dp -= 4;
2461 png_memcpy(dp, v, 4);
2465 } /* end of pixel_bytes == 4 */
2467 //--------------------------------------------------------------
2468 else if (pixel_bytes == 8)
2470 // GRR TEST: should work, but needs testing (special 64-bit version of rpng2?)
2471 // GRR NOTE: no need to combine passes here!
2472 if (((pass == 0) || (pass == 1)) && width)
2474 int dummy_value_c; // fix 'forbidden register spilled'
2475 int dummy_value_S;
2476 int dummy_value_D;
2478 // source is 8-byte RRGGBBAA
2479 // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
2480 __asm__ __volatile__ (
2481 "subl $56, %%edi \n\t" // start of last block
2483 ".loop8_pass0: \n\t"
2484 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2485 "movq %%mm0, (%%edi) \n\t"
2486 "movq %%mm0, 8(%%edi) \n\t"
2487 "movq %%mm0, 16(%%edi) \n\t"
2488 "movq %%mm0, 24(%%edi) \n\t"
2489 "movq %%mm0, 32(%%edi) \n\t"
2490 "movq %%mm0, 40(%%edi) \n\t"
2491 "movq %%mm0, 48(%%edi) \n\t"
2492 "subl $8, %%esi \n\t"
2493 "movq %%mm0, 56(%%edi) \n\t"
2494 "subl $64, %%edi \n\t"
2495 "decl %%ecx \n\t"
2496 "jnz .loop8_pass0 \n\t"
2497 "EMMS \n\t" // DONE
2499 : "=c" (dummy_value_c), // output regs (dummy)
2500 "=S" (dummy_value_S),
2501 "=D" (dummy_value_D)
2503 : "1" (sptr), // esi // input regs
2504 "2" (dp), // edi
2505 "0" (width) // ecx
2507 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2508 : "%mm0" // clobber list
2509 #endif
2512 else if (((pass == 2) || (pass == 3)) && width)
2514 // source is 8-byte RRGGBBAA
2515 // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
2516 // (recall that expansion is _in place_: sptr and dp
2517 // both point at locations within same row buffer)
2519 int dummy_value_c; // fix 'forbidden register spilled'
2520 int dummy_value_S;
2521 int dummy_value_D;
2523 __asm__ __volatile__ (
2524 "subl $24, %%edi \n\t" // start of last block
2526 ".loop8_pass2: \n\t"
2527 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2528 "movq %%mm0, (%%edi) \n\t"
2529 "movq %%mm0, 8(%%edi) \n\t"
2530 "movq %%mm0, 16(%%edi) \n\t"
2531 "subl $8, %%esi \n\t"
2532 "movq %%mm0, 24(%%edi) \n\t"
2533 "subl $32, %%edi \n\t"
2534 "decl %%ecx \n\t"
2535 "jnz .loop8_pass2 \n\t"
2536 "EMMS \n\t" // DONE
2538 : "=c" (dummy_value_c), // output regs (dummy)
2539 "=S" (dummy_value_S),
2540 "=D" (dummy_value_D)
2542 : "1" (sptr), // esi // input regs
2543 "2" (dp), // edi
2544 "0" (width) // ecx
2546 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2547 : "%mm0" // clobber list
2548 #endif
2552 else if (width) // pass == 4 or 5
2554 // source is 8-byte RRGGBBAA
2555 // dest is 16-byte RRGGBBAA RRGGBBAA
2557 int dummy_value_c; // fix 'forbidden register spilled'
2558 int dummy_value_S;
2559 int dummy_value_D;
2561 __asm__ __volatile__ (
2562 "subl $8, %%edi \n\t" // start of last block
2564 ".loop8_pass4: \n\t"
2565 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2566 "movq %%mm0, (%%edi) \n\t"
2567 "subl $8, %%esi \n\t"
2568 "movq %%mm0, 8(%%edi) \n\t"
2569 "subl $16, %%edi \n\t"
2570 "decl %%ecx \n\t"
2571 "jnz .loop8_pass4 \n\t"
2572 "EMMS \n\t" // DONE
2574 : "=c" (dummy_value_c), // output regs (dummy)
2575 "=S" (dummy_value_S),
2576 "=D" (dummy_value_D)
2578 : "1" (sptr), // esi // input regs
2579 "2" (dp), // edi
2580 "0" (width) // ecx
2582 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2583 : "%mm0" // clobber list
2584 #endif
2589 } /* end of pixel_bytes == 8 */
2591 //--------------------------------------------------------------
2592 else if (pixel_bytes == 6)
2594 for (i = width; i; i--)
2596 png_byte v[8];
2597 int j;
2598 png_memcpy(v, sptr, 6);
2599 for (j = 0; j < png_pass_inc[pass]; j++)
2601 png_memcpy(dp, v, 6);
2602 dp -= 6;
2604 sptr -= 6;
2606 } /* end of pixel_bytes == 6 */
2608 //--------------------------------------------------------------
2609 else
2611 for (i = width; i; i--)
2613 png_byte v[8];
2614 int j;
2615 png_memcpy(v, sptr, pixel_bytes);
2616 for (j = 0; j < png_pass_inc[pass]; j++)
2618 png_memcpy(dp, v, pixel_bytes);
2619 dp -= pixel_bytes;
2621 sptr-= pixel_bytes;
2624 } // end of _mmx_supported ========================================
2626 else /* MMX not supported: use modified C code - takes advantage
2627 * of inlining of png_memcpy for a constant */
2628 /* GRR 19991007: does it? or should pixel_bytes in each
2629 * block be replaced with immediate value (e.g., 1)? */
2630 /* GRR 19991017: replaced with constants in each case */
2631 #endif /* PNG_MMX_CODE_SUPPORTED */
2633 if (pixel_bytes == 1)
2635 for (i = width; i; i--)
2637 int j;
2638 for (j = 0; j < png_pass_inc[pass]; j++)
2640 *dp-- = *sptr;
2642 --sptr;
2645 else if (pixel_bytes == 3)
2647 for (i = width; i; i--)
2649 png_byte v[8];
2650 int j;
2651 png_memcpy(v, sptr, 3);
2652 for (j = 0; j < png_pass_inc[pass]; j++)
2654 png_memcpy(dp, v, 3);
2655 dp -= 3;
2657 sptr -= 3;
2660 else if (pixel_bytes == 2)
2662 for (i = width; i; i--)
2664 png_byte v[8];
2665 int j;
2666 png_memcpy(v, sptr, 2);
2667 for (j = 0; j < png_pass_inc[pass]; j++)
2669 png_memcpy(dp, v, 2);
2670 dp -= 2;
2672 sptr -= 2;
2675 else if (pixel_bytes == 4)
2677 for (i = width; i; i--)
2679 png_byte v[8];
2680 int j;
2681 png_memcpy(v, sptr, 4);
2682 for (j = 0; j < png_pass_inc[pass]; j++)
2684 #ifdef PNG_DEBUG
2685 if (dp < row || dp+3 > row+png_ptr->row_buf_size)
2687 printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
2688 row, dp, row+png_ptr->row_buf_size);
2689 printf("row_buf=%d\n",png_ptr->row_buf_size);
2691 #endif
2692 png_memcpy(dp, v, 4);
2693 dp -= 4;
2695 sptr -= 4;
2698 else if (pixel_bytes == 6)
2700 for (i = width; i; i--)
2702 png_byte v[8];
2703 int j;
2704 png_memcpy(v, sptr, 6);
2705 for (j = 0; j < png_pass_inc[pass]; j++)
2707 png_memcpy(dp, v, 6);
2708 dp -= 6;
2710 sptr -= 6;
2713 else if (pixel_bytes == 8)
2715 for (i = width; i; i--)
2717 png_byte v[8];
2718 int j;
2719 png_memcpy(v, sptr, 8);
2720 for (j = 0; j < png_pass_inc[pass]; j++)
2722 png_memcpy(dp, v, 8);
2723 dp -= 8;
2725 sptr -= 8;
2728 else /* GRR: should never be reached */
2730 for (i = width; i; i--)
2732 png_byte v[8];
2733 int j;
2734 png_memcpy(v, sptr, pixel_bytes);
2735 for (j = 0; j < png_pass_inc[pass]; j++)
2737 png_memcpy(dp, v, pixel_bytes);
2738 dp -= pixel_bytes;
2740 sptr -= pixel_bytes;
2744 } /* end if (MMX not supported) */
2745 break;
2747 } /* end switch (row_info->pixel_depth) */
2749 row_info->width = final_width;
2751 row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
2754 } /* end png_do_read_interlace() */
2756 #endif /* PNG_HAVE_MMX_READ_INTERLACE */
2757 #endif /* PNG_READ_INTERLACING_SUPPORTED */
2761 #if defined(PNG_HAVE_MMX_READ_FILTER_ROW)
2762 #if defined(PNG_MMX_CODE_SUPPORTED)
2764 // These variables are utilized in the functions below. They are declared
2765 // globally here to ensure alignment on 8-byte boundaries.
2767 union uAll {
2768 long long use;
2769 double align;
2770 } _LBCarryMask = {0x0101010101010101LL},
2771 _HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
2772 _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
2774 #ifdef PNG_THREAD_UNSAFE_OK
2775 //===========================================================================//
2776 // //
2777 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G //
2778 // //
2779 //===========================================================================//
2781 // Optimized code for PNG Average filter decoder
2783 static void /* PRIVATE */
2784 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
2785 png_bytep prev_row)
2787 int bpp;
2788 int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
2789 int dummy_value_S;
2790 int dummy_value_D;
2792 bpp = (row_info->pixel_depth + 7) >> 3; // get # bytes per pixel
2793 _FullLength = row_info->rowbytes; // # of bytes to filter
2795 __asm__ __volatile__ (
2796 // initialize address pointers and offset
2797 #ifdef __PIC__
2798 "pushl %%ebx \n\t" // save index to Global Offset Table
2799 #endif
2800 //pre "movl row, %%edi \n\t" // edi: Avg(x)
2801 "xorl %%ebx, %%ebx \n\t" // ebx: x
2802 "movl %%edi, %%edx \n\t"
2803 //pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
2804 //pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
2805 "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
2807 "xorl %%eax,%%eax \n\t"
2809 // Compute the Raw value for the first bpp bytes
2810 // Raw(x) = Avg(x) + (Prior(x)/2)
2811 "avg_rlp: \n\t"
2812 "movb (%%esi,%%ebx,),%%al \n\t" // load al with Prior(x)
2813 "incl %%ebx \n\t"
2814 "shrb %%al \n\t" // divide by 2
2815 "addb -1(%%edi,%%ebx,),%%al \n\t" // add Avg(x); -1 to offset inc ebx
2816 //pre "cmpl bpp, %%ebx \n\t" // (bpp is preloaded into ecx)
2817 "cmpl %%ecx, %%ebx \n\t"
2818 "movb %%al,-1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2819 "jb avg_rlp \n\t" // mov does not affect flags
2821 // get # of bytes to alignment
2822 "movl %%edi, _dif \n\t" // take start of row
2823 "addl %%ebx, _dif \n\t" // add bpp
2824 "addl $0xf, _dif \n\t" // add 7+8 to incr past alignment bdry
2825 "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
2826 "subl %%edi, _dif \n\t" // subtract from start => value ebx at
2827 "jz avg_go \n\t" // alignment
2829 // fix alignment
2830 // Compute the Raw value for the bytes up to the alignment boundary
2831 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2832 "xorl %%ecx, %%ecx \n\t"
2834 "avg_lp1: \n\t"
2835 "xorl %%eax, %%eax \n\t"
2836 "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
2837 "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
2838 "addw %%cx, %%ax \n\t"
2839 "incl %%ebx \n\t"
2840 "shrw %%ax \n\t" // divide by 2
2841 "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
2842 "cmpl _dif, %%ebx \n\t" // check if at alignment boundary
2843 "movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2844 "jb avg_lp1 \n\t" // repeat until at alignment boundary
2846 "avg_go: \n\t"
2847 "movl _FullLength, %%eax \n\t"
2848 "movl %%eax, %%ecx \n\t"
2849 "subl %%ebx, %%eax \n\t" // subtract alignment fix
2850 "andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
2851 "subl %%eax, %%ecx \n\t" // drop over bytes from original length
2852 "movl %%ecx, _MMXLength \n\t"
2853 #ifdef __PIC__
2854 "popl %%ebx \n\t" // restore index to Global Offset Table
2855 #endif
2857 : "=c" (dummy_value_c), // output regs (dummy)
2858 "=S" (dummy_value_S),
2859 "=D" (dummy_value_D)
2861 : "0" (bpp), // ecx // input regs
2862 "1" (prev_row), // esi
2863 "2" (row) // edi
2865 : "%eax", "%edx" // clobber list
2866 #ifndef __PIC__
2867 , "%ebx"
2868 #endif
2869 // GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
2870 // (seems to work fine without...)
2873 // now do the math for the rest of the row
2874 switch (bpp)
2876 case 3:
2878 _ActiveMask.use = 0x0000000000ffffffLL;
2879 _ShiftBpp.use = 24; // == 3 * 8
2880 _ShiftRem.use = 40; // == 64 - 24
2882 __asm__ __volatile__ (
2883 // re-init address pointers and offset
2884 "movq _ActiveMask, %%mm7 \n\t"
2885 "movl _dif, %%ecx \n\t" // ecx: x = offset to
2886 "movq _LBCarryMask, %%mm5 \n\t" // alignment boundary
2887 // preload "movl row, %%edi \n\t" // edi: Avg(x)
2888 "movq _HBClearMask, %%mm4 \n\t"
2889 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
2891 // prime the pump: load the first Raw(x-bpp) data set
2892 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2893 // (correct pos. in loop below)
2894 "avg_3lp: \n\t"
2895 "movq (%%edi,%%ecx,), %%mm0 \n\t" // load mm0 with Avg(x)
2896 "movq %%mm5, %%mm3 \n\t"
2897 "psrlq _ShiftRem, %%mm2 \n\t" // correct position Raw(x-bpp)
2898 // data
2899 "movq (%%esi,%%ecx,), %%mm1 \n\t" // load mm1 with Prior(x)
2900 "movq %%mm7, %%mm6 \n\t"
2901 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
2902 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
2903 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
2904 // byte
2905 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
2906 // each byte
2907 // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
2908 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2909 // LBCarrys
2910 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2911 // where both
2912 // lsb's were == 1 (only valid for active group)
2913 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2914 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2915 // byte
2916 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2917 // for each byte
2918 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
2919 // bytes to add to Avg
2920 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2921 // Avg for each Active
2922 // byte
2923 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
2924 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
2925 // bytes 3-5
2926 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2927 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2928 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2929 // LBCarrys
2930 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2931 // where both
2932 // lsb's were == 1 (only valid for active group)
2933 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2934 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2935 // byte
2936 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2937 // for each byte
2938 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
2939 // bytes to add to Avg
2940 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2941 // Avg for each Active
2942 // byte
2944 // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
2945 "psllq _ShiftBpp, %%mm6 \n\t" // shift mm6 mask to cover last
2946 // two
2947 // bytes
2948 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2949 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2950 // Data only needs to be shifted once here to
2951 // get the correct x-bpp offset.
2952 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2953 // LBCarrys
2954 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2955 // where both
2956 // lsb's were == 1 (only valid for active group)
2957 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2958 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2959 // byte
2960 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2961 // for each byte
2962 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
2963 // bytes to add to Avg
2964 "addl $8, %%ecx \n\t"
2965 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2966 // Avg for each Active
2967 // byte
2968 // now ready to write back to memory
2969 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2970 // move updated Raw(x) to use as Raw(x-bpp) for next loop
2971 "cmpl _MMXLength, %%ecx \n\t"
2972 "movq %%mm0, %%mm2 \n\t" // mov updated Raw(x) to mm2
2973 "jb avg_3lp \n\t"
2975 : "=S" (dummy_value_S), // output regs (dummy)
2976 "=D" (dummy_value_D)
2978 : "0" (prev_row), // esi // input regs
2979 "1" (row) // edi
2981 : "%ecx" // clobber list
2982 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2983 , "%mm0", "%mm1", "%mm2", "%mm3"
2984 , "%mm4", "%mm5", "%mm6", "%mm7"
2985 #endif
2988 break; // end 3 bpp
2990 case 6:
2991 case 4:
2992 //case 7: // who wrote this? PNG doesn't support 5 or 7 bytes/pixel
2993 //case 5: // GRR BOGUS
2995 _ActiveMask.use = 0xffffffffffffffffLL; // use shift below to clear
2996 // appropriate inactive bytes
2997 _ShiftBpp.use = bpp << 3;
2998 _ShiftRem.use = 64 - _ShiftBpp.use;
3000 __asm__ __volatile__ (
3001 "movq _HBClearMask, %%mm4 \n\t"
3003 // re-init address pointers and offset
3004 "movl _dif, %%ecx \n\t" // ecx: x = offset to
3005 // alignment boundary
3007 // load _ActiveMask and clear all bytes except for 1st active group
3008 "movq _ActiveMask, %%mm7 \n\t"
3009 // preload "movl row, %%edi \n\t" // edi: Avg(x)
3010 "psrlq _ShiftRem, %%mm7 \n\t"
3011 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3012 "movq %%mm7, %%mm6 \n\t"
3013 "movq _LBCarryMask, %%mm5 \n\t"
3014 "psllq _ShiftBpp, %%mm6 \n\t" // create mask for 2nd active
3015 // group
3017 // prime the pump: load the first Raw(x-bpp) data set
3018 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3019 // (we correct pos. in loop below)
3020 "avg_4lp: \n\t"
3021 "movq (%%edi,%%ecx,), %%mm0 \n\t"
3022 "psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
3023 "movq (%%esi,%%ecx,), %%mm1 \n\t"
3024 // add (Prev_row/2) to average
3025 "movq %%mm5, %%mm3 \n\t"
3026 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3027 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3028 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
3029 // byte
3030 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
3031 // each byte
3032 // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3033 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3034 // LBCarrys
3035 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3036 // where both
3037 // lsb's were == 1 (only valid for active group)
3038 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3039 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3040 // byte
3041 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3042 // for each byte
3043 "pand %%mm7, %%mm2 \n\t" // leave only Active Group 1
3044 // bytes to add to Avg
3045 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
3046 // for each Active
3047 // byte
3048 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3049 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3050 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3051 "addl $8, %%ecx \n\t"
3052 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3053 // LBCarrys
3054 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3055 // where both
3056 // lsb's were == 1 (only valid for active group)
3057 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3058 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3059 // byte
3060 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3061 // for each byte
3062 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3063 // bytes to add to Avg
3064 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3065 // Avg for each Active
3066 // byte
3067 "cmpl _MMXLength, %%ecx \n\t"
3068 // now ready to write back to memory
3069 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3070 // prep Raw(x-bpp) for next loop
3071 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3072 "jb avg_4lp \n\t"
3074 : "=S" (dummy_value_S), // output regs (dummy)
3075 "=D" (dummy_value_D)
3077 : "0" (prev_row), // esi // input regs
3078 "1" (row) // edi
3080 : "%ecx" // clobber list
3081 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3082 , "%mm0", "%mm1", "%mm2", "%mm3"
3083 , "%mm4", "%mm5", "%mm6", "%mm7"
3084 #endif
3087 break; // end 4,6 bpp
3089 case 2:
3091 _ActiveMask.use = 0x000000000000ffffLL;
3092 _ShiftBpp.use = 16; // == 2 * 8
3093 _ShiftRem.use = 48; // == 64 - 16
3095 __asm__ __volatile__ (
3096 // load _ActiveMask
3097 "movq _ActiveMask, %%mm7 \n\t"
3098 // re-init address pointers and offset
3099 "movl _dif, %%ecx \n\t" // ecx: x = offset to alignment
3100 // boundary
3101 "movq _LBCarryMask, %%mm5 \n\t"
3102 // preload "movl row, %%edi \n\t" // edi: Avg(x)
3103 "movq _HBClearMask, %%mm4 \n\t"
3104 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3106 // prime the pump: load the first Raw(x-bpp) data set
3107 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3108 // (we correct pos. in loop below)
3109 "avg_2lp: \n\t"
3110 "movq (%%edi,%%ecx,), %%mm0 \n\t"
3111 "psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
3112 "movq (%%esi,%%ecx,), %%mm1 \n\t" // (GRR BUGFIX: was psllq)
3113 // add (Prev_row/2) to average
3114 "movq %%mm5, %%mm3 \n\t"
3115 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3116 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3117 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
3118 // byte
3119 "movq %%mm7, %%mm6 \n\t"
3120 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
3121 // each byte
3123 // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3124 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3125 // LBCarrys
3126 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3127 // where both
3128 // lsb's were == 1 (only valid
3129 // for active group)
3130 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3131 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3132 // byte
3133 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3134 // for each byte
3135 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
3136 // bytes to add to Avg
3137 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
3138 // for each Active byte
3140 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3141 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3142 // bytes 2 & 3
3143 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3144 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3145 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3146 // LBCarrys
3147 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3148 // where both
3149 // lsb's were == 1 (only valid
3150 // for active group)
3151 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3152 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3153 // byte
3154 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3155 // for each byte
3156 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3157 // bytes to add to Avg
3158 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3159 // Avg for each Active byte
3161 // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
3162 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3163 // bytes 4 & 5
3164 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3165 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3166 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3167 // LBCarrys
3168 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3169 // where both lsb's were == 1
3170 // (only valid for active group)
3171 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3172 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3173 // byte
3174 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3175 // for each byte
3176 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3177 // bytes to add to Avg
3178 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3179 // Avg for each Active byte
3181 // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
3182 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3183 // bytes 6 & 7
3184 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3185 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3186 "addl $8, %%ecx \n\t"
3187 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3188 // LBCarrys
3189 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3190 // where both
3191 // lsb's were == 1 (only valid
3192 // for active group)
3193 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3194 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3195 // byte
3196 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3197 // for each byte
3198 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3199 // bytes to add to Avg
3200 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3201 // Avg for each Active byte
3203 "cmpl _MMXLength, %%ecx \n\t"
3204 // now ready to write back to memory
3205 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3206 // prep Raw(x-bpp) for next loop
3207 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3208 "jb avg_2lp \n\t"
3210 : "=S" (dummy_value_S), // output regs (dummy)
3211 "=D" (dummy_value_D)
3213 : "0" (prev_row), // esi // input regs
3214 "1" (row) // edi
3216 : "%ecx" // clobber list
3217 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3218 , "%mm0", "%mm1", "%mm2", "%mm3"
3219 , "%mm4", "%mm5", "%mm6", "%mm7"
3220 #endif
3223 break; // end 2 bpp
3225 case 1:
3227 __asm__ __volatile__ (
3228 // re-init address pointers and offset
3229 #ifdef __PIC__
3230 "pushl %%ebx \n\t" // save Global Offset Table index
3231 #endif
3232 "movl _dif, %%ebx \n\t" // ebx: x = offset to alignment
3233 // boundary
3234 // preload "movl row, %%edi \n\t" // edi: Avg(x)
3235 "cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
3236 "jnb avg_1end \n\t"
3237 // do Paeth decode for remaining bytes
3238 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3239 "movl %%edi, %%edx \n\t"
3240 // preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
3241 "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
3242 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
3243 // in loop below
3244 "avg_1lp: \n\t"
3245 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3246 "xorl %%eax, %%eax \n\t"
3247 "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
3248 "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
3249 "addw %%cx, %%ax \n\t"
3250 "incl %%ebx \n\t"
3251 "shrw %%ax \n\t" // divide by 2
3252 "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset
3253 // inc ebx
3254 "cmpl _FullLength, %%ebx \n\t" // check if at end of array
3255 "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x);
3256 // mov does not affect flags; -1 to offset inc ebx
3257 "jb avg_1lp \n\t"
3259 "avg_1end: \n\t"
3260 #ifdef __PIC__
3261 "popl %%ebx \n\t" // Global Offset Table index
3262 #endif
3264 : "=c" (dummy_value_c), // output regs (dummy)
3265 "=S" (dummy_value_S),
3266 "=D" (dummy_value_D)
3268 : "0" (bpp), // ecx // input regs
3269 "1" (prev_row), // esi
3270 "2" (row) // edi
3272 : "%eax", "%edx" // clobber list
3273 #ifndef __PIC__
3274 , "%ebx"
3275 #endif
3278 return; // end 1 bpp
3280 case 8:
3282 __asm__ __volatile__ (
3283 // re-init address pointers and offset
3284 "movl _dif, %%ecx \n\t" // ecx: x == offset to alignment
3285 "movq _LBCarryMask, %%mm5 \n\t" // boundary
3286 // preload "movl row, %%edi \n\t" // edi: Avg(x)
3287 "movq _HBClearMask, %%mm4 \n\t"
3288 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3290 // prime the pump: load the first Raw(x-bpp) data set
3291 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3292 // (NO NEED to correct pos. in loop below)
3294 "avg_8lp: \n\t"
3295 "movq (%%edi,%%ecx,), %%mm0 \n\t"
3296 "movq %%mm5, %%mm3 \n\t"
3297 "movq (%%esi,%%ecx,), %%mm1 \n\t"
3298 "addl $8, %%ecx \n\t"
3299 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3300 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3301 "pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
3302 // where both lsb's were == 1
3303 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3304 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7, each byte
3305 "paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg, each byte
3306 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7, each byte
3307 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg, each
3308 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
3309 "cmpl _MMXLength, %%ecx \n\t"
3310 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3311 "movq %%mm0, %%mm2 \n\t" // reuse as Raw(x-bpp)
3312 "jb avg_8lp \n\t"
3314 : "=S" (dummy_value_S), // output regs (dummy)
3315 "=D" (dummy_value_D)
3317 : "0" (prev_row), // esi // input regs
3318 "1" (row) // edi
3320 : "%ecx" // clobber list
3321 #if 0 /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
3322 , "%mm0", "%mm1", "%mm2"
3323 , "%mm3", "%mm4", "%mm5"
3324 #endif
3327 break; // end 8 bpp
3329 default: // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
3332 #ifdef PNG_DEBUG
3333 // GRR: PRINT ERROR HERE: SHOULD NEVER BE REACHED
3334 png_debug(1,
3335 "Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
3336 #endif
3338 #if 0
3339 __asm__ __volatile__ (
3340 "movq _LBCarryMask, %%mm5 \n\t"
3341 // re-init address pointers and offset
3342 "movl _dif, %%ebx \n\t" // ebx: x = offset to
3343 // alignment boundary
3344 "movl row, %%edi \n\t" // edi: Avg(x)
3345 "movq _HBClearMask, %%mm4 \n\t"
3346 "movl %%edi, %%edx \n\t"
3347 "movl prev_row, %%esi \n\t" // esi: Prior(x)
3348 "subl bpp, %%edx \n\t" // edx: Raw(x-bpp)
3349 "avg_Alp: \n\t"
3350 "movq (%%edi,%%ebx,), %%mm0 \n\t"
3351 "movq %%mm5, %%mm3 \n\t"
3352 "movq (%%esi,%%ebx,), %%mm1 \n\t"
3353 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3354 "movq (%%edx,%%ebx,), %%mm2 \n\t"
3355 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3356 "pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
3357 // where both lsb's were == 1
3358 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3359 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
3360 // byte
3361 "paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg for each
3362 // byte
3363 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3364 // byte
3365 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
3366 // each byte
3367 "addl $8, %%ebx \n\t"
3368 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
3369 // byte
3370 "cmpl _MMXLength, %%ebx \n\t"
3371 "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
3372 "jb avg_Alp \n\t"
3374 : // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
3376 : // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
3378 : "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
3380 #endif /* 0 - NEVER REACHED */
3382 break;
3384 } // end switch (bpp)
3386 __asm__ __volatile__ (
3387 // MMX acceleration complete; now do clean-up
3388 // check if any remaining bytes left to decode
3389 #ifdef __PIC__
3390 "pushl %%ebx \n\t" // save index to Global Offset Table
3391 #endif
3392 "movl _MMXLength, %%ebx \n\t" // ebx: x == offset bytes after MMX
3393 //pre "movl row, %%edi \n\t" // edi: Avg(x)
3394 "cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
3395 "jnb avg_end \n\t"
3397 // do Avg decode for remaining bytes
3398 //pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
3399 "movl %%edi, %%edx \n\t"
3400 //pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
3401 "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
3402 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
3404 "avg_lp2: \n\t"
3405 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3406 "xorl %%eax, %%eax \n\t"
3407 "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
3408 "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
3409 "addw %%cx, %%ax \n\t"
3410 "incl %%ebx \n\t"
3411 "shrw %%ax \n\t" // divide by 2
3412 "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
3413 "cmpl _FullLength, %%ebx \n\t" // check if at end of array
3414 "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x) [mov does not
3415 "jb avg_lp2 \n\t" // affect flags; -1 to offset inc ebx]
3417 "avg_end: \n\t"
3418 "EMMS \n\t" // end MMX; prep for poss. FP instrs.
3419 #ifdef __PIC__
3420 "popl %%ebx \n\t" // restore index to Global Offset Table
3421 #endif
3423 : "=c" (dummy_value_c), // output regs (dummy)
3424 "=S" (dummy_value_S),
3425 "=D" (dummy_value_D)
3427 : "0" (bpp), // ecx // input regs
3428 "1" (prev_row), // esi
3429 "2" (row) // edi
3431 : "%eax", "%edx" // clobber list
3432 #ifndef __PIC__
3433 , "%ebx"
3434 #endif
3437 } /* end png_read_filter_row_mmx_avg() */
3438 #endif
3442 #ifdef PNG_THREAD_UNSAFE_OK
3443 //===========================================================================//
3444 // //
3445 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H //
3446 // //
3447 //===========================================================================//
3449 // Optimized code for PNG Paeth filter decoder
3451 static void /* PRIVATE */
3452 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
3453 png_bytep prev_row)
3455 int bpp;
3456 int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
3457 int dummy_value_S;
3458 int dummy_value_D;
3460 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3461 _FullLength = row_info->rowbytes; // # of bytes to filter
3463 __asm__ __volatile__ (
3464 #ifdef __PIC__
3465 "pushl %%ebx \n\t" // save index to Global Offset Table
3466 #endif
3467 "xorl %%ebx, %%ebx \n\t" // ebx: x offset
3468 //pre "movl row, %%edi \n\t"
3469 "xorl %%edx, %%edx \n\t" // edx: x-bpp offset
3470 //pre "movl prev_row, %%esi \n\t"
3471 "xorl %%eax, %%eax \n\t"
3473 // Compute the Raw value for the first bpp bytes
3474 // Note: the formula works out to be always
3475 // Paeth(x) = Raw(x) + Prior(x) where x < bpp
3476 "paeth_rlp: \n\t"
3477 "movb (%%edi,%%ebx,), %%al \n\t"
3478 "addb (%%esi,%%ebx,), %%al \n\t"
3479 "incl %%ebx \n\t"
3480 //pre "cmpl bpp, %%ebx \n\t" (bpp is preloaded into ecx)
3481 "cmpl %%ecx, %%ebx \n\t"
3482 "movb %%al, -1(%%edi,%%ebx,) \n\t"
3483 "jb paeth_rlp \n\t"
3484 // get # of bytes to alignment
3485 "movl %%edi, _dif \n\t" // take start of row
3486 "addl %%ebx, _dif \n\t" // add bpp
3487 "xorl %%ecx, %%ecx \n\t"
3488 "addl $0xf, _dif \n\t" // add 7 + 8 to incr past alignment
3489 // boundary
3490 "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
3491 "subl %%edi, _dif \n\t" // subtract from start ==> value ebx
3492 // at alignment
3493 "jz paeth_go \n\t"
3494 // fix alignment
3496 "paeth_lp1: \n\t"
3497 "xorl %%eax, %%eax \n\t"
3498 // pav = p - a = (a + b - c) - a = b - c
3499 "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
3500 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3501 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3502 "movl %%eax, _patemp \n\t" // Save pav for later use
3503 "xorl %%eax, %%eax \n\t"
3504 // pbv = p - b = (a + b - c) - b = a - c
3505 "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
3506 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3507 "movl %%eax, %%ecx \n\t"
3508 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3509 "addl _patemp, %%eax \n\t" // pcv = pav + pbv
3510 // pc = abs(pcv)
3511 "testl $0x80000000, %%eax \n\t"
3512 "jz paeth_pca \n\t"
3513 "negl %%eax \n\t" // reverse sign of neg values
3515 "paeth_pca: \n\t"
3516 "movl %%eax, _pctemp \n\t" // save pc for later use
3517 // pb = abs(pbv)
3518 "testl $0x80000000, %%ecx \n\t"
3519 "jz paeth_pba \n\t"
3520 "negl %%ecx \n\t" // reverse sign of neg values
3522 "paeth_pba: \n\t"
3523 "movl %%ecx, _pbtemp \n\t" // save pb for later use
3524 // pa = abs(pav)
3525 "movl _patemp, %%eax \n\t"
3526 "testl $0x80000000, %%eax \n\t"
3527 "jz paeth_paa \n\t"
3528 "negl %%eax \n\t" // reverse sign of neg values
3530 "paeth_paa: \n\t"
3531 "movl %%eax, _patemp \n\t" // save pa for later use
3532 // test if pa <= pb
3533 "cmpl %%ecx, %%eax \n\t"
3534 "jna paeth_abb \n\t"
3535 // pa > pb; now test if pb <= pc
3536 "cmpl _pctemp, %%ecx \n\t"
3537 "jna paeth_bbc \n\t"
3538 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3539 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3540 "jmp paeth_paeth \n\t"
3542 "paeth_bbc: \n\t"
3543 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3544 "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
3545 "jmp paeth_paeth \n\t"
3547 "paeth_abb: \n\t"
3548 // pa <= pb; now test if pa <= pc
3549 "cmpl _pctemp, %%eax \n\t"
3550 "jna paeth_abc \n\t"
3551 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3552 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3553 "jmp paeth_paeth \n\t"
3555 "paeth_abc: \n\t"
3556 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3557 "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
3559 "paeth_paeth: \n\t"
3560 "incl %%ebx \n\t"
3561 "incl %%edx \n\t"
3562 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3563 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
3564 "cmpl _dif, %%ebx \n\t"
3565 "jb paeth_lp1 \n\t"
3567 "paeth_go: \n\t"
3568 "movl _FullLength, %%ecx \n\t"
3569 "movl %%ecx, %%eax \n\t"
3570 "subl %%ebx, %%eax \n\t" // subtract alignment fix
3571 "andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
3572 "subl %%eax, %%ecx \n\t" // drop over bytes from original length
3573 "movl %%ecx, _MMXLength \n\t"
3574 #ifdef __PIC__
3575 "popl %%ebx \n\t" // restore index to Global Offset Table
3576 #endif
3578 : "=c" (dummy_value_c), // output regs (dummy)
3579 "=S" (dummy_value_S),
3580 "=D" (dummy_value_D)
3582 : "0" (bpp), // ecx // input regs
3583 "1" (prev_row), // esi
3584 "2" (row) // edi
3586 : "%eax", "%edx" // clobber list
3587 #ifndef __PIC__
3588 , "%ebx"
3589 #endif
3592 // now do the math for the rest of the row
3593 switch (bpp)
3595 case 3:
3597 _ActiveMask.use = 0x0000000000ffffffLL;
3598 _ActiveMaskEnd.use = 0xffff000000000000LL;
3599 _ShiftBpp.use = 24; // == bpp(3) * 8
3600 _ShiftRem.use = 40; // == 64 - 24
3602 __asm__ __volatile__ (
3603 "movl _dif, %%ecx \n\t"
3604 // preload "movl row, %%edi \n\t"
3605 // preload "movl prev_row, %%esi \n\t"
3606 "pxor %%mm0, %%mm0 \n\t"
3607 // prime the pump: load the first Raw(x-bpp) data set
3608 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3609 "paeth_3lp: \n\t"
3610 "psrlq _ShiftRem, %%mm1 \n\t" // shift last 3 bytes to 1st
3611 // 3 bytes
3612 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3613 "punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3614 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
3615 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3616 "psrlq _ShiftRem, %%mm3 \n\t" // shift last 3 bytes to 1st
3617 // 3 bytes
3618 // pav = p - a = (a + b - c) - a = b - c
3619 "movq %%mm2, %%mm4 \n\t"
3620 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3621 // pbv = p - b = (a + b - c) - b = a - c
3622 "movq %%mm1, %%mm5 \n\t"
3623 "psubw %%mm3, %%mm4 \n\t"
3624 "pxor %%mm7, %%mm7 \n\t"
3625 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3626 "movq %%mm4, %%mm6 \n\t"
3627 "psubw %%mm3, %%mm5 \n\t"
3629 // pa = abs(p-a) = abs(pav)
3630 // pb = abs(p-b) = abs(pbv)
3631 // pc = abs(p-c) = abs(pcv)
3632 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3633 "paddw %%mm5, %%mm6 \n\t"
3634 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3635 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3636 "psubw %%mm0, %%mm4 \n\t"
3637 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3638 "psubw %%mm0, %%mm4 \n\t"
3639 "psubw %%mm7, %%mm5 \n\t"
3640 "pxor %%mm0, %%mm0 \n\t"
3641 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3642 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3643 "psubw %%mm7, %%mm5 \n\t"
3644 "psubw %%mm0, %%mm6 \n\t"
3645 // test pa <= pb
3646 "movq %%mm4, %%mm7 \n\t"
3647 "psubw %%mm0, %%mm6 \n\t"
3648 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3649 "movq %%mm7, %%mm0 \n\t"
3650 // use mm7 mask to merge pa & pb
3651 "pand %%mm7, %%mm5 \n\t"
3652 // use mm0 mask copy to merge a & b
3653 "pand %%mm0, %%mm2 \n\t"
3654 "pandn %%mm4, %%mm7 \n\t"
3655 "pandn %%mm1, %%mm0 \n\t"
3656 "paddw %%mm5, %%mm7 \n\t"
3657 "paddw %%mm2, %%mm0 \n\t"
3658 // test ((pa <= pb)? pa:pb) <= pc
3659 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3660 "pxor %%mm1, %%mm1 \n\t"
3661 "pand %%mm7, %%mm3 \n\t"
3662 "pandn %%mm0, %%mm7 \n\t"
3663 "paddw %%mm3, %%mm7 \n\t"
3664 "pxor %%mm0, %%mm0 \n\t"
3665 "packuswb %%mm1, %%mm7 \n\t"
3666 "movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3667 "pand _ActiveMask, %%mm7 \n\t"
3668 "movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
3669 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3670 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3671 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3672 "movq %%mm7, %%mm1 \n\t" // now mm1 will be used as
3673 // Raw(x-bpp)
3674 // now do Paeth for 2nd set of bytes (3-5)
3675 "psrlq _ShiftBpp, %%mm2 \n\t" // load b=Prior(x) step 2
3676 "punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3677 "pxor %%mm7, %%mm7 \n\t"
3678 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3679 // pbv = p - b = (a + b - c) - b = a - c
3680 "movq %%mm1, %%mm5 \n\t"
3681 // pav = p - a = (a + b - c) - a = b - c
3682 "movq %%mm2, %%mm4 \n\t"
3683 "psubw %%mm3, %%mm5 \n\t"
3684 "psubw %%mm3, %%mm4 \n\t"
3685 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
3686 // pav + pbv = pbv + pav
3687 "movq %%mm5, %%mm6 \n\t"
3688 "paddw %%mm4, %%mm6 \n\t"
3690 // pa = abs(p-a) = abs(pav)
3691 // pb = abs(p-b) = abs(pbv)
3692 // pc = abs(p-c) = abs(pcv)
3693 "pcmpgtw %%mm5, %%mm0 \n\t" // create mask pbv bytes < 0
3694 "pcmpgtw %%mm4, %%mm7 \n\t" // create mask pav bytes < 0
3695 "pand %%mm5, %%mm0 \n\t" // only pbv bytes < 0 in mm0
3696 "pand %%mm4, %%mm7 \n\t" // only pav bytes < 0 in mm7
3697 "psubw %%mm0, %%mm5 \n\t"
3698 "psubw %%mm7, %%mm4 \n\t"
3699 "psubw %%mm0, %%mm5 \n\t"
3700 "psubw %%mm7, %%mm4 \n\t"
3701 "pxor %%mm0, %%mm0 \n\t"
3702 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3703 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3704 "psubw %%mm0, %%mm6 \n\t"
3705 // test pa <= pb
3706 "movq %%mm4, %%mm7 \n\t"
3707 "psubw %%mm0, %%mm6 \n\t"
3708 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3709 "movq %%mm7, %%mm0 \n\t"
3710 // use mm7 mask to merge pa & pb
3711 "pand %%mm7, %%mm5 \n\t"
3712 // use mm0 mask copy to merge a & b
3713 "pand %%mm0, %%mm2 \n\t"
3714 "pandn %%mm4, %%mm7 \n\t"
3715 "pandn %%mm1, %%mm0 \n\t"
3716 "paddw %%mm5, %%mm7 \n\t"
3717 "paddw %%mm2, %%mm0 \n\t"
3718 // test ((pa <= pb)? pa:pb) <= pc
3719 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3720 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3721 "pand %%mm7, %%mm3 \n\t"
3722 "pandn %%mm0, %%mm7 \n\t"
3723 "pxor %%mm1, %%mm1 \n\t"
3724 "paddw %%mm3, %%mm7 \n\t"
3725 "pxor %%mm0, %%mm0 \n\t"
3726 "packuswb %%mm1, %%mm7 \n\t"
3727 "movq %%mm2, %%mm3 \n\t" // load c=Prior(x-bpp) step 1
3728 "pand _ActiveMask, %%mm7 \n\t"
3729 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3730 "psllq _ShiftBpp, %%mm7 \n\t" // shift bytes to 2nd group of
3731 // 3 bytes
3732 // pav = p - a = (a + b - c) - a = b - c
3733 "movq %%mm2, %%mm4 \n\t"
3734 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3735 "psllq _ShiftBpp, %%mm3 \n\t" // load c=Prior(x-bpp) step 2
3736 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3737 "movq %%mm7, %%mm1 \n\t"
3738 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3739 "psllq _ShiftBpp, %%mm1 \n\t" // shift bytes
3740 // now mm1 will be used as Raw(x-bpp)
3741 // now do Paeth for 3rd, and final, set of bytes (6-7)
3742 "pxor %%mm7, %%mm7 \n\t"
3743 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3744 "psubw %%mm3, %%mm4 \n\t"
3745 // pbv = p - b = (a + b - c) - b = a - c
3746 "movq %%mm1, %%mm5 \n\t"
3747 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3748 "movq %%mm4, %%mm6 \n\t"
3749 "psubw %%mm3, %%mm5 \n\t"
3750 "pxor %%mm0, %%mm0 \n\t"
3751 "paddw %%mm5, %%mm6 \n\t"
3753 // pa = abs(p-a) = abs(pav)
3754 // pb = abs(p-b) = abs(pbv)
3755 // pc = abs(p-c) = abs(pcv)
3756 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3757 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3758 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3759 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3760 "psubw %%mm0, %%mm4 \n\t"
3761 "psubw %%mm7, %%mm5 \n\t"
3762 "psubw %%mm0, %%mm4 \n\t"
3763 "psubw %%mm7, %%mm5 \n\t"
3764 "pxor %%mm0, %%mm0 \n\t"
3765 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3766 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3767 "psubw %%mm0, %%mm6 \n\t"
3768 // test pa <= pb
3769 "movq %%mm4, %%mm7 \n\t"
3770 "psubw %%mm0, %%mm6 \n\t"
3771 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3772 "movq %%mm7, %%mm0 \n\t"
3773 // use mm0 mask copy to merge a & b
3774 "pand %%mm0, %%mm2 \n\t"
3775 // use mm7 mask to merge pa & pb
3776 "pand %%mm7, %%mm5 \n\t"
3777 "pandn %%mm1, %%mm0 \n\t"
3778 "pandn %%mm4, %%mm7 \n\t"
3779 "paddw %%mm2, %%mm0 \n\t"
3780 "paddw %%mm5, %%mm7 \n\t"
3781 // test ((pa <= pb)? pa:pb) <= pc
3782 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3783 "pand %%mm7, %%mm3 \n\t"
3784 "pandn %%mm0, %%mm7 \n\t"
3785 "paddw %%mm3, %%mm7 \n\t"
3786 "pxor %%mm1, %%mm1 \n\t"
3787 "packuswb %%mm7, %%mm1 \n\t"
3788 // step ecx to next set of 8 bytes and repeat loop til done
3789 "addl $8, %%ecx \n\t"
3790 "pand _ActiveMaskEnd, %%mm1 \n\t"
3791 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with
3792 // Raw(x)
3794 "cmpl _MMXLength, %%ecx \n\t"
3795 "pxor %%mm0, %%mm0 \n\t" // pxor does not affect flags
3796 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3797 // mm1 will be used as Raw(x-bpp) next loop
3798 // mm3 ready to be used as Prior(x-bpp) next loop
3799 "jb paeth_3lp \n\t"
3801 : "=S" (dummy_value_S), // output regs (dummy)
3802 "=D" (dummy_value_D)
3804 : "0" (prev_row), // esi // input regs
3805 "1" (row) // edi
3807 : "%ecx" // clobber list
3808 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3809 , "%mm0", "%mm1", "%mm2", "%mm3"
3810 , "%mm4", "%mm5", "%mm6", "%mm7"
3811 #endif
3814 break; // end 3 bpp
3816 case 6:
3817 //case 7: // GRR BOGUS
3818 //case 5: // GRR BOGUS
3820 _ActiveMask.use = 0x00000000ffffffffLL;
3821 _ActiveMask2.use = 0xffffffff00000000LL;
3822 _ShiftBpp.use = bpp << 3; // == bpp * 8
3823 _ShiftRem.use = 64 - _ShiftBpp.use;
3825 __asm__ __volatile__ (
3826 "movl _dif, %%ecx \n\t"
3827 // preload "movl row, %%edi \n\t"
3828 // preload "movl prev_row, %%esi \n\t"
3829 // prime the pump: load the first Raw(x-bpp) data set
3830 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3831 "pxor %%mm0, %%mm0 \n\t"
3833 "paeth_6lp: \n\t"
3834 // must shift to position Raw(x-bpp) data
3835 "psrlq _ShiftRem, %%mm1 \n\t"
3836 // do first set of 4 bytes
3837 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3838 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
3839 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3840 "punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
3841 // must shift to position Prior(x-bpp) data
3842 "psrlq _ShiftRem, %%mm3 \n\t"
3843 // pav = p - a = (a + b - c) - a = b - c
3844 "movq %%mm2, %%mm4 \n\t"
3845 "punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
3846 // pbv = p - b = (a + b - c) - b = a - c
3847 "movq %%mm1, %%mm5 \n\t"
3848 "psubw %%mm3, %%mm4 \n\t"
3849 "pxor %%mm7, %%mm7 \n\t"
3850 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3851 "movq %%mm4, %%mm6 \n\t"
3852 "psubw %%mm3, %%mm5 \n\t"
3853 // pa = abs(p-a) = abs(pav)
3854 // pb = abs(p-b) = abs(pbv)
3855 // pc = abs(p-c) = abs(pcv)
3856 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3857 "paddw %%mm5, %%mm6 \n\t"
3858 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3859 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3860 "psubw %%mm0, %%mm4 \n\t"
3861 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3862 "psubw %%mm0, %%mm4 \n\t"
3863 "psubw %%mm7, %%mm5 \n\t"
3864 "pxor %%mm0, %%mm0 \n\t"
3865 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3866 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3867 "psubw %%mm7, %%mm5 \n\t"
3868 "psubw %%mm0, %%mm6 \n\t"
3869 // test pa <= pb
3870 "movq %%mm4, %%mm7 \n\t"
3871 "psubw %%mm0, %%mm6 \n\t"
3872 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3873 "movq %%mm7, %%mm0 \n\t"
3874 // use mm7 mask to merge pa & pb
3875 "pand %%mm7, %%mm5 \n\t"
3876 // use mm0 mask copy to merge a & b
3877 "pand %%mm0, %%mm2 \n\t"
3878 "pandn %%mm4, %%mm7 \n\t"
3879 "pandn %%mm1, %%mm0 \n\t"
3880 "paddw %%mm5, %%mm7 \n\t"
3881 "paddw %%mm2, %%mm0 \n\t"
3882 // test ((pa <= pb)? pa:pb) <= pc
3883 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3884 "pxor %%mm1, %%mm1 \n\t"
3885 "pand %%mm7, %%mm3 \n\t"
3886 "pandn %%mm0, %%mm7 \n\t"
3887 "paddw %%mm3, %%mm7 \n\t"
3888 "pxor %%mm0, %%mm0 \n\t"
3889 "packuswb %%mm1, %%mm7 \n\t"
3890 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3891 "pand _ActiveMask, %%mm7 \n\t"
3892 "psrlq _ShiftRem, %%mm3 \n\t"
3893 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x) step 1
3894 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor and Raw(x)
3895 "movq %%mm2, %%mm6 \n\t"
3896 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3897 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3898 "psllq _ShiftBpp, %%mm6 \n\t"
3899 "movq %%mm7, %%mm5 \n\t"
3900 "psrlq _ShiftRem, %%mm1 \n\t"
3901 "por %%mm6, %%mm3 \n\t"
3902 "psllq _ShiftBpp, %%mm5 \n\t"
3903 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3904 "por %%mm5, %%mm1 \n\t"
3905 // do second set of 4 bytes
3906 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3907 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3908 // pav = p - a = (a + b - c) - a = b - c
3909 "movq %%mm2, %%mm4 \n\t"
3910 // pbv = p - b = (a + b - c) - b = a - c
3911 "movq %%mm1, %%mm5 \n\t"
3912 "psubw %%mm3, %%mm4 \n\t"
3913 "pxor %%mm7, %%mm7 \n\t"
3914 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3915 "movq %%mm4, %%mm6 \n\t"
3916 "psubw %%mm3, %%mm5 \n\t"
3917 // pa = abs(p-a) = abs(pav)
3918 // pb = abs(p-b) = abs(pbv)
3919 // pc = abs(p-c) = abs(pcv)
3920 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3921 "paddw %%mm5, %%mm6 \n\t"
3922 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3923 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3924 "psubw %%mm0, %%mm4 \n\t"
3925 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3926 "psubw %%mm0, %%mm4 \n\t"
3927 "psubw %%mm7, %%mm5 \n\t"
3928 "pxor %%mm0, %%mm0 \n\t"
3929 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3930 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3931 "psubw %%mm7, %%mm5 \n\t"
3932 "psubw %%mm0, %%mm6 \n\t"
3933 // test pa <= pb
3934 "movq %%mm4, %%mm7 \n\t"
3935 "psubw %%mm0, %%mm6 \n\t"
3936 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3937 "movq %%mm7, %%mm0 \n\t"
3938 // use mm7 mask to merge pa & pb
3939 "pand %%mm7, %%mm5 \n\t"
3940 // use mm0 mask copy to merge a & b
3941 "pand %%mm0, %%mm2 \n\t"
3942 "pandn %%mm4, %%mm7 \n\t"
3943 "pandn %%mm1, %%mm0 \n\t"
3944 "paddw %%mm5, %%mm7 \n\t"
3945 "paddw %%mm2, %%mm0 \n\t"
3946 // test ((pa <= pb)? pa:pb) <= pc
3947 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3948 "pxor %%mm1, %%mm1 \n\t"
3949 "pand %%mm7, %%mm3 \n\t"
3950 "pandn %%mm0, %%mm7 \n\t"
3951 "pxor %%mm1, %%mm1 \n\t"
3952 "paddw %%mm3, %%mm7 \n\t"
3953 "pxor %%mm0, %%mm0 \n\t"
3954 // step ecx to next set of 8 bytes and repeat loop til done
3955 "addl $8, %%ecx \n\t"
3956 "packuswb %%mm7, %%mm1 \n\t"
3957 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3958 "cmpl _MMXLength, %%ecx \n\t"
3959 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3960 // mm1 will be used as Raw(x-bpp) next loop
3961 "jb paeth_6lp \n\t"
3963 : "=S" (dummy_value_S), // output regs (dummy)
3964 "=D" (dummy_value_D)
3966 : "0" (prev_row), // esi // input regs
3967 "1" (row) // edi
3969 : "%ecx" // clobber list
3970 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3971 , "%mm0", "%mm1", "%mm2", "%mm3"
3972 , "%mm4", "%mm5", "%mm6", "%mm7"
3973 #endif
3976 break; // end 6 bpp
3978 case 4:
3980 _ActiveMask.use = 0x00000000ffffffffLL;
3982 __asm__ __volatile__ (
3983 "movl _dif, %%ecx \n\t"
3984 // preload "movl row, %%edi \n\t"
3985 // preload "movl prev_row, %%esi \n\t"
3986 "pxor %%mm0, %%mm0 \n\t"
3987 // prime the pump: load the first Raw(x-bpp) data set
3988 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
3989 // a=Raw(x-bpp) bytes
3990 "paeth_4lp: \n\t"
3991 // do first set of 4 bytes
3992 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3993 "punpckhbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
3994 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3995 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3996 // pav = p - a = (a + b - c) - a = b - c
3997 "movq %%mm2, %%mm4 \n\t"
3998 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3999 // pbv = p - b = (a + b - c) - b = a - c
4000 "movq %%mm1, %%mm5 \n\t"
4001 "psubw %%mm3, %%mm4 \n\t"
4002 "pxor %%mm7, %%mm7 \n\t"
4003 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4004 "movq %%mm4, %%mm6 \n\t"
4005 "psubw %%mm3, %%mm5 \n\t"
4006 // pa = abs(p-a) = abs(pav)
4007 // pb = abs(p-b) = abs(pbv)
4008 // pc = abs(p-c) = abs(pcv)
4009 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4010 "paddw %%mm5, %%mm6 \n\t"
4011 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4012 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4013 "psubw %%mm0, %%mm4 \n\t"
4014 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4015 "psubw %%mm0, %%mm4 \n\t"
4016 "psubw %%mm7, %%mm5 \n\t"
4017 "pxor %%mm0, %%mm0 \n\t"
4018 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4019 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4020 "psubw %%mm7, %%mm5 \n\t"
4021 "psubw %%mm0, %%mm6 \n\t"
4022 // test pa <= pb
4023 "movq %%mm4, %%mm7 \n\t"
4024 "psubw %%mm0, %%mm6 \n\t"
4025 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4026 "movq %%mm7, %%mm0 \n\t"
4027 // use mm7 mask to merge pa & pb
4028 "pand %%mm7, %%mm5 \n\t"
4029 // use mm0 mask copy to merge a & b
4030 "pand %%mm0, %%mm2 \n\t"
4031 "pandn %%mm4, %%mm7 \n\t"
4032 "pandn %%mm1, %%mm0 \n\t"
4033 "paddw %%mm5, %%mm7 \n\t"
4034 "paddw %%mm2, %%mm0 \n\t"
4035 // test ((pa <= pb)? pa:pb) <= pc
4036 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4037 "pxor %%mm1, %%mm1 \n\t"
4038 "pand %%mm7, %%mm3 \n\t"
4039 "pandn %%mm0, %%mm7 \n\t"
4040 "paddw %%mm3, %%mm7 \n\t"
4041 "pxor %%mm0, %%mm0 \n\t"
4042 "packuswb %%mm1, %%mm7 \n\t"
4043 "movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
4044 "pand _ActiveMask, %%mm7 \n\t"
4045 "movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
4046 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
4047 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
4048 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
4049 "movq %%mm7, %%mm1 \n\t" // now mm1 will be used as Raw(x-bpp)
4050 // do second set of 4 bytes
4051 "punpckhbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
4052 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
4053 // pav = p - a = (a + b - c) - a = b - c
4054 "movq %%mm2, %%mm4 \n\t"
4055 // pbv = p - b = (a + b - c) - b = a - c
4056 "movq %%mm1, %%mm5 \n\t"
4057 "psubw %%mm3, %%mm4 \n\t"
4058 "pxor %%mm7, %%mm7 \n\t"
4059 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4060 "movq %%mm4, %%mm6 \n\t"
4061 "psubw %%mm3, %%mm5 \n\t"
4062 // pa = abs(p-a) = abs(pav)
4063 // pb = abs(p-b) = abs(pbv)
4064 // pc = abs(p-c) = abs(pcv)
4065 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4066 "paddw %%mm5, %%mm6 \n\t"
4067 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4068 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4069 "psubw %%mm0, %%mm4 \n\t"
4070 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4071 "psubw %%mm0, %%mm4 \n\t"
4072 "psubw %%mm7, %%mm5 \n\t"
4073 "pxor %%mm0, %%mm0 \n\t"
4074 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4075 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4076 "psubw %%mm7, %%mm5 \n\t"
4077 "psubw %%mm0, %%mm6 \n\t"
4078 // test pa <= pb
4079 "movq %%mm4, %%mm7 \n\t"
4080 "psubw %%mm0, %%mm6 \n\t"
4081 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4082 "movq %%mm7, %%mm0 \n\t"
4083 // use mm7 mask to merge pa & pb
4084 "pand %%mm7, %%mm5 \n\t"
4085 // use mm0 mask copy to merge a & b
4086 "pand %%mm0, %%mm2 \n\t"
4087 "pandn %%mm4, %%mm7 \n\t"
4088 "pandn %%mm1, %%mm0 \n\t"
4089 "paddw %%mm5, %%mm7 \n\t"
4090 "paddw %%mm2, %%mm0 \n\t"
4091 // test ((pa <= pb)? pa:pb) <= pc
4092 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4093 "pxor %%mm1, %%mm1 \n\t"
4094 "pand %%mm7, %%mm3 \n\t"
4095 "pandn %%mm0, %%mm7 \n\t"
4096 "pxor %%mm1, %%mm1 \n\t"
4097 "paddw %%mm3, %%mm7 \n\t"
4098 "pxor %%mm0, %%mm0 \n\t"
4099 // step ecx to next set of 8 bytes and repeat loop til done
4100 "addl $8, %%ecx \n\t"
4101 "packuswb %%mm7, %%mm1 \n\t"
4102 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
4103 "cmpl _MMXLength, %%ecx \n\t"
4104 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
4105 // mm1 will be used as Raw(x-bpp) next loop
4106 "jb paeth_4lp \n\t"
4108 : "=S" (dummy_value_S), // output regs (dummy)
4109 "=D" (dummy_value_D)
4111 : "0" (prev_row), // esi // input regs
4112 "1" (row) // edi
4114 : "%ecx" // clobber list
4115 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4116 , "%mm0", "%mm1", "%mm2", "%mm3"
4117 , "%mm4", "%mm5", "%mm6", "%mm7"
4118 #endif
4121 break; // end 4 bpp
4123 case 8: // bpp == 8
4125 _ActiveMask.use = 0x00000000ffffffffLL;
4127 __asm__ __volatile__ (
4128 "movl _dif, %%ecx \n\t"
4129 // preload "movl row, %%edi \n\t"
4130 // preload "movl prev_row, %%esi \n\t"
4131 "pxor %%mm0, %%mm0 \n\t"
4132 // prime the pump: load the first Raw(x-bpp) data set
4133 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
4134 // a=Raw(x-bpp) bytes
4135 "paeth_8lp: \n\t"
4136 // do first set of 4 bytes
4137 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4138 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
4139 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
4140 "punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
4141 // pav = p - a = (a + b - c) - a = b - c
4142 "movq %%mm2, %%mm4 \n\t"
4143 "punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
4144 // pbv = p - b = (a + b - c) - b = a - c
4145 "movq %%mm1, %%mm5 \n\t"
4146 "psubw %%mm3, %%mm4 \n\t"
4147 "pxor %%mm7, %%mm7 \n\t"
4148 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4149 "movq %%mm4, %%mm6 \n\t"
4150 "psubw %%mm3, %%mm5 \n\t"
4151 // pa = abs(p-a) = abs(pav)
4152 // pb = abs(p-b) = abs(pbv)
4153 // pc = abs(p-c) = abs(pcv)
4154 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4155 "paddw %%mm5, %%mm6 \n\t"
4156 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4157 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4158 "psubw %%mm0, %%mm4 \n\t"
4159 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4160 "psubw %%mm0, %%mm4 \n\t"
4161 "psubw %%mm7, %%mm5 \n\t"
4162 "pxor %%mm0, %%mm0 \n\t"
4163 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4164 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4165 "psubw %%mm7, %%mm5 \n\t"
4166 "psubw %%mm0, %%mm6 \n\t"
4167 // test pa <= pb
4168 "movq %%mm4, %%mm7 \n\t"
4169 "psubw %%mm0, %%mm6 \n\t"
4170 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4171 "movq %%mm7, %%mm0 \n\t"
4172 // use mm7 mask to merge pa & pb
4173 "pand %%mm7, %%mm5 \n\t"
4174 // use mm0 mask copy to merge a & b
4175 "pand %%mm0, %%mm2 \n\t"
4176 "pandn %%mm4, %%mm7 \n\t"
4177 "pandn %%mm1, %%mm0 \n\t"
4178 "paddw %%mm5, %%mm7 \n\t"
4179 "paddw %%mm2, %%mm0 \n\t"
4180 // test ((pa <= pb)? pa:pb) <= pc
4181 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4182 "pxor %%mm1, %%mm1 \n\t"
4183 "pand %%mm7, %%mm3 \n\t"
4184 "pandn %%mm0, %%mm7 \n\t"
4185 "paddw %%mm3, %%mm7 \n\t"
4186 "pxor %%mm0, %%mm0 \n\t"
4187 "packuswb %%mm1, %%mm7 \n\t"
4188 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4189 "pand _ActiveMask, %%mm7 \n\t"
4190 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
4191 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
4192 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
4193 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
4194 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
4196 // do second set of 4 bytes
4197 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
4198 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
4199 // pav = p - a = (a + b - c) - a = b - c
4200 "movq %%mm2, %%mm4 \n\t"
4201 // pbv = p - b = (a + b - c) - b = a - c
4202 "movq %%mm1, %%mm5 \n\t"
4203 "psubw %%mm3, %%mm4 \n\t"
4204 "pxor %%mm7, %%mm7 \n\t"
4205 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4206 "movq %%mm4, %%mm6 \n\t"
4207 "psubw %%mm3, %%mm5 \n\t"
4208 // pa = abs(p-a) = abs(pav)
4209 // pb = abs(p-b) = abs(pbv)
4210 // pc = abs(p-c) = abs(pcv)
4211 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4212 "paddw %%mm5, %%mm6 \n\t"
4213 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4214 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4215 "psubw %%mm0, %%mm4 \n\t"
4216 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4217 "psubw %%mm0, %%mm4 \n\t"
4218 "psubw %%mm7, %%mm5 \n\t"
4219 "pxor %%mm0, %%mm0 \n\t"
4220 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4221 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4222 "psubw %%mm7, %%mm5 \n\t"
4223 "psubw %%mm0, %%mm6 \n\t"
4224 // test pa <= pb
4225 "movq %%mm4, %%mm7 \n\t"
4226 "psubw %%mm0, %%mm6 \n\t"
4227 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4228 "movq %%mm7, %%mm0 \n\t"
4229 // use mm7 mask to merge pa & pb
4230 "pand %%mm7, %%mm5 \n\t"
4231 // use mm0 mask copy to merge a & b
4232 "pand %%mm0, %%mm2 \n\t"
4233 "pandn %%mm4, %%mm7 \n\t"
4234 "pandn %%mm1, %%mm0 \n\t"
4235 "paddw %%mm5, %%mm7 \n\t"
4236 "paddw %%mm2, %%mm0 \n\t"
4237 // test ((pa <= pb)? pa:pb) <= pc
4238 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4239 "pxor %%mm1, %%mm1 \n\t"
4240 "pand %%mm7, %%mm3 \n\t"
4241 "pandn %%mm0, %%mm7 \n\t"
4242 "pxor %%mm1, %%mm1 \n\t"
4243 "paddw %%mm3, %%mm7 \n\t"
4244 "pxor %%mm0, %%mm0 \n\t"
4245 // step ecx to next set of 8 bytes and repeat loop til done
4246 "addl $8, %%ecx \n\t"
4247 "packuswb %%mm7, %%mm1 \n\t"
4248 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
4249 "cmpl _MMXLength, %%ecx \n\t"
4250 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
4251 // mm1 will be used as Raw(x-bpp) next loop
4252 "jb paeth_8lp \n\t"
4254 : "=S" (dummy_value_S), // output regs (dummy)
4255 "=D" (dummy_value_D)
4257 : "0" (prev_row), // esi // input regs
4258 "1" (row) // edi
4260 : "%ecx" // clobber list
4261 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4262 , "%mm0", "%mm1", "%mm2", "%mm3"
4263 , "%mm4", "%mm5", "%mm6", "%mm7"
4264 #endif
4267 break; // end 8 bpp
4269 case 1: // bpp = 1
4270 case 2: // bpp = 2
4271 default: // bpp > 8
4273 __asm__ __volatile__ (
4274 #ifdef __PIC__
4275 "pushl %%ebx \n\t" // save Global Offset Table index
4276 #endif
4277 "movl _dif, %%ebx \n\t"
4278 "cmpl _FullLength, %%ebx \n\t"
4279 "jnb paeth_dend \n\t"
4281 // preload "movl row, %%edi \n\t"
4282 // preload "movl prev_row, %%esi \n\t"
4283 // do Paeth decode for remaining bytes
4284 "movl %%ebx, %%edx \n\t"
4285 // preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
4286 "subl %%ecx, %%edx \n\t" // edx = ebx - bpp
4287 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
4289 "paeth_dlp: \n\t"
4290 "xorl %%eax, %%eax \n\t"
4291 // pav = p - a = (a + b - c) - a = b - c
4292 "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
4293 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4294 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4295 "movl %%eax, _patemp \n\t" // Save pav for later use
4296 "xorl %%eax, %%eax \n\t"
4297 // pbv = p - b = (a + b - c) - b = a - c
4298 "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
4299 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4300 "movl %%eax, %%ecx \n\t"
4301 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4302 "addl _patemp, %%eax \n\t" // pcv = pav + pbv
4303 // pc = abs(pcv)
4304 "testl $0x80000000, %%eax \n\t"
4305 "jz paeth_dpca \n\t"
4306 "negl %%eax \n\t" // reverse sign of neg values
4308 "paeth_dpca: \n\t"
4309 "movl %%eax, _pctemp \n\t" // save pc for later use
4310 // pb = abs(pbv)
4311 "testl $0x80000000, %%ecx \n\t"
4312 "jz paeth_dpba \n\t"
4313 "negl %%ecx \n\t" // reverse sign of neg values
4315 "paeth_dpba: \n\t"
4316 "movl %%ecx, _pbtemp \n\t" // save pb for later use
4317 // pa = abs(pav)
4318 "movl _patemp, %%eax \n\t"
4319 "testl $0x80000000, %%eax \n\t"
4320 "jz paeth_dpaa \n\t"
4321 "negl %%eax \n\t" // reverse sign of neg values
4323 "paeth_dpaa: \n\t"
4324 "movl %%eax, _patemp \n\t" // save pa for later use
4325 // test if pa <= pb
4326 "cmpl %%ecx, %%eax \n\t"
4327 "jna paeth_dabb \n\t"
4328 // pa > pb; now test if pb <= pc
4329 "cmpl _pctemp, %%ecx \n\t"
4330 "jna paeth_dbbc \n\t"
4331 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4332 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4333 "jmp paeth_dpaeth \n\t"
4335 "paeth_dbbc: \n\t"
4336 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4337 "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
4338 "jmp paeth_dpaeth \n\t"
4340 "paeth_dabb: \n\t"
4341 // pa <= pb; now test if pa <= pc
4342 "cmpl _pctemp, %%eax \n\t"
4343 "jna paeth_dabc \n\t"
4344 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4345 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4346 "jmp paeth_dpaeth \n\t"
4348 "paeth_dabc: \n\t"
4349 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4350 "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
4352 "paeth_dpaeth: \n\t"
4353 "incl %%ebx \n\t"
4354 "incl %%edx \n\t"
4355 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4356 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4357 "cmpl _FullLength, %%ebx \n\t"
4358 "jb paeth_dlp \n\t"
4360 "paeth_dend: \n\t"
4361 #ifdef __PIC__
4362 "popl %%ebx \n\t" // index to Global Offset Table
4363 #endif
4365 : "=c" (dummy_value_c), // output regs (dummy)
4366 "=S" (dummy_value_S),
4367 "=D" (dummy_value_D)
4369 : "0" (bpp), // ecx // input regs
4370 "1" (prev_row), // esi
4371 "2" (row) // edi
4373 : "%eax", "%edx" // clobber list
4374 #ifndef __PIC__
4375 , "%ebx"
4376 #endif
4379 return; // No need to go further with this one
4381 } // end switch (bpp)
4383 __asm__ __volatile__ (
4384 // MMX acceleration complete; now do clean-up
4385 // check if any remaining bytes left to decode
4386 #ifdef __PIC__
4387 "pushl %%ebx \n\t" // save index to Global Offset Table
4388 #endif
4389 "movl _MMXLength, %%ebx \n\t"
4390 "cmpl _FullLength, %%ebx \n\t"
4391 "jnb paeth_end \n\t"
4392 //pre "movl row, %%edi \n\t"
4393 //pre "movl prev_row, %%esi \n\t"
4394 // do Paeth decode for remaining bytes
4395 "movl %%ebx, %%edx \n\t"
4396 //pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
4397 "subl %%ecx, %%edx \n\t" // edx = ebx - bpp
4398 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
4400 "paeth_lp2: \n\t"
4401 "xorl %%eax, %%eax \n\t"
4402 // pav = p - a = (a + b - c) - a = b - c
4403 "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
4404 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4405 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4406 "movl %%eax, _patemp \n\t" // Save pav for later use
4407 "xorl %%eax, %%eax \n\t"
4408 // pbv = p - b = (a + b - c) - b = a - c
4409 "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
4410 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4411 "movl %%eax, %%ecx \n\t"
4412 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4413 "addl _patemp, %%eax \n\t" // pcv = pav + pbv
4414 // pc = abs(pcv)
4415 "testl $0x80000000, %%eax \n\t"
4416 "jz paeth_pca2 \n\t"
4417 "negl %%eax \n\t" // reverse sign of neg values
4419 "paeth_pca2: \n\t"
4420 "movl %%eax, _pctemp \n\t" // save pc for later use
4421 // pb = abs(pbv)
4422 "testl $0x80000000, %%ecx \n\t"
4423 "jz paeth_pba2 \n\t"
4424 "negl %%ecx \n\t" // reverse sign of neg values
4426 "paeth_pba2: \n\t"
4427 "movl %%ecx, _pbtemp \n\t" // save pb for later use
4428 // pa = abs(pav)
4429 "movl _patemp, %%eax \n\t"
4430 "testl $0x80000000, %%eax \n\t"
4431 "jz paeth_paa2 \n\t"
4432 "negl %%eax \n\t" // reverse sign of neg values
4434 "paeth_paa2: \n\t"
4435 "movl %%eax, _patemp \n\t" // save pa for later use
4436 // test if pa <= pb
4437 "cmpl %%ecx, %%eax \n\t"
4438 "jna paeth_abb2 \n\t"
4439 // pa > pb; now test if pb <= pc
4440 "cmpl _pctemp, %%ecx \n\t"
4441 "jna paeth_bbc2 \n\t"
4442 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4443 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4444 "jmp paeth_paeth2 \n\t"
4446 "paeth_bbc2: \n\t"
4447 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4448 "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
4449 "jmp paeth_paeth2 \n\t"
4451 "paeth_abb2: \n\t"
4452 // pa <= pb; now test if pa <= pc
4453 "cmpl _pctemp, %%eax \n\t"
4454 "jna paeth_abc2 \n\t"
4455 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4456 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4457 "jmp paeth_paeth2 \n\t"
4459 "paeth_abc2: \n\t"
4460 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4461 "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
4463 "paeth_paeth2: \n\t"
4464 "incl %%ebx \n\t"
4465 "incl %%edx \n\t"
4466 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4467 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4468 "cmpl _FullLength, %%ebx \n\t"
4469 "jb paeth_lp2 \n\t"
4471 "paeth_end: \n\t"
4472 "EMMS \n\t" // end MMX; prep for poss. FP instrs.
4473 #ifdef __PIC__
4474 "popl %%ebx \n\t" // restore index to Global Offset Table
4475 #endif
4477 : "=c" (dummy_value_c), // output regs (dummy)
4478 "=S" (dummy_value_S),
4479 "=D" (dummy_value_D)
4481 : "0" (bpp), // ecx // input regs
4482 "1" (prev_row), // esi
4483 "2" (row) // edi
4485 : "%eax", "%edx" // clobber list (no input regs!)
4486 #ifndef __PIC__
4487 , "%ebx"
4488 #endif
4491 } /* end png_read_filter_row_mmx_paeth() */
4492 #endif
4497 #ifdef PNG_THREAD_UNSAFE_OK
4498 //===========================================================================//
4499 // //
4500 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B //
4501 // //
4502 //===========================================================================//
4504 // Optimized code for PNG Sub filter decoder
4506 static void /* PRIVATE */
4507 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
4509 int bpp;
4510 int dummy_value_a;
4511 int dummy_value_D;
4513 bpp = (row_info->pixel_depth + 7) >> 3; // calc number of bytes per pixel
4514 _FullLength = row_info->rowbytes - bpp; // number of bytes to filter
4516 __asm__ __volatile__ (
4517 //pre "movl row, %%edi \n\t"
4518 "movl %%edi, %%esi \n\t" // lp = row
4519 //pre "movl bpp, %%eax \n\t"
4520 "addl %%eax, %%edi \n\t" // rp = row + bpp
4521 //irr "xorl %%eax, %%eax \n\t"
4522 // get # of bytes to alignment
4523 "movl %%edi, _dif \n\t" // take start of row
4524 "addl $0xf, _dif \n\t" // add 7 + 8 to incr past
4525 // alignment boundary
4526 "xorl %%ecx, %%ecx \n\t"
4527 "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
4528 "subl %%edi, _dif \n\t" // subtract from start ==> value
4529 "jz sub_go \n\t" // ecx at alignment
4531 "sub_lp1: \n\t" // fix alignment
4532 "movb (%%esi,%%ecx,), %%al \n\t"
4533 "addb %%al, (%%edi,%%ecx,) \n\t"
4534 "incl %%ecx \n\t"
4535 "cmpl _dif, %%ecx \n\t"
4536 "jb sub_lp1 \n\t"
4538 "sub_go: \n\t"
4539 "movl _FullLength, %%eax \n\t"
4540 "movl %%eax, %%edx \n\t"
4541 "subl %%ecx, %%edx \n\t" // subtract alignment fix
4542 "andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
4543 "subl %%edx, %%eax \n\t" // drop over bytes from length
4544 "movl %%eax, _MMXLength \n\t"
4546 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4547 "=D" (dummy_value_D) // 1
4549 : "0" (bpp), // eax // input regs
4550 "1" (row) // edi
4552 : "%esi", "%ecx", "%edx" // clobber list
4554 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4555 , "%mm0", "%mm1", "%mm2", "%mm3"
4556 , "%mm4", "%mm5", "%mm6", "%mm7"
4557 #endif
4560 // now do the math for the rest of the row
4561 switch (bpp)
4563 case 3:
4565 _ActiveMask.use = 0x0000ffffff000000LL;
4566 _ShiftBpp.use = 24; // == 3 * 8
4567 _ShiftRem.use = 40; // == 64 - 24
4569 __asm__ __volatile__ (
4570 // preload "movl row, %%edi \n\t"
4571 "movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
4572 // active byte group
4573 "movl %%edi, %%esi \n\t" // lp = row
4574 // preload "movl bpp, %%eax \n\t"
4575 "addl %%eax, %%edi \n\t" // rp = row + bpp
4576 "movq %%mm7, %%mm6 \n\t"
4577 "movl _dif, %%edx \n\t"
4578 "psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
4579 // 3rd active byte group
4580 // prime the pump: load the first Raw(x-bpp) data set
4581 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4583 "sub_3lp: \n\t" // shift data for adding first
4584 "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4585 // shift clears inactive bytes)
4586 // add 1st active group
4587 "movq (%%edi,%%edx,), %%mm0 \n\t"
4588 "paddb %%mm1, %%mm0 \n\t"
4590 // add 2nd active group
4591 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4592 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4593 "pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
4594 "paddb %%mm1, %%mm0 \n\t"
4596 // add 3rd active group
4597 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4598 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4599 "pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
4600 "addl $8, %%edx \n\t"
4601 "paddb %%mm1, %%mm0 \n\t"
4603 "cmpl _MMXLength, %%edx \n\t"
4604 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4605 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4606 "jb sub_3lp \n\t"
4608 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4609 "=D" (dummy_value_D) // 1
4611 : "0" (bpp), // eax // input regs
4612 "1" (row) // edi
4614 : "%edx", "%esi" // clobber list
4615 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4616 , "%mm0", "%mm1", "%mm6", "%mm7"
4617 #endif
4620 break;
4622 case 1:
4624 __asm__ __volatile__ (
4625 "movl _dif, %%edx \n\t"
4626 // preload "movl row, %%edi \n\t"
4627 "cmpl _FullLength, %%edx \n\t"
4628 "jnb sub_1end \n\t"
4629 "movl %%edi, %%esi \n\t" // lp = row
4630 "xorl %%eax, %%eax \n\t"
4631 // preload "movl bpp, %%eax \n\t"
4632 "addl %%eax, %%edi \n\t" // rp = row + bpp
4634 "sub_1lp: \n\t"
4635 "movb (%%esi,%%edx,), %%al \n\t"
4636 "addb %%al, (%%edi,%%edx,) \n\t"
4637 "incl %%edx \n\t"
4638 "cmpl _FullLength, %%edx \n\t"
4639 "jb sub_1lp \n\t"
4641 "sub_1end: \n\t"
4643 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4644 "=D" (dummy_value_D) // 1
4646 : "0" (bpp), // eax // input regs
4647 "1" (row) // edi
4649 : "%edx", "%esi" // clobber list
4652 return;
4654 case 6:
4655 case 4:
4656 //case 7: // GRR BOGUS
4657 //case 5: // GRR BOGUS
4659 _ShiftBpp.use = bpp << 3;
4660 _ShiftRem.use = 64 - _ShiftBpp.use;
4662 __asm__ __volatile__ (
4663 // preload "movl row, %%edi \n\t"
4664 "movl _dif, %%edx \n\t"
4665 "movl %%edi, %%esi \n\t" // lp = row
4666 // preload "movl bpp, %%eax \n\t"
4667 "addl %%eax, %%edi \n\t" // rp = row + bpp
4669 // prime the pump: load the first Raw(x-bpp) data set
4670 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4672 "sub_4lp: \n\t" // shift data for adding first
4673 "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4674 // shift clears inactive bytes)
4675 "movq (%%edi,%%edx,), %%mm0 \n\t"
4676 "paddb %%mm1, %%mm0 \n\t"
4678 // add 2nd active group
4679 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4680 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4681 "addl $8, %%edx \n\t"
4682 "paddb %%mm1, %%mm0 \n\t"
4684 "cmpl _MMXLength, %%edx \n\t"
4685 "movq %%mm0, -8(%%edi,%%edx,) \n\t"
4686 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4687 "jb sub_4lp \n\t"
4689 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4690 "=D" (dummy_value_D) // 1
4692 : "0" (bpp), // eax // input regs
4693 "1" (row) // edi
4695 : "%edx", "%esi" // clobber list
4696 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4697 , "%mm0", "%mm1"
4698 #endif
4701 break;
4703 case 2:
4705 _ActiveMask.use = 0x00000000ffff0000LL;
4706 _ShiftBpp.use = 16; // == 2 * 8
4707 _ShiftRem.use = 48; // == 64 - 16
4709 __asm__ __volatile__ (
4710 "movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
4711 // active byte group
4712 "movl _dif, %%edx \n\t"
4713 "movq %%mm7, %%mm6 \n\t"
4714 // preload "movl row, %%edi \n\t"
4715 "psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
4716 // 3rd active byte group
4717 "movl %%edi, %%esi \n\t" // lp = row
4718 "movq %%mm6, %%mm5 \n\t"
4719 // preload "movl bpp, %%eax \n\t"
4720 "addl %%eax, %%edi \n\t" // rp = row + bpp
4721 "psllq _ShiftBpp, %%mm5 \n\t" // move mask in mm5 to cover
4722 // 4th active byte group
4723 // prime the pump: load the first Raw(x-bpp) data set
4724 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4726 "sub_2lp: \n\t" // shift data for adding first
4727 "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4728 // shift clears inactive bytes)
4729 // add 1st active group
4730 "movq (%%edi,%%edx,), %%mm0 \n\t"
4731 "paddb %%mm1, %%mm0 \n\t"
4733 // add 2nd active group
4734 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4735 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4736 "pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
4737 "paddb %%mm1, %%mm0 \n\t"
4739 // add 3rd active group
4740 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4741 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4742 "pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
4743 "paddb %%mm1, %%mm0 \n\t"
4745 // add 4th active group
4746 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4747 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4748 "pand %%mm5, %%mm1 \n\t" // mask to use 4th active group
4749 "addl $8, %%edx \n\t"
4750 "paddb %%mm1, %%mm0 \n\t"
4751 "cmpl _MMXLength, %%edx \n\t"
4752 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4753 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4754 "jb sub_2lp \n\t"
4756 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4757 "=D" (dummy_value_D) // 1
4759 : "0" (bpp), // eax // input regs
4760 "1" (row) // edi
4762 : "%edx", "%esi" // clobber list
4763 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4764 , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
4765 #endif
4768 break;
4770 case 8:
4772 __asm__ __volatile__ (
4773 // preload "movl row, %%edi \n\t"
4774 "movl _dif, %%edx \n\t"
4775 "movl %%edi, %%esi \n\t" // lp = row
4776 // preload "movl bpp, %%eax \n\t"
4777 "addl %%eax, %%edi \n\t" // rp = row + bpp
4778 "movl _MMXLength, %%ecx \n\t"
4780 // prime the pump: load the first Raw(x-bpp) data set
4781 "movq -8(%%edi,%%edx,), %%mm7 \n\t"
4782 "andl $0x0000003f, %%ecx \n\t" // calc bytes over mult of 64
4784 "sub_8lp: \n\t"
4785 "movq (%%edi,%%edx,), %%mm0 \n\t" // load Sub(x) for 1st 8 bytes
4786 "paddb %%mm7, %%mm0 \n\t"
4787 "movq 8(%%edi,%%edx,), %%mm1 \n\t" // load Sub(x) for 2nd 8 bytes
4788 "movq %%mm0, (%%edi,%%edx,) \n\t" // write Raw(x) for 1st 8 bytes
4790 // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
4791 // This will be repeated for each group of 8 bytes with the 8th
4792 // group being used as the Raw(x-bpp) for the 1st group of the
4793 // next loop.
4795 "paddb %%mm0, %%mm1 \n\t"
4796 "movq 16(%%edi,%%edx,), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes
4797 "movq %%mm1, 8(%%edi,%%edx,) \n\t" // write Raw(x) for 2nd 8 bytes
4798 "paddb %%mm1, %%mm2 \n\t"
4799 "movq 24(%%edi,%%edx,), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes
4800 "movq %%mm2, 16(%%edi,%%edx,) \n\t" // write Raw(x) for 3rd 8 bytes
4801 "paddb %%mm2, %%mm3 \n\t"
4802 "movq 32(%%edi,%%edx,), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes
4803 "movq %%mm3, 24(%%edi,%%edx,) \n\t" // write Raw(x) for 4th 8 bytes
4804 "paddb %%mm3, %%mm4 \n\t"
4805 "movq 40(%%edi,%%edx,), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes
4806 "movq %%mm4, 32(%%edi,%%edx,) \n\t" // write Raw(x) for 5th 8 bytes
4807 "paddb %%mm4, %%mm5 \n\t"
4808 "movq 48(%%edi,%%edx,), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes
4809 "movq %%mm5, 40(%%edi,%%edx,) \n\t" // write Raw(x) for 6th 8 bytes
4810 "paddb %%mm5, %%mm6 \n\t"
4811 "movq 56(%%edi,%%edx,), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes
4812 "movq %%mm6, 48(%%edi,%%edx,) \n\t" // write Raw(x) for 7th 8 bytes
4813 "addl $64, %%edx \n\t"
4814 "paddb %%mm6, %%mm7 \n\t"
4815 "cmpl %%ecx, %%edx \n\t"
4816 "movq %%mm7, -8(%%edi,%%edx,) \n\t" // write Raw(x) for 8th 8 bytes
4817 "jb sub_8lp \n\t"
4819 "cmpl _MMXLength, %%edx \n\t"
4820 "jnb sub_8lt8 \n\t"
4822 "sub_8lpA: \n\t"
4823 "movq (%%edi,%%edx,), %%mm0 \n\t"
4824 "addl $8, %%edx \n\t"
4825 "paddb %%mm7, %%mm0 \n\t"
4826 "cmpl _MMXLength, %%edx \n\t"
4827 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // -8 to offset early addl edx
4828 "movq %%mm0, %%mm7 \n\t" // move calculated Raw(x) data
4829 // to mm1 to be new Raw(x-bpp)
4830 // for next loop
4831 "jb sub_8lpA \n\t"
4833 "sub_8lt8: \n\t"
4835 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4836 "=D" (dummy_value_D) // 1
4838 : "0" (bpp), // eax // input regs
4839 "1" (row) // edi
4841 : "%ecx", "%edx", "%esi" // clobber list
4842 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4843 , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
4844 #endif
4847 break;
4849 default: // bpp greater than 8 bytes GRR BOGUS
4851 __asm__ __volatile__ (
4852 "movl _dif, %%edx \n\t"
4853 // preload "movl row, %%edi \n\t"
4854 "movl %%edi, %%esi \n\t" // lp = row
4855 // preload "movl bpp, %%eax \n\t"
4856 "addl %%eax, %%edi \n\t" // rp = row + bpp
4858 "sub_Alp: \n\t"
4859 "movq (%%edi,%%edx,), %%mm0 \n\t"
4860 "movq (%%esi,%%edx,), %%mm1 \n\t"
4861 "addl $8, %%edx \n\t"
4862 "paddb %%mm1, %%mm0 \n\t"
4863 "cmpl _MMXLength, %%edx \n\t"
4864 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // mov does not affect flags;
4865 // -8 to offset addl edx
4866 "jb sub_Alp \n\t"
4868 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4869 "=D" (dummy_value_D) // 1
4871 : "0" (bpp), // eax // input regs
4872 "1" (row) // edi
4874 : "%edx", "%esi" // clobber list
4875 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4876 , "%mm0", "%mm1"
4877 #endif
4880 break;
4882 } // end switch (bpp)
4884 __asm__ __volatile__ (
4885 "movl _MMXLength, %%edx \n\t"
4886 //pre "movl row, %%edi \n\t"
4887 "cmpl _FullLength, %%edx \n\t"
4888 "jnb sub_end \n\t"
4890 "movl %%edi, %%esi \n\t" // lp = row
4891 //pre "movl bpp, %%eax \n\t"
4892 "addl %%eax, %%edi \n\t" // rp = row + bpp
4893 "xorl %%eax, %%eax \n\t"
4895 "sub_lp2: \n\t"
4896 "movb (%%esi,%%edx,), %%al \n\t"
4897 "addb %%al, (%%edi,%%edx,) \n\t"
4898 "incl %%edx \n\t"
4899 "cmpl _FullLength, %%edx \n\t"
4900 "jb sub_lp2 \n\t"
4902 "sub_end: \n\t"
4903 "EMMS \n\t" // end MMX instructions
4905 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4906 "=D" (dummy_value_D) // 1
4908 : "0" (bpp), // eax // input regs
4909 "1" (row) // edi
4911 : "%edx", "%esi" // clobber list
4914 } // end of png_read_filter_row_mmx_sub()
4915 #endif
4920 //===========================================================================//
4921 // //
4922 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P //
4923 // //
4924 //===========================================================================//
4926 // Optimized code for PNG Up filter decoder
4928 static void /* PRIVATE */
4929 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
4930 png_bytep prev_row)
4932 png_uint_32 len;
4933 int dummy_value_d; // fix 'forbidden register 3 (dx) was spilled' error
4934 int dummy_value_S;
4935 int dummy_value_D;
4937 len = row_info->rowbytes; // number of bytes to filter
4939 __asm__ __volatile__ (
4940 //pre "movl row, %%edi \n\t"
4941 // get # of bytes to alignment
4942 #ifdef __PIC__
4943 "pushl %%ebx \n\t"
4944 #endif
4945 "movl %%edi, %%ecx \n\t"
4946 "xorl %%ebx, %%ebx \n\t"
4947 "addl $0x7, %%ecx \n\t"
4948 "xorl %%eax, %%eax \n\t"
4949 "andl $0xfffffff8, %%ecx \n\t"
4950 //pre "movl prev_row, %%esi \n\t"
4951 "subl %%edi, %%ecx \n\t"
4952 "jz up_go \n\t"
4954 "up_lp1: \n\t" // fix alignment
4955 "movb (%%edi,%%ebx,), %%al \n\t"
4956 "addb (%%esi,%%ebx,), %%al \n\t"
4957 "incl %%ebx \n\t"
4958 "cmpl %%ecx, %%ebx \n\t"
4959 "movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
4960 "jb up_lp1 \n\t" // offset incl ebx
4962 "up_go: \n\t"
4963 //pre "movl len, %%edx \n\t"
4964 "movl %%edx, %%ecx \n\t"
4965 "subl %%ebx, %%edx \n\t" // subtract alignment fix
4966 "andl $0x0000003f, %%edx \n\t" // calc bytes over mult of 64
4967 "subl %%edx, %%ecx \n\t" // drop over bytes from length
4969 // unrolled loop - use all MMX registers and interleave to reduce
4970 // number of branch instructions (loops) and reduce partial stalls
4971 "up_loop: \n\t"
4972 "movq (%%esi,%%ebx,), %%mm1 \n\t"
4973 "movq (%%edi,%%ebx,), %%mm0 \n\t"
4974 "movq 8(%%esi,%%ebx,), %%mm3 \n\t"
4975 "paddb %%mm1, %%mm0 \n\t"
4976 "movq 8(%%edi,%%ebx,), %%mm2 \n\t"
4977 "movq %%mm0, (%%edi,%%ebx,) \n\t"
4978 "paddb %%mm3, %%mm2 \n\t"
4979 "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
4980 "movq %%mm2, 8(%%edi,%%ebx,) \n\t"
4981 "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
4982 "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
4983 "paddb %%mm5, %%mm4 \n\t"
4984 "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
4985 "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
4986 "paddb %%mm7, %%mm6 \n\t"
4987 "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
4988 "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
4989 "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
4990 "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
4991 "paddb %%mm1, %%mm0 \n\t"
4992 "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
4993 "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
4994 "paddb %%mm3, %%mm2 \n\t"
4995 "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
4996 "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
4997 "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
4998 "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
4999 "paddb %%mm5, %%mm4 \n\t"
5000 "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
5001 "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
5002 "addl $64, %%ebx \n\t"
5003 "paddb %%mm7, %%mm6 \n\t"
5004 "cmpl %%ecx, %%ebx \n\t"
5005 "movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
5006 "jb up_loop \n\t" // -8 to offset addl ebx
5008 "cmpl $0, %%edx \n\t" // test for bytes over mult of 64
5009 "jz up_end \n\t"
5011 "cmpl $8, %%edx \n\t" // test for less than 8 bytes
5012 "jb up_lt8 \n\t" // [added by lcreeve at netins.net]
5014 "addl %%edx, %%ecx \n\t"
5015 "andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
5016 "subl %%edx, %%ecx \n\t" // drop over bytes from length
5017 "jz up_lt8 \n\t"
5019 "up_lpA: \n\t" // use MMX regs to update 8 bytes sim.
5020 "movq (%%esi,%%ebx,), %%mm1 \n\t"
5021 "movq (%%edi,%%ebx,), %%mm0 \n\t"
5022 "addl $8, %%ebx \n\t"
5023 "paddb %%mm1, %%mm0 \n\t"
5024 "cmpl %%ecx, %%ebx \n\t"
5025 "movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to
5026 "jb up_lpA \n\t" // offset add ebx
5027 "cmpl $0, %%edx \n\t" // test for bytes over mult of 8
5028 "jz up_end \n\t"
5030 "up_lt8: \n\t"
5031 "xorl %%eax, %%eax \n\t"
5032 "addl %%edx, %%ecx \n\t" // move over byte count into counter
5034 "up_lp2: \n\t" // use x86 regs for remaining bytes
5035 "movb (%%edi,%%ebx,), %%al \n\t"
5036 "addb (%%esi,%%ebx,), %%al \n\t"
5037 "incl %%ebx \n\t"
5038 "cmpl %%ecx, %%ebx \n\t"
5039 "movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
5040 "jb up_lp2 \n\t" // offset inc ebx
5042 "up_end: \n\t"
5043 "EMMS \n\t" // conversion of filtered row complete
5044 #ifdef __PIC__
5045 "popl %%ebx \n\t"
5046 #endif
5048 : "=d" (dummy_value_d), // 0 // output regs (dummy)
5049 "=S" (dummy_value_S), // 1
5050 "=D" (dummy_value_D) // 2
5052 : "0" (len), // edx // input regs
5053 "1" (prev_row), // esi
5054 "2" (row) // edi
5056 : "%eax", "%ecx" // clobber list (no input regs!)
5057 #ifndef __PIC__
5058 , "%ebx"
5059 #endif
5061 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
5062 , "%mm0", "%mm1", "%mm2", "%mm3"
5063 , "%mm4", "%mm5", "%mm6", "%mm7"
5064 #endif
5067 } // end of png_read_filter_row_mmx_up()
5069 #endif /* PNG_MMX_CODE_SUPPORTED */
5074 /*===========================================================================*/
5075 /* */
5076 /* P N G _ R E A D _ F I L T E R _ R O W */
5077 /* */
5078 /*===========================================================================*/
5081 /* Optimized png_read_filter_row routines */
5083 void /* PRIVATE */
5084 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
5085 row, png_bytep prev_row, int filter)
5087 #ifdef PNG_DEBUG
5088 char filnm[10];
5089 #endif
5091 #if defined(PNG_MMX_CODE_SUPPORTED)
5092 /* GRR: these are superseded by png_ptr->asm_flags: */
5093 #define UseMMX_sub 1 // GRR: converted 20000730
5094 #define UseMMX_up 1 // GRR: converted 20000729
5095 #define UseMMX_avg 1 // GRR: converted 20000828 (+ 16-bit bugfix 20000916)
5096 #define UseMMX_paeth 1 // GRR: converted 20000828
5098 if (_mmx_supported == 2) {
5099 /* this should have happened in png_init_mmx_flags() already */
5100 #if !defined(PNG_1_0_X)
5101 png_warning(png_ptr, "asm_flags may not have been initialized");
5102 #endif
5103 png_mmx_support();
5105 #endif /* PNG_MMX_CODE_SUPPORTED */
5107 #ifdef PNG_DEBUG
5108 png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
5109 switch (filter)
5111 case 0: sprintf(filnm, "none");
5112 break;
5113 case 1: sprintf(filnm, "sub-%s",
5114 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5115 #if !defined(PNG_1_0_X)
5116 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" :
5117 #endif
5118 #endif
5119 "x86");
5120 break;
5121 case 2: sprintf(filnm, "up-%s",
5122 #ifdef PNG_MMX_CODE_SUPPORTED
5123 #if !defined(PNG_1_0_X)
5124 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" :
5125 #endif
5126 #endif
5127 "x86");
5128 break;
5129 case 3: sprintf(filnm, "avg-%s",
5130 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5131 #if !defined(PNG_1_0_X)
5132 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" :
5133 #endif
5134 #endif
5135 "x86");
5136 break;
5137 case 4: sprintf(filnm, "Paeth-%s",
5138 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5139 #if !defined(PNG_1_0_X)
5140 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":
5141 #endif
5142 #endif
5143 "x86");
5144 break;
5145 default: sprintf(filnm, "unknw");
5146 break;
5148 png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
5149 png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
5150 png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
5151 (int)((row_info->pixel_depth + 7) >> 3));
5152 png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
5153 #endif /* PNG_DEBUG */
5155 switch (filter)
5157 case PNG_FILTER_VALUE_NONE:
5158 break;
5160 case PNG_FILTER_VALUE_SUB:
5161 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5162 #if !defined(PNG_1_0_X)
5163 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
5164 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5165 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5166 #else
5167 if (_mmx_supported)
5168 #endif
5170 png_read_filter_row_mmx_sub(row_info, row);
5172 else
5173 #endif /* PNG_MMX_CODE_SUPPORTED */
5175 png_uint_32 i;
5176 png_uint_32 istop = row_info->rowbytes;
5177 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5178 png_bytep rp = row + bpp;
5179 png_bytep lp = row;
5181 for (i = bpp; i < istop; i++)
5183 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
5184 rp++;
5186 } /* end !UseMMX_sub */
5187 break;
5189 case PNG_FILTER_VALUE_UP:
5190 #if defined(PNG_MMX_CODE_SUPPORTED)
5191 #if !defined(PNG_1_0_X)
5192 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
5193 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5194 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5195 #else
5196 if (_mmx_supported)
5197 #endif
5199 png_read_filter_row_mmx_up(row_info, row, prev_row);
5201 else
5202 #endif /* PNG_MMX_CODE_SUPPORTED */
5204 png_uint_32 i;
5205 png_uint_32 istop = row_info->rowbytes;
5206 png_bytep rp = row;
5207 png_bytep pp = prev_row;
5209 for (i = 0; i < istop; ++i)
5211 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5212 rp++;
5214 } /* end !UseMMX_up */
5215 break;
5217 case PNG_FILTER_VALUE_AVG:
5218 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5219 #if !defined(PNG_1_0_X)
5220 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
5221 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5222 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5223 #else
5224 if (_mmx_supported)
5225 #endif
5227 png_read_filter_row_mmx_avg(row_info, row, prev_row);
5229 else
5230 #endif /* PNG_MMX_CODE_SUPPORTED */
5232 png_uint_32 i;
5233 png_bytep rp = row;
5234 png_bytep pp = prev_row;
5235 png_bytep lp = row;
5236 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5237 png_uint_32 istop = row_info->rowbytes - bpp;
5239 for (i = 0; i < bpp; i++)
5241 *rp = (png_byte)(((int)(*rp) +
5242 ((int)(*pp++) >> 1)) & 0xff);
5243 rp++;
5246 for (i = 0; i < istop; i++)
5248 *rp = (png_byte)(((int)(*rp) +
5249 ((int)(*pp++ + *lp++) >> 1)) & 0xff);
5250 rp++;
5252 } /* end !UseMMX_avg */
5253 break;
5255 case PNG_FILTER_VALUE_PAETH:
5256 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5257 #if !defined(PNG_1_0_X)
5258 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
5259 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5260 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5261 #else
5262 if (_mmx_supported)
5263 #endif
5265 png_read_filter_row_mmx_paeth(row_info, row, prev_row);
5267 else
5268 #endif /* PNG_MMX_CODE_SUPPORTED */
5270 png_uint_32 i;
5271 png_bytep rp = row;
5272 png_bytep pp = prev_row;
5273 png_bytep lp = row;
5274 png_bytep cp = prev_row;
5275 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5276 png_uint_32 istop = row_info->rowbytes - bpp;
5278 for (i = 0; i < bpp; i++)
5280 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5281 rp++;
5284 for (i = 0; i < istop; i++) /* use leftover rp,pp */
5286 int a, b, c, pa, pb, pc, p;
5288 a = *lp++;
5289 b = *pp++;
5290 c = *cp++;
5292 p = b - c;
5293 pc = a - c;
5295 #ifdef PNG_USE_ABS
5296 pa = abs(p);
5297 pb = abs(pc);
5298 pc = abs(p + pc);
5299 #else
5300 pa = p < 0 ? -p : p;
5301 pb = pc < 0 ? -pc : pc;
5302 pc = (p + pc) < 0 ? -(p + pc) : p + pc;
5303 #endif
5306 if (pa <= pb && pa <= pc)
5307 p = a;
5308 else if (pb <= pc)
5309 p = b;
5310 else
5311 p = c;
5314 p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
5316 *rp = (png_byte)(((int)(*rp) + p) & 0xff);
5317 rp++;
5319 } /* end !UseMMX_paeth */
5320 break;
5322 default:
5323 png_warning(png_ptr, "Ignoring bad row-filter type");
5324 *row=0;
5325 break;
5329 #endif /* PNG_HAVE_MMX_READ_FILTER_ROW */
5332 /*===========================================================================*/
5333 /* */
5334 /* P N G _ M M X _ S U P P O R T */
5335 /* */
5336 /*===========================================================================*/
5338 /* GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl)
5339 * (2) all instructions compile with gcc 2.7.2.3 and later
5340 * (3) the function is moved down here to prevent gcc from
5341 * inlining it in multiple places and then barfing be-
5342 * cause the ".NOT_SUPPORTED" label is multiply defined
5343 * [is there a way to signal that a *single* function should
5344 * not be inlined? is there a way to modify the label for
5345 * each inlined instance, e.g., by appending _1, _2, etc.?
5346 * maybe if don't use leading "." in label name? (nope...sigh)]
5349 int PNGAPI
5350 png_mmx_support(void)
5352 #if defined(PNG_MMX_CODE_SUPPORTED)
5353 int result;
5354 __asm__ __volatile__ (
5355 "pushl %%ebx \n\t" // ebx gets clobbered by CPUID instruction
5356 "pushl %%ecx \n\t" // so does ecx...
5357 "pushl %%edx \n\t" // ...and edx (but ecx & edx safe on Linux)
5358 // ".byte 0x66 \n\t" // convert 16-bit pushf to 32-bit pushfd
5359 // "pushf \n\t" // 16-bit pushf
5360 "pushfl \n\t" // save Eflag to stack
5361 "popl %%eax \n\t" // get Eflag from stack into eax
5362 "movl %%eax, %%ecx \n\t" // make another copy of Eflag in ecx
5363 "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
5364 "pushl %%eax \n\t" // save modified Eflag back to stack
5365 // ".byte 0x66 \n\t" // convert 16-bit popf to 32-bit popfd
5366 // "popf \n\t" // 16-bit popf
5367 "popfl \n\t" // restore modified value to Eflag reg
5368 "pushfl \n\t" // save Eflag to stack
5369 "popl %%eax \n\t" // get Eflag from stack
5370 "pushl %%ecx \n\t" // save original Eflag to stack
5371 "popfl \n\t" // restore original Eflag
5372 "xorl %%ecx, %%eax \n\t" // compare new Eflag with original Eflag
5373 "jz 0f \n\t" // if same, CPUID instr. is not supported
5375 "xorl %%eax, %%eax \n\t" // set eax to zero
5376 // ".byte 0x0f, 0xa2 \n\t" // CPUID instruction (two-byte opcode)
5377 "cpuid \n\t" // get the CPU identification info
5378 "cmpl $1, %%eax \n\t" // make sure eax return non-zero value
5379 "jl 0f \n\t" // if eax is zero, MMX is not supported
5381 "xorl %%eax, %%eax \n\t" // set eax to zero and...
5382 "incl %%eax \n\t" // ...increment eax to 1. This pair is
5383 // faster than the instruction "mov eax, 1"
5384 "cpuid \n\t" // get the CPU identification info again
5385 "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
5386 "cmpl $0, %%edx \n\t" // 0 = MMX not supported
5387 "jz 0f \n\t" // non-zero = yes, MMX IS supported
5389 "movl $1, %%eax \n\t" // set return value to 1
5390 "jmp 1f \n\t" // DONE: have MMX support
5392 "0: \n\t" // .NOT_SUPPORTED: target label for jump instructions
5393 "movl $0, %%eax \n\t" // set return value to 0
5394 "1: \n\t" // .RETURN: target label for jump instructions
5395 "popl %%edx \n\t" // restore edx
5396 "popl %%ecx \n\t" // restore ecx
5397 "popl %%ebx \n\t" // restore ebx
5399 // "ret \n\t" // DONE: no MMX support
5400 // (fall through to standard C "ret")
5402 : "=a" (result) // output list
5404 : // any variables used on input (none)
5406 // no clobber list
5407 // , "%ebx", "%ecx", "%edx" // GRR: we handle these manually
5408 // , "memory" // if write to a variable gcc thought was in a reg
5409 // , "cc" // "condition codes" (flag bits)
5411 _mmx_supported = result;
5412 #else
5413 _mmx_supported = 0;
5414 #endif /* PNG_MMX_CODE_SUPPORTED */
5416 return _mmx_supported;
5420 #endif /* PNG_USE_PNGGCCRD */