2 /* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
4 * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
6 * See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
7 * and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
8 * for Intel's performance analysis of the MMX vs. non-MMX code.
10 * Last changed in libpng 1.2.15 January 5, 2007
11 * For conditions of distribution and use, see copyright notice in png.h
12 * Copyright (c) 1998-2007 Glenn Randers-Pehrson
13 * Copyright (c) 1998, Intel Corporation
15 * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
16 * Interface to libpng contributed by Gilles Vollant, 1999.
17 * GNU C port by Greg Roelofs, 1999-2001.
19 * Lines 2350-4300 converted in place with intel2gas 1.3.1:
21 * intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
23 * and then cleaned up by hand. See http://hermes.terminal.at/intel2gas/ .
25 * NOTE: A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
26 * is required to assemble the newer MMX instructions such as movq.
29 * ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
31 * (or a later version in the same directory). For Linux, check your
32 * distribution's web site(s) or try these links:
34 * http://rufus.w3.org/linux/RPM/binutils.html
35 * http://www.debian.org/Packages/stable/devel/binutils.html
36 * ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
39 * For other platforms, see the main GNU site:
41 * ftp://ftp.gnu.org/pub/gnu/binutils/
43 * Version 2.5.2l.15 is definitely too old...
47 * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
48 * =====================================
51 * - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
54 * - additional optimizations (possible or definite):
55 * x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
56 * - write MMX code for 48-bit case (pixel_bytes == 6)
57 * - figure out what's up with 24-bit case (pixel_bytes == 3):
58 * why subtract 8 from width_mmx in the pass 4/5 case?
59 * (only width_mmx case) (near line 1606)
60 * x [DONE] replace pixel_bytes within each block with the true
61 * constant value (or are compilers smart enough to do that?)
62 * - rewrite all MMX interlacing code so it's aligned with
63 * the *beginning* of the row buffer, not the end. This
64 * would not only allow one to eliminate half of the memory
65 * writes for odd passes (that is, pass == odd), it may also
66 * eliminate some unaligned-data-access exceptions (assuming
67 * there's a penalty for not aligning 64-bit accesses on
68 * 64-bit boundaries). The only catch is that the "leftover"
69 * pixel(s) at the end of the row would have to be saved,
70 * but there are enough unused MMX registers in every case,
71 * so this is not a problem. A further benefit is that the
72 * post-MMX cleanup code (C code) in at least some of the
73 * cases could be done within the assembler block.
74 * x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
75 * inconsistent, and don't match the MMX Programmer's Reference
76 * Manual conventions anyway. They should be changed to
77 * "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
78 * was lowest in memory (e.g., corresponding to a left pixel)
79 * and b7 is the byte that was highest (e.g., a right pixel).
82 * - Brennan's Guide notwithstanding, gcc under Linux does *not*
83 * want globals prefixed by underscores when referencing them--
84 * i.e., if the variable is const4, then refer to it as const4,
85 * not _const4. This seems to be a djgpp-specific requirement.
86 * Also, such variables apparently *must* be declared outside
87 * of functions; neither static nor automatic variables work if
88 * defined within the scope of a single function, but both
89 * static and truly global (multi-module) variables work fine.
92 * - fixed png_combine_row() non-MMX replication bug (odd passes only?)
93 * - switched from string-concatenation-with-macros to cleaner method of
94 * renaming global variables for djgpp--i.e., always use prefixes in
95 * inlined assembler code (== strings) and conditionally rename the
96 * variables, not the other way around. Hence _const4, _mask8_0, etc.
99 * - fixed mmxsupport()/png_do_read_interlace() first-row bug
100 * This one was severely weird: even though mmxsupport() doesn't touch
101 * ebx (where "row" pointer was stored), it nevertheless managed to zero
102 * the register (even in static/non-fPIC code--see below), which in turn
103 * caused png_do_read_interlace() to return prematurely on the first row of
104 * interlaced images (i.e., without expanding the interlaced pixels).
105 * Inspection of the generated assembly code didn't turn up any clues,
106 * although it did point at a minor optimization (i.e., get rid of
107 * mmx_supported_local variable and just use eax). Possibly the CPUID
108 * instruction is more destructive than it looks? (Not yet checked.)
109 * - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
110 * listings... Apparently register spillage has to do with ebx, since
111 * it's used to index the global offset table. Commenting it out of the
112 * input-reg lists in png_combine_row() eliminated compiler barfage, so
113 * ifdef'd with __PIC__ macro: if defined, use a global for unmask
116 * - verified CPUID clobberage: 12-char string constant ("GenuineIntel",
117 * "AuthenticAMD", etc.) placed in ebx:ecx:edx. Still need to polish.
120 * - made "diff" variable (now "_dif") global to simplify conversion of
121 * filtering routines (running out of regs, sigh). "diff" is still used
122 * in interlacing routines, however.
123 * - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
124 * macro determines which is used); original not yet tested.
127 * - when compiling with gcc, be sure to use -fomit-frame-pointer
130 * - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
131 * pass == 4 or 5, that caused visible corruption of interlaced images
134 * - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
135 * many of the form "forbidden register 0 (ax) was spilled for class AREG."
136 * This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
137 * Chuck Wilson supplied a patch involving dummy output registers. See
138 * http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
139 * for the original (anonymous) SourceForge bug report.
142 * - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
143 * pnggccrd.c: In function `png_combine_row':
144 * pnggccrd.c:525: more than 10 operands in `asm'
145 * pnggccrd.c:669: more than 10 operands in `asm'
146 * pnggccrd.c:828: more than 10 operands in `asm'
147 * pnggccrd.c:994: more than 10 operands in `asm'
148 * pnggccrd.c:1177: more than 10 operands in `asm'
149 * They are all the same problem and can be worked around by using the
150 * global _unmask variable unconditionally, not just in the -fPIC case.
151 * Reportedly earlier versions of gcc also have the problem with more than
152 * 10 operands; they just don't report it. Much strangeness ensues, etc.
155 * - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
156 * MMX routine); began converting png_read_filter_row_mmx_sub()
157 * - to finish remaining sections:
158 * - clean up indentation and comments
159 * - preload local variables
160 * - add output and input regs (order of former determines numerical
162 * - avoid all usage of ebx (including bx, bh, bl) register [20000823]
163 * - remove "$" from addressing of Shift and Mask variables [20000823]
166 * - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
169 * - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
170 * shared-library (-fPIC) version! Code works just fine as part of static
171 * library. Damn damn damn damn damn, should have tested that sooner.
172 * ebx is getting clobbered again (explicitly this time); need to save it
173 * on stack or rewrite asm code to avoid using it altogether. Blargh!
176 * - first section was trickiest; all remaining sections have ebx -> edx now.
177 * (-fPIC works again.) Also added missing underscores to various Shift*
178 * and *Mask* globals and got rid of leading "$" signs.
181 * - added visual separators to help navigate microscopic printed copies
182 * (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
183 * on png_read_filter_row_mmx_avg()
186 * - finished png_read_filter_row_mmx_avg(): only Paeth left! (930 lines...)
187 * What the hell, did png_read_filter_row_mmx_paeth(), too. Comments not
188 * cleaned up/shortened in either routine, but functionality is complete
189 * and seems to be working fine.
192 * - ahhh, figured out last(?) bit of gcc/gas asm-fu: if register is listed
193 * as an input reg (with dummy output variables, etc.), then it *cannot*
194 * also appear in the clobber list or gcc 2.95.2 will barf. The solution
195 * is simple enough...
198 * - bug in png_read_filter_row_mmx_avg(): 16-bit grayscale not handled
199 * correctly (but 48-bit RGB just fine)
202 * - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
203 * - "_ShiftBpp.use = 24;" should have been "_ShiftBpp.use = 16;"
204 * - "_ShiftRem.use = 40;" should have been "_ShiftRem.use = 48;"
205 * - "psllq _ShiftRem, %%mm2" should have been "psrlq _ShiftRem, %%mm2"
208 * - added new png_init_mmx_flags() function (here only because it needs to
209 * call mmxsupport(), which should probably become global png_mmxsupport());
210 * modified other MMX routines to run conditionally (png_ptr->asm_flags)
213 * - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
214 * and made it public; moved png_init_mmx_flags() to png.c as internal func
217 * - removed dependency on png_read_filter_row_c() (C code already duplicated
218 * within MMX version of png_read_filter_row()) so no longer necessary to
219 * compile it into pngrutil.o
222 * - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
225 * - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
228 * - more tinkering with clobber list at lines 4529 and 5033, to get
229 * it to compile on gcc-3.4.
232 * - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
233 * - write MMX code for 48-bit case (pixel_bytes == 6)
234 * - figure out what's up with 24-bit case (pixel_bytes == 3):
235 * why subtract 8 from width_mmx in the pass 4/5 case?
236 * (only width_mmx case) (near line 1606)
237 * - rewrite all MMX interlacing code so it's aligned with beginning
238 * of the row buffer, not the end (see 19991007 for details)
239 * x pick one version of mmxsupport() and get rid of the other
240 * - add error messages to any remaining bogus default cases
241 * - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
242 * x add support for runtime enable/disable/query of various MMX routines
248 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGGCCRD)
250 int PNGAPI
png_mmx_support(void);
252 #ifdef PNG_USE_LOCAL_ARRAYS
253 const static int FARDATA png_pass_start
[7] = {0, 4, 0, 2, 0, 1, 0};
254 const static int FARDATA png_pass_inc
[7] = {8, 8, 4, 4, 2, 2, 1};
255 const static int FARDATA png_pass_width
[7] = {8, 4, 4, 2, 2, 1, 1};
258 #if defined(PNG_MMX_CODE_SUPPORTED)
259 /* djgpp, Win32, Cygwin, and OS2 add their own underscores to global variables,
260 * so define them without: */
261 #if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__) || \
263 # define _mmx_supported mmx_supported
264 # define _const4 const4
265 # define _const6 const6
266 # define _mask8_0 mask8_0
267 # define _mask16_1 mask16_1
268 # define _mask16_0 mask16_0
269 # define _mask24_2 mask24_2
270 # define _mask24_1 mask24_1
271 # define _mask24_0 mask24_0
272 # define _mask32_3 mask32_3
273 # define _mask32_2 mask32_2
274 # define _mask32_1 mask32_1
275 # define _mask32_0 mask32_0
276 # define _mask48_5 mask48_5
277 # define _mask48_4 mask48_4
278 # define _mask48_3 mask48_3
279 # define _mask48_2 mask48_2
280 # define _mask48_1 mask48_1
281 # define _mask48_0 mask48_0
282 # define _LBCarryMask LBCarryMask
283 # define _HBClearMask HBClearMask
284 # define _ActiveMask ActiveMask
285 # define _ActiveMask2 ActiveMask2
286 # define _ActiveMaskEnd ActiveMaskEnd
287 # define _ShiftBpp ShiftBpp
288 # define _ShiftRem ShiftRem
289 #ifdef PNG_THREAD_UNSAFE_OK
290 # define _unmask unmask
291 # define _FullLength FullLength
292 # define _MMXLength MMXLength
294 # define _patemp patemp
295 # define _pbtemp pbtemp
296 # define _pctemp pctemp
301 /* These constants are used in the inlined MMX assembly code.
302 Ignore gcc's "At top level: defined but not used" warnings. */
304 /* GRR 20000706: originally _unmask was needed only when compiling with -fPIC,
305 * since that case uses the %ebx register for indexing the Global Offset Table
306 * and there were no other registers available. But gcc 2.95 and later emit
307 * "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
308 * in the non-PIC case, so we'll just use the global unconditionally now.
310 #ifdef PNG_THREAD_UNSAFE_OK
314 const static unsigned long long _mask8_0
= 0x0102040810204080LL
;
316 const static unsigned long long _mask16_1
= 0x0101020204040808LL
;
317 const static unsigned long long _mask16_0
= 0x1010202040408080LL
;
319 const static unsigned long long _mask24_2
= 0x0101010202020404LL
;
320 const static unsigned long long _mask24_1
= 0x0408080810101020LL
;
321 const static unsigned long long _mask24_0
= 0x2020404040808080LL
;
323 const static unsigned long long _mask32_3
= 0x0101010102020202LL
;
324 const static unsigned long long _mask32_2
= 0x0404040408080808LL
;
325 const static unsigned long long _mask32_1
= 0x1010101020202020LL
;
326 const static unsigned long long _mask32_0
= 0x4040404080808080LL
;
328 const static unsigned long long _mask48_5
= 0x0101010101010202LL
;
329 const static unsigned long long _mask48_4
= 0x0202020204040404LL
;
330 const static unsigned long long _mask48_3
= 0x0404080808080808LL
;
331 const static unsigned long long _mask48_2
= 0x1010101010102020LL
;
332 const static unsigned long long _mask48_1
= 0x2020202040404040LL
;
333 const static unsigned long long _mask48_0
= 0x4040808080808080LL
;
335 const static unsigned long long _const4
= 0x0000000000FFFFFFLL
;
336 //const static unsigned long long _const5 = 0x000000FFFFFF0000LL; // NOT USED
337 const static unsigned long long _const6
= 0x00000000000000FFLL
;
339 // These are used in the row-filter routines and should/would be local
340 // variables if not for gcc addressing limitations.
341 // WARNING: Their presence probably defeats the thread safety of libpng.
343 #ifdef PNG_THREAD_UNSAFE_OK
344 static png_uint_32 _FullLength
;
345 static png_uint_32 _MMXLength
;
347 static int _patemp
; // temp variables for Paeth routine
353 png_squelch_warnings(void)
355 #ifdef PNG_THREAD_UNSAFE_OK
360 _MMXLength
= _MMXLength
;
365 _mask16_1
= _mask16_1
;
366 _mask16_0
= _mask16_0
;
367 _mask24_2
= _mask24_2
;
368 _mask24_1
= _mask24_1
;
369 _mask24_0
= _mask24_0
;
370 _mask32_3
= _mask32_3
;
371 _mask32_2
= _mask32_2
;
372 _mask32_1
= _mask32_1
;
373 _mask32_0
= _mask32_0
;
374 _mask48_5
= _mask48_5
;
375 _mask48_4
= _mask48_4
;
376 _mask48_3
= _mask48_3
;
377 _mask48_2
= _mask48_2
;
378 _mask48_1
= _mask48_1
;
379 _mask48_0
= _mask48_0
;
381 #endif /* PNG_MMX_CODE_SUPPORTED */
384 static int _mmx_supported
= 2;
386 /*===========================================================================*/
388 /* P N G _ C O M B I N E _ R O W */
390 /*===========================================================================*/
392 #if defined(PNG_HAVE_MMX_COMBINE_ROW)
395 #define BPP3 3 /* bytes per pixel (a.k.a. pixel_bytes) */
397 #define BPP6 6 /* (defined only to help avoid cut-and-paste errors) */
400 /* Combines the row recently read in with the previous row.
401 This routine takes care of alpha and transparency if requested.
402 This routine also handles the two methods of progressive display
403 of interlaced images, depending on the mask value.
404 The mask value describes which pixels are to be combined with
405 the row. The pattern always repeats every 8 pixels, so just 8
406 bits are needed. A one indicates the pixel is to be combined; a
407 zero indicates the pixel is to be skipped. This is in addition
408 to any alpha or transparency value associated with the pixel.
409 If you want all pixels to be combined, pass 0xff (255) in mask. */
411 /* Use this routine for the x86 platform - it uses a faster MMX routine
412 if the machine supports MMX. */
415 png_combine_row(png_structp png_ptr
, png_bytep row
, int mask
)
417 png_debug(1, "in png_combine_row (pnggccrd.c)\n");
419 #if defined(PNG_MMX_CODE_SUPPORTED)
420 if (_mmx_supported
== 2) {
421 #if !defined(PNG_1_0_X)
422 /* this should have happened in png_init_mmx_flags() already */
423 png_warning(png_ptr
, "asm_flags may not have been initialized");
431 png_debug(2,"mask == 0xff: doing single png_memcpy()\n");
432 png_memcpy(row
, png_ptr
->row_buf
+ 1,
433 (png_size_t
)PNG_ROWBYTES(png_ptr
->row_info
.pixel_depth
,png_ptr
->width
));
435 else /* (png_combine_row() is never called with mask == 0) */
437 switch (png_ptr
->row_info
.pixel_depth
)
439 case 1: /* png_ptr->row_info.pixel_depth */
443 int s_inc
, s_start
, s_end
;
448 sp
= png_ptr
->row_buf
+ 1;
451 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
452 if (png_ptr
->transformations
& PNG_PACKSWAP
)
468 for (i
= 0; i
< png_ptr
->width
; i
++)
474 value
= (*sp
>> shift
) & 0x1;
475 *dp
&= (png_byte
)((0x7f7f >> (7 - shift
)) & 0xff);
476 *dp
|= (png_byte
)(value
<< shift
);
496 case 2: /* png_ptr->row_info.pixel_depth */
500 int s_start
, s_end
, s_inc
;
506 sp
= png_ptr
->row_buf
+ 1;
509 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
510 if (png_ptr
->transformations
& PNG_PACKSWAP
)
526 for (i
= 0; i
< png_ptr
->width
; i
++)
530 value
= (*sp
>> shift
) & 0x3;
531 *dp
&= (png_byte
)((0x3f3f >> (6 - shift
)) & 0xff);
532 *dp
|= (png_byte
)(value
<< shift
);
551 case 4: /* png_ptr->row_info.pixel_depth */
555 int s_start
, s_end
, s_inc
;
561 sp
= png_ptr
->row_buf
+ 1;
564 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
565 if (png_ptr
->transformations
& PNG_PACKSWAP
)
580 for (i
= 0; i
< png_ptr
->width
; i
++)
584 value
= (*sp
>> shift
) & 0xf;
585 *dp
&= (png_byte
)((0xf0f >> (4 - shift
)) & 0xff);
586 *dp
|= (png_byte
)(value
<< shift
);
605 case 8: /* png_ptr->row_info.pixel_depth */
610 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
611 #if !defined(PNG_1_0_X)
612 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
613 /* && _mmx_supported */ )
620 int dummy_value_a
; // fix 'forbidden register spilled' error
625 _unmask
= ~mask
; // global variable for -fPIC version
626 srcptr
= png_ptr
->row_buf
+ 1;
628 len
= png_ptr
->width
&~7; // reduce to multiple of 8
629 diff
= (int) (png_ptr
->width
& 7); // amount lost
631 __asm__
__volatile__ (
632 "movd _unmask, %%mm7 \n\t" // load bit pattern
633 "psubb %%mm6, %%mm6 \n\t" // zero mm6
634 "punpcklbw %%mm7, %%mm7 \n\t"
635 "punpcklwd %%mm7, %%mm7 \n\t"
636 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
638 "movq _mask8_0, %%mm0 \n\t"
639 "pand %%mm7, %%mm0 \n\t" // nonzero if keep byte
640 "pcmpeqb %%mm6, %%mm0 \n\t" // zeros->1s, v versa
642 // preload "movl len, %%ecx \n\t" // load length of line
643 // preload "movl srcptr, %%esi \n\t" // load source
644 // preload "movl dstptr, %%edi \n\t" // load dest
646 "cmpl $0, %%ecx \n\t" // len == 0 ?
647 "je mainloop8end \n\t"
650 "movq (%%esi), %%mm4 \n\t" // *srcptr
651 "pand %%mm0, %%mm4 \n\t"
652 "movq %%mm0, %%mm6 \n\t"
653 "pandn (%%edi), %%mm6 \n\t" // *dstptr
654 "por %%mm6, %%mm4 \n\t"
655 "movq %%mm4, (%%edi) \n\t"
656 "addl $8, %%esi \n\t" // inc by 8 bytes processed
657 "addl $8, %%edi \n\t"
658 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
662 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
663 "movl %%eax, %%ecx \n\t"
664 "cmpl $0, %%ecx \n\t"
666 // preload "movl mask, %%edx \n\t"
667 "sall $24, %%edx \n\t" // make low byte, high byte
670 "sall %%edx \n\t" // move high bit to CF
671 "jnc skip8 \n\t" // if CF = 0
672 "movb (%%esi), %%al \n\t"
673 "movb %%al, (%%edi) \n\t"
679 "jnz secondloop8 \n\t"
684 : "=a" (dummy_value_a
), // output regs (dummy)
685 "=d" (dummy_value_d
),
686 "=c" (dummy_value_c
),
687 "=S" (dummy_value_S
),
690 : "3" (srcptr
), // esi // input regs
693 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
697 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
698 : "%mm0", "%mm4", "%mm6", "%mm7" // clobber list
702 else /* mmx _not supported - Use modified C routine */
703 #endif /* PNG_MMX_CODE_SUPPORTED */
705 register png_uint_32 i
;
706 png_uint_32 initial_val
= png_pass_start
[png_ptr
->pass
];
707 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
708 register int stride
= png_pass_inc
[png_ptr
->pass
];
709 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
710 register int rep_bytes
= png_pass_width
[png_ptr
->pass
];
711 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
712 png_uint_32 len
= png_ptr
->width
&~7; /* reduce to mult. of 8 */
713 int diff
= (int) (png_ptr
->width
& 7); /* amount lost */
714 register png_uint_32 final_val
= len
; /* GRR bugfix */
716 srcptr
= png_ptr
->row_buf
+ 1 + initial_val
;
717 dstptr
= row
+ initial_val
;
719 for (i
= initial_val
; i
< final_val
; i
+= stride
)
721 png_memcpy(dstptr
, srcptr
, rep_bytes
);
725 if (diff
) /* number of leftover pixels: 3 for pngtest */
727 final_val
+=diff
/* *BPP1 */ ;
728 for (; i
< final_val
; i
+= stride
)
730 if (rep_bytes
> (int)(final_val
-i
))
731 rep_bytes
= (int)(final_val
-i
);
732 png_memcpy(dstptr
, srcptr
, rep_bytes
);
738 } /* end of else (_mmx_supported) */
743 case 16: /* png_ptr->row_info.pixel_depth */
748 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
749 #if !defined(PNG_1_0_X)
750 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
751 /* && _mmx_supported */ )
758 int dummy_value_a
; // fix 'forbidden register spilled' error
763 _unmask
= ~mask
; // global variable for -fPIC version
764 srcptr
= png_ptr
->row_buf
+ 1;
766 len
= png_ptr
->width
&~7; // reduce to multiple of 8
767 diff
= (int) (png_ptr
->width
& 7); // amount lost //
769 __asm__
__volatile__ (
770 "movd _unmask, %%mm7 \n\t" // load bit pattern
771 "psubb %%mm6, %%mm6 \n\t" // zero mm6
772 "punpcklbw %%mm7, %%mm7 \n\t"
773 "punpcklwd %%mm7, %%mm7 \n\t"
774 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
776 "movq _mask16_0, %%mm0 \n\t"
777 "movq _mask16_1, %%mm1 \n\t"
779 "pand %%mm7, %%mm0 \n\t"
780 "pand %%mm7, %%mm1 \n\t"
782 "pcmpeqb %%mm6, %%mm0 \n\t"
783 "pcmpeqb %%mm6, %%mm1 \n\t"
785 // preload "movl len, %%ecx \n\t" // load length of line
786 // preload "movl srcptr, %%esi \n\t" // load source
787 // preload "movl dstptr, %%edi \n\t" // load dest
789 "cmpl $0, %%ecx \n\t"
790 "jz mainloop16end \n\t"
793 "movq (%%esi), %%mm4 \n\t"
794 "pand %%mm0, %%mm4 \n\t"
795 "movq %%mm0, %%mm6 \n\t"
796 "movq (%%edi), %%mm7 \n\t"
797 "pandn %%mm7, %%mm6 \n\t"
798 "por %%mm6, %%mm4 \n\t"
799 "movq %%mm4, (%%edi) \n\t"
801 "movq 8(%%esi), %%mm5 \n\t"
802 "pand %%mm1, %%mm5 \n\t"
803 "movq %%mm1, %%mm7 \n\t"
804 "movq 8(%%edi), %%mm6 \n\t"
805 "pandn %%mm6, %%mm7 \n\t"
806 "por %%mm7, %%mm5 \n\t"
807 "movq %%mm5, 8(%%edi) \n\t"
809 "addl $16, %%esi \n\t" // inc by 16 bytes processed
810 "addl $16, %%edi \n\t"
811 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
814 "mainloop16end: \n\t"
815 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
816 "movl %%eax, %%ecx \n\t"
817 "cmpl $0, %%ecx \n\t"
819 // preload "movl mask, %%edx \n\t"
820 "sall $24, %%edx \n\t" // make low byte, high byte
823 "sall %%edx \n\t" // move high bit to CF
824 "jnc skip16 \n\t" // if CF = 0
825 "movw (%%esi), %%ax \n\t"
826 "movw %%ax, (%%edi) \n\t"
829 "addl $2, %%esi \n\t"
830 "addl $2, %%edi \n\t"
832 "jnz secondloop16 \n\t"
837 : "=a" (dummy_value_a
), // output regs (dummy)
838 "=c" (dummy_value_c
),
839 "=d" (dummy_value_d
),
840 "=S" (dummy_value_S
),
843 : "0" (diff
), // eax // input regs
844 // was (unmask) " " RESERVED // ebx // Global Offset Table idx
850 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
851 : "%mm0", "%mm1", "%mm4" // clobber list
852 , "%mm5", "%mm6", "%mm7"
856 else /* mmx _not supported - Use modified C routine */
857 #endif /* PNG_MMX_CODE_SUPPORTED */
859 register png_uint_32 i
;
860 png_uint_32 initial_val
= BPP2
* png_pass_start
[png_ptr
->pass
];
861 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
862 register int stride
= BPP2
* png_pass_inc
[png_ptr
->pass
];
863 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
864 register int rep_bytes
= BPP2
* png_pass_width
[png_ptr
->pass
];
865 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
866 png_uint_32 len
= png_ptr
->width
&~7; /* reduce to mult. of 8 */
867 int diff
= (int) (png_ptr
->width
& 7); /* amount lost */
868 register png_uint_32 final_val
= BPP2
* len
; /* GRR bugfix */
870 srcptr
= png_ptr
->row_buf
+ 1 + initial_val
;
871 dstptr
= row
+ initial_val
;
873 for (i
= initial_val
; i
< final_val
; i
+= stride
)
875 png_memcpy(dstptr
, srcptr
, rep_bytes
);
879 if (diff
) /* number of leftover pixels: 3 for pngtest */
881 final_val
+=diff
*BPP2
;
882 for (; i
< final_val
; i
+= stride
)
884 if (rep_bytes
> (int)(final_val
-i
))
885 rep_bytes
= (int)(final_val
-i
);
886 png_memcpy(dstptr
, srcptr
, rep_bytes
);
891 } /* end of else (_mmx_supported) */
896 case 24: /* png_ptr->row_info.pixel_depth */
901 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
902 #if !defined(PNG_1_0_X)
903 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
904 /* && _mmx_supported */ )
911 int dummy_value_a
; // fix 'forbidden register spilled' error
916 _unmask
= ~mask
; // global variable for -fPIC version
917 srcptr
= png_ptr
->row_buf
+ 1;
919 len
= png_ptr
->width
&~7; // reduce to multiple of 8
920 diff
= (int) (png_ptr
->width
& 7); // amount lost //
922 __asm__
__volatile__ (
923 "movd _unmask, %%mm7 \n\t" // load bit pattern
924 "psubb %%mm6, %%mm6 \n\t" // zero mm6
925 "punpcklbw %%mm7, %%mm7 \n\t"
926 "punpcklwd %%mm7, %%mm7 \n\t"
927 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
929 "movq _mask24_0, %%mm0 \n\t"
930 "movq _mask24_1, %%mm1 \n\t"
931 "movq _mask24_2, %%mm2 \n\t"
933 "pand %%mm7, %%mm0 \n\t"
934 "pand %%mm7, %%mm1 \n\t"
935 "pand %%mm7, %%mm2 \n\t"
937 "pcmpeqb %%mm6, %%mm0 \n\t"
938 "pcmpeqb %%mm6, %%mm1 \n\t"
939 "pcmpeqb %%mm6, %%mm2 \n\t"
941 // preload "movl len, %%ecx \n\t" // load length of line
942 // preload "movl srcptr, %%esi \n\t" // load source
943 // preload "movl dstptr, %%edi \n\t" // load dest
945 "cmpl $0, %%ecx \n\t"
946 "jz mainloop24end \n\t"
949 "movq (%%esi), %%mm4 \n\t"
950 "pand %%mm0, %%mm4 \n\t"
951 "movq %%mm0, %%mm6 \n\t"
952 "movq (%%edi), %%mm7 \n\t"
953 "pandn %%mm7, %%mm6 \n\t"
954 "por %%mm6, %%mm4 \n\t"
955 "movq %%mm4, (%%edi) \n\t"
957 "movq 8(%%esi), %%mm5 \n\t"
958 "pand %%mm1, %%mm5 \n\t"
959 "movq %%mm1, %%mm7 \n\t"
960 "movq 8(%%edi), %%mm6 \n\t"
961 "pandn %%mm6, %%mm7 \n\t"
962 "por %%mm7, %%mm5 \n\t"
963 "movq %%mm5, 8(%%edi) \n\t"
965 "movq 16(%%esi), %%mm6 \n\t"
966 "pand %%mm2, %%mm6 \n\t"
967 "movq %%mm2, %%mm4 \n\t"
968 "movq 16(%%edi), %%mm7 \n\t"
969 "pandn %%mm7, %%mm4 \n\t"
970 "por %%mm4, %%mm6 \n\t"
971 "movq %%mm6, 16(%%edi) \n\t"
973 "addl $24, %%esi \n\t" // inc by 24 bytes processed
974 "addl $24, %%edi \n\t"
975 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
979 "mainloop24end: \n\t"
980 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
981 "movl %%eax, %%ecx \n\t"
982 "cmpl $0, %%ecx \n\t"
984 // preload "movl mask, %%edx \n\t"
985 "sall $24, %%edx \n\t" // make low byte, high byte
988 "sall %%edx \n\t" // move high bit to CF
989 "jnc skip24 \n\t" // if CF = 0
990 "movw (%%esi), %%ax \n\t"
991 "movw %%ax, (%%edi) \n\t"
992 "xorl %%eax, %%eax \n\t"
993 "movb 2(%%esi), %%al \n\t"
994 "movb %%al, 2(%%edi) \n\t"
997 "addl $3, %%esi \n\t"
998 "addl $3, %%edi \n\t"
1000 "jnz secondloop24 \n\t"
1005 : "=a" (dummy_value_a
), // output regs (dummy)
1006 "=d" (dummy_value_d
),
1007 "=c" (dummy_value_c
),
1008 "=S" (dummy_value_S
),
1009 "=D" (dummy_value_D
)
1011 : "3" (srcptr
), // esi // input regs
1012 "4" (dstptr
), // edi
1014 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1018 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1019 : "%mm0", "%mm1", "%mm2" // clobber list
1020 , "%mm4", "%mm5", "%mm6", "%mm7"
1024 else /* mmx _not supported - Use modified C routine */
1025 #endif /* PNG_MMX_CODE_SUPPORTED */
1027 register png_uint_32 i
;
1028 png_uint_32 initial_val
= BPP3
* png_pass_start
[png_ptr
->pass
];
1029 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1030 register int stride
= BPP3
* png_pass_inc
[png_ptr
->pass
];
1031 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1032 register int rep_bytes
= BPP3
* png_pass_width
[png_ptr
->pass
];
1033 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1034 png_uint_32 len
= png_ptr
->width
&~7; /* reduce to mult. of 8 */
1035 int diff
= (int) (png_ptr
->width
& 7); /* amount lost */
1036 register png_uint_32 final_val
= BPP3
* len
; /* GRR bugfix */
1038 srcptr
= png_ptr
->row_buf
+ 1 + initial_val
;
1039 dstptr
= row
+ initial_val
;
1041 for (i
= initial_val
; i
< final_val
; i
+= stride
)
1043 png_memcpy(dstptr
, srcptr
, rep_bytes
);
1047 if (diff
) /* number of leftover pixels: 3 for pngtest */
1049 final_val
+=diff
*BPP3
;
1050 for (; i
< final_val
; i
+= stride
)
1052 if (rep_bytes
> (int)(final_val
-i
))
1053 rep_bytes
= (int)(final_val
-i
);
1054 png_memcpy(dstptr
, srcptr
, rep_bytes
);
1059 } /* end of else (_mmx_supported) */
1064 case 32: /* png_ptr->row_info.pixel_depth */
1069 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1070 #if !defined(PNG_1_0_X)
1071 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
1072 /* && _mmx_supported */ )
1079 int dummy_value_a
; // fix 'forbidden register spilled' error
1084 _unmask
= ~mask
; // global variable for -fPIC version
1085 srcptr
= png_ptr
->row_buf
+ 1;
1087 len
= png_ptr
->width
&~7; // reduce to multiple of 8
1088 diff
= (int) (png_ptr
->width
& 7); // amount lost //
1090 __asm__
__volatile__ (
1091 "movd _unmask, %%mm7 \n\t" // load bit pattern
1092 "psubb %%mm6, %%mm6 \n\t" // zero mm6
1093 "punpcklbw %%mm7, %%mm7 \n\t"
1094 "punpcklwd %%mm7, %%mm7 \n\t"
1095 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
1097 "movq _mask32_0, %%mm0 \n\t"
1098 "movq _mask32_1, %%mm1 \n\t"
1099 "movq _mask32_2, %%mm2 \n\t"
1100 "movq _mask32_3, %%mm3 \n\t"
1102 "pand %%mm7, %%mm0 \n\t"
1103 "pand %%mm7, %%mm1 \n\t"
1104 "pand %%mm7, %%mm2 \n\t"
1105 "pand %%mm7, %%mm3 \n\t"
1107 "pcmpeqb %%mm6, %%mm0 \n\t"
1108 "pcmpeqb %%mm6, %%mm1 \n\t"
1109 "pcmpeqb %%mm6, %%mm2 \n\t"
1110 "pcmpeqb %%mm6, %%mm3 \n\t"
1112 // preload "movl len, %%ecx \n\t" // load length of line
1113 // preload "movl srcptr, %%esi \n\t" // load source
1114 // preload "movl dstptr, %%edi \n\t" // load dest
1116 "cmpl $0, %%ecx \n\t" // lcr
1117 "jz mainloop32end \n\t"
1120 "movq (%%esi), %%mm4 \n\t"
1121 "pand %%mm0, %%mm4 \n\t"
1122 "movq %%mm0, %%mm6 \n\t"
1123 "movq (%%edi), %%mm7 \n\t"
1124 "pandn %%mm7, %%mm6 \n\t"
1125 "por %%mm6, %%mm4 \n\t"
1126 "movq %%mm4, (%%edi) \n\t"
1128 "movq 8(%%esi), %%mm5 \n\t"
1129 "pand %%mm1, %%mm5 \n\t"
1130 "movq %%mm1, %%mm7 \n\t"
1131 "movq 8(%%edi), %%mm6 \n\t"
1132 "pandn %%mm6, %%mm7 \n\t"
1133 "por %%mm7, %%mm5 \n\t"
1134 "movq %%mm5, 8(%%edi) \n\t"
1136 "movq 16(%%esi), %%mm6 \n\t"
1137 "pand %%mm2, %%mm6 \n\t"
1138 "movq %%mm2, %%mm4 \n\t"
1139 "movq 16(%%edi), %%mm7 \n\t"
1140 "pandn %%mm7, %%mm4 \n\t"
1141 "por %%mm4, %%mm6 \n\t"
1142 "movq %%mm6, 16(%%edi) \n\t"
1144 "movq 24(%%esi), %%mm7 \n\t"
1145 "pand %%mm3, %%mm7 \n\t"
1146 "movq %%mm3, %%mm5 \n\t"
1147 "movq 24(%%edi), %%mm4 \n\t"
1148 "pandn %%mm4, %%mm5 \n\t"
1149 "por %%mm5, %%mm7 \n\t"
1150 "movq %%mm7, 24(%%edi) \n\t"
1152 "addl $32, %%esi \n\t" // inc by 32 bytes processed
1153 "addl $32, %%edi \n\t"
1154 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
1155 "ja mainloop32 \n\t"
1157 "mainloop32end: \n\t"
1158 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
1159 "movl %%eax, %%ecx \n\t"
1160 "cmpl $0, %%ecx \n\t"
1162 // preload "movl mask, %%edx \n\t"
1163 "sall $24, %%edx \n\t" // low byte => high byte
1165 "secondloop32: \n\t"
1166 "sall %%edx \n\t" // move high bit to CF
1167 "jnc skip32 \n\t" // if CF = 0
1168 "movl (%%esi), %%eax \n\t"
1169 "movl %%eax, (%%edi) \n\t"
1172 "addl $4, %%esi \n\t"
1173 "addl $4, %%edi \n\t"
1175 "jnz secondloop32 \n\t"
1180 : "=a" (dummy_value_a
), // output regs (dummy)
1181 "=d" (dummy_value_d
),
1182 "=c" (dummy_value_c
),
1183 "=S" (dummy_value_S
),
1184 "=D" (dummy_value_D
)
1186 : "3" (srcptr
), // esi // input regs
1187 "4" (dstptr
), // edi
1189 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1193 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1194 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
1195 , "%mm4", "%mm5", "%mm6", "%mm7"
1199 else /* mmx _not supported - Use modified C routine */
1200 #endif /* PNG_MMX_CODE_SUPPORTED */
1202 register png_uint_32 i
;
1203 png_uint_32 initial_val
= BPP4
* png_pass_start
[png_ptr
->pass
];
1204 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1205 register int stride
= BPP4
* png_pass_inc
[png_ptr
->pass
];
1206 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1207 register int rep_bytes
= BPP4
* png_pass_width
[png_ptr
->pass
];
1208 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1209 png_uint_32 len
= png_ptr
->width
&~7; /* reduce to mult. of 8 */
1210 int diff
= (int) (png_ptr
->width
& 7); /* amount lost */
1211 register png_uint_32 final_val
= BPP4
* len
; /* GRR bugfix */
1213 srcptr
= png_ptr
->row_buf
+ 1 + initial_val
;
1214 dstptr
= row
+ initial_val
;
1216 for (i
= initial_val
; i
< final_val
; i
+= stride
)
1218 png_memcpy(dstptr
, srcptr
, rep_bytes
);
1222 if (diff
) /* number of leftover pixels: 3 for pngtest */
1224 final_val
+=diff
*BPP4
;
1225 for (; i
< final_val
; i
+= stride
)
1227 if (rep_bytes
> (int)(final_val
-i
))
1228 rep_bytes
= (int)(final_val
-i
);
1229 png_memcpy(dstptr
, srcptr
, rep_bytes
);
1234 } /* end of else (_mmx_supported) */
1239 case 48: /* png_ptr->row_info.pixel_depth */
1244 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1245 #if !defined(PNG_1_0_X)
1246 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
1247 /* && _mmx_supported */ )
1254 int dummy_value_a
; // fix 'forbidden register spilled' error
1259 _unmask
= ~mask
; // global variable for -fPIC version
1260 srcptr
= png_ptr
->row_buf
+ 1;
1262 len
= png_ptr
->width
&~7; // reduce to multiple of 8
1263 diff
= (int) (png_ptr
->width
& 7); // amount lost //
1265 __asm__
__volatile__ (
1266 "movd _unmask, %%mm7 \n\t" // load bit pattern
1267 "psubb %%mm6, %%mm6 \n\t" // zero mm6
1268 "punpcklbw %%mm7, %%mm7 \n\t"
1269 "punpcklwd %%mm7, %%mm7 \n\t"
1270 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
1272 "movq _mask48_0, %%mm0 \n\t"
1273 "movq _mask48_1, %%mm1 \n\t"
1274 "movq _mask48_2, %%mm2 \n\t"
1275 "movq _mask48_3, %%mm3 \n\t"
1276 "movq _mask48_4, %%mm4 \n\t"
1277 "movq _mask48_5, %%mm5 \n\t"
1279 "pand %%mm7, %%mm0 \n\t"
1280 "pand %%mm7, %%mm1 \n\t"
1281 "pand %%mm7, %%mm2 \n\t"
1282 "pand %%mm7, %%mm3 \n\t"
1283 "pand %%mm7, %%mm4 \n\t"
1284 "pand %%mm7, %%mm5 \n\t"
1286 "pcmpeqb %%mm6, %%mm0 \n\t"
1287 "pcmpeqb %%mm6, %%mm1 \n\t"
1288 "pcmpeqb %%mm6, %%mm2 \n\t"
1289 "pcmpeqb %%mm6, %%mm3 \n\t"
1290 "pcmpeqb %%mm6, %%mm4 \n\t"
1291 "pcmpeqb %%mm6, %%mm5 \n\t"
1293 // preload "movl len, %%ecx \n\t" // load length of line
1294 // preload "movl srcptr, %%esi \n\t" // load source
1295 // preload "movl dstptr, %%edi \n\t" // load dest
1297 "cmpl $0, %%ecx \n\t"
1298 "jz mainloop48end \n\t"
1301 "movq (%%esi), %%mm7 \n\t"
1302 "pand %%mm0, %%mm7 \n\t"
1303 "movq %%mm0, %%mm6 \n\t"
1304 "pandn (%%edi), %%mm6 \n\t"
1305 "por %%mm6, %%mm7 \n\t"
1306 "movq %%mm7, (%%edi) \n\t"
1308 "movq 8(%%esi), %%mm6 \n\t"
1309 "pand %%mm1, %%mm6 \n\t"
1310 "movq %%mm1, %%mm7 \n\t"
1311 "pandn 8(%%edi), %%mm7 \n\t"
1312 "por %%mm7, %%mm6 \n\t"
1313 "movq %%mm6, 8(%%edi) \n\t"
1315 "movq 16(%%esi), %%mm6 \n\t"
1316 "pand %%mm2, %%mm6 \n\t"
1317 "movq %%mm2, %%mm7 \n\t"
1318 "pandn 16(%%edi), %%mm7 \n\t"
1319 "por %%mm7, %%mm6 \n\t"
1320 "movq %%mm6, 16(%%edi) \n\t"
1322 "movq 24(%%esi), %%mm7 \n\t"
1323 "pand %%mm3, %%mm7 \n\t"
1324 "movq %%mm3, %%mm6 \n\t"
1325 "pandn 24(%%edi), %%mm6 \n\t"
1326 "por %%mm6, %%mm7 \n\t"
1327 "movq %%mm7, 24(%%edi) \n\t"
1329 "movq 32(%%esi), %%mm6 \n\t"
1330 "pand %%mm4, %%mm6 \n\t"
1331 "movq %%mm4, %%mm7 \n\t"
1332 "pandn 32(%%edi), %%mm7 \n\t"
1333 "por %%mm7, %%mm6 \n\t"
1334 "movq %%mm6, 32(%%edi) \n\t"
1336 "movq 40(%%esi), %%mm7 \n\t"
1337 "pand %%mm5, %%mm7 \n\t"
1338 "movq %%mm5, %%mm6 \n\t"
1339 "pandn 40(%%edi), %%mm6 \n\t"
1340 "por %%mm6, %%mm7 \n\t"
1341 "movq %%mm7, 40(%%edi) \n\t"
1343 "addl $48, %%esi \n\t" // inc by 48 bytes processed
1344 "addl $48, %%edi \n\t"
1345 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
1347 "ja mainloop48 \n\t"
1349 "mainloop48end: \n\t"
1350 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
1351 "movl %%eax, %%ecx \n\t"
1352 "cmpl $0, %%ecx \n\t"
1354 // preload "movl mask, %%edx \n\t"
1355 "sall $24, %%edx \n\t" // make low byte, high byte
1357 "secondloop48: \n\t"
1358 "sall %%edx \n\t" // move high bit to CF
1359 "jnc skip48 \n\t" // if CF = 0
1360 "movl (%%esi), %%eax \n\t"
1361 "movl %%eax, (%%edi) \n\t"
1364 "addl $4, %%esi \n\t"
1365 "addl $4, %%edi \n\t"
1367 "jnz secondloop48 \n\t"
1372 : "=a" (dummy_value_a
), // output regs (dummy)
1373 "=d" (dummy_value_d
),
1374 "=c" (dummy_value_c
),
1375 "=S" (dummy_value_S
),
1376 "=D" (dummy_value_D
)
1378 : "3" (srcptr
), // esi // input regs
1379 "4" (dstptr
), // edi
1381 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1385 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1386 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
1387 , "%mm4", "%mm5", "%mm6", "%mm7"
1391 else /* mmx _not supported - Use modified C routine */
1392 #endif /* PNG_MMX_CODE_SUPPORTED */
1394 register png_uint_32 i
;
1395 png_uint_32 initial_val
= BPP6
* png_pass_start
[png_ptr
->pass
];
1396 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1397 register int stride
= BPP6
* png_pass_inc
[png_ptr
->pass
];
1398 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1399 register int rep_bytes
= BPP6
* png_pass_width
[png_ptr
->pass
];
1400 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1401 png_uint_32 len
= png_ptr
->width
&~7; /* reduce to mult. of 8 */
1402 int diff
= (int) (png_ptr
->width
& 7); /* amount lost */
1403 register png_uint_32 final_val
= BPP6
* len
; /* GRR bugfix */
1405 srcptr
= png_ptr
->row_buf
+ 1 + initial_val
;
1406 dstptr
= row
+ initial_val
;
1408 for (i
= initial_val
; i
< final_val
; i
+= stride
)
1410 png_memcpy(dstptr
, srcptr
, rep_bytes
);
1414 if (diff
) /* number of leftover pixels: 3 for pngtest */
1416 final_val
+=diff
*BPP6
;
1417 for (; i
< final_val
; i
+= stride
)
1419 if (rep_bytes
> (int)(final_val
-i
))
1420 rep_bytes
= (int)(final_val
-i
);
1421 png_memcpy(dstptr
, srcptr
, rep_bytes
);
1426 } /* end of else (_mmx_supported) */
1431 case 64: /* png_ptr->row_info.pixel_depth */
1435 register png_uint_32 i
;
1436 png_uint_32 initial_val
= BPP8
* png_pass_start
[png_ptr
->pass
];
1437 /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1438 register int stride
= BPP8
* png_pass_inc
[png_ptr
->pass
];
1439 /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1440 register int rep_bytes
= BPP8
* png_pass_width
[png_ptr
->pass
];
1441 /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1442 png_uint_32 len
= png_ptr
->width
&~7; /* reduce to mult. of 8 */
1443 int diff
= (int) (png_ptr
->width
& 7); /* amount lost */
1444 register png_uint_32 final_val
= BPP8
* len
; /* GRR bugfix */
1446 srcptr
= png_ptr
->row_buf
+ 1 + initial_val
;
1447 dstptr
= row
+ initial_val
;
1449 for (i
= initial_val
; i
< final_val
; i
+= stride
)
1451 png_memcpy(dstptr
, srcptr
, rep_bytes
);
1455 if (diff
) /* number of leftover pixels: 3 for pngtest */
1457 final_val
+=diff
*BPP8
;
1458 for (; i
< final_val
; i
+= stride
)
1460 if (rep_bytes
> (int)(final_val
-i
))
1461 rep_bytes
= (int)(final_val
-i
);
1462 png_memcpy(dstptr
, srcptr
, rep_bytes
);
1471 default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
1473 /* this should never happen */
1474 png_warning(png_ptr
, "Invalid row_info.pixel_depth in pnggccrd");
1477 } /* end switch (png_ptr->row_info.pixel_depth) */
1479 } /* end if (non-trivial mask) */
1481 } /* end png_combine_row() */
1483 #endif /* PNG_HAVE_MMX_COMBINE_ROW */
1488 /*===========================================================================*/
1490 /* P N G _ D O _ R E A D _ I N T E R L A C E */
1492 /*===========================================================================*/
1494 #if defined(PNG_READ_INTERLACING_SUPPORTED)
1495 #if defined(PNG_HAVE_MMX_READ_INTERLACE)
1497 /* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
1498 * has taken place. [GRR: what other steps come before and/or after?]
1502 png_do_read_interlace(png_structp png_ptr
)
1504 png_row_infop row_info
= &(png_ptr
->row_info
);
1505 png_bytep row
= png_ptr
->row_buf
+ 1;
1506 int pass
= png_ptr
->pass
;
1507 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1508 png_uint_32 transformations
= png_ptr
->transformations
;
1511 png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
1513 #if defined(PNG_MMX_CODE_SUPPORTED)
1514 if (_mmx_supported
== 2) {
1515 #if !defined(PNG_1_0_X)
1516 /* this should have happened in png_init_mmx_flags() already */
1517 png_warning(png_ptr
, "asm_flags may not have been initialized");
1523 if (row
!= NULL
&& row_info
!= NULL
)
1525 png_uint_32 final_width
;
1527 final_width
= row_info
->width
* png_pass_inc
[pass
];
1529 switch (row_info
->pixel_depth
)
1535 int s_start
, s_end
, s_inc
;
1540 sp
= row
+ (png_size_t
)((row_info
->width
- 1) >> 3);
1541 dp
= row
+ (png_size_t
)((final_width
- 1) >> 3);
1542 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1543 if (transformations
& PNG_PACKSWAP
)
1545 sshift
= (int)((row_info
->width
+ 7) & 7);
1546 dshift
= (int)((final_width
+ 7) & 7);
1554 sshift
= 7 - (int)((row_info
->width
+ 7) & 7);
1555 dshift
= 7 - (int)((final_width
+ 7) & 7);
1561 for (i
= row_info
->width
; i
; i
--)
1563 v
= (png_byte
)((*sp
>> sshift
) & 0x1);
1564 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1566 *dp
&= (png_byte
)((0x7f7f >> (7 - dshift
)) & 0xff);
1567 *dp
|= (png_byte
)(v
<< dshift
);
1568 if (dshift
== s_end
)
1576 if (sshift
== s_end
)
1591 int s_start
, s_end
, s_inc
;
1594 sp
= row
+ (png_size_t
)((row_info
->width
- 1) >> 2);
1595 dp
= row
+ (png_size_t
)((final_width
- 1) >> 2);
1596 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1597 if (transformations
& PNG_PACKSWAP
)
1599 sshift
= (png_size_t
)(((row_info
->width
+ 3) & 3) << 1);
1600 dshift
= (png_size_t
)(((final_width
+ 3) & 3) << 1);
1608 sshift
= (png_size_t
)((3 - ((row_info
->width
+ 3) & 3)) << 1);
1609 dshift
= (png_size_t
)((3 - ((final_width
+ 3) & 3)) << 1);
1615 for (i
= row_info
->width
; i
; i
--)
1620 v
= (png_byte
)((*sp
>> sshift
) & 0x3);
1621 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1623 *dp
&= (png_byte
)((0x3f3f >> (6 - dshift
)) & 0xff);
1624 *dp
|= (png_byte
)(v
<< dshift
);
1625 if (dshift
== s_end
)
1633 if (sshift
== s_end
)
1648 int s_start
, s_end
, s_inc
;
1651 sp
= row
+ (png_size_t
)((row_info
->width
- 1) >> 1);
1652 dp
= row
+ (png_size_t
)((final_width
- 1) >> 1);
1653 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1654 if (transformations
& PNG_PACKSWAP
)
1656 sshift
= (png_size_t
)(((row_info
->width
+ 1) & 1) << 2);
1657 dshift
= (png_size_t
)(((final_width
+ 1) & 1) << 2);
1665 sshift
= (png_size_t
)((1 - ((row_info
->width
+ 1) & 1)) << 2);
1666 dshift
= (png_size_t
)((1 - ((final_width
+ 1) & 1)) << 2);
1672 for (i
= row_info
->width
; i
; i
--)
1677 v
= (png_byte
)((*sp
>> sshift
) & 0xf);
1678 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1680 *dp
&= (png_byte
)((0xf0f >> (4 - dshift
)) & 0xff);
1681 *dp
|= (png_byte
)(v
<< dshift
);
1682 if (dshift
== s_end
)
1690 if (sshift
== s_end
)
1701 /*====================================================================*/
1703 default: /* 8-bit or larger (this is where the routine is modified) */
1706 // static unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
1707 // static unsigned long long const4 = 0x0000000000FFFFFFLL; no good
1708 // unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
1709 // unsigned long long const4 = 0x0000000000FFFFFFLL; no good
1713 png_size_t pixel_bytes
;
1714 int width
= (int)row_info
->width
;
1716 pixel_bytes
= (row_info
->pixel_depth
>> 3);
1718 /* point sptr at the last pixel in the pre-expanded row: */
1719 sptr
= row
+ (width
- 1) * pixel_bytes
;
1721 /* point dp at the last pixel position in the expanded row: */
1722 dp
= row
+ (final_width
- 1) * pixel_bytes
;
1724 /* New code by Nirav Chhatrapati - Intel Corporation */
1726 #if defined(PNG_MMX_CODE_SUPPORTED)
1727 #if !defined(PNG_1_0_X)
1728 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_INTERLACE
)
1729 /* && _mmx_supported */ )
1734 //--------------------------------------------------------------
1735 if (pixel_bytes
== 3)
1737 if (((pass
== 0) || (pass
== 1)) && width
)
1739 int dummy_value_c
; // fix 'forbidden register spilled'
1744 __asm__
__volatile__ (
1745 "subl $21, %%edi \n\t"
1746 // (png_pass_inc[pass] - 1)*pixel_bytes
1748 ".loop3_pass0: \n\t"
1749 "movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
1750 "pand (%3), %%mm0 \n\t" // z z z z z 2 1 0
1751 "movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
1752 "psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
1753 "movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
1754 "psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
1755 "psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
1756 "por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
1757 "por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
1758 "movq %%mm0, %%mm3 \n\t" // 2 1 0 2 1 0 2 1
1759 "psllq $16, %%mm0 \n\t" // 0 2 1 0 2 1 z z
1760 "movq %%mm3, %%mm4 \n\t" // 2 1 0 2 1 0 2 1
1761 "punpckhdq %%mm0, %%mm3 \n\t" // 0 2 1 0 2 1 0 2
1762 "movq %%mm4, 16(%%edi) \n\t"
1763 "psrlq $32, %%mm0 \n\t" // z z z z 0 2 1 0
1764 "movq %%mm3, 8(%%edi) \n\t"
1765 "punpckldq %%mm4, %%mm0 \n\t" // 1 0 2 1 0 2 1 0
1766 "subl $3, %%esi \n\t"
1767 "movq %%mm0, (%%edi) \n\t"
1768 "subl $24, %%edi \n\t"
1770 "jnz .loop3_pass0 \n\t"
1773 : "=c" (dummy_value_c
), // output regs (dummy)
1774 "=S" (dummy_value_S
),
1775 "=D" (dummy_value_D
),
1776 "=a" (dummy_value_a
)
1779 : "1" (sptr
), // esi // input regs
1782 "3" (&_const4
) // %1(?) (0x0000000000FFFFFFLL)
1784 #if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1785 : "%mm0", "%mm1", "%mm2" // clobber list
1790 else if (((pass
== 2) || (pass
== 3)) && width
)
1792 int dummy_value_c
; // fix 'forbidden register spilled'
1797 __asm__
__volatile__ (
1798 "subl $9, %%edi \n\t"
1799 // (png_pass_inc[pass] - 1)*pixel_bytes
1801 ".loop3_pass2: \n\t"
1802 "movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
1803 "pand (%3), %%mm0 \n\t" // z z z z z 2 1 0
1804 "movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
1805 "psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
1806 "movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
1807 "psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
1808 "psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
1809 "por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
1810 "por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
1811 "movq %%mm0, 4(%%edi) \n\t"
1812 "psrlq $16, %%mm0 \n\t" // z z 2 1 0 2 1 0
1813 "subl $3, %%esi \n\t"
1814 "movd %%mm0, (%%edi) \n\t"
1815 "subl $12, %%edi \n\t"
1817 "jnz .loop3_pass2 \n\t"
1820 : "=c" (dummy_value_c
), // output regs (dummy)
1821 "=S" (dummy_value_S
),
1822 "=D" (dummy_value_D
),
1823 "=a" (dummy_value_a
)
1825 : "1" (sptr
), // esi // input regs
1828 "3" (&_const4
) // (0x0000000000FFFFFFLL)
1830 #if 0 /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
1831 : "%mm0", "%mm1", "%mm2" // clobber list
1835 else if (width
) /* && ((pass == 4) || (pass == 5)) */
1837 int width_mmx
= ((width
>> 1) << 1) - 8; // GRR: huh?
1840 width
-= width_mmx
; // 8 or 9 pix, 24 or 27 bytes
1843 // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1844 // sptr points at last pixel in pre-expanded row
1845 // dp points at last pixel position in expanded row
1846 int dummy_value_c
; // fix 'forbidden register spilled'
1852 __asm__
__volatile__ (
1853 "subl $3, %%esi \n\t"
1854 "subl $9, %%edi \n\t"
1855 // (png_pass_inc[pass] + 1)*pixel_bytes
1857 ".loop3_pass4: \n\t"
1858 "movq (%%esi), %%mm0 \n\t" // x x 5 4 3 2 1 0
1859 "movq %%mm0, %%mm1 \n\t" // x x 5 4 3 2 1 0
1860 "movq %%mm0, %%mm2 \n\t" // x x 5 4 3 2 1 0
1861 "psllq $24, %%mm0 \n\t" // 4 3 2 1 0 z z z
1862 "pand (%3), %%mm1 \n\t" // z z z z z 2 1 0
1863 "psrlq $24, %%mm2 \n\t" // z z z x x 5 4 3
1864 "por %%mm1, %%mm0 \n\t" // 4 3 2 1 0 2 1 0
1865 "movq %%mm2, %%mm3 \n\t" // z z z x x 5 4 3
1866 "psllq $8, %%mm2 \n\t" // z z x x 5 4 3 z
1867 "movq %%mm0, (%%edi) \n\t"
1868 "psrlq $16, %%mm3 \n\t" // z z z z z x x 5
1869 "pand (%4), %%mm3 \n\t" // z z z z z z z 5
1870 "por %%mm3, %%mm2 \n\t" // z z x x 5 4 3 5
1871 "subl $6, %%esi \n\t"
1872 "movd %%mm2, 8(%%edi) \n\t"
1873 "subl $12, %%edi \n\t"
1874 "subl $2, %%ecx \n\t"
1875 "jnz .loop3_pass4 \n\t"
1878 : "=c" (dummy_value_c
), // output regs (dummy)
1879 "=S" (dummy_value_S
),
1880 "=D" (dummy_value_D
),
1881 "=a" (dummy_value_a
),
1882 "=d" (dummy_value_d
)
1884 : "1" (sptr
), // esi // input regs
1886 "0" (width_mmx
), // ecx
1887 "3" (&_const4
), // 0x0000000000FFFFFFLL
1888 "4" (&_const6
) // 0x00000000000000FFLL
1890 #if 0 /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
1891 : "%mm0", "%mm1" // clobber list
1897 sptr
-= width_mmx
*3;
1899 for (i
= width
; i
; i
--)
1904 png_memcpy(v
, sptr
, 3);
1905 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1907 png_memcpy(dp
, v
, 3);
1913 } /* end of pixel_bytes == 3 */
1915 //--------------------------------------------------------------
1916 else if (pixel_bytes
== 1)
1918 if (((pass
== 0) || (pass
== 1)) && width
)
1920 int width_mmx
= ((width
>> 2) << 2);
1921 width
-= width_mmx
; // 0-3 pixels => 0-3 bytes
1924 int dummy_value_c
; // fix 'forbidden register spilled'
1928 __asm__
__volatile__ (
1929 "subl $3, %%esi \n\t"
1930 "subl $31, %%edi \n\t"
1932 ".loop1_pass0: \n\t"
1933 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
1934 "movq %%mm0, %%mm1 \n\t" // x x x x 3 2 1 0
1935 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
1936 "movq %%mm0, %%mm2 \n\t" // 3 3 2 2 1 1 0 0
1937 "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
1938 "movq %%mm0, %%mm3 \n\t" // 1 1 1 1 0 0 0 0
1939 "punpckldq %%mm0, %%mm0 \n\t" // 0 0 0 0 0 0 0 0
1940 "punpckhdq %%mm3, %%mm3 \n\t" // 1 1 1 1 1 1 1 1
1941 "movq %%mm0, (%%edi) \n\t"
1942 "punpckhwd %%mm2, %%mm2 \n\t" // 3 3 3 3 2 2 2 2
1943 "movq %%mm3, 8(%%edi) \n\t"
1944 "movq %%mm2, %%mm4 \n\t" // 3 3 3 3 2 2 2 2
1945 "punpckldq %%mm2, %%mm2 \n\t" // 2 2 2 2 2 2 2 2
1946 "punpckhdq %%mm4, %%mm4 \n\t" // 3 3 3 3 3 3 3 3
1947 "movq %%mm2, 16(%%edi) \n\t"
1948 "subl $4, %%esi \n\t"
1949 "movq %%mm4, 24(%%edi) \n\t"
1950 "subl $32, %%edi \n\t"
1951 "subl $4, %%ecx \n\t"
1952 "jnz .loop1_pass0 \n\t"
1955 : "=c" (dummy_value_c
), // output regs (dummy)
1956 "=S" (dummy_value_S
),
1957 "=D" (dummy_value_D
)
1959 : "1" (sptr
), // esi // input regs
1961 "0" (width_mmx
) // ecx
1963 #if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1964 : "%mm0", "%mm1", "%mm2" // clobber list
1972 for (i
= width
; i
; i
--)
1976 /* I simplified this part in version 1.0.4e
1977 * here and in several other instances where
1978 * pixel_bytes == 1 -- GR-P
1983 * png_memcpy(v, sptr, pixel_bytes);
1984 * for (j = 0; j < png_pass_inc[pass]; j++)
1986 * png_memcpy(dp, v, pixel_bytes);
1987 * dp -= pixel_bytes;
1989 * sptr -= pixel_bytes;
1991 * Replacement code is in the next three lines:
1994 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2001 else if (((pass
== 2) || (pass
== 3)) && width
)
2003 int width_mmx
= ((width
>> 2) << 2);
2004 width
-= width_mmx
; // 0-3 pixels => 0-3 bytes
2007 int dummy_value_c
; // fix 'forbidden register spilled'
2011 __asm__
__volatile__ (
2012 "subl $3, %%esi \n\t"
2013 "subl $15, %%edi \n\t"
2015 ".loop1_pass2: \n\t"
2016 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2017 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
2018 "movq %%mm0, %%mm1 \n\t" // 3 3 2 2 1 1 0 0
2019 "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
2020 "punpckhwd %%mm1, %%mm1 \n\t" // 3 3 3 3 2 2 2 2
2021 "movq %%mm0, (%%edi) \n\t"
2022 "subl $4, %%esi \n\t"
2023 "movq %%mm1, 8(%%edi) \n\t"
2024 "subl $16, %%edi \n\t"
2025 "subl $4, %%ecx \n\t"
2026 "jnz .loop1_pass2 \n\t"
2029 : "=c" (dummy_value_c
), // output regs (dummy)
2030 "=S" (dummy_value_S
),
2031 "=D" (dummy_value_D
)
2033 : "1" (sptr
), // esi // input regs
2035 "0" (width_mmx
) // ecx
2037 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2038 : "%mm0", "%mm1" // clobber list
2045 for (i
= width
; i
; i
--)
2049 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2056 else if (width
) /* && ((pass == 4) || (pass == 5)) */
2058 int width_mmx
= ((width
>> 3) << 3);
2059 width
-= width_mmx
; // 0-3 pixels => 0-3 bytes
2062 int dummy_value_c
; // fix 'forbidden register spilled'
2066 __asm__
__volatile__ (
2067 "subl $7, %%esi \n\t"
2068 "subl $15, %%edi \n\t"
2070 ".loop1_pass4: \n\t"
2071 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2072 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2073 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
2074 "punpckhbw %%mm1, %%mm1 \n\t" // 7 7 6 6 5 5 4 4
2075 "movq %%mm1, 8(%%edi) \n\t"
2076 "subl $8, %%esi \n\t"
2077 "movq %%mm0, (%%edi) \n\t"
2078 "subl $16, %%edi \n\t"
2079 "subl $8, %%ecx \n\t"
2080 "jnz .loop1_pass4 \n\t"
2083 : "=c" (dummy_value_c
), // output regs (none)
2084 "=S" (dummy_value_S
),
2085 "=D" (dummy_value_D
)
2087 : "1" (sptr
), // esi // input regs
2089 "0" (width_mmx
) // ecx
2091 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2092 : "%mm0", "%mm1" // clobber list
2099 for (i
= width
; i
; i
--)
2103 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2110 } /* end of pixel_bytes == 1 */
2112 //--------------------------------------------------------------
2113 else if (pixel_bytes
== 2)
2115 if (((pass
== 0) || (pass
== 1)) && width
)
2117 int width_mmx
= ((width
>> 1) << 1);
2118 width
-= width_mmx
; // 0,1 pixels => 0,2 bytes
2121 int dummy_value_c
; // fix 'forbidden register spilled'
2125 __asm__
__volatile__ (
2126 "subl $2, %%esi \n\t"
2127 "subl $30, %%edi \n\t"
2129 ".loop2_pass0: \n\t"
2130 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2131 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2132 "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
2133 "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
2134 "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
2135 "movq %%mm0, (%%edi) \n\t"
2136 "movq %%mm0, 8(%%edi) \n\t"
2137 "movq %%mm1, 16(%%edi) \n\t"
2138 "subl $4, %%esi \n\t"
2139 "movq %%mm1, 24(%%edi) \n\t"
2140 "subl $32, %%edi \n\t"
2141 "subl $2, %%ecx \n\t"
2142 "jnz .loop2_pass0 \n\t"
2145 : "=c" (dummy_value_c
), // output regs (dummy)
2146 "=S" (dummy_value_S
),
2147 "=D" (dummy_value_D
)
2149 : "1" (sptr
), // esi // input regs
2151 "0" (width_mmx
) // ecx
2153 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2154 : "%mm0", "%mm1" // clobber list
2159 sptr
-= (width_mmx
*2 - 2); // sign fixed
2160 dp
-= (width_mmx
*16 - 2); // sign fixed
2161 for (i
= width
; i
; i
--)
2166 png_memcpy(v
, sptr
, 2);
2167 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2170 png_memcpy(dp
, v
, 2);
2174 else if (((pass
== 2) || (pass
== 3)) && width
)
2176 int width_mmx
= ((width
>> 1) << 1) ;
2177 width
-= width_mmx
; // 0,1 pixels => 0,2 bytes
2180 int dummy_value_c
; // fix 'forbidden register spilled'
2184 __asm__
__volatile__ (
2185 "subl $2, %%esi \n\t"
2186 "subl $14, %%edi \n\t"
2188 ".loop2_pass2: \n\t"
2189 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2190 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2191 "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
2192 "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
2193 "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
2194 "movq %%mm0, (%%edi) \n\t"
2195 "subl $4, %%esi \n\t"
2196 "movq %%mm1, 8(%%edi) \n\t"
2197 "subl $16, %%edi \n\t"
2198 "subl $2, %%ecx \n\t"
2199 "jnz .loop2_pass2 \n\t"
2202 : "=c" (dummy_value_c
), // output regs (dummy)
2203 "=S" (dummy_value_S
),
2204 "=D" (dummy_value_D
)
2206 : "1" (sptr
), // esi // input regs
2208 "0" (width_mmx
) // ecx
2210 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2211 : "%mm0", "%mm1" // clobber list
2216 sptr
-= (width_mmx
*2 - 2); // sign fixed
2217 dp
-= (width_mmx
*8 - 2); // sign fixed
2218 for (i
= width
; i
; i
--)
2223 png_memcpy(v
, sptr
, 2);
2224 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2227 png_memcpy(dp
, v
, 2);
2231 else if (width
) // pass == 4 or 5
2233 int width_mmx
= ((width
>> 1) << 1) ;
2234 width
-= width_mmx
; // 0,1 pixels => 0,2 bytes
2237 int dummy_value_c
; // fix 'forbidden register spilled'
2241 __asm__
__volatile__ (
2242 "subl $2, %%esi \n\t"
2243 "subl $6, %%edi \n\t"
2245 ".loop2_pass4: \n\t"
2246 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2247 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2248 "subl $4, %%esi \n\t"
2249 "movq %%mm0, (%%edi) \n\t"
2250 "subl $8, %%edi \n\t"
2251 "subl $2, %%ecx \n\t"
2252 "jnz .loop2_pass4 \n\t"
2255 : "=c" (dummy_value_c
), // output regs (dummy)
2256 "=S" (dummy_value_S
),
2257 "=D" (dummy_value_D
)
2259 : "1" (sptr
), // esi // input regs
2261 "0" (width_mmx
) // ecx
2263 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2264 : "%mm0" // clobber list
2269 sptr
-= (width_mmx
*2 - 2); // sign fixed
2270 dp
-= (width_mmx
*4 - 2); // sign fixed
2271 for (i
= width
; i
; i
--)
2276 png_memcpy(v
, sptr
, 2);
2277 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2280 png_memcpy(dp
, v
, 2);
2284 } /* end of pixel_bytes == 2 */
2286 //--------------------------------------------------------------
2287 else if (pixel_bytes
== 4)
2289 if (((pass
== 0) || (pass
== 1)) && width
)
2291 int width_mmx
= ((width
>> 1) << 1);
2292 width
-= width_mmx
; // 0,1 pixels => 0,4 bytes
2295 int dummy_value_c
; // fix 'forbidden register spilled'
2299 __asm__
__volatile__ (
2300 "subl $4, %%esi \n\t"
2301 "subl $60, %%edi \n\t"
2303 ".loop4_pass0: \n\t"
2304 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2305 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2306 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2307 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2308 "movq %%mm0, (%%edi) \n\t"
2309 "movq %%mm0, 8(%%edi) \n\t"
2310 "movq %%mm0, 16(%%edi) \n\t"
2311 "movq %%mm0, 24(%%edi) \n\t"
2312 "movq %%mm1, 32(%%edi) \n\t"
2313 "movq %%mm1, 40(%%edi) \n\t"
2314 "movq %%mm1, 48(%%edi) \n\t"
2315 "subl $8, %%esi \n\t"
2316 "movq %%mm1, 56(%%edi) \n\t"
2317 "subl $64, %%edi \n\t"
2318 "subl $2, %%ecx \n\t"
2319 "jnz .loop4_pass0 \n\t"
2322 : "=c" (dummy_value_c
), // output regs (dummy)
2323 "=S" (dummy_value_S
),
2324 "=D" (dummy_value_D
)
2326 : "1" (sptr
), // esi // input regs
2328 "0" (width_mmx
) // ecx
2330 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2331 : "%mm0", "%mm1" // clobber list
2336 sptr
-= (width_mmx
*4 - 4); // sign fixed
2337 dp
-= (width_mmx
*32 - 4); // sign fixed
2338 for (i
= width
; i
; i
--)
2343 png_memcpy(v
, sptr
, 4);
2344 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2347 png_memcpy(dp
, v
, 4);
2351 else if (((pass
== 2) || (pass
== 3)) && width
)
2353 int width_mmx
= ((width
>> 1) << 1);
2354 width
-= width_mmx
; // 0,1 pixels => 0,4 bytes
2357 int dummy_value_c
; // fix 'forbidden register spilled'
2361 __asm__
__volatile__ (
2362 "subl $4, %%esi \n\t"
2363 "subl $28, %%edi \n\t"
2365 ".loop4_pass2: \n\t"
2366 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2367 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2368 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2369 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2370 "movq %%mm0, (%%edi) \n\t"
2371 "movq %%mm0, 8(%%edi) \n\t"
2372 "movq %%mm1, 16(%%edi) \n\t"
2373 "movq %%mm1, 24(%%edi) \n\t"
2374 "subl $8, %%esi \n\t"
2375 "subl $32, %%edi \n\t"
2376 "subl $2, %%ecx \n\t"
2377 "jnz .loop4_pass2 \n\t"
2380 : "=c" (dummy_value_c
), // output regs (dummy)
2381 "=S" (dummy_value_S
),
2382 "=D" (dummy_value_D
)
2384 : "1" (sptr
), // esi // input regs
2386 "0" (width_mmx
) // ecx
2388 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2389 : "%mm0", "%mm1" // clobber list
2394 sptr
-= (width_mmx
*4 - 4); // sign fixed
2395 dp
-= (width_mmx
*16 - 4); // sign fixed
2396 for (i
= width
; i
; i
--)
2401 png_memcpy(v
, sptr
, 4);
2402 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2405 png_memcpy(dp
, v
, 4);
2409 else if (width
) // pass == 4 or 5
2411 int width_mmx
= ((width
>> 1) << 1) ;
2412 width
-= width_mmx
; // 0,1 pixels => 0,4 bytes
2415 int dummy_value_c
; // fix 'forbidden register spilled'
2419 __asm__
__volatile__ (
2420 "subl $4, %%esi \n\t"
2421 "subl $12, %%edi \n\t"
2423 ".loop4_pass4: \n\t"
2424 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2425 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2426 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2427 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2428 "movq %%mm0, (%%edi) \n\t"
2429 "subl $8, %%esi \n\t"
2430 "movq %%mm1, 8(%%edi) \n\t"
2431 "subl $16, %%edi \n\t"
2432 "subl $2, %%ecx \n\t"
2433 "jnz .loop4_pass4 \n\t"
2436 : "=c" (dummy_value_c
), // output regs (dummy)
2437 "=S" (dummy_value_S
),
2438 "=D" (dummy_value_D
)
2440 : "1" (sptr
), // esi // input regs
2442 "0" (width_mmx
) // ecx
2444 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2445 : "%mm0", "%mm1" // clobber list
2450 sptr
-= (width_mmx
*4 - 4); // sign fixed
2451 dp
-= (width_mmx
*8 - 4); // sign fixed
2452 for (i
= width
; i
; i
--)
2457 png_memcpy(v
, sptr
, 4);
2458 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2461 png_memcpy(dp
, v
, 4);
2465 } /* end of pixel_bytes == 4 */
2467 //--------------------------------------------------------------
2468 else if (pixel_bytes
== 8)
2470 // GRR TEST: should work, but needs testing (special 64-bit version of rpng2?)
2471 // GRR NOTE: no need to combine passes here!
2472 if (((pass
== 0) || (pass
== 1)) && width
)
2474 int dummy_value_c
; // fix 'forbidden register spilled'
2478 // source is 8-byte RRGGBBAA
2479 // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
2480 __asm__
__volatile__ (
2481 "subl $56, %%edi \n\t" // start of last block
2483 ".loop8_pass0: \n\t"
2484 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2485 "movq %%mm0, (%%edi) \n\t"
2486 "movq %%mm0, 8(%%edi) \n\t"
2487 "movq %%mm0, 16(%%edi) \n\t"
2488 "movq %%mm0, 24(%%edi) \n\t"
2489 "movq %%mm0, 32(%%edi) \n\t"
2490 "movq %%mm0, 40(%%edi) \n\t"
2491 "movq %%mm0, 48(%%edi) \n\t"
2492 "subl $8, %%esi \n\t"
2493 "movq %%mm0, 56(%%edi) \n\t"
2494 "subl $64, %%edi \n\t"
2496 "jnz .loop8_pass0 \n\t"
2499 : "=c" (dummy_value_c
), // output regs (dummy)
2500 "=S" (dummy_value_S
),
2501 "=D" (dummy_value_D
)
2503 : "1" (sptr
), // esi // input regs
2507 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2508 : "%mm0" // clobber list
2512 else if (((pass
== 2) || (pass
== 3)) && width
)
2514 // source is 8-byte RRGGBBAA
2515 // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
2516 // (recall that expansion is _in place_: sptr and dp
2517 // both point at locations within same row buffer)
2519 int dummy_value_c
; // fix 'forbidden register spilled'
2523 __asm__
__volatile__ (
2524 "subl $24, %%edi \n\t" // start of last block
2526 ".loop8_pass2: \n\t"
2527 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2528 "movq %%mm0, (%%edi) \n\t"
2529 "movq %%mm0, 8(%%edi) \n\t"
2530 "movq %%mm0, 16(%%edi) \n\t"
2531 "subl $8, %%esi \n\t"
2532 "movq %%mm0, 24(%%edi) \n\t"
2533 "subl $32, %%edi \n\t"
2535 "jnz .loop8_pass2 \n\t"
2538 : "=c" (dummy_value_c
), // output regs (dummy)
2539 "=S" (dummy_value_S
),
2540 "=D" (dummy_value_D
)
2542 : "1" (sptr
), // esi // input regs
2546 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2547 : "%mm0" // clobber list
2552 else if (width
) // pass == 4 or 5
2554 // source is 8-byte RRGGBBAA
2555 // dest is 16-byte RRGGBBAA RRGGBBAA
2557 int dummy_value_c
; // fix 'forbidden register spilled'
2561 __asm__
__volatile__ (
2562 "subl $8, %%edi \n\t" // start of last block
2564 ".loop8_pass4: \n\t"
2565 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2566 "movq %%mm0, (%%edi) \n\t"
2567 "subl $8, %%esi \n\t"
2568 "movq %%mm0, 8(%%edi) \n\t"
2569 "subl $16, %%edi \n\t"
2571 "jnz .loop8_pass4 \n\t"
2574 : "=c" (dummy_value_c
), // output regs (dummy)
2575 "=S" (dummy_value_S
),
2576 "=D" (dummy_value_D
)
2578 : "1" (sptr
), // esi // input regs
2582 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2583 : "%mm0" // clobber list
2589 } /* end of pixel_bytes == 8 */
2591 //--------------------------------------------------------------
2592 else if (pixel_bytes
== 6)
2594 for (i
= width
; i
; i
--)
2598 png_memcpy(v
, sptr
, 6);
2599 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2601 png_memcpy(dp
, v
, 6);
2606 } /* end of pixel_bytes == 6 */
2608 //--------------------------------------------------------------
2611 for (i
= width
; i
; i
--)
2615 png_memcpy(v
, sptr
, pixel_bytes
);
2616 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2618 png_memcpy(dp
, v
, pixel_bytes
);
2624 } // end of _mmx_supported ========================================
2626 else /* MMX not supported: use modified C code - takes advantage
2627 * of inlining of png_memcpy for a constant */
2628 /* GRR 19991007: does it? or should pixel_bytes in each
2629 * block be replaced with immediate value (e.g., 1)? */
2630 /* GRR 19991017: replaced with constants in each case */
2631 #endif /* PNG_MMX_CODE_SUPPORTED */
2633 if (pixel_bytes
== 1)
2635 for (i
= width
; i
; i
--)
2638 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2645 else if (pixel_bytes
== 3)
2647 for (i
= width
; i
; i
--)
2651 png_memcpy(v
, sptr
, 3);
2652 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2654 png_memcpy(dp
, v
, 3);
2660 else if (pixel_bytes
== 2)
2662 for (i
= width
; i
; i
--)
2666 png_memcpy(v
, sptr
, 2);
2667 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2669 png_memcpy(dp
, v
, 2);
2675 else if (pixel_bytes
== 4)
2677 for (i
= width
; i
; i
--)
2681 png_memcpy(v
, sptr
, 4);
2682 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2685 if (dp
< row
|| dp
+3 > row
+png_ptr
->row_buf_size
)
2687 printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
2688 row
, dp
, row
+png_ptr
->row_buf_size
);
2689 printf("row_buf=%d\n",png_ptr
->row_buf_size
);
2692 png_memcpy(dp
, v
, 4);
2698 else if (pixel_bytes
== 6)
2700 for (i
= width
; i
; i
--)
2704 png_memcpy(v
, sptr
, 6);
2705 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2707 png_memcpy(dp
, v
, 6);
2713 else if (pixel_bytes
== 8)
2715 for (i
= width
; i
; i
--)
2719 png_memcpy(v
, sptr
, 8);
2720 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2722 png_memcpy(dp
, v
, 8);
2728 else /* GRR: should never be reached */
2730 for (i
= width
; i
; i
--)
2734 png_memcpy(v
, sptr
, pixel_bytes
);
2735 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
2737 png_memcpy(dp
, v
, pixel_bytes
);
2740 sptr
-= pixel_bytes
;
2744 } /* end if (MMX not supported) */
2747 } /* end switch (row_info->pixel_depth) */
2749 row_info
->width
= final_width
;
2751 row_info
->rowbytes
= PNG_ROWBYTES(row_info
->pixel_depth
,final_width
);
2754 } /* end png_do_read_interlace() */
2756 #endif /* PNG_HAVE_MMX_READ_INTERLACE */
2757 #endif /* PNG_READ_INTERLACING_SUPPORTED */
2761 #if defined(PNG_HAVE_MMX_READ_FILTER_ROW)
2762 #if defined(PNG_MMX_CODE_SUPPORTED)
2764 // These variables are utilized in the functions below. They are declared
2765 // globally here to ensure alignment on 8-byte boundaries.
2770 } _LBCarryMask
= {0x0101010101010101LL
},
2771 _HBClearMask
= {0x7f7f7f7f7f7f7f7fLL
},
2772 _ActiveMask
, _ActiveMask2
, _ActiveMaskEnd
, _ShiftBpp
, _ShiftRem
;
2774 #ifdef PNG_THREAD_UNSAFE_OK
2775 //===========================================================================//
2777 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G //
2779 //===========================================================================//
2781 // Optimized code for PNG Average filter decoder
2783 static void /* PRIVATE */
2784 png_read_filter_row_mmx_avg(png_row_infop row_info
, png_bytep row
,
2788 int dummy_value_c
; // fix 'forbidden register 2 (cx) was spilled' error
2792 bpp
= (row_info
->pixel_depth
+ 7) >> 3; // get # bytes per pixel
2793 _FullLength
= row_info
->rowbytes
; // # of bytes to filter
2795 __asm__
__volatile__ (
2796 // initialize address pointers and offset
2798 "pushl %%ebx \n\t" // save index to Global Offset Table
2800 //pre "movl row, %%edi \n\t" // edi: Avg(x)
2801 "xorl %%ebx, %%ebx \n\t" // ebx: x
2802 "movl %%edi, %%edx \n\t"
2803 //pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
2804 //pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
2805 "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
2807 "xorl %%eax,%%eax \n\t"
2809 // Compute the Raw value for the first bpp bytes
2810 // Raw(x) = Avg(x) + (Prior(x)/2)
2812 "movb (%%esi,%%ebx,),%%al \n\t" // load al with Prior(x)
2814 "shrb %%al \n\t" // divide by 2
2815 "addb -1(%%edi,%%ebx,),%%al \n\t" // add Avg(x); -1 to offset inc ebx
2816 //pre "cmpl bpp, %%ebx \n\t" // (bpp is preloaded into ecx)
2817 "cmpl %%ecx, %%ebx \n\t"
2818 "movb %%al,-1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2819 "jb avg_rlp \n\t" // mov does not affect flags
2821 // get # of bytes to alignment
2822 "movl %%edi, _dif \n\t" // take start of row
2823 "addl %%ebx, _dif \n\t" // add bpp
2824 "addl $0xf, _dif \n\t" // add 7+8 to incr past alignment bdry
2825 "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
2826 "subl %%edi, _dif \n\t" // subtract from start => value ebx at
2827 "jz avg_go \n\t" // alignment
2830 // Compute the Raw value for the bytes up to the alignment boundary
2831 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2832 "xorl %%ecx, %%ecx \n\t"
2835 "xorl %%eax, %%eax \n\t"
2836 "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
2837 "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
2838 "addw %%cx, %%ax \n\t"
2840 "shrw %%ax \n\t" // divide by 2
2841 "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
2842 "cmpl _dif, %%ebx \n\t" // check if at alignment boundary
2843 "movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2844 "jb avg_lp1 \n\t" // repeat until at alignment boundary
2847 "movl _FullLength, %%eax \n\t"
2848 "movl %%eax, %%ecx \n\t"
2849 "subl %%ebx, %%eax \n\t" // subtract alignment fix
2850 "andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
2851 "subl %%eax, %%ecx \n\t" // drop over bytes from original length
2852 "movl %%ecx, _MMXLength \n\t"
2854 "popl %%ebx \n\t" // restore index to Global Offset Table
2857 : "=c" (dummy_value_c
), // output regs (dummy)
2858 "=S" (dummy_value_S
),
2859 "=D" (dummy_value_D
)
2861 : "0" (bpp
), // ecx // input regs
2862 "1" (prev_row
), // esi
2865 : "%eax", "%edx" // clobber list
2869 // GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
2870 // (seems to work fine without...)
2873 // now do the math for the rest of the row
2878 _ActiveMask
.use
= 0x0000000000ffffffLL
;
2879 _ShiftBpp
.use
= 24; // == 3 * 8
2880 _ShiftRem
.use
= 40; // == 64 - 24
2882 __asm__
__volatile__ (
2883 // re-init address pointers and offset
2884 "movq _ActiveMask, %%mm7 \n\t"
2885 "movl _dif, %%ecx \n\t" // ecx: x = offset to
2886 "movq _LBCarryMask, %%mm5 \n\t" // alignment boundary
2887 // preload "movl row, %%edi \n\t" // edi: Avg(x)
2888 "movq _HBClearMask, %%mm4 \n\t"
2889 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
2891 // prime the pump: load the first Raw(x-bpp) data set
2892 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2893 // (correct pos. in loop below)
2895 "movq (%%edi,%%ecx,), %%mm0 \n\t" // load mm0 with Avg(x)
2896 "movq %%mm5, %%mm3 \n\t"
2897 "psrlq _ShiftRem, %%mm2 \n\t" // correct position Raw(x-bpp)
2899 "movq (%%esi,%%ecx,), %%mm1 \n\t" // load mm1 with Prior(x)
2900 "movq %%mm7, %%mm6 \n\t"
2901 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
2902 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
2903 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
2905 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
2907 // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
2908 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2910 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2912 // lsb's were == 1 (only valid for active group)
2913 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2914 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2916 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2918 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
2919 // bytes to add to Avg
2920 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2921 // Avg for each Active
2923 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
2924 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
2926 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2927 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2928 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2930 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2932 // lsb's were == 1 (only valid for active group)
2933 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2934 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2936 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2938 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
2939 // bytes to add to Avg
2940 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2941 // Avg for each Active
2944 // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
2945 "psllq _ShiftBpp, %%mm6 \n\t" // shift mm6 mask to cover last
2948 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2949 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2950 // Data only needs to be shifted once here to
2951 // get the correct x-bpp offset.
2952 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2954 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2956 // lsb's were == 1 (only valid for active group)
2957 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2958 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2960 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2962 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
2963 // bytes to add to Avg
2964 "addl $8, %%ecx \n\t"
2965 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2966 // Avg for each Active
2968 // now ready to write back to memory
2969 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2970 // move updated Raw(x) to use as Raw(x-bpp) for next loop
2971 "cmpl _MMXLength, %%ecx \n\t"
2972 "movq %%mm0, %%mm2 \n\t" // mov updated Raw(x) to mm2
2975 : "=S" (dummy_value_S
), // output regs (dummy)
2976 "=D" (dummy_value_D
)
2978 : "0" (prev_row
), // esi // input regs
2981 : "%ecx" // clobber list
2982 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2983 , "%mm0", "%mm1", "%mm2", "%mm3"
2984 , "%mm4", "%mm5", "%mm6", "%mm7"
2992 //case 7: // who wrote this? PNG doesn't support 5 or 7 bytes/pixel
2993 //case 5: // GRR BOGUS
2995 _ActiveMask
.use
= 0xffffffffffffffffLL
; // use shift below to clear
2996 // appropriate inactive bytes
2997 _ShiftBpp
.use
= bpp
<< 3;
2998 _ShiftRem
.use
= 64 - _ShiftBpp
.use
;
3000 __asm__
__volatile__ (
3001 "movq _HBClearMask, %%mm4 \n\t"
3003 // re-init address pointers and offset
3004 "movl _dif, %%ecx \n\t" // ecx: x = offset to
3005 // alignment boundary
3007 // load _ActiveMask and clear all bytes except for 1st active group
3008 "movq _ActiveMask, %%mm7 \n\t"
3009 // preload "movl row, %%edi \n\t" // edi: Avg(x)
3010 "psrlq _ShiftRem, %%mm7 \n\t"
3011 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3012 "movq %%mm7, %%mm6 \n\t"
3013 "movq _LBCarryMask, %%mm5 \n\t"
3014 "psllq _ShiftBpp, %%mm6 \n\t" // create mask for 2nd active
3017 // prime the pump: load the first Raw(x-bpp) data set
3018 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3019 // (we correct pos. in loop below)
3021 "movq (%%edi,%%ecx,), %%mm0 \n\t"
3022 "psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
3023 "movq (%%esi,%%ecx,), %%mm1 \n\t"
3024 // add (Prev_row/2) to average
3025 "movq %%mm5, %%mm3 \n\t"
3026 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3027 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3028 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
3030 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
3032 // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3033 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3035 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3037 // lsb's were == 1 (only valid for active group)
3038 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3039 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3041 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3043 "pand %%mm7, %%mm2 \n\t" // leave only Active Group 1
3044 // bytes to add to Avg
3045 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
3048 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3049 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3050 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3051 "addl $8, %%ecx \n\t"
3052 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3054 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3056 // lsb's were == 1 (only valid for active group)
3057 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3058 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3060 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3062 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3063 // bytes to add to Avg
3064 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3065 // Avg for each Active
3067 "cmpl _MMXLength, %%ecx \n\t"
3068 // now ready to write back to memory
3069 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3070 // prep Raw(x-bpp) for next loop
3071 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3074 : "=S" (dummy_value_S
), // output regs (dummy)
3075 "=D" (dummy_value_D
)
3077 : "0" (prev_row
), // esi // input regs
3080 : "%ecx" // clobber list
3081 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3082 , "%mm0", "%mm1", "%mm2", "%mm3"
3083 , "%mm4", "%mm5", "%mm6", "%mm7"
3087 break; // end 4,6 bpp
3091 _ActiveMask
.use
= 0x000000000000ffffLL
;
3092 _ShiftBpp
.use
= 16; // == 2 * 8
3093 _ShiftRem
.use
= 48; // == 64 - 16
3095 __asm__
__volatile__ (
3097 "movq _ActiveMask, %%mm7 \n\t"
3098 // re-init address pointers and offset
3099 "movl _dif, %%ecx \n\t" // ecx: x = offset to alignment
3101 "movq _LBCarryMask, %%mm5 \n\t"
3102 // preload "movl row, %%edi \n\t" // edi: Avg(x)
3103 "movq _HBClearMask, %%mm4 \n\t"
3104 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3106 // prime the pump: load the first Raw(x-bpp) data set
3107 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3108 // (we correct pos. in loop below)
3110 "movq (%%edi,%%ecx,), %%mm0 \n\t"
3111 "psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
3112 "movq (%%esi,%%ecx,), %%mm1 \n\t" // (GRR BUGFIX: was psllq)
3113 // add (Prev_row/2) to average
3114 "movq %%mm5, %%mm3 \n\t"
3115 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3116 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3117 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
3119 "movq %%mm7, %%mm6 \n\t"
3120 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
3123 // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3124 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3126 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3128 // lsb's were == 1 (only valid
3129 // for active group)
3130 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3131 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3133 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3135 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
3136 // bytes to add to Avg
3137 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
3138 // for each Active byte
3140 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3141 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3143 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3144 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3145 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3147 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3149 // lsb's were == 1 (only valid
3150 // for active group)
3151 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3152 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3154 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3156 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3157 // bytes to add to Avg
3158 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3159 // Avg for each Active byte
3161 // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
3162 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3164 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3165 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3166 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3168 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3169 // where both lsb's were == 1
3170 // (only valid for active group)
3171 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3172 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3174 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3176 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3177 // bytes to add to Avg
3178 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3179 // Avg for each Active byte
3181 // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
3182 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3184 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3185 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3186 "addl $8, %%ecx \n\t"
3187 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3189 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3191 // lsb's were == 1 (only valid
3192 // for active group)
3193 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3194 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3196 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3198 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3199 // bytes to add to Avg
3200 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3201 // Avg for each Active byte
3203 "cmpl _MMXLength, %%ecx \n\t"
3204 // now ready to write back to memory
3205 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3206 // prep Raw(x-bpp) for next loop
3207 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3210 : "=S" (dummy_value_S
), // output regs (dummy)
3211 "=D" (dummy_value_D
)
3213 : "0" (prev_row
), // esi // input regs
3216 : "%ecx" // clobber list
3217 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3218 , "%mm0", "%mm1", "%mm2", "%mm3"
3219 , "%mm4", "%mm5", "%mm6", "%mm7"
3227 __asm__
__volatile__ (
3228 // re-init address pointers and offset
3230 "pushl %%ebx \n\t" // save Global Offset Table index
3232 "movl _dif, %%ebx \n\t" // ebx: x = offset to alignment
3234 // preload "movl row, %%edi \n\t" // edi: Avg(x)
3235 "cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
3237 // do Paeth decode for remaining bytes
3238 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3239 "movl %%edi, %%edx \n\t"
3240 // preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
3241 "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
3242 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
3245 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3246 "xorl %%eax, %%eax \n\t"
3247 "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
3248 "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
3249 "addw %%cx, %%ax \n\t"
3251 "shrw %%ax \n\t" // divide by 2
3252 "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset
3254 "cmpl _FullLength, %%ebx \n\t" // check if at end of array
3255 "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x);
3256 // mov does not affect flags; -1 to offset inc ebx
3261 "popl %%ebx \n\t" // Global Offset Table index
3264 : "=c" (dummy_value_c
), // output regs (dummy)
3265 "=S" (dummy_value_S
),
3266 "=D" (dummy_value_D
)
3268 : "0" (bpp
), // ecx // input regs
3269 "1" (prev_row
), // esi
3272 : "%eax", "%edx" // clobber list
3278 return; // end 1 bpp
3282 __asm__
__volatile__ (
3283 // re-init address pointers and offset
3284 "movl _dif, %%ecx \n\t" // ecx: x == offset to alignment
3285 "movq _LBCarryMask, %%mm5 \n\t" // boundary
3286 // preload "movl row, %%edi \n\t" // edi: Avg(x)
3287 "movq _HBClearMask, %%mm4 \n\t"
3288 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3290 // prime the pump: load the first Raw(x-bpp) data set
3291 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3292 // (NO NEED to correct pos. in loop below)
3295 "movq (%%edi,%%ecx,), %%mm0 \n\t"
3296 "movq %%mm5, %%mm3 \n\t"
3297 "movq (%%esi,%%ecx,), %%mm1 \n\t"
3298 "addl $8, %%ecx \n\t"
3299 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3300 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3301 "pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
3302 // where both lsb's were == 1
3303 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3304 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7, each byte
3305 "paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg, each byte
3306 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7, each byte
3307 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg, each
3308 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
3309 "cmpl _MMXLength, %%ecx \n\t"
3310 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3311 "movq %%mm0, %%mm2 \n\t" // reuse as Raw(x-bpp)
3314 : "=S" (dummy_value_S
), // output regs (dummy)
3315 "=D" (dummy_value_D
)
3317 : "0" (prev_row
), // esi // input regs
3320 : "%ecx" // clobber list
3321 #if 0 /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
3322 , "%mm0", "%mm1", "%mm2"
3323 , "%mm3", "%mm4", "%mm5"
3329 default: // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
3333 // GRR: PRINT ERROR HERE: SHOULD NEVER BE REACHED
3335 "Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
3339 __asm__
__volatile__ (
3340 "movq _LBCarryMask, %%mm5 \n\t"
3341 // re-init address pointers and offset
3342 "movl _dif, %%ebx \n\t" // ebx: x = offset to
3343 // alignment boundary
3344 "movl row, %%edi \n\t" // edi: Avg(x)
3345 "movq _HBClearMask, %%mm4 \n\t"
3346 "movl %%edi, %%edx \n\t"
3347 "movl prev_row, %%esi \n\t" // esi: Prior(x)
3348 "subl bpp, %%edx \n\t" // edx: Raw(x-bpp)
3350 "movq (%%edi,%%ebx,), %%mm0 \n\t"
3351 "movq %%mm5, %%mm3 \n\t"
3352 "movq (%%esi,%%ebx,), %%mm1 \n\t"
3353 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3354 "movq (%%edx,%%ebx,), %%mm2 \n\t"
3355 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3356 "pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
3357 // where both lsb's were == 1
3358 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3359 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
3361 "paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg for each
3363 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3365 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
3367 "addl $8, %%ebx \n\t"
3368 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
3370 "cmpl _MMXLength, %%ebx \n\t"
3371 "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
3374 : // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
3376 : // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
3378 : "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
3380 #endif /* 0 - NEVER REACHED */
3384 } // end switch (bpp)
3386 __asm__
__volatile__ (
3387 // MMX acceleration complete; now do clean-up
3388 // check if any remaining bytes left to decode
3390 "pushl %%ebx \n\t" // save index to Global Offset Table
3392 "movl _MMXLength, %%ebx \n\t" // ebx: x == offset bytes after MMX
3393 //pre "movl row, %%edi \n\t" // edi: Avg(x)
3394 "cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
3397 // do Avg decode for remaining bytes
3398 //pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
3399 "movl %%edi, %%edx \n\t"
3400 //pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
3401 "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
3402 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
3405 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3406 "xorl %%eax, %%eax \n\t"
3407 "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
3408 "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
3409 "addw %%cx, %%ax \n\t"
3411 "shrw %%ax \n\t" // divide by 2
3412 "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
3413 "cmpl _FullLength, %%ebx \n\t" // check if at end of array
3414 "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x) [mov does not
3415 "jb avg_lp2 \n\t" // affect flags; -1 to offset inc ebx]
3418 "EMMS \n\t" // end MMX; prep for poss. FP instrs.
3420 "popl %%ebx \n\t" // restore index to Global Offset Table
3423 : "=c" (dummy_value_c
), // output regs (dummy)
3424 "=S" (dummy_value_S
),
3425 "=D" (dummy_value_D
)
3427 : "0" (bpp
), // ecx // input regs
3428 "1" (prev_row
), // esi
3431 : "%eax", "%edx" // clobber list
3437 } /* end png_read_filter_row_mmx_avg() */
3442 #ifdef PNG_THREAD_UNSAFE_OK
3443 //===========================================================================//
3445 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H //
3447 //===========================================================================//
3449 // Optimized code for PNG Paeth filter decoder
3451 static void /* PRIVATE */
3452 png_read_filter_row_mmx_paeth(png_row_infop row_info
, png_bytep row
,
3456 int dummy_value_c
; // fix 'forbidden register 2 (cx) was spilled' error
3460 bpp
= (row_info
->pixel_depth
+ 7) >> 3; // Get # bytes per pixel
3461 _FullLength
= row_info
->rowbytes
; // # of bytes to filter
3463 __asm__
__volatile__ (
3465 "pushl %%ebx \n\t" // save index to Global Offset Table
3467 "xorl %%ebx, %%ebx \n\t" // ebx: x offset
3468 //pre "movl row, %%edi \n\t"
3469 "xorl %%edx, %%edx \n\t" // edx: x-bpp offset
3470 //pre "movl prev_row, %%esi \n\t"
3471 "xorl %%eax, %%eax \n\t"
3473 // Compute the Raw value for the first bpp bytes
3474 // Note: the formula works out to be always
3475 // Paeth(x) = Raw(x) + Prior(x) where x < bpp
3477 "movb (%%edi,%%ebx,), %%al \n\t"
3478 "addb (%%esi,%%ebx,), %%al \n\t"
3480 //pre "cmpl bpp, %%ebx \n\t" (bpp is preloaded into ecx)
3481 "cmpl %%ecx, %%ebx \n\t"
3482 "movb %%al, -1(%%edi,%%ebx,) \n\t"
3484 // get # of bytes to alignment
3485 "movl %%edi, _dif \n\t" // take start of row
3486 "addl %%ebx, _dif \n\t" // add bpp
3487 "xorl %%ecx, %%ecx \n\t"
3488 "addl $0xf, _dif \n\t" // add 7 + 8 to incr past alignment
3490 "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
3491 "subl %%edi, _dif \n\t" // subtract from start ==> value ebx
3497 "xorl %%eax, %%eax \n\t"
3498 // pav = p - a = (a + b - c) - a = b - c
3499 "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
3500 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3501 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3502 "movl %%eax, _patemp \n\t" // Save pav for later use
3503 "xorl %%eax, %%eax \n\t"
3504 // pbv = p - b = (a + b - c) - b = a - c
3505 "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
3506 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3507 "movl %%eax, %%ecx \n\t"
3508 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3509 "addl _patemp, %%eax \n\t" // pcv = pav + pbv
3511 "testl $0x80000000, %%eax \n\t"
3513 "negl %%eax \n\t" // reverse sign of neg values
3516 "movl %%eax, _pctemp \n\t" // save pc for later use
3518 "testl $0x80000000, %%ecx \n\t"
3520 "negl %%ecx \n\t" // reverse sign of neg values
3523 "movl %%ecx, _pbtemp \n\t" // save pb for later use
3525 "movl _patemp, %%eax \n\t"
3526 "testl $0x80000000, %%eax \n\t"
3528 "negl %%eax \n\t" // reverse sign of neg values
3531 "movl %%eax, _patemp \n\t" // save pa for later use
3533 "cmpl %%ecx, %%eax \n\t"
3534 "jna paeth_abb \n\t"
3535 // pa > pb; now test if pb <= pc
3536 "cmpl _pctemp, %%ecx \n\t"
3537 "jna paeth_bbc \n\t"
3538 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3539 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3540 "jmp paeth_paeth \n\t"
3543 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3544 "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
3545 "jmp paeth_paeth \n\t"
3548 // pa <= pb; now test if pa <= pc
3549 "cmpl _pctemp, %%eax \n\t"
3550 "jna paeth_abc \n\t"
3551 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3552 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3553 "jmp paeth_paeth \n\t"
3556 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3557 "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
3562 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3563 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
3564 "cmpl _dif, %%ebx \n\t"
3568 "movl _FullLength, %%ecx \n\t"
3569 "movl %%ecx, %%eax \n\t"
3570 "subl %%ebx, %%eax \n\t" // subtract alignment fix
3571 "andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
3572 "subl %%eax, %%ecx \n\t" // drop over bytes from original length
3573 "movl %%ecx, _MMXLength \n\t"
3575 "popl %%ebx \n\t" // restore index to Global Offset Table
3578 : "=c" (dummy_value_c
), // output regs (dummy)
3579 "=S" (dummy_value_S
),
3580 "=D" (dummy_value_D
)
3582 : "0" (bpp
), // ecx // input regs
3583 "1" (prev_row
), // esi
3586 : "%eax", "%edx" // clobber list
3592 // now do the math for the rest of the row
3597 _ActiveMask
.use
= 0x0000000000ffffffLL
;
3598 _ActiveMaskEnd
.use
= 0xffff000000000000LL
;
3599 _ShiftBpp
.use
= 24; // == bpp(3) * 8
3600 _ShiftRem
.use
= 40; // == 64 - 24
3602 __asm__
__volatile__ (
3603 "movl _dif, %%ecx \n\t"
3604 // preload "movl row, %%edi \n\t"
3605 // preload "movl prev_row, %%esi \n\t"
3606 "pxor %%mm0, %%mm0 \n\t"
3607 // prime the pump: load the first Raw(x-bpp) data set
3608 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3610 "psrlq _ShiftRem, %%mm1 \n\t" // shift last 3 bytes to 1st
3612 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3613 "punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3614 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
3615 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3616 "psrlq _ShiftRem, %%mm3 \n\t" // shift last 3 bytes to 1st
3618 // pav = p - a = (a + b - c) - a = b - c
3619 "movq %%mm2, %%mm4 \n\t"
3620 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3621 // pbv = p - b = (a + b - c) - b = a - c
3622 "movq %%mm1, %%mm5 \n\t"
3623 "psubw %%mm3, %%mm4 \n\t"
3624 "pxor %%mm7, %%mm7 \n\t"
3625 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3626 "movq %%mm4, %%mm6 \n\t"
3627 "psubw %%mm3, %%mm5 \n\t"
3629 // pa = abs(p-a) = abs(pav)
3630 // pb = abs(p-b) = abs(pbv)
3631 // pc = abs(p-c) = abs(pcv)
3632 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3633 "paddw %%mm5, %%mm6 \n\t"
3634 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3635 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3636 "psubw %%mm0, %%mm4 \n\t"
3637 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3638 "psubw %%mm0, %%mm4 \n\t"
3639 "psubw %%mm7, %%mm5 \n\t"
3640 "pxor %%mm0, %%mm0 \n\t"
3641 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3642 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3643 "psubw %%mm7, %%mm5 \n\t"
3644 "psubw %%mm0, %%mm6 \n\t"
3646 "movq %%mm4, %%mm7 \n\t"
3647 "psubw %%mm0, %%mm6 \n\t"
3648 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3649 "movq %%mm7, %%mm0 \n\t"
3650 // use mm7 mask to merge pa & pb
3651 "pand %%mm7, %%mm5 \n\t"
3652 // use mm0 mask copy to merge a & b
3653 "pand %%mm0, %%mm2 \n\t"
3654 "pandn %%mm4, %%mm7 \n\t"
3655 "pandn %%mm1, %%mm0 \n\t"
3656 "paddw %%mm5, %%mm7 \n\t"
3657 "paddw %%mm2, %%mm0 \n\t"
3658 // test ((pa <= pb)? pa:pb) <= pc
3659 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3660 "pxor %%mm1, %%mm1 \n\t"
3661 "pand %%mm7, %%mm3 \n\t"
3662 "pandn %%mm0, %%mm7 \n\t"
3663 "paddw %%mm3, %%mm7 \n\t"
3664 "pxor %%mm0, %%mm0 \n\t"
3665 "packuswb %%mm1, %%mm7 \n\t"
3666 "movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3667 "pand _ActiveMask, %%mm7 \n\t"
3668 "movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
3669 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3670 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3671 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3672 "movq %%mm7, %%mm1 \n\t" // now mm1 will be used as
3674 // now do Paeth for 2nd set of bytes (3-5)
3675 "psrlq _ShiftBpp, %%mm2 \n\t" // load b=Prior(x) step 2
3676 "punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3677 "pxor %%mm7, %%mm7 \n\t"
3678 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3679 // pbv = p - b = (a + b - c) - b = a - c
3680 "movq %%mm1, %%mm5 \n\t"
3681 // pav = p - a = (a + b - c) - a = b - c
3682 "movq %%mm2, %%mm4 \n\t"
3683 "psubw %%mm3, %%mm5 \n\t"
3684 "psubw %%mm3, %%mm4 \n\t"
3685 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
3686 // pav + pbv = pbv + pav
3687 "movq %%mm5, %%mm6 \n\t"
3688 "paddw %%mm4, %%mm6 \n\t"
3690 // pa = abs(p-a) = abs(pav)
3691 // pb = abs(p-b) = abs(pbv)
3692 // pc = abs(p-c) = abs(pcv)
3693 "pcmpgtw %%mm5, %%mm0 \n\t" // create mask pbv bytes < 0
3694 "pcmpgtw %%mm4, %%mm7 \n\t" // create mask pav bytes < 0
3695 "pand %%mm5, %%mm0 \n\t" // only pbv bytes < 0 in mm0
3696 "pand %%mm4, %%mm7 \n\t" // only pav bytes < 0 in mm7
3697 "psubw %%mm0, %%mm5 \n\t"
3698 "psubw %%mm7, %%mm4 \n\t"
3699 "psubw %%mm0, %%mm5 \n\t"
3700 "psubw %%mm7, %%mm4 \n\t"
3701 "pxor %%mm0, %%mm0 \n\t"
3702 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3703 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3704 "psubw %%mm0, %%mm6 \n\t"
3706 "movq %%mm4, %%mm7 \n\t"
3707 "psubw %%mm0, %%mm6 \n\t"
3708 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3709 "movq %%mm7, %%mm0 \n\t"
3710 // use mm7 mask to merge pa & pb
3711 "pand %%mm7, %%mm5 \n\t"
3712 // use mm0 mask copy to merge a & b
3713 "pand %%mm0, %%mm2 \n\t"
3714 "pandn %%mm4, %%mm7 \n\t"
3715 "pandn %%mm1, %%mm0 \n\t"
3716 "paddw %%mm5, %%mm7 \n\t"
3717 "paddw %%mm2, %%mm0 \n\t"
3718 // test ((pa <= pb)? pa:pb) <= pc
3719 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3720 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3721 "pand %%mm7, %%mm3 \n\t"
3722 "pandn %%mm0, %%mm7 \n\t"
3723 "pxor %%mm1, %%mm1 \n\t"
3724 "paddw %%mm3, %%mm7 \n\t"
3725 "pxor %%mm0, %%mm0 \n\t"
3726 "packuswb %%mm1, %%mm7 \n\t"
3727 "movq %%mm2, %%mm3 \n\t" // load c=Prior(x-bpp) step 1
3728 "pand _ActiveMask, %%mm7 \n\t"
3729 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3730 "psllq _ShiftBpp, %%mm7 \n\t" // shift bytes to 2nd group of
3732 // pav = p - a = (a + b - c) - a = b - c
3733 "movq %%mm2, %%mm4 \n\t"
3734 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3735 "psllq _ShiftBpp, %%mm3 \n\t" // load c=Prior(x-bpp) step 2
3736 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3737 "movq %%mm7, %%mm1 \n\t"
3738 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3739 "psllq _ShiftBpp, %%mm1 \n\t" // shift bytes
3740 // now mm1 will be used as Raw(x-bpp)
3741 // now do Paeth for 3rd, and final, set of bytes (6-7)
3742 "pxor %%mm7, %%mm7 \n\t"
3743 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3744 "psubw %%mm3, %%mm4 \n\t"
3745 // pbv = p - b = (a + b - c) - b = a - c
3746 "movq %%mm1, %%mm5 \n\t"
3747 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3748 "movq %%mm4, %%mm6 \n\t"
3749 "psubw %%mm3, %%mm5 \n\t"
3750 "pxor %%mm0, %%mm0 \n\t"
3751 "paddw %%mm5, %%mm6 \n\t"
3753 // pa = abs(p-a) = abs(pav)
3754 // pb = abs(p-b) = abs(pbv)
3755 // pc = abs(p-c) = abs(pcv)
3756 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3757 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3758 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3759 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3760 "psubw %%mm0, %%mm4 \n\t"
3761 "psubw %%mm7, %%mm5 \n\t"
3762 "psubw %%mm0, %%mm4 \n\t"
3763 "psubw %%mm7, %%mm5 \n\t"
3764 "pxor %%mm0, %%mm0 \n\t"
3765 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3766 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3767 "psubw %%mm0, %%mm6 \n\t"
3769 "movq %%mm4, %%mm7 \n\t"
3770 "psubw %%mm0, %%mm6 \n\t"
3771 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3772 "movq %%mm7, %%mm0 \n\t"
3773 // use mm0 mask copy to merge a & b
3774 "pand %%mm0, %%mm2 \n\t"
3775 // use mm7 mask to merge pa & pb
3776 "pand %%mm7, %%mm5 \n\t"
3777 "pandn %%mm1, %%mm0 \n\t"
3778 "pandn %%mm4, %%mm7 \n\t"
3779 "paddw %%mm2, %%mm0 \n\t"
3780 "paddw %%mm5, %%mm7 \n\t"
3781 // test ((pa <= pb)? pa:pb) <= pc
3782 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3783 "pand %%mm7, %%mm3 \n\t"
3784 "pandn %%mm0, %%mm7 \n\t"
3785 "paddw %%mm3, %%mm7 \n\t"
3786 "pxor %%mm1, %%mm1 \n\t"
3787 "packuswb %%mm7, %%mm1 \n\t"
3788 // step ecx to next set of 8 bytes and repeat loop til done
3789 "addl $8, %%ecx \n\t"
3790 "pand _ActiveMaskEnd, %%mm1 \n\t"
3791 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with
3794 "cmpl _MMXLength, %%ecx \n\t"
3795 "pxor %%mm0, %%mm0 \n\t" // pxor does not affect flags
3796 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3797 // mm1 will be used as Raw(x-bpp) next loop
3798 // mm3 ready to be used as Prior(x-bpp) next loop
3801 : "=S" (dummy_value_S
), // output regs (dummy)
3802 "=D" (dummy_value_D
)
3804 : "0" (prev_row
), // esi // input regs
3807 : "%ecx" // clobber list
3808 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3809 , "%mm0", "%mm1", "%mm2", "%mm3"
3810 , "%mm4", "%mm5", "%mm6", "%mm7"
3817 //case 7: // GRR BOGUS
3818 //case 5: // GRR BOGUS
3820 _ActiveMask
.use
= 0x00000000ffffffffLL
;
3821 _ActiveMask2
.use
= 0xffffffff00000000LL
;
3822 _ShiftBpp
.use
= bpp
<< 3; // == bpp * 8
3823 _ShiftRem
.use
= 64 - _ShiftBpp
.use
;
3825 __asm__
__volatile__ (
3826 "movl _dif, %%ecx \n\t"
3827 // preload "movl row, %%edi \n\t"
3828 // preload "movl prev_row, %%esi \n\t"
3829 // prime the pump: load the first Raw(x-bpp) data set
3830 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3831 "pxor %%mm0, %%mm0 \n\t"
3834 // must shift to position Raw(x-bpp) data
3835 "psrlq _ShiftRem, %%mm1 \n\t"
3836 // do first set of 4 bytes
3837 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3838 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
3839 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3840 "punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
3841 // must shift to position Prior(x-bpp) data
3842 "psrlq _ShiftRem, %%mm3 \n\t"
3843 // pav = p - a = (a + b - c) - a = b - c
3844 "movq %%mm2, %%mm4 \n\t"
3845 "punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
3846 // pbv = p - b = (a + b - c) - b = a - c
3847 "movq %%mm1, %%mm5 \n\t"
3848 "psubw %%mm3, %%mm4 \n\t"
3849 "pxor %%mm7, %%mm7 \n\t"
3850 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3851 "movq %%mm4, %%mm6 \n\t"
3852 "psubw %%mm3, %%mm5 \n\t"
3853 // pa = abs(p-a) = abs(pav)
3854 // pb = abs(p-b) = abs(pbv)
3855 // pc = abs(p-c) = abs(pcv)
3856 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3857 "paddw %%mm5, %%mm6 \n\t"
3858 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3859 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3860 "psubw %%mm0, %%mm4 \n\t"
3861 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3862 "psubw %%mm0, %%mm4 \n\t"
3863 "psubw %%mm7, %%mm5 \n\t"
3864 "pxor %%mm0, %%mm0 \n\t"
3865 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3866 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3867 "psubw %%mm7, %%mm5 \n\t"
3868 "psubw %%mm0, %%mm6 \n\t"
3870 "movq %%mm4, %%mm7 \n\t"
3871 "psubw %%mm0, %%mm6 \n\t"
3872 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3873 "movq %%mm7, %%mm0 \n\t"
3874 // use mm7 mask to merge pa & pb
3875 "pand %%mm7, %%mm5 \n\t"
3876 // use mm0 mask copy to merge a & b
3877 "pand %%mm0, %%mm2 \n\t"
3878 "pandn %%mm4, %%mm7 \n\t"
3879 "pandn %%mm1, %%mm0 \n\t"
3880 "paddw %%mm5, %%mm7 \n\t"
3881 "paddw %%mm2, %%mm0 \n\t"
3882 // test ((pa <= pb)? pa:pb) <= pc
3883 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3884 "pxor %%mm1, %%mm1 \n\t"
3885 "pand %%mm7, %%mm3 \n\t"
3886 "pandn %%mm0, %%mm7 \n\t"
3887 "paddw %%mm3, %%mm7 \n\t"
3888 "pxor %%mm0, %%mm0 \n\t"
3889 "packuswb %%mm1, %%mm7 \n\t"
3890 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3891 "pand _ActiveMask, %%mm7 \n\t"
3892 "psrlq _ShiftRem, %%mm3 \n\t"
3893 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x) step 1
3894 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor and Raw(x)
3895 "movq %%mm2, %%mm6 \n\t"
3896 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3897 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3898 "psllq _ShiftBpp, %%mm6 \n\t"
3899 "movq %%mm7, %%mm5 \n\t"
3900 "psrlq _ShiftRem, %%mm1 \n\t"
3901 "por %%mm6, %%mm3 \n\t"
3902 "psllq _ShiftBpp, %%mm5 \n\t"
3903 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3904 "por %%mm5, %%mm1 \n\t"
3905 // do second set of 4 bytes
3906 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3907 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3908 // pav = p - a = (a + b - c) - a = b - c
3909 "movq %%mm2, %%mm4 \n\t"
3910 // pbv = p - b = (a + b - c) - b = a - c
3911 "movq %%mm1, %%mm5 \n\t"
3912 "psubw %%mm3, %%mm4 \n\t"
3913 "pxor %%mm7, %%mm7 \n\t"
3914 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3915 "movq %%mm4, %%mm6 \n\t"
3916 "psubw %%mm3, %%mm5 \n\t"
3917 // pa = abs(p-a) = abs(pav)
3918 // pb = abs(p-b) = abs(pbv)
3919 // pc = abs(p-c) = abs(pcv)
3920 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3921 "paddw %%mm5, %%mm6 \n\t"
3922 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3923 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3924 "psubw %%mm0, %%mm4 \n\t"
3925 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3926 "psubw %%mm0, %%mm4 \n\t"
3927 "psubw %%mm7, %%mm5 \n\t"
3928 "pxor %%mm0, %%mm0 \n\t"
3929 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3930 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3931 "psubw %%mm7, %%mm5 \n\t"
3932 "psubw %%mm0, %%mm6 \n\t"
3934 "movq %%mm4, %%mm7 \n\t"
3935 "psubw %%mm0, %%mm6 \n\t"
3936 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3937 "movq %%mm7, %%mm0 \n\t"
3938 // use mm7 mask to merge pa & pb
3939 "pand %%mm7, %%mm5 \n\t"
3940 // use mm0 mask copy to merge a & b
3941 "pand %%mm0, %%mm2 \n\t"
3942 "pandn %%mm4, %%mm7 \n\t"
3943 "pandn %%mm1, %%mm0 \n\t"
3944 "paddw %%mm5, %%mm7 \n\t"
3945 "paddw %%mm2, %%mm0 \n\t"
3946 // test ((pa <= pb)? pa:pb) <= pc
3947 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3948 "pxor %%mm1, %%mm1 \n\t"
3949 "pand %%mm7, %%mm3 \n\t"
3950 "pandn %%mm0, %%mm7 \n\t"
3951 "pxor %%mm1, %%mm1 \n\t"
3952 "paddw %%mm3, %%mm7 \n\t"
3953 "pxor %%mm0, %%mm0 \n\t"
3954 // step ecx to next set of 8 bytes and repeat loop til done
3955 "addl $8, %%ecx \n\t"
3956 "packuswb %%mm7, %%mm1 \n\t"
3957 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3958 "cmpl _MMXLength, %%ecx \n\t"
3959 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3960 // mm1 will be used as Raw(x-bpp) next loop
3963 : "=S" (dummy_value_S
), // output regs (dummy)
3964 "=D" (dummy_value_D
)
3966 : "0" (prev_row
), // esi // input regs
3969 : "%ecx" // clobber list
3970 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3971 , "%mm0", "%mm1", "%mm2", "%mm3"
3972 , "%mm4", "%mm5", "%mm6", "%mm7"
3980 _ActiveMask
.use
= 0x00000000ffffffffLL
;
3982 __asm__
__volatile__ (
3983 "movl _dif, %%ecx \n\t"
3984 // preload "movl row, %%edi \n\t"
3985 // preload "movl prev_row, %%esi \n\t"
3986 "pxor %%mm0, %%mm0 \n\t"
3987 // prime the pump: load the first Raw(x-bpp) data set
3988 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
3989 // a=Raw(x-bpp) bytes
3991 // do first set of 4 bytes
3992 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3993 "punpckhbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
3994 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3995 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3996 // pav = p - a = (a + b - c) - a = b - c
3997 "movq %%mm2, %%mm4 \n\t"
3998 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3999 // pbv = p - b = (a + b - c) - b = a - c
4000 "movq %%mm1, %%mm5 \n\t"
4001 "psubw %%mm3, %%mm4 \n\t"
4002 "pxor %%mm7, %%mm7 \n\t"
4003 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4004 "movq %%mm4, %%mm6 \n\t"
4005 "psubw %%mm3, %%mm5 \n\t"
4006 // pa = abs(p-a) = abs(pav)
4007 // pb = abs(p-b) = abs(pbv)
4008 // pc = abs(p-c) = abs(pcv)
4009 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4010 "paddw %%mm5, %%mm6 \n\t"
4011 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4012 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4013 "psubw %%mm0, %%mm4 \n\t"
4014 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4015 "psubw %%mm0, %%mm4 \n\t"
4016 "psubw %%mm7, %%mm5 \n\t"
4017 "pxor %%mm0, %%mm0 \n\t"
4018 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4019 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4020 "psubw %%mm7, %%mm5 \n\t"
4021 "psubw %%mm0, %%mm6 \n\t"
4023 "movq %%mm4, %%mm7 \n\t"
4024 "psubw %%mm0, %%mm6 \n\t"
4025 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4026 "movq %%mm7, %%mm0 \n\t"
4027 // use mm7 mask to merge pa & pb
4028 "pand %%mm7, %%mm5 \n\t"
4029 // use mm0 mask copy to merge a & b
4030 "pand %%mm0, %%mm2 \n\t"
4031 "pandn %%mm4, %%mm7 \n\t"
4032 "pandn %%mm1, %%mm0 \n\t"
4033 "paddw %%mm5, %%mm7 \n\t"
4034 "paddw %%mm2, %%mm0 \n\t"
4035 // test ((pa <= pb)? pa:pb) <= pc
4036 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4037 "pxor %%mm1, %%mm1 \n\t"
4038 "pand %%mm7, %%mm3 \n\t"
4039 "pandn %%mm0, %%mm7 \n\t"
4040 "paddw %%mm3, %%mm7 \n\t"
4041 "pxor %%mm0, %%mm0 \n\t"
4042 "packuswb %%mm1, %%mm7 \n\t"
4043 "movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
4044 "pand _ActiveMask, %%mm7 \n\t"
4045 "movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
4046 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
4047 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
4048 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
4049 "movq %%mm7, %%mm1 \n\t" // now mm1 will be used as Raw(x-bpp)
4050 // do second set of 4 bytes
4051 "punpckhbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
4052 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
4053 // pav = p - a = (a + b - c) - a = b - c
4054 "movq %%mm2, %%mm4 \n\t"
4055 // pbv = p - b = (a + b - c) - b = a - c
4056 "movq %%mm1, %%mm5 \n\t"
4057 "psubw %%mm3, %%mm4 \n\t"
4058 "pxor %%mm7, %%mm7 \n\t"
4059 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4060 "movq %%mm4, %%mm6 \n\t"
4061 "psubw %%mm3, %%mm5 \n\t"
4062 // pa = abs(p-a) = abs(pav)
4063 // pb = abs(p-b) = abs(pbv)
4064 // pc = abs(p-c) = abs(pcv)
4065 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4066 "paddw %%mm5, %%mm6 \n\t"
4067 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4068 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4069 "psubw %%mm0, %%mm4 \n\t"
4070 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4071 "psubw %%mm0, %%mm4 \n\t"
4072 "psubw %%mm7, %%mm5 \n\t"
4073 "pxor %%mm0, %%mm0 \n\t"
4074 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4075 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4076 "psubw %%mm7, %%mm5 \n\t"
4077 "psubw %%mm0, %%mm6 \n\t"
4079 "movq %%mm4, %%mm7 \n\t"
4080 "psubw %%mm0, %%mm6 \n\t"
4081 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4082 "movq %%mm7, %%mm0 \n\t"
4083 // use mm7 mask to merge pa & pb
4084 "pand %%mm7, %%mm5 \n\t"
4085 // use mm0 mask copy to merge a & b
4086 "pand %%mm0, %%mm2 \n\t"
4087 "pandn %%mm4, %%mm7 \n\t"
4088 "pandn %%mm1, %%mm0 \n\t"
4089 "paddw %%mm5, %%mm7 \n\t"
4090 "paddw %%mm2, %%mm0 \n\t"
4091 // test ((pa <= pb)? pa:pb) <= pc
4092 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4093 "pxor %%mm1, %%mm1 \n\t"
4094 "pand %%mm7, %%mm3 \n\t"
4095 "pandn %%mm0, %%mm7 \n\t"
4096 "pxor %%mm1, %%mm1 \n\t"
4097 "paddw %%mm3, %%mm7 \n\t"
4098 "pxor %%mm0, %%mm0 \n\t"
4099 // step ecx to next set of 8 bytes and repeat loop til done
4100 "addl $8, %%ecx \n\t"
4101 "packuswb %%mm7, %%mm1 \n\t"
4102 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
4103 "cmpl _MMXLength, %%ecx \n\t"
4104 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
4105 // mm1 will be used as Raw(x-bpp) next loop
4108 : "=S" (dummy_value_S
), // output regs (dummy)
4109 "=D" (dummy_value_D
)
4111 : "0" (prev_row
), // esi // input regs
4114 : "%ecx" // clobber list
4115 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4116 , "%mm0", "%mm1", "%mm2", "%mm3"
4117 , "%mm4", "%mm5", "%mm6", "%mm7"
4125 _ActiveMask
.use
= 0x00000000ffffffffLL
;
4127 __asm__
__volatile__ (
4128 "movl _dif, %%ecx \n\t"
4129 // preload "movl row, %%edi \n\t"
4130 // preload "movl prev_row, %%esi \n\t"
4131 "pxor %%mm0, %%mm0 \n\t"
4132 // prime the pump: load the first Raw(x-bpp) data set
4133 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
4134 // a=Raw(x-bpp) bytes
4136 // do first set of 4 bytes
4137 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4138 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
4139 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
4140 "punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
4141 // pav = p - a = (a + b - c) - a = b - c
4142 "movq %%mm2, %%mm4 \n\t"
4143 "punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
4144 // pbv = p - b = (a + b - c) - b = a - c
4145 "movq %%mm1, %%mm5 \n\t"
4146 "psubw %%mm3, %%mm4 \n\t"
4147 "pxor %%mm7, %%mm7 \n\t"
4148 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4149 "movq %%mm4, %%mm6 \n\t"
4150 "psubw %%mm3, %%mm5 \n\t"
4151 // pa = abs(p-a) = abs(pav)
4152 // pb = abs(p-b) = abs(pbv)
4153 // pc = abs(p-c) = abs(pcv)
4154 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4155 "paddw %%mm5, %%mm6 \n\t"
4156 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4157 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4158 "psubw %%mm0, %%mm4 \n\t"
4159 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4160 "psubw %%mm0, %%mm4 \n\t"
4161 "psubw %%mm7, %%mm5 \n\t"
4162 "pxor %%mm0, %%mm0 \n\t"
4163 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4164 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4165 "psubw %%mm7, %%mm5 \n\t"
4166 "psubw %%mm0, %%mm6 \n\t"
4168 "movq %%mm4, %%mm7 \n\t"
4169 "psubw %%mm0, %%mm6 \n\t"
4170 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4171 "movq %%mm7, %%mm0 \n\t"
4172 // use mm7 mask to merge pa & pb
4173 "pand %%mm7, %%mm5 \n\t"
4174 // use mm0 mask copy to merge a & b
4175 "pand %%mm0, %%mm2 \n\t"
4176 "pandn %%mm4, %%mm7 \n\t"
4177 "pandn %%mm1, %%mm0 \n\t"
4178 "paddw %%mm5, %%mm7 \n\t"
4179 "paddw %%mm2, %%mm0 \n\t"
4180 // test ((pa <= pb)? pa:pb) <= pc
4181 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4182 "pxor %%mm1, %%mm1 \n\t"
4183 "pand %%mm7, %%mm3 \n\t"
4184 "pandn %%mm0, %%mm7 \n\t"
4185 "paddw %%mm3, %%mm7 \n\t"
4186 "pxor %%mm0, %%mm0 \n\t"
4187 "packuswb %%mm1, %%mm7 \n\t"
4188 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4189 "pand _ActiveMask, %%mm7 \n\t"
4190 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
4191 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
4192 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
4193 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
4194 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
4196 // do second set of 4 bytes
4197 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
4198 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
4199 // pav = p - a = (a + b - c) - a = b - c
4200 "movq %%mm2, %%mm4 \n\t"
4201 // pbv = p - b = (a + b - c) - b = a - c
4202 "movq %%mm1, %%mm5 \n\t"
4203 "psubw %%mm3, %%mm4 \n\t"
4204 "pxor %%mm7, %%mm7 \n\t"
4205 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4206 "movq %%mm4, %%mm6 \n\t"
4207 "psubw %%mm3, %%mm5 \n\t"
4208 // pa = abs(p-a) = abs(pav)
4209 // pb = abs(p-b) = abs(pbv)
4210 // pc = abs(p-c) = abs(pcv)
4211 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4212 "paddw %%mm5, %%mm6 \n\t"
4213 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4214 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4215 "psubw %%mm0, %%mm4 \n\t"
4216 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4217 "psubw %%mm0, %%mm4 \n\t"
4218 "psubw %%mm7, %%mm5 \n\t"
4219 "pxor %%mm0, %%mm0 \n\t"
4220 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4221 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4222 "psubw %%mm7, %%mm5 \n\t"
4223 "psubw %%mm0, %%mm6 \n\t"
4225 "movq %%mm4, %%mm7 \n\t"
4226 "psubw %%mm0, %%mm6 \n\t"
4227 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4228 "movq %%mm7, %%mm0 \n\t"
4229 // use mm7 mask to merge pa & pb
4230 "pand %%mm7, %%mm5 \n\t"
4231 // use mm0 mask copy to merge a & b
4232 "pand %%mm0, %%mm2 \n\t"
4233 "pandn %%mm4, %%mm7 \n\t"
4234 "pandn %%mm1, %%mm0 \n\t"
4235 "paddw %%mm5, %%mm7 \n\t"
4236 "paddw %%mm2, %%mm0 \n\t"
4237 // test ((pa <= pb)? pa:pb) <= pc
4238 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4239 "pxor %%mm1, %%mm1 \n\t"
4240 "pand %%mm7, %%mm3 \n\t"
4241 "pandn %%mm0, %%mm7 \n\t"
4242 "pxor %%mm1, %%mm1 \n\t"
4243 "paddw %%mm3, %%mm7 \n\t"
4244 "pxor %%mm0, %%mm0 \n\t"
4245 // step ecx to next set of 8 bytes and repeat loop til done
4246 "addl $8, %%ecx \n\t"
4247 "packuswb %%mm7, %%mm1 \n\t"
4248 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
4249 "cmpl _MMXLength, %%ecx \n\t"
4250 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
4251 // mm1 will be used as Raw(x-bpp) next loop
4254 : "=S" (dummy_value_S
), // output regs (dummy)
4255 "=D" (dummy_value_D
)
4257 : "0" (prev_row
), // esi // input regs
4260 : "%ecx" // clobber list
4261 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4262 , "%mm0", "%mm1", "%mm2", "%mm3"
4263 , "%mm4", "%mm5", "%mm6", "%mm7"
4273 __asm__
__volatile__ (
4275 "pushl %%ebx \n\t" // save Global Offset Table index
4277 "movl _dif, %%ebx \n\t"
4278 "cmpl _FullLength, %%ebx \n\t"
4279 "jnb paeth_dend \n\t"
4281 // preload "movl row, %%edi \n\t"
4282 // preload "movl prev_row, %%esi \n\t"
4283 // do Paeth decode for remaining bytes
4284 "movl %%ebx, %%edx \n\t"
4285 // preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
4286 "subl %%ecx, %%edx \n\t" // edx = ebx - bpp
4287 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
4290 "xorl %%eax, %%eax \n\t"
4291 // pav = p - a = (a + b - c) - a = b - c
4292 "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
4293 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4294 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4295 "movl %%eax, _patemp \n\t" // Save pav for later use
4296 "xorl %%eax, %%eax \n\t"
4297 // pbv = p - b = (a + b - c) - b = a - c
4298 "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
4299 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4300 "movl %%eax, %%ecx \n\t"
4301 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4302 "addl _patemp, %%eax \n\t" // pcv = pav + pbv
4304 "testl $0x80000000, %%eax \n\t"
4305 "jz paeth_dpca \n\t"
4306 "negl %%eax \n\t" // reverse sign of neg values
4309 "movl %%eax, _pctemp \n\t" // save pc for later use
4311 "testl $0x80000000, %%ecx \n\t"
4312 "jz paeth_dpba \n\t"
4313 "negl %%ecx \n\t" // reverse sign of neg values
4316 "movl %%ecx, _pbtemp \n\t" // save pb for later use
4318 "movl _patemp, %%eax \n\t"
4319 "testl $0x80000000, %%eax \n\t"
4320 "jz paeth_dpaa \n\t"
4321 "negl %%eax \n\t" // reverse sign of neg values
4324 "movl %%eax, _patemp \n\t" // save pa for later use
4326 "cmpl %%ecx, %%eax \n\t"
4327 "jna paeth_dabb \n\t"
4328 // pa > pb; now test if pb <= pc
4329 "cmpl _pctemp, %%ecx \n\t"
4330 "jna paeth_dbbc \n\t"
4331 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4332 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4333 "jmp paeth_dpaeth \n\t"
4336 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4337 "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
4338 "jmp paeth_dpaeth \n\t"
4341 // pa <= pb; now test if pa <= pc
4342 "cmpl _pctemp, %%eax \n\t"
4343 "jna paeth_dabc \n\t"
4344 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4345 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4346 "jmp paeth_dpaeth \n\t"
4349 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4350 "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
4352 "paeth_dpaeth: \n\t"
4355 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4356 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4357 "cmpl _FullLength, %%ebx \n\t"
4362 "popl %%ebx \n\t" // index to Global Offset Table
4365 : "=c" (dummy_value_c
), // output regs (dummy)
4366 "=S" (dummy_value_S
),
4367 "=D" (dummy_value_D
)
4369 : "0" (bpp
), // ecx // input regs
4370 "1" (prev_row
), // esi
4373 : "%eax", "%edx" // clobber list
4379 return; // No need to go further with this one
4381 } // end switch (bpp)
4383 __asm__
__volatile__ (
4384 // MMX acceleration complete; now do clean-up
4385 // check if any remaining bytes left to decode
4387 "pushl %%ebx \n\t" // save index to Global Offset Table
4389 "movl _MMXLength, %%ebx \n\t"
4390 "cmpl _FullLength, %%ebx \n\t"
4391 "jnb paeth_end \n\t"
4392 //pre "movl row, %%edi \n\t"
4393 //pre "movl prev_row, %%esi \n\t"
4394 // do Paeth decode for remaining bytes
4395 "movl %%ebx, %%edx \n\t"
4396 //pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
4397 "subl %%ecx, %%edx \n\t" // edx = ebx - bpp
4398 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
4401 "xorl %%eax, %%eax \n\t"
4402 // pav = p - a = (a + b - c) - a = b - c
4403 "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
4404 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4405 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4406 "movl %%eax, _patemp \n\t" // Save pav for later use
4407 "xorl %%eax, %%eax \n\t"
4408 // pbv = p - b = (a + b - c) - b = a - c
4409 "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
4410 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4411 "movl %%eax, %%ecx \n\t"
4412 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4413 "addl _patemp, %%eax \n\t" // pcv = pav + pbv
4415 "testl $0x80000000, %%eax \n\t"
4416 "jz paeth_pca2 \n\t"
4417 "negl %%eax \n\t" // reverse sign of neg values
4420 "movl %%eax, _pctemp \n\t" // save pc for later use
4422 "testl $0x80000000, %%ecx \n\t"
4423 "jz paeth_pba2 \n\t"
4424 "negl %%ecx \n\t" // reverse sign of neg values
4427 "movl %%ecx, _pbtemp \n\t" // save pb for later use
4429 "movl _patemp, %%eax \n\t"
4430 "testl $0x80000000, %%eax \n\t"
4431 "jz paeth_paa2 \n\t"
4432 "negl %%eax \n\t" // reverse sign of neg values
4435 "movl %%eax, _patemp \n\t" // save pa for later use
4437 "cmpl %%ecx, %%eax \n\t"
4438 "jna paeth_abb2 \n\t"
4439 // pa > pb; now test if pb <= pc
4440 "cmpl _pctemp, %%ecx \n\t"
4441 "jna paeth_bbc2 \n\t"
4442 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4443 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4444 "jmp paeth_paeth2 \n\t"
4447 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4448 "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
4449 "jmp paeth_paeth2 \n\t"
4452 // pa <= pb; now test if pa <= pc
4453 "cmpl _pctemp, %%eax \n\t"
4454 "jna paeth_abc2 \n\t"
4455 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4456 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4457 "jmp paeth_paeth2 \n\t"
4460 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4461 "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
4463 "paeth_paeth2: \n\t"
4466 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4467 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4468 "cmpl _FullLength, %%ebx \n\t"
4472 "EMMS \n\t" // end MMX; prep for poss. FP instrs.
4474 "popl %%ebx \n\t" // restore index to Global Offset Table
4477 : "=c" (dummy_value_c
), // output regs (dummy)
4478 "=S" (dummy_value_S
),
4479 "=D" (dummy_value_D
)
4481 : "0" (bpp
), // ecx // input regs
4482 "1" (prev_row
), // esi
4485 : "%eax", "%edx" // clobber list (no input regs!)
4491 } /* end png_read_filter_row_mmx_paeth() */
4497 #ifdef PNG_THREAD_UNSAFE_OK
4498 //===========================================================================//
4500 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B //
4502 //===========================================================================//
4504 // Optimized code for PNG Sub filter decoder
4506 static void /* PRIVATE */
4507 png_read_filter_row_mmx_sub(png_row_infop row_info
, png_bytep row
)
4513 bpp
= (row_info
->pixel_depth
+ 7) >> 3; // calc number of bytes per pixel
4514 _FullLength
= row_info
->rowbytes
- bpp
; // number of bytes to filter
4516 __asm__
__volatile__ (
4517 //pre "movl row, %%edi \n\t"
4518 "movl %%edi, %%esi \n\t" // lp = row
4519 //pre "movl bpp, %%eax \n\t"
4520 "addl %%eax, %%edi \n\t" // rp = row + bpp
4521 //irr "xorl %%eax, %%eax \n\t"
4522 // get # of bytes to alignment
4523 "movl %%edi, _dif \n\t" // take start of row
4524 "addl $0xf, _dif \n\t" // add 7 + 8 to incr past
4525 // alignment boundary
4526 "xorl %%ecx, %%ecx \n\t"
4527 "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
4528 "subl %%edi, _dif \n\t" // subtract from start ==> value
4529 "jz sub_go \n\t" // ecx at alignment
4531 "sub_lp1: \n\t" // fix alignment
4532 "movb (%%esi,%%ecx,), %%al \n\t"
4533 "addb %%al, (%%edi,%%ecx,) \n\t"
4535 "cmpl _dif, %%ecx \n\t"
4539 "movl _FullLength, %%eax \n\t"
4540 "movl %%eax, %%edx \n\t"
4541 "subl %%ecx, %%edx \n\t" // subtract alignment fix
4542 "andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
4543 "subl %%edx, %%eax \n\t" // drop over bytes from length
4544 "movl %%eax, _MMXLength \n\t"
4546 : "=a" (dummy_value_a
), // 0 // output regs (dummy)
4547 "=D" (dummy_value_D
) // 1
4549 : "0" (bpp
), // eax // input regs
4552 : "%esi", "%ecx", "%edx" // clobber list
4554 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4555 , "%mm0", "%mm1", "%mm2", "%mm3"
4556 , "%mm4", "%mm5", "%mm6", "%mm7"
4560 // now do the math for the rest of the row
4565 _ActiveMask
.use
= 0x0000ffffff000000LL
;
4566 _ShiftBpp
.use
= 24; // == 3 * 8
4567 _ShiftRem
.use
= 40; // == 64 - 24
4569 __asm__
__volatile__ (
4570 // preload "movl row, %%edi \n\t"
4571 "movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
4572 // active byte group
4573 "movl %%edi, %%esi \n\t" // lp = row
4574 // preload "movl bpp, %%eax \n\t"
4575 "addl %%eax, %%edi \n\t" // rp = row + bpp
4576 "movq %%mm7, %%mm6 \n\t"
4577 "movl _dif, %%edx \n\t"
4578 "psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
4579 // 3rd active byte group
4580 // prime the pump: load the first Raw(x-bpp) data set
4581 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4583 "sub_3lp: \n\t" // shift data for adding first
4584 "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4585 // shift clears inactive bytes)
4586 // add 1st active group
4587 "movq (%%edi,%%edx,), %%mm0 \n\t"
4588 "paddb %%mm1, %%mm0 \n\t"
4590 // add 2nd active group
4591 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4592 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4593 "pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
4594 "paddb %%mm1, %%mm0 \n\t"
4596 // add 3rd active group
4597 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4598 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4599 "pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
4600 "addl $8, %%edx \n\t"
4601 "paddb %%mm1, %%mm0 \n\t"
4603 "cmpl _MMXLength, %%edx \n\t"
4604 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4605 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4608 : "=a" (dummy_value_a
), // 0 // output regs (dummy)
4609 "=D" (dummy_value_D
) // 1
4611 : "0" (bpp
), // eax // input regs
4614 : "%edx", "%esi" // clobber list
4615 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4616 , "%mm0", "%mm1", "%mm6", "%mm7"
4624 __asm__
__volatile__ (
4625 "movl _dif, %%edx \n\t"
4626 // preload "movl row, %%edi \n\t"
4627 "cmpl _FullLength, %%edx \n\t"
4629 "movl %%edi, %%esi \n\t" // lp = row
4630 "xorl %%eax, %%eax \n\t"
4631 // preload "movl bpp, %%eax \n\t"
4632 "addl %%eax, %%edi \n\t" // rp = row + bpp
4635 "movb (%%esi,%%edx,), %%al \n\t"
4636 "addb %%al, (%%edi,%%edx,) \n\t"
4638 "cmpl _FullLength, %%edx \n\t"
4643 : "=a" (dummy_value_a
), // 0 // output regs (dummy)
4644 "=D" (dummy_value_D
) // 1
4646 : "0" (bpp
), // eax // input regs
4649 : "%edx", "%esi" // clobber list
4656 //case 7: // GRR BOGUS
4657 //case 5: // GRR BOGUS
4659 _ShiftBpp
.use
= bpp
<< 3;
4660 _ShiftRem
.use
= 64 - _ShiftBpp
.use
;
4662 __asm__
__volatile__ (
4663 // preload "movl row, %%edi \n\t"
4664 "movl _dif, %%edx \n\t"
4665 "movl %%edi, %%esi \n\t" // lp = row
4666 // preload "movl bpp, %%eax \n\t"
4667 "addl %%eax, %%edi \n\t" // rp = row + bpp
4669 // prime the pump: load the first Raw(x-bpp) data set
4670 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4672 "sub_4lp: \n\t" // shift data for adding first
4673 "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4674 // shift clears inactive bytes)
4675 "movq (%%edi,%%edx,), %%mm0 \n\t"
4676 "paddb %%mm1, %%mm0 \n\t"
4678 // add 2nd active group
4679 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4680 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4681 "addl $8, %%edx \n\t"
4682 "paddb %%mm1, %%mm0 \n\t"
4684 "cmpl _MMXLength, %%edx \n\t"
4685 "movq %%mm0, -8(%%edi,%%edx,) \n\t"
4686 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4689 : "=a" (dummy_value_a
), // 0 // output regs (dummy)
4690 "=D" (dummy_value_D
) // 1
4692 : "0" (bpp
), // eax // input regs
4695 : "%edx", "%esi" // clobber list
4696 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4705 _ActiveMask
.use
= 0x00000000ffff0000LL
;
4706 _ShiftBpp
.use
= 16; // == 2 * 8
4707 _ShiftRem
.use
= 48; // == 64 - 16
4709 __asm__
__volatile__ (
4710 "movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
4711 // active byte group
4712 "movl _dif, %%edx \n\t"
4713 "movq %%mm7, %%mm6 \n\t"
4714 // preload "movl row, %%edi \n\t"
4715 "psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
4716 // 3rd active byte group
4717 "movl %%edi, %%esi \n\t" // lp = row
4718 "movq %%mm6, %%mm5 \n\t"
4719 // preload "movl bpp, %%eax \n\t"
4720 "addl %%eax, %%edi \n\t" // rp = row + bpp
4721 "psllq _ShiftBpp, %%mm5 \n\t" // move mask in mm5 to cover
4722 // 4th active byte group
4723 // prime the pump: load the first Raw(x-bpp) data set
4724 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4726 "sub_2lp: \n\t" // shift data for adding first
4727 "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4728 // shift clears inactive bytes)
4729 // add 1st active group
4730 "movq (%%edi,%%edx,), %%mm0 \n\t"
4731 "paddb %%mm1, %%mm0 \n\t"
4733 // add 2nd active group
4734 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4735 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4736 "pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
4737 "paddb %%mm1, %%mm0 \n\t"
4739 // add 3rd active group
4740 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4741 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4742 "pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
4743 "paddb %%mm1, %%mm0 \n\t"
4745 // add 4th active group
4746 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4747 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4748 "pand %%mm5, %%mm1 \n\t" // mask to use 4th active group
4749 "addl $8, %%edx \n\t"
4750 "paddb %%mm1, %%mm0 \n\t"
4751 "cmpl _MMXLength, %%edx \n\t"
4752 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4753 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4756 : "=a" (dummy_value_a
), // 0 // output regs (dummy)
4757 "=D" (dummy_value_D
) // 1
4759 : "0" (bpp
), // eax // input regs
4762 : "%edx", "%esi" // clobber list
4763 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4764 , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
4772 __asm__
__volatile__ (
4773 // preload "movl row, %%edi \n\t"
4774 "movl _dif, %%edx \n\t"
4775 "movl %%edi, %%esi \n\t" // lp = row
4776 // preload "movl bpp, %%eax \n\t"
4777 "addl %%eax, %%edi \n\t" // rp = row + bpp
4778 "movl _MMXLength, %%ecx \n\t"
4780 // prime the pump: load the first Raw(x-bpp) data set
4781 "movq -8(%%edi,%%edx,), %%mm7 \n\t"
4782 "andl $0x0000003f, %%ecx \n\t" // calc bytes over mult of 64
4785 "movq (%%edi,%%edx,), %%mm0 \n\t" // load Sub(x) for 1st 8 bytes
4786 "paddb %%mm7, %%mm0 \n\t"
4787 "movq 8(%%edi,%%edx,), %%mm1 \n\t" // load Sub(x) for 2nd 8 bytes
4788 "movq %%mm0, (%%edi,%%edx,) \n\t" // write Raw(x) for 1st 8 bytes
4790 // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
4791 // This will be repeated for each group of 8 bytes with the 8th
4792 // group being used as the Raw(x-bpp) for the 1st group of the
4795 "paddb %%mm0, %%mm1 \n\t"
4796 "movq 16(%%edi,%%edx,), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes
4797 "movq %%mm1, 8(%%edi,%%edx,) \n\t" // write Raw(x) for 2nd 8 bytes
4798 "paddb %%mm1, %%mm2 \n\t"
4799 "movq 24(%%edi,%%edx,), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes
4800 "movq %%mm2, 16(%%edi,%%edx,) \n\t" // write Raw(x) for 3rd 8 bytes
4801 "paddb %%mm2, %%mm3 \n\t"
4802 "movq 32(%%edi,%%edx,), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes
4803 "movq %%mm3, 24(%%edi,%%edx,) \n\t" // write Raw(x) for 4th 8 bytes
4804 "paddb %%mm3, %%mm4 \n\t"
4805 "movq 40(%%edi,%%edx,), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes
4806 "movq %%mm4, 32(%%edi,%%edx,) \n\t" // write Raw(x) for 5th 8 bytes
4807 "paddb %%mm4, %%mm5 \n\t"
4808 "movq 48(%%edi,%%edx,), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes
4809 "movq %%mm5, 40(%%edi,%%edx,) \n\t" // write Raw(x) for 6th 8 bytes
4810 "paddb %%mm5, %%mm6 \n\t"
4811 "movq 56(%%edi,%%edx,), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes
4812 "movq %%mm6, 48(%%edi,%%edx,) \n\t" // write Raw(x) for 7th 8 bytes
4813 "addl $64, %%edx \n\t"
4814 "paddb %%mm6, %%mm7 \n\t"
4815 "cmpl %%ecx, %%edx \n\t"
4816 "movq %%mm7, -8(%%edi,%%edx,) \n\t" // write Raw(x) for 8th 8 bytes
4819 "cmpl _MMXLength, %%edx \n\t"
4823 "movq (%%edi,%%edx,), %%mm0 \n\t"
4824 "addl $8, %%edx \n\t"
4825 "paddb %%mm7, %%mm0 \n\t"
4826 "cmpl _MMXLength, %%edx \n\t"
4827 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // -8 to offset early addl edx
4828 "movq %%mm0, %%mm7 \n\t" // move calculated Raw(x) data
4829 // to mm1 to be new Raw(x-bpp)
4835 : "=a" (dummy_value_a
), // 0 // output regs (dummy)
4836 "=D" (dummy_value_D
) // 1
4838 : "0" (bpp
), // eax // input regs
4841 : "%ecx", "%edx", "%esi" // clobber list
4842 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4843 , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
4849 default: // bpp greater than 8 bytes GRR BOGUS
4851 __asm__
__volatile__ (
4852 "movl _dif, %%edx \n\t"
4853 // preload "movl row, %%edi \n\t"
4854 "movl %%edi, %%esi \n\t" // lp = row
4855 // preload "movl bpp, %%eax \n\t"
4856 "addl %%eax, %%edi \n\t" // rp = row + bpp
4859 "movq (%%edi,%%edx,), %%mm0 \n\t"
4860 "movq (%%esi,%%edx,), %%mm1 \n\t"
4861 "addl $8, %%edx \n\t"
4862 "paddb %%mm1, %%mm0 \n\t"
4863 "cmpl _MMXLength, %%edx \n\t"
4864 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // mov does not affect flags;
4865 // -8 to offset addl edx
4868 : "=a" (dummy_value_a
), // 0 // output regs (dummy)
4869 "=D" (dummy_value_D
) // 1
4871 : "0" (bpp
), // eax // input regs
4874 : "%edx", "%esi" // clobber list
4875 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4882 } // end switch (bpp)
4884 __asm__
__volatile__ (
4885 "movl _MMXLength, %%edx \n\t"
4886 //pre "movl row, %%edi \n\t"
4887 "cmpl _FullLength, %%edx \n\t"
4890 "movl %%edi, %%esi \n\t" // lp = row
4891 //pre "movl bpp, %%eax \n\t"
4892 "addl %%eax, %%edi \n\t" // rp = row + bpp
4893 "xorl %%eax, %%eax \n\t"
4896 "movb (%%esi,%%edx,), %%al \n\t"
4897 "addb %%al, (%%edi,%%edx,) \n\t"
4899 "cmpl _FullLength, %%edx \n\t"
4903 "EMMS \n\t" // end MMX instructions
4905 : "=a" (dummy_value_a
), // 0 // output regs (dummy)
4906 "=D" (dummy_value_D
) // 1
4908 : "0" (bpp
), // eax // input regs
4911 : "%edx", "%esi" // clobber list
4914 } // end of png_read_filter_row_mmx_sub()
4920 //===========================================================================//
4922 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P //
4924 //===========================================================================//
4926 // Optimized code for PNG Up filter decoder
4928 static void /* PRIVATE */
4929 png_read_filter_row_mmx_up(png_row_infop row_info
, png_bytep row
,
4933 int dummy_value_d
; // fix 'forbidden register 3 (dx) was spilled' error
4937 len
= row_info
->rowbytes
; // number of bytes to filter
4939 __asm__
__volatile__ (
4940 //pre "movl row, %%edi \n\t"
4941 // get # of bytes to alignment
4945 "movl %%edi, %%ecx \n\t"
4946 "xorl %%ebx, %%ebx \n\t"
4947 "addl $0x7, %%ecx \n\t"
4948 "xorl %%eax, %%eax \n\t"
4949 "andl $0xfffffff8, %%ecx \n\t"
4950 //pre "movl prev_row, %%esi \n\t"
4951 "subl %%edi, %%ecx \n\t"
4954 "up_lp1: \n\t" // fix alignment
4955 "movb (%%edi,%%ebx,), %%al \n\t"
4956 "addb (%%esi,%%ebx,), %%al \n\t"
4958 "cmpl %%ecx, %%ebx \n\t"
4959 "movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
4960 "jb up_lp1 \n\t" // offset incl ebx
4963 //pre "movl len, %%edx \n\t"
4964 "movl %%edx, %%ecx \n\t"
4965 "subl %%ebx, %%edx \n\t" // subtract alignment fix
4966 "andl $0x0000003f, %%edx \n\t" // calc bytes over mult of 64
4967 "subl %%edx, %%ecx \n\t" // drop over bytes from length
4969 // unrolled loop - use all MMX registers and interleave to reduce
4970 // number of branch instructions (loops) and reduce partial stalls
4972 "movq (%%esi,%%ebx,), %%mm1 \n\t"
4973 "movq (%%edi,%%ebx,), %%mm0 \n\t"
4974 "movq 8(%%esi,%%ebx,), %%mm3 \n\t"
4975 "paddb %%mm1, %%mm0 \n\t"
4976 "movq 8(%%edi,%%ebx,), %%mm2 \n\t"
4977 "movq %%mm0, (%%edi,%%ebx,) \n\t"
4978 "paddb %%mm3, %%mm2 \n\t"
4979 "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
4980 "movq %%mm2, 8(%%edi,%%ebx,) \n\t"
4981 "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
4982 "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
4983 "paddb %%mm5, %%mm4 \n\t"
4984 "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
4985 "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
4986 "paddb %%mm7, %%mm6 \n\t"
4987 "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
4988 "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
4989 "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
4990 "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
4991 "paddb %%mm1, %%mm0 \n\t"
4992 "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
4993 "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
4994 "paddb %%mm3, %%mm2 \n\t"
4995 "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
4996 "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
4997 "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
4998 "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
4999 "paddb %%mm5, %%mm4 \n\t"
5000 "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
5001 "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
5002 "addl $64, %%ebx \n\t"
5003 "paddb %%mm7, %%mm6 \n\t"
5004 "cmpl %%ecx, %%ebx \n\t"
5005 "movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
5006 "jb up_loop \n\t" // -8 to offset addl ebx
5008 "cmpl $0, %%edx \n\t" // test for bytes over mult of 64
5011 "cmpl $8, %%edx \n\t" // test for less than 8 bytes
5012 "jb up_lt8 \n\t" // [added by lcreeve at netins.net]
5014 "addl %%edx, %%ecx \n\t"
5015 "andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
5016 "subl %%edx, %%ecx \n\t" // drop over bytes from length
5019 "up_lpA: \n\t" // use MMX regs to update 8 bytes sim.
5020 "movq (%%esi,%%ebx,), %%mm1 \n\t"
5021 "movq (%%edi,%%ebx,), %%mm0 \n\t"
5022 "addl $8, %%ebx \n\t"
5023 "paddb %%mm1, %%mm0 \n\t"
5024 "cmpl %%ecx, %%ebx \n\t"
5025 "movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to
5026 "jb up_lpA \n\t" // offset add ebx
5027 "cmpl $0, %%edx \n\t" // test for bytes over mult of 8
5031 "xorl %%eax, %%eax \n\t"
5032 "addl %%edx, %%ecx \n\t" // move over byte count into counter
5034 "up_lp2: \n\t" // use x86 regs for remaining bytes
5035 "movb (%%edi,%%ebx,), %%al \n\t"
5036 "addb (%%esi,%%ebx,), %%al \n\t"
5038 "cmpl %%ecx, %%ebx \n\t"
5039 "movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
5040 "jb up_lp2 \n\t" // offset inc ebx
5043 "EMMS \n\t" // conversion of filtered row complete
5048 : "=d" (dummy_value_d
), // 0 // output regs (dummy)
5049 "=S" (dummy_value_S
), // 1
5050 "=D" (dummy_value_D
) // 2
5052 : "0" (len
), // edx // input regs
5053 "1" (prev_row
), // esi
5056 : "%eax", "%ecx" // clobber list (no input regs!)
5061 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
5062 , "%mm0", "%mm1", "%mm2", "%mm3"
5063 , "%mm4", "%mm5", "%mm6", "%mm7"
5067 } // end of png_read_filter_row_mmx_up()
5069 #endif /* PNG_MMX_CODE_SUPPORTED */
5074 /*===========================================================================*/
5076 /* P N G _ R E A D _ F I L T E R _ R O W */
5078 /*===========================================================================*/
5081 /* Optimized png_read_filter_row routines */
5084 png_read_filter_row(png_structp png_ptr
, png_row_infop row_info
, png_bytep
5085 row
, png_bytep prev_row
, int filter
)
5091 #if defined(PNG_MMX_CODE_SUPPORTED)
5092 /* GRR: these are superseded by png_ptr->asm_flags: */
5093 #define UseMMX_sub 1 // GRR: converted 20000730
5094 #define UseMMX_up 1 // GRR: converted 20000729
5095 #define UseMMX_avg 1 // GRR: converted 20000828 (+ 16-bit bugfix 20000916)
5096 #define UseMMX_paeth 1 // GRR: converted 20000828
5098 if (_mmx_supported
== 2) {
5099 /* this should have happened in png_init_mmx_flags() already */
5100 #if !defined(PNG_1_0_X)
5101 png_warning(png_ptr
, "asm_flags may not have been initialized");
5105 #endif /* PNG_MMX_CODE_SUPPORTED */
5108 png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
5111 case 0: sprintf(filnm
, "none");
5113 case 1: sprintf(filnm
, "sub-%s",
5114 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5115 #if !defined(PNG_1_0_X)
5116 (png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_SUB
)? "MMX" :
5121 case 2: sprintf(filnm
, "up-%s",
5122 #ifdef PNG_MMX_CODE_SUPPORTED
5123 #if !defined(PNG_1_0_X)
5124 (png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_UP
)? "MMX" :
5129 case 3: sprintf(filnm
, "avg-%s",
5130 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5131 #if !defined(PNG_1_0_X)
5132 (png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_AVG
)? "MMX" :
5137 case 4: sprintf(filnm
, "Paeth-%s",
5138 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5139 #if !defined(PNG_1_0_X)
5140 (png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_PAETH
)? "MMX":
5145 default: sprintf(filnm
, "unknw");
5148 png_debug2(0, "row_number=%5ld, %5s, ", png_ptr
->row_number
, filnm
);
5149 png_debug1(0, "row=0x%08lx, ", (unsigned long)row
);
5150 png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info
->pixel_depth
,
5151 (int)((row_info
->pixel_depth
+ 7) >> 3));
5152 png_debug1(0,"rowbytes=%8ld\n", row_info
->rowbytes
);
5153 #endif /* PNG_DEBUG */
5157 case PNG_FILTER_VALUE_NONE
:
5160 case PNG_FILTER_VALUE_SUB
:
5161 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5162 #if !defined(PNG_1_0_X)
5163 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_SUB
) &&
5164 (row_info
->pixel_depth
>= png_ptr
->mmx_bitdepth_threshold
) &&
5165 (row_info
->rowbytes
>= png_ptr
->mmx_rowbytes_threshold
))
5170 png_read_filter_row_mmx_sub(row_info
, row
);
5173 #endif /* PNG_MMX_CODE_SUPPORTED */
5176 png_uint_32 istop
= row_info
->rowbytes
;
5177 png_uint_32 bpp
= (row_info
->pixel_depth
+ 7) >> 3;
5178 png_bytep rp
= row
+ bpp
;
5181 for (i
= bpp
; i
< istop
; i
++)
5183 *rp
= (png_byte
)(((int)(*rp
) + (int)(*lp
++)) & 0xff);
5186 } /* end !UseMMX_sub */
5189 case PNG_FILTER_VALUE_UP
:
5190 #if defined(PNG_MMX_CODE_SUPPORTED)
5191 #if !defined(PNG_1_0_X)
5192 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_UP
) &&
5193 (row_info
->pixel_depth
>= png_ptr
->mmx_bitdepth_threshold
) &&
5194 (row_info
->rowbytes
>= png_ptr
->mmx_rowbytes_threshold
))
5199 png_read_filter_row_mmx_up(row_info
, row
, prev_row
);
5202 #endif /* PNG_MMX_CODE_SUPPORTED */
5205 png_uint_32 istop
= row_info
->rowbytes
;
5207 png_bytep pp
= prev_row
;
5209 for (i
= 0; i
< istop
; ++i
)
5211 *rp
= (png_byte
)(((int)(*rp
) + (int)(*pp
++)) & 0xff);
5214 } /* end !UseMMX_up */
5217 case PNG_FILTER_VALUE_AVG
:
5218 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5219 #if !defined(PNG_1_0_X)
5220 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_AVG
) &&
5221 (row_info
->pixel_depth
>= png_ptr
->mmx_bitdepth_threshold
) &&
5222 (row_info
->rowbytes
>= png_ptr
->mmx_rowbytes_threshold
))
5227 png_read_filter_row_mmx_avg(row_info
, row
, prev_row
);
5230 #endif /* PNG_MMX_CODE_SUPPORTED */
5234 png_bytep pp
= prev_row
;
5236 png_uint_32 bpp
= (row_info
->pixel_depth
+ 7) >> 3;
5237 png_uint_32 istop
= row_info
->rowbytes
- bpp
;
5239 for (i
= 0; i
< bpp
; i
++)
5241 *rp
= (png_byte
)(((int)(*rp
) +
5242 ((int)(*pp
++) >> 1)) & 0xff);
5246 for (i
= 0; i
< istop
; i
++)
5248 *rp
= (png_byte
)(((int)(*rp
) +
5249 ((int)(*pp
++ + *lp
++) >> 1)) & 0xff);
5252 } /* end !UseMMX_avg */
5255 case PNG_FILTER_VALUE_PAETH
:
5256 #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5257 #if !defined(PNG_1_0_X)
5258 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_PAETH
) &&
5259 (row_info
->pixel_depth
>= png_ptr
->mmx_bitdepth_threshold
) &&
5260 (row_info
->rowbytes
>= png_ptr
->mmx_rowbytes_threshold
))
5265 png_read_filter_row_mmx_paeth(row_info
, row
, prev_row
);
5268 #endif /* PNG_MMX_CODE_SUPPORTED */
5272 png_bytep pp
= prev_row
;
5274 png_bytep cp
= prev_row
;
5275 png_uint_32 bpp
= (row_info
->pixel_depth
+ 7) >> 3;
5276 png_uint_32 istop
= row_info
->rowbytes
- bpp
;
5278 for (i
= 0; i
< bpp
; i
++)
5280 *rp
= (png_byte
)(((int)(*rp
) + (int)(*pp
++)) & 0xff);
5284 for (i
= 0; i
< istop
; i
++) /* use leftover rp,pp */
5286 int a
, b
, c
, pa
, pb
, pc
, p
;
5300 pa
= p
< 0 ? -p
: p
;
5301 pb
= pc
< 0 ? -pc
: pc
;
5302 pc
= (p
+ pc
) < 0 ? -(p
+ pc
) : p
+ pc
;
5306 if (pa <= pb && pa <= pc)
5314 p
= (pa
<= pb
&& pa
<= pc
) ? a
: (pb
<= pc
) ? b
: c
;
5316 *rp
= (png_byte
)(((int)(*rp
) + p
) & 0xff);
5319 } /* end !UseMMX_paeth */
5323 png_warning(png_ptr
, "Ignoring bad row-filter type");
5329 #endif /* PNG_HAVE_MMX_READ_FILTER_ROW */
5332 /*===========================================================================*/
5334 /* P N G _ M M X _ S U P P O R T */
5336 /*===========================================================================*/
5338 /* GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl)
5339 * (2) all instructions compile with gcc 2.7.2.3 and later
5340 * (3) the function is moved down here to prevent gcc from
5341 * inlining it in multiple places and then barfing be-
5342 * cause the ".NOT_SUPPORTED" label is multiply defined
5343 * [is there a way to signal that a *single* function should
5344 * not be inlined? is there a way to modify the label for
5345 * each inlined instance, e.g., by appending _1, _2, etc.?
5346 * maybe if don't use leading "." in label name? (nope...sigh)]
5350 png_mmx_support(void)
5352 #if defined(PNG_MMX_CODE_SUPPORTED)
5354 __asm__
__volatile__ (
5355 "pushl %%ebx \n\t" // ebx gets clobbered by CPUID instruction
5356 "pushl %%ecx \n\t" // so does ecx...
5357 "pushl %%edx \n\t" // ...and edx (but ecx & edx safe on Linux)
5358 // ".byte 0x66 \n\t" // convert 16-bit pushf to 32-bit pushfd
5359 // "pushf \n\t" // 16-bit pushf
5360 "pushfl \n\t" // save Eflag to stack
5361 "popl %%eax \n\t" // get Eflag from stack into eax
5362 "movl %%eax, %%ecx \n\t" // make another copy of Eflag in ecx
5363 "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
5364 "pushl %%eax \n\t" // save modified Eflag back to stack
5365 // ".byte 0x66 \n\t" // convert 16-bit popf to 32-bit popfd
5366 // "popf \n\t" // 16-bit popf
5367 "popfl \n\t" // restore modified value to Eflag reg
5368 "pushfl \n\t" // save Eflag to stack
5369 "popl %%eax \n\t" // get Eflag from stack
5370 "pushl %%ecx \n\t" // save original Eflag to stack
5371 "popfl \n\t" // restore original Eflag
5372 "xorl %%ecx, %%eax \n\t" // compare new Eflag with original Eflag
5373 "jz 0f \n\t" // if same, CPUID instr. is not supported
5375 "xorl %%eax, %%eax \n\t" // set eax to zero
5376 // ".byte 0x0f, 0xa2 \n\t" // CPUID instruction (two-byte opcode)
5377 "cpuid \n\t" // get the CPU identification info
5378 "cmpl $1, %%eax \n\t" // make sure eax return non-zero value
5379 "jl 0f \n\t" // if eax is zero, MMX is not supported
5381 "xorl %%eax, %%eax \n\t" // set eax to zero and...
5382 "incl %%eax \n\t" // ...increment eax to 1. This pair is
5383 // faster than the instruction "mov eax, 1"
5384 "cpuid \n\t" // get the CPU identification info again
5385 "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
5386 "cmpl $0, %%edx \n\t" // 0 = MMX not supported
5387 "jz 0f \n\t" // non-zero = yes, MMX IS supported
5389 "movl $1, %%eax \n\t" // set return value to 1
5390 "jmp 1f \n\t" // DONE: have MMX support
5392 "0: \n\t" // .NOT_SUPPORTED: target label for jump instructions
5393 "movl $0, %%eax \n\t" // set return value to 0
5394 "1: \n\t" // .RETURN: target label for jump instructions
5395 "popl %%edx \n\t" // restore edx
5396 "popl %%ecx \n\t" // restore ecx
5397 "popl %%ebx \n\t" // restore ebx
5399 // "ret \n\t" // DONE: no MMX support
5400 // (fall through to standard C "ret")
5402 : "=a" (result
) // output list
5404 : // any variables used on input (none)
5407 // , "%ebx", "%ecx", "%edx" // GRR: we handle these manually
5408 // , "memory" // if write to a variable gcc thought was in a reg
5409 // , "cc" // "condition codes" (flag bits)
5411 _mmx_supported
= result
;
5414 #endif /* PNG_MMX_CODE_SUPPORTED */
5416 return _mmx_supported
;
5420 #endif /* PNG_USE_PNGGCCRD */