1 /* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
3 * For Intel x86 CPU and Microsoft Visual C++ compiler
5 * libpng version 1.2.8 - December 3, 2004
6 * For conditions of distribution and use, see copyright notice in png.h
7 * Copyright (c) 1998-2004 Glenn Randers-Pehrson
8 * Copyright (c) 1998, Intel Corporation
10 * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
11 * Interface to libpng contributed by Gilles Vollant, 1999
14 * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
15 * a sign error in the post-MMX cleanup code for each pixel_depth resulted
16 * in bad pixels at the beginning of some rows of some images, and also
17 * (due to out-of-range memory reads and writes) caused heap corruption
18 * when compiled with MSVC 6.0. The error was fixed in version 1.0.4e.
20 * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
22 * [runtime MMX configuration, GRR 20010102]
29 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
31 static int mmx_supported
=2;
37 int mmx_supported_local
= 0;
39 push ebx
//CPUID will trash these
43 pushfd
//Save Eflag to stack
44 pop eax
//Get Eflag from stack into eax
45 mov ecx
, eax
//Make another copy of Eflag in ecx
46 xor eax
, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
47 push eax
//Save modified Eflag back to stack
49 popfd
//Restored modified value back to Eflag reg
50 pushfd
//Save Eflag to stack
51 pop eax
//Get Eflag from stack
52 push ecx
// save original Eflag to stack
53 popfd
// restore original Eflag
54 xor eax
, ecx
//Compare the new Eflag with the original Eflag
55 jz NOT_SUPPORTED
//If the same, CPUID instruction is not supported,
56 //skip following instructions and jump to
59 xor eax
, eax
//Set eax to zero
61 _asm _emit
0x0f //CPUID instruction (two bytes opcode)
64 cmp eax
, 1 //make sure eax return non-zero value
65 jl NOT_SUPPORTED
//If eax is zero, mmx not supported
67 xor eax
, eax
//set eax to zero
68 inc eax
//Now increment eax to 1. This instruction is
69 //faster than the instruction "mov eax, 1"
71 _asm _emit
0x0f //CPUID instruction
74 and edx
, 0x00800000 //mask out all bits but mmx bit(24)
75 cmp edx
, 0 // 0 = mmx not supported
76 jz NOT_SUPPORTED
// non-zero = Yes, mmx IS supported
78 mov mmx_supported_local
, 1 //set return value to 1
81 mov eax
, mmx_supported_local
//move return value to eax
82 pop edx
//CPUID trashed these
87 //mmx_supported_local=0; // test code for force don't support MMX
88 //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
90 mmx_supported
= mmx_supported_local
;
91 return mmx_supported_local
;
94 /* Combines the row recently read in with the previous row.
95 This routine takes care of alpha and transparency if requested.
96 This routine also handles the two methods of progressive display
97 of interlaced images, depending on the mask value.
98 The mask value describes which pixels are to be combined with
99 the row. The pattern always repeats every 8 pixels, so just 8
100 bits are needed. A one indicates the pixel is to be combined; a
101 zero indicates the pixel is to be skipped. This is in addition
102 to any alpha or transparency value associated with the pixel. If
103 you want all pixels to be combined, pass 0xff (255) in mask. */
105 /* Use this routine for x86 platform - uses faster MMX routine if machine
109 png_combine_row(png_structp png_ptr
, png_bytep row
, int mask
)
111 #ifdef PNG_USE_LOCAL_ARRAYS
112 const int png_pass_inc
[7] = {8, 8, 4, 4, 2, 2, 1};
115 png_debug(1,"in png_combine_row_asm\n");
117 if (mmx_supported
== 2) {
118 #if !defined(PNG_1_0_X)
119 /* this should have happened in png_init_mmx_flags() already */
120 png_warning(png_ptr
, "asm_flags may not have been initialized");
127 png_memcpy(row
, png_ptr
->row_buf
+ 1,
128 (png_size_t
)PNG_ROWBYTES(png_ptr
->row_info
.pixel_depth
,
131 /* GRR: add "else if (mask == 0)" case?
132 * or does png_combine_row() not even get called in that case? */
135 switch (png_ptr
->row_info
.pixel_depth
)
141 int s_inc
, s_start
, s_end
;
146 sp
= png_ptr
->row_buf
+ 1;
149 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
150 if (png_ptr
->transformations
& PNG_PACKSWAP
)
166 for (i
= 0; i
< png_ptr
->width
; i
++)
172 value
= (*sp
>> shift
) & 0x1;
173 *dp
&= (png_byte
)((0x7f7f >> (7 - shift
)) & 0xff);
174 *dp
|= (png_byte
)(value
<< shift
);
198 int s_start
, s_end
, s_inc
;
204 sp
= png_ptr
->row_buf
+ 1;
207 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
208 if (png_ptr
->transformations
& PNG_PACKSWAP
)
224 for (i
= 0; i
< png_ptr
->width
; i
++)
228 value
= (*sp
>> shift
) & 0x3;
229 *dp
&= (png_byte
)((0x3f3f >> (6 - shift
)) & 0xff);
230 *dp
|= (png_byte
)(value
<< shift
);
253 int s_start
, s_end
, s_inc
;
259 sp
= png_ptr
->row_buf
+ 1;
262 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
263 if (png_ptr
->transformations
& PNG_PACKSWAP
)
278 for (i
= 0; i
< png_ptr
->width
; i
++)
282 value
= (*sp
>> shift
) & 0xf;
283 *dp
&= (png_byte
)((0xf0f >> (4 - shift
)) & 0xff);
284 *dp
|= (png_byte
)(value
<< shift
);
311 __int64 mask0
=0x0102040810204080;
313 #if !defined(PNG_1_0_X)
314 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
315 /* && mmx_supported */ )
320 srcptr
= png_ptr
->row_buf
+ 1;
324 len
= png_ptr
->width
&~7; //reduce to multiple of 8
325 diff
= png_ptr
->width
& 7; //amount lost
329 movd mm7
, unmask
//load bit pattern
330 psubb mm6
,mm6
//zero mm6
333 punpckldq mm7
,mm7
//fill register with 8 masks
337 pand mm0
,mm7
//nonzero if keep byte
338 pcmpeqb mm0
,mm6
//zeros->1s, v versa
340 mov ecx
,len
//load length of line (pixels)
341 mov esi
,srcptr
//load source
342 mov ebx
,dstptr
//load dest
354 add esi
,8 //inc by 8 bytes processed
356 sub ecx
,8 //dec by 8 pixels processed
366 sal edx
,24 //make low byte the high byte
369 sal edx
,1 //move high bit to CF
370 jnc skip8
//if CF = 0
383 else /* mmx not supported - use modified C routine */
385 register unsigned int incr1
, initial_val
, final_val
;
386 png_size_t pixel_bytes
;
388 register int disp
= png_pass_inc
[png_ptr
->pass
];
389 int offset_table
[7] = {0, 4, 0, 2, 0, 1, 0};
391 pixel_bytes
= (png_ptr
->row_info
.pixel_depth
>> 3);
392 srcptr
= png_ptr
->row_buf
+ 1 + offset_table
[png_ptr
->pass
]*
394 dstptr
= row
+ offset_table
[png_ptr
->pass
]*pixel_bytes
;
395 initial_val
= offset_table
[png_ptr
->pass
]*pixel_bytes
;
396 final_val
= png_ptr
->width
*pixel_bytes
;
397 incr1
= (disp
)*pixel_bytes
;
398 for (i
= initial_val
; i
< final_val
; i
+= incr1
)
400 png_memcpy(dstptr
, srcptr
, pixel_bytes
);
415 __int64 mask1
=0x0101020204040808,
416 mask0
=0x1010202040408080;
418 #if !defined(PNG_1_0_X)
419 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
420 /* && mmx_supported */ )
425 srcptr
= png_ptr
->row_buf
+ 1;
429 len
= (png_ptr
->width
)&~7;
430 diff
= (png_ptr
->width
)&7;
433 movd mm7
, unmask
//load bit pattern
434 psubb mm6
,mm6
//zero mm6
437 punpckldq mm7
,mm7
//fill register with 8 masks
448 mov ecx
,len
//load length of line
449 mov esi
,srcptr
//load source
450 mov ebx
,dstptr
//load dest
471 add esi
,16 //inc by 16 bytes processed
473 sub ecx
,8 //dec by 8 pixels processed
483 sal edx
,24 //make low byte the high byte
485 sal edx
,1 //move high bit to CF
486 jnc skip16
//if CF = 0
499 else /* mmx not supported - use modified C routine */
501 register unsigned int incr1
, initial_val
, final_val
;
502 png_size_t pixel_bytes
;
504 register int disp
= png_pass_inc
[png_ptr
->pass
];
505 int offset_table
[7] = {0, 4, 0, 2, 0, 1, 0};
507 pixel_bytes
= (png_ptr
->row_info
.pixel_depth
>> 3);
508 srcptr
= png_ptr
->row_buf
+ 1 + offset_table
[png_ptr
->pass
]*
510 dstptr
= row
+ offset_table
[png_ptr
->pass
]*pixel_bytes
;
511 initial_val
= offset_table
[png_ptr
->pass
]*pixel_bytes
;
512 final_val
= png_ptr
->width
*pixel_bytes
;
513 incr1
= (disp
)*pixel_bytes
;
514 for (i
= initial_val
; i
< final_val
; i
+= incr1
)
516 png_memcpy(dstptr
, srcptr
, pixel_bytes
);
532 __int64 mask2
=0x0101010202020404, //24bpp
533 mask1
=0x0408080810101020,
534 mask0
=0x2020404040808080;
536 srcptr
= png_ptr
->row_buf
+ 1;
540 len
= (png_ptr
->width
)&~7;
541 diff
= (png_ptr
->width
)&7;
543 #if !defined(PNG_1_0_X)
544 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
545 /* && mmx_supported */ )
552 movd mm7
, unmask
//load bit pattern
553 psubb mm6
,mm6
//zero mm6
556 punpckldq mm7
,mm7
//fill register with 8 masks
570 mov ecx
,len
//load length of line
571 mov esi
,srcptr
//load source
572 mov ebx
,dstptr
//load dest
602 add esi
,24 //inc by 24 bytes processed
604 sub ecx
,8 //dec by 8 pixels processed
614 sal edx
,24 //make low byte the high byte
616 sal edx
,1 //move high bit to CF
617 jnc skip24
//if CF = 0
634 else /* mmx not supported - use modified C routine */
636 register unsigned int incr1
, initial_val
, final_val
;
637 png_size_t pixel_bytes
;
639 register int disp
= png_pass_inc
[png_ptr
->pass
];
640 int offset_table
[7] = {0, 4, 0, 2, 0, 1, 0};
642 pixel_bytes
= (png_ptr
->row_info
.pixel_depth
>> 3);
643 srcptr
= png_ptr
->row_buf
+ 1 + offset_table
[png_ptr
->pass
]*
645 dstptr
= row
+ offset_table
[png_ptr
->pass
]*pixel_bytes
;
646 initial_val
= offset_table
[png_ptr
->pass
]*pixel_bytes
;
647 final_val
= png_ptr
->width
*pixel_bytes
;
648 incr1
= (disp
)*pixel_bytes
;
649 for (i
= initial_val
; i
< final_val
; i
+= incr1
)
651 png_memcpy(dstptr
, srcptr
, pixel_bytes
);
667 __int64 mask3
=0x0101010102020202, //32bpp
668 mask2
=0x0404040408080808,
669 mask1
=0x1010101020202020,
670 mask0
=0x4040404080808080;
672 srcptr
= png_ptr
->row_buf
+ 1;
676 len
= (png_ptr
->width
)&~7;
677 diff
= (png_ptr
->width
)&7;
679 #if !defined(PNG_1_0_X)
680 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
681 /* && mmx_supported */ )
688 movd mm7
, unmask
//load bit pattern
689 psubb mm6
,mm6
//zero mm6
692 punpckldq mm7
,mm7
//fill register with 8 masks
709 mov ecx
,len
//load length of line
710 mov esi
,srcptr
//load source
711 mov ebx
,dstptr
//load dest
749 add esi
,32 //inc by 32 bytes processed
751 sub ecx
,8 //dec by 8 pixels processed
761 sal edx
,24 //make low byte the high byte
763 sal edx
,1 //move high bit to CF
764 jnc skip32
//if CF = 0
778 else /* mmx _not supported - Use modified C routine */
780 register unsigned int incr1
, initial_val
, final_val
;
781 png_size_t pixel_bytes
;
783 register int disp
= png_pass_inc
[png_ptr
->pass
];
784 int offset_table
[7] = {0, 4, 0, 2, 0, 1, 0};
786 pixel_bytes
= (png_ptr
->row_info
.pixel_depth
>> 3);
787 srcptr
= png_ptr
->row_buf
+ 1 + offset_table
[png_ptr
->pass
]*
789 dstptr
= row
+ offset_table
[png_ptr
->pass
]*pixel_bytes
;
790 initial_val
= offset_table
[png_ptr
->pass
]*pixel_bytes
;
791 final_val
= png_ptr
->width
*pixel_bytes
;
792 incr1
= (disp
)*pixel_bytes
;
793 for (i
= initial_val
; i
< final_val
; i
+= incr1
)
795 png_memcpy(dstptr
, srcptr
, pixel_bytes
);
811 __int64 mask5
=0x0101010101010202,
812 mask4
=0x0202020204040404,
813 mask3
=0x0404080808080808,
814 mask2
=0x1010101010102020,
815 mask1
=0x2020202040404040,
816 mask0
=0x4040808080808080;
818 #if !defined(PNG_1_0_X)
819 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_COMBINE_ROW
)
820 /* && mmx_supported */ )
825 srcptr
= png_ptr
->row_buf
+ 1;
829 len
= (png_ptr
->width
)&~7;
830 diff
= (png_ptr
->width
)&7;
833 movd mm7
, unmask
//load bit pattern
834 psubb mm6
,mm6
//zero mm6
837 punpckldq mm7
,mm7
//fill register with 8 masks
860 mov ecx
,len
//load length of line
861 mov esi
,srcptr
//load source
862 mov ebx
,dstptr
//load dest
910 add esi
,48 //inc by 32 bytes processed
912 sub ecx
,8 //dec by 8 pixels processed
922 sal edx
,24 //make low byte the high byte
925 sal edx
,1 //move high bit to CF
926 jnc skip48
//if CF = 0
940 else /* mmx _not supported - Use modified C routine */
942 register unsigned int incr1
, initial_val
, final_val
;
943 png_size_t pixel_bytes
;
945 register int disp
= png_pass_inc
[png_ptr
->pass
];
946 int offset_table
[7] = {0, 4, 0, 2, 0, 1, 0};
948 pixel_bytes
= (png_ptr
->row_info
.pixel_depth
>> 3);
949 srcptr
= png_ptr
->row_buf
+ 1 + offset_table
[png_ptr
->pass
]*
951 dstptr
= row
+ offset_table
[png_ptr
->pass
]*pixel_bytes
;
952 initial_val
= offset_table
[png_ptr
->pass
]*pixel_bytes
;
953 final_val
= png_ptr
->width
*pixel_bytes
;
954 incr1
= (disp
)*pixel_bytes
;
955 for (i
= initial_val
; i
< final_val
; i
+= incr1
)
957 png_memcpy(dstptr
, srcptr
, pixel_bytes
);
970 png_size_t pixel_bytes
;
971 int offset_table
[7] = {0, 4, 0, 2, 0, 1, 0};
973 register int disp
= png_pass_inc
[png_ptr
->pass
]; // get the offset
974 register unsigned int incr1
, initial_val
, final_val
;
976 pixel_bytes
= (png_ptr
->row_info
.pixel_depth
>> 3);
977 sptr
= png_ptr
->row_buf
+ 1 + offset_table
[png_ptr
->pass
]*
979 dp
= row
+ offset_table
[png_ptr
->pass
]*pixel_bytes
;
980 initial_val
= offset_table
[png_ptr
->pass
]*pixel_bytes
;
981 final_val
= png_ptr
->width
*pixel_bytes
;
982 incr1
= (disp
)*pixel_bytes
;
983 for (i
= initial_val
; i
< final_val
; i
+= incr1
)
985 png_memcpy(dp
, sptr
, pixel_bytes
);
991 } /* end switch (png_ptr->row_info.pixel_depth) */
992 } /* end if (non-trivial mask) */
994 } /* end png_combine_row() */
997 #if defined(PNG_READ_INTERLACING_SUPPORTED)
1000 png_do_read_interlace(png_structp png_ptr
)
1002 png_row_infop row_info
= &(png_ptr
->row_info
);
1003 png_bytep row
= png_ptr
->row_buf
+ 1;
1004 int pass
= png_ptr
->pass
;
1005 png_uint_32 transformations
= png_ptr
->transformations
;
1006 #ifdef PNG_USE_LOCAL_ARRAYS
1007 const int png_pass_inc
[7] = {8, 8, 4, 4, 2, 2, 1};
1010 png_debug(1,"in png_do_read_interlace\n");
1012 if (mmx_supported
== 2) {
1013 #if !defined(PNG_1_0_X)
1014 /* this should have happened in png_init_mmx_flags() already */
1015 png_warning(png_ptr
, "asm_flags may not have been initialized");
1020 if (row
!= NULL
&& row_info
!= NULL
)
1022 png_uint_32 final_width
;
1024 final_width
= row_info
->width
* png_pass_inc
[pass
];
1026 switch (row_info
->pixel_depth
)
1032 int s_start
, s_end
, s_inc
;
1037 sp
= row
+ (png_size_t
)((row_info
->width
- 1) >> 3);
1038 dp
= row
+ (png_size_t
)((final_width
- 1) >> 3);
1039 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1040 if (transformations
& PNG_PACKSWAP
)
1042 sshift
= (int)((row_info
->width
+ 7) & 7);
1043 dshift
= (int)((final_width
+ 7) & 7);
1051 sshift
= 7 - (int)((row_info
->width
+ 7) & 7);
1052 dshift
= 7 - (int)((final_width
+ 7) & 7);
1058 for (i
= row_info
->width
; i
; i
--)
1060 v
= (png_byte
)((*sp
>> sshift
) & 0x1);
1061 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1063 *dp
&= (png_byte
)((0x7f7f >> (7 - dshift
)) & 0xff);
1064 *dp
|= (png_byte
)(v
<< dshift
);
1065 if (dshift
== s_end
)
1073 if (sshift
== s_end
)
1088 int s_start
, s_end
, s_inc
;
1091 sp
= row
+ (png_size_t
)((row_info
->width
- 1) >> 2);
1092 dp
= row
+ (png_size_t
)((final_width
- 1) >> 2);
1093 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1094 if (transformations
& PNG_PACKSWAP
)
1096 sshift
= (png_size_t
)(((row_info
->width
+ 3) & 3) << 1);
1097 dshift
= (png_size_t
)(((final_width
+ 3) & 3) << 1);
1105 sshift
= (png_size_t
)((3 - ((row_info
->width
+ 3) & 3)) << 1);
1106 dshift
= (png_size_t
)((3 - ((final_width
+ 3) & 3)) << 1);
1112 for (i
= row_info
->width
; i
; i
--)
1117 v
= (png_byte
)((*sp
>> sshift
) & 0x3);
1118 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1120 *dp
&= (png_byte
)((0x3f3f >> (6 - dshift
)) & 0xff);
1121 *dp
|= (png_byte
)(v
<< dshift
);
1122 if (dshift
== s_end
)
1130 if (sshift
== s_end
)
1145 int s_start
, s_end
, s_inc
;
1148 sp
= row
+ (png_size_t
)((row_info
->width
- 1) >> 1);
1149 dp
= row
+ (png_size_t
)((final_width
- 1) >> 1);
1150 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1151 if (transformations
& PNG_PACKSWAP
)
1153 sshift
= (png_size_t
)(((row_info
->width
+ 1) & 1) << 2);
1154 dshift
= (png_size_t
)(((final_width
+ 1) & 1) << 2);
1162 sshift
= (png_size_t
)((1 - ((row_info
->width
+ 1) & 1)) << 2);
1163 dshift
= (png_size_t
)((1 - ((final_width
+ 1) & 1)) << 2);
1169 for (i
= row_info
->width
; i
; i
--)
1174 v
= (png_byte
)((*sp
>> sshift
) & 0xf);
1175 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1177 *dp
&= (png_byte
)((0xf0f >> (4 - dshift
)) & 0xff);
1178 *dp
|= (png_byte
)(v
<< dshift
);
1179 if (dshift
== s_end
)
1187 if (sshift
== s_end
)
1198 default: // This is the place where the routine is modified
1200 __int64 const4
= 0x0000000000FFFFFF;
1201 // __int64 const5 = 0x000000FFFFFF0000; // unused...
1202 __int64 const6
= 0x00000000000000FF;
1205 png_size_t pixel_bytes
;
1206 int width
= row_info
->width
;
1208 pixel_bytes
= (row_info
->pixel_depth
>> 3);
1210 sptr
= row
+ (width
- 1) * pixel_bytes
;
1211 dp
= row
+ (final_width
- 1) * pixel_bytes
;
1212 // New code by Nirav Chhatrapati - Intel Corporation
1214 // NOTE: there is NO MMX code for 48-bit and 64-bit images
1216 // use MMX routine if machine supports it
1217 #if !defined(PNG_1_0_X)
1218 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_INTERLACE
)
1219 /* && mmx_supported */ )
1224 if (pixel_bytes
== 3)
1226 if (((pass
== 0) || (pass
== 1)) && width
)
1233 sub edi
, 21 // (png_pass_inc[pass] - 1)*pixel_bytes
1235 movd mm0
, [esi
] ; X X X X X v2 v1 v0
1236 pand mm0
, const4
; 0 0 0 0 0 v2 v1 v0
1237 movq mm1
, mm0
; 0 0 0 0 0 v2 v1 v0
1238 psllq mm0
, 16 ; 0 0 0 v2 v1 v0
0 0
1239 movq mm2
, mm0
; 0 0 0 v2 v1 v0
0 0
1240 psllq mm0
, 24 ; v2 v1 v0
0 0 0 0 0
1241 psrlq mm1
, 8 ; 0 0 0 0 0 0 v2 v1
1242 por mm0
, mm2
; v2 v1 v0 v2 v1 v0
0 0
1243 por mm0
, mm1
; v2 v1 v0 v2 v1 v0 v2 v1
1244 movq mm3
, mm0
; v2 v1 v0 v2 v1 v0 v2 v1
1245 psllq mm0
, 16 ; v0 v2 v1 v0 v2 v1
0 0
1246 movq mm4
, mm3
; v2 v1 v0 v2 v1 v0 v2 v1
1247 punpckhdq mm3
, mm0
; v0 v2 v1 v0 v2 v1 v0 v2
1249 psrlq mm0
, 32 ; 0 0 0 0 v0 v2 v1 v0
1251 punpckldq mm0
, mm4
; v1 v0 v2 v1 v0 v2 v1 v0
1261 else if (((pass
== 2) || (pass
== 3)) && width
)
1268 sub edi
, 9 // (png_pass_inc[pass] - 1)*pixel_bytes
1270 movd mm0
, [esi
] ; X X X X X v2 v1 v0
1271 pand mm0
, const4
; 0 0 0 0 0 v2 v1 v0
1272 movq mm1
, mm0
; 0 0 0 0 0 v2 v1 v0
1273 psllq mm0
, 16 ; 0 0 0 v2 v1 v0
0 0
1274 movq mm2
, mm0
; 0 0 0 v2 v1 v0
0 0
1275 psllq mm0
, 24 ; v2 v1 v0
0 0 0 0 0
1276 psrlq mm1
, 8 ; 0 0 0 0 0 0 v2 v1
1277 por mm0
, mm2
; v2 v1 v0 v2 v1 v0
0 0
1278 por mm0
, mm1
; v2 v1 v0 v2 v1 v0 v2 v1
1279 movq
[edi
+4], mm0
; move to memory
1280 psrlq mm0
, 16 ; 0 0 v2 v1 v0 v2 v1 v0
1281 movd
[edi
], mm0
; move to memory
1289 else if (width
) /* && ((pass == 4) || (pass == 5)) */
1291 int width_mmx
= ((width
>> 1) << 1) - 8;
1294 width
-= width_mmx
; // 8 or 9 pix, 24 or 27 bytes
1305 movq mm0
, [esi
] ; X X v2 v1 v0 v5 v4 v3
1306 movq mm7
, mm0
; X X v2 v1 v0 v5 v4 v3
1307 movq mm6
, mm0
; X X v2 v1 v0 v5 v4 v3
1308 psllq mm0
, 24 ; v1 v0 v5 v4 v3
0 0 0
1309 pand mm7
, const4
; 0 0 0 0 0 v5 v4 v3
1310 psrlq mm6
, 24 ; 0 0 0 X X v2 v1 v0
1311 por mm0
, mm7
; v1 v0 v5 v4 v3 v5 v4 v3
1312 movq mm5
, mm6
; 0 0 0 X X v2 v1 v0
1313 psllq mm6
, 8 ; 0 0 X X v2 v1 v0
0
1314 movq
[edi
], mm0
; move quad to memory
1315 psrlq mm5
, 16 ; 0 0 0 0 0 X X v2
1316 pand mm5
, const6
; 0 0 0 0 0 0 0 v2
1317 por mm6
, mm5
; 0 0 X X v2 v1 v0 v2
1318 movd
[edi
+8], mm6
; move
double to memory
1327 sptr
-= width_mmx
*3;
1329 for (i
= width
; i
; i
--)
1334 png_memcpy(v
, sptr
, 3);
1335 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1337 png_memcpy(dp
, v
, 3);
1343 } /* end of pixel_bytes == 3 */
1345 else if (pixel_bytes
== 1)
1347 if (((pass
== 0) || (pass
== 1)) && width
)
1349 int width_mmx
= ((width
>> 2) << 2);
1361 movd mm0
, [esi
] ; X X X X v0 v1 v2 v3
1362 movq mm1
, mm0
; X X X X v0 v1 v2 v3
1363 punpcklbw mm0
, mm0
; v0 v0 v1 v1 v2 v2 v3 v3
1364 movq mm2
, mm0
; v0 v0 v1 v1 v2 v2 v3 v3
1365 punpcklwd mm0
, mm0
; v2 v2 v2 v2 v3 v3 v3 v3
1366 movq mm3
, mm0
; v2 v2 v2 v2 v3 v3 v3 v3
1367 punpckldq mm0
, mm0
; v3 v3 v3 v3 v3 v3 v3 v3
1368 punpckhdq mm3
, mm3
; v2 v2 v2 v2 v2 v2 v2 v2
1369 movq
[edi
], mm0
; move to memory v3
1370 punpckhwd mm2
, mm2
; v0 v0 v0 v0 v1 v1 v1 v1
1371 movq
[edi
+8], mm3
; move to memory v2
1372 movq mm4
, mm2
; v0 v0 v0 v0 v1 v1 v1 v1
1373 punpckldq mm2
, mm2
; v1 v1 v1 v1 v1 v1 v1 v1
1374 punpckhdq mm4
, mm4
; v0 v0 v0 v0 v0 v0 v0 v0
1375 movq
[edi
+16], mm2
; move to memory v1
1376 movq
[edi
+24], mm4
; move to memory v0
1387 for (i
= width
; i
; i
--)
1391 /* I simplified this part in version 1.0.4e
1392 * here and in several other instances where
1393 * pixel_bytes == 1 -- GR-P
1398 * png_memcpy(v, sptr, pixel_bytes);
1399 * for (j = 0; j < png_pass_inc[pass]; j++)
1401 * png_memcpy(dp, v, pixel_bytes);
1402 * dp -= pixel_bytes;
1404 * sptr -= pixel_bytes;
1406 * Replacement code is in the next three lines:
1409 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1414 else if (((pass
== 2) || (pass
== 3)) && width
)
1416 int width_mmx
= ((width
>> 2) << 2);
1428 movd mm0
, [esi
] ; X X X X v0 v1 v2 v3
1429 punpcklbw mm0
, mm0
; v0 v0 v1 v1 v2 v2 v3 v3
1430 movq mm1
, mm0
; v0 v0 v1 v1 v2 v2 v3 v3
1431 punpcklwd mm0
, mm0
; v2 v2 v2 v2 v3 v3 v3 v3
1432 punpckhwd mm1
, mm1
; v0 v0 v0 v0 v1 v1 v1 v1
1433 movq
[edi
], mm0
; move to memory v2
and v3
1435 movq
[edi
+8], mm1
; move to memory v1
and v0
1445 for (i
= width
; i
; i
--)
1449 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1456 else if (width
) /* && ((pass == 4) || (pass == 5))) */
1458 int width_mmx
= ((width
>> 3) << 3);
1470 movq mm0
, [esi
] ; v0 v1 v2 v3 v4 v5 v6 v7
1471 movq mm1
, mm0
; v0 v1 v2 v3 v4 v5 v6 v7
1472 punpcklbw mm0
, mm0
; v4 v4 v5 v5 v6 v6 v7 v7
1473 //movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1474 punpckhbw mm1
, mm1
;v0 v0 v1 v1 v2 v2 v3 v3
1475 movq
[edi
+8], mm1
; move to memory v0 v1 v2
and v3
1477 movq
[edi
], mm0
; move to memory v4 v5 v6
and v7
1488 for (i
= width
; i
; i
--)
1492 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1499 } /* end of pixel_bytes == 1 */
1501 else if (pixel_bytes
== 2)
1503 if (((pass
== 0) || (pass
== 1)) && width
)
1505 int width_mmx
= ((width
>> 1) << 1);
1517 movd mm0
, [esi
] ; X X X X v1 v0 v3 v2
1518 punpcklwd mm0
, mm0
; v1 v0 v1 v0 v3 v2 v3 v2
1519 movq mm1
, mm0
; v1 v0 v1 v0 v3 v2 v3 v2
1520 punpckldq mm0
, mm0
; v3 v2 v3 v2 v3 v2 v3 v2
1521 punpckhdq mm1
, mm1
; v1 v0 v1 v0 v1 v0 v1 v0
1524 movq
[edi
+ 16], mm1
1525 movq
[edi
+ 24], mm1
1534 sptr
-= (width_mmx
*2 - 2); // sign fixed
1535 dp
-= (width_mmx
*16 - 2); // sign fixed
1536 for (i
= width
; i
; i
--)
1541 png_memcpy(v
, sptr
, 2);
1542 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1545 png_memcpy(dp
, v
, 2);
1549 else if (((pass
== 2) || (pass
== 3)) && width
)
1551 int width_mmx
= ((width
>> 1) << 1) ;
1563 movd mm0
, [esi
] ; X X X X v1 v0 v3 v2
1564 punpcklwd mm0
, mm0
; v1 v0 v1 v0 v3 v2 v3 v2
1565 movq mm1
, mm0
; v1 v0 v1 v0 v3 v2 v3 v2
1566 punpckldq mm0
, mm0
; v3 v2 v3 v2 v3 v2 v3 v2
1567 punpckhdq mm1
, mm1
; v1 v0 v1 v0 v1 v0 v1 v0
1579 sptr
-= (width_mmx
*2 - 2); // sign fixed
1580 dp
-= (width_mmx
*8 - 2); // sign fixed
1581 for (i
= width
; i
; i
--)
1586 png_memcpy(v
, sptr
, 2);
1587 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1590 png_memcpy(dp
, v
, 2);
1594 else if (width
) // pass == 4 or 5
1596 int width_mmx
= ((width
>> 1) << 1) ;
1608 movd mm0
, [esi
] ; X X X X v1 v0 v3 v2
1609 punpcklwd mm0
, mm0
; v1 v0 v1 v0 v3 v2 v3 v2
1619 sptr
-= (width_mmx
*2 - 2); // sign fixed
1620 dp
-= (width_mmx
*4 - 2); // sign fixed
1621 for (i
= width
; i
; i
--)
1626 png_memcpy(v
, sptr
, 2);
1627 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1630 png_memcpy(dp
, v
, 2);
1634 } /* end of pixel_bytes == 2 */
1636 else if (pixel_bytes
== 4)
1638 if (((pass
== 0) || (pass
== 1)) && width
)
1640 int width_mmx
= ((width
>> 1) << 1) ;
1652 movq mm0
, [esi
] ; v3 v2 v1 v0 v7 v6 v5 v4
1653 movq mm1
, mm0
; v3 v2 v1 v0 v7 v6 v5 v4
1654 punpckldq mm0
, mm0
; v7 v6 v5 v4 v7 v6 v5 v4
1655 punpckhdq mm1
, mm1
; v3 v2 v1 v0 v3 v2 v1 v0
1658 movq
[edi
+ 16], mm0
1659 movq
[edi
+ 24], mm0
1661 movq
[edi
+ 40], mm1
1664 movq
[edi
+ 56], mm1
1672 sptr
-= (width_mmx
*4 - 4); // sign fixed
1673 dp
-= (width_mmx
*32 - 4); // sign fixed
1674 for (i
= width
; i
; i
--)
1679 png_memcpy(v
, sptr
, 4);
1680 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1683 png_memcpy(dp
, v
, 4);
1687 else if (((pass
== 2) || (pass
== 3)) && width
)
1689 int width_mmx
= ((width
>> 1) << 1) ;
1701 movq mm0
, [esi
] ; v3 v2 v1 v0 v7 v6 v5 v4
1702 movq mm1
, mm0
; v3 v2 v1 v0 v7 v6 v5 v4
1703 punpckldq mm0
, mm0
; v7 v6 v5 v4 v7 v6 v5 v4
1704 punpckhdq mm1
, mm1
; v3 v2 v1 v0 v3 v2 v1 v0
1708 movq
[edi
+ 24], mm1
1717 sptr
-= (width_mmx
*4 - 4); // sign fixed
1718 dp
-= (width_mmx
*16 - 4); // sign fixed
1719 for (i
= width
; i
; i
--)
1724 png_memcpy(v
, sptr
, 4);
1725 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1728 png_memcpy(dp
, v
, 4);
1732 else if (width
) // pass == 4 or 5
1734 int width_mmx
= ((width
>> 1) << 1) ;
1746 movq mm0
, [esi
] ; v3 v2 v1 v0 v7 v6 v5 v4
1747 movq mm1
, mm0
; v3 v2 v1 v0 v7 v6 v5 v4
1748 punpckldq mm0
, mm0
; v7 v6 v5 v4 v7 v6 v5 v4
1749 punpckhdq mm1
, mm1
; v3 v2 v1 v0 v3 v2 v1 v0
1760 sptr
-= (width_mmx
*4 - 4); // sign fixed
1761 dp
-= (width_mmx
*8 - 4); // sign fixed
1762 for (i
= width
; i
; i
--)
1767 png_memcpy(v
, sptr
, 4);
1768 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1771 png_memcpy(dp
, v
, 4);
1776 } /* end of pixel_bytes == 4 */
1778 else if (pixel_bytes
== 6)
1780 for (i
= width
; i
; i
--)
1784 png_memcpy(v
, sptr
, 6);
1785 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1787 png_memcpy(dp
, v
, 6);
1792 } /* end of pixel_bytes == 6 */
1796 for (i
= width
; i
; i
--)
1800 png_memcpy(v
, sptr
, pixel_bytes
);
1801 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1803 png_memcpy(dp
, v
, pixel_bytes
);
1809 } /* end of mmx_supported */
1811 else /* MMX not supported: use modified C code - takes advantage
1812 * of inlining of memcpy for a constant */
1814 if (pixel_bytes
== 1)
1816 for (i
= width
; i
; i
--)
1819 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1824 else if (pixel_bytes
== 3)
1826 for (i
= width
; i
; i
--)
1830 png_memcpy(v
, sptr
, pixel_bytes
);
1831 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1833 png_memcpy(dp
, v
, pixel_bytes
);
1836 sptr
-= pixel_bytes
;
1839 else if (pixel_bytes
== 2)
1841 for (i
= width
; i
; i
--)
1845 png_memcpy(v
, sptr
, pixel_bytes
);
1846 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1848 png_memcpy(dp
, v
, pixel_bytes
);
1851 sptr
-= pixel_bytes
;
1854 else if (pixel_bytes
== 4)
1856 for (i
= width
; i
; i
--)
1860 png_memcpy(v
, sptr
, pixel_bytes
);
1861 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1863 png_memcpy(dp
, v
, pixel_bytes
);
1866 sptr
-= pixel_bytes
;
1869 else if (pixel_bytes
== 6)
1871 for (i
= width
; i
; i
--)
1875 png_memcpy(v
, sptr
, pixel_bytes
);
1876 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1878 png_memcpy(dp
, v
, pixel_bytes
);
1881 sptr
-= pixel_bytes
;
1886 for (i
= width
; i
; i
--)
1890 png_memcpy(v
, sptr
, pixel_bytes
);
1891 for (j
= 0; j
< png_pass_inc
[pass
]; j
++)
1893 png_memcpy(dp
, v
, pixel_bytes
);
1896 sptr
-= pixel_bytes
;
1900 } /* end of MMX not supported */
1903 } /* end switch (row_info->pixel_depth) */
1905 row_info
->width
= final_width
;
1907 row_info
->rowbytes
= PNG_ROWBYTES(row_info
->pixel_depth
,final_width
);
1912 #endif /* PNG_READ_INTERLACING_SUPPORTED */
1915 // These variables are utilized in the functions below. They are declared
1916 // globally here to ensure alignment on 8-byte boundaries.
1921 } LBCarryMask
= {0x0101010101010101},
1922 HBClearMask
= {0x7f7f7f7f7f7f7f7f},
1923 ActiveMask
, ActiveMask2
, ActiveMaskEnd
, ShiftBpp
, ShiftRem
;
1926 // Optimized code for PNG Average filter decoder
1928 png_read_filter_row_mmx_avg(png_row_infop row_info
, png_bytep row
1929 , png_bytep prev_row
)
1932 png_uint_32 FullLength
;
1933 png_uint_32 MMXLength
;
1937 bpp
= (row_info
->pixel_depth
+ 7) >> 3; // Get # bytes per pixel
1938 FullLength
= row_info
->rowbytes
; // # of bytes to filter
1940 // Init address pointers and offset
1941 mov edi
, row
// edi ==> Avg(x)
1942 xor ebx
, ebx
// ebx ==> x
1944 mov esi
, prev_row
// esi ==> Prior(x)
1945 sub edx
, bpp
// edx ==> Raw(x-bpp)
1948 // Compute the Raw value for the first bpp bytes
1949 // Raw(x) = Avg(x) + (Prior(x)/2)
1951 mov al
, [esi
+ ebx
] // Load al with Prior(x)
1953 shr al
, 1 // divide by 2
1954 add al
, [edi
+ebx
-1] // Add Avg(x); -1 to offset inc ebx
1956 mov
[edi
+ebx
-1], al
// Write back Raw(x);
1957 // mov does not affect flags; -1 to offset inc ebx
1959 // get # of bytes to alignment
1960 mov diff
, edi
// take start of row
1961 add diff
, ebx
// add bpp
1962 add diff
, 0xf // add 7 + 8 to incr past alignment boundary
1963 and diff
, 0xfffffff8 // mask to alignment boundary
1964 sub diff
, edi
// subtract from start ==> value ebx at alignment
1967 // Compute the Raw value for the bytes upto the alignment boundary
1968 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
1972 mov cl
, [esi
+ ebx
] // load cl with Prior(x)
1973 mov al
, [edx
+ ebx
] // load al with Raw(x-bpp)
1976 shr ax
, 1 // divide by 2
1977 add al
, [edi
+ebx
-1] // Add Avg(x); -1 to offset inc ebx
1978 cmp ebx
, diff
// Check if at alignment boundary
1979 mov
[edi
+ebx
-1], al
// Write back Raw(x);
1980 // mov does not affect flags; -1 to offset inc ebx
1981 jb davglp1
// Repeat until at alignment boundary
1985 sub eax
, ebx
// subtract alignment fix
1986 and eax
, 0x00000007 // calc bytes over mult of 8
1987 sub ecx
, eax
// drop over bytes from original length
1990 // Now do the math for the rest of the row
1995 ActiveMask
.use
= 0x0000000000ffffff;
1996 ShiftBpp
.use
= 24; // == 3 * 8
1997 ShiftRem
.use
= 40; // == 64 - 24
1999 // Re-init address pointers and offset
2000 movq mm7
, ActiveMask
2001 mov ebx
, diff
// ebx ==> x = offset to alignment boundary
2002 movq mm5
, LBCarryMask
2003 mov edi
, row
// edi ==> Avg(x)
2004 movq mm4
, HBClearMask
2005 mov esi
, prev_row
// esi ==> Prior(x)
2006 // PRIME the pump (load the first Raw(x-bpp) data set
2007 movq mm2
, [edi
+ ebx
- 8] // Load previous aligned 8 bytes
2008 // (we correct position in loop below)
2010 movq mm0
, [edi
+ ebx
] // Load mm0 with Avg(x)
2011 // Add (Prev_row/2) to Average
2013 psrlq mm2
, ShiftRem
// Correct position Raw(x-bpp) data
2014 movq mm1
, [esi
+ ebx
] // Load mm1 with Prior(x)
2016 pand mm3
, mm1
// get lsb for each prev_row byte
2017 psrlq mm1
, 1 // divide prev_row bytes by 2
2018 pand mm1
, mm4
// clear invalid bit 7 of each byte
2019 paddb mm0
, mm1
// add (Prev_row/2) to Avg for each byte
2020 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2021 movq mm1
, mm3
// now use mm1 for getting LBCarrys
2022 pand mm1
, mm2
// get LBCarrys for each byte where both
2023 // lsb's were == 1 (Only valid for active group)
2024 psrlq mm2
, 1 // divide raw bytes by 2
2025 pand mm2
, mm4
// clear invalid bit 7 of each byte
2026 paddb mm2
, mm1
// add LBCarrys to (Raw(x-bpp)/2) for each byte
2027 pand mm2
, mm6
// Leave only Active Group 1 bytes to add to Avg
2028 paddb mm0
, mm2
// add (Raw/2) + LBCarrys to Avg for each Active
2030 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2031 psllq mm6
, ShiftBpp
// shift the mm6 mask to cover bytes 3-5
2032 movq mm2
, mm0
// mov updated Raws to mm2
2033 psllq mm2
, ShiftBpp
// shift data to position correctly
2034 movq mm1
, mm3
// now use mm1 for getting LBCarrys
2035 pand mm1
, mm2
// get LBCarrys for each byte where both
2036 // lsb's were == 1 (Only valid for active group)
2037 psrlq mm2
, 1 // divide raw bytes by 2
2038 pand mm2
, mm4
// clear invalid bit 7 of each byte
2039 paddb mm2
, mm1
// add LBCarrys to (Raw(x-bpp)/2) for each byte
2040 pand mm2
, mm6
// Leave only Active Group 2 bytes to add to Avg
2041 paddb mm0
, mm2
// add (Raw/2) + LBCarrys to Avg for each Active
2044 // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
2045 psllq mm6
, ShiftBpp
// shift the mm6 mask to cover the last two
2047 movq mm2
, mm0
// mov updated Raws to mm2
2048 psllq mm2
, ShiftBpp
// shift data to position correctly
2049 // Data only needs to be shifted once here to
2050 // get the correct x-bpp offset.
2051 movq mm1
, mm3
// now use mm1 for getting LBCarrys
2052 pand mm1
, mm2
// get LBCarrys for each byte where both
2053 // lsb's were == 1 (Only valid for active group)
2054 psrlq mm2
, 1 // divide raw bytes by 2
2055 pand mm2
, mm4
// clear invalid bit 7 of each byte
2056 paddb mm2
, mm1
// add LBCarrys to (Raw(x-bpp)/2) for each byte
2057 pand mm2
, mm6
// Leave only Active Group 2 bytes to add to Avg
2059 paddb mm0
, mm2
// add (Raw/2) + LBCarrys to Avg for each Active
2062 // Now ready to write back to memory
2063 movq
[edi
+ ebx
- 8], mm0
2064 // Move updated Raw(x) to use as Raw(x-bpp) for next loop
2066 movq mm2
, mm0
// mov updated Raw(x) to mm2
2077 ActiveMask
.use
= 0xffffffffffffffff; // use shift below to clear
2078 // appropriate inactive bytes
2079 ShiftBpp
.use
= bpp
<< 3;
2080 ShiftRem
.use
= 64 - ShiftBpp
.use
;
2082 movq mm4
, HBClearMask
2083 // Re-init address pointers and offset
2084 mov ebx
, diff
// ebx ==> x = offset to alignment boundary
2085 // Load ActiveMask and clear all bytes except for 1st active group
2086 movq mm7
, ActiveMask
2087 mov edi
, row
// edi ==> Avg(x)
2089 mov esi
, prev_row
// esi ==> Prior(x)
2091 movq mm5
, LBCarryMask
2092 psllq mm6
, ShiftBpp
// Create mask for 2nd active group
2093 // PRIME the pump (load the first Raw(x-bpp) data set
2094 movq mm2
, [edi
+ ebx
- 8] // Load previous aligned 8 bytes
2095 // (we correct position in loop below)
2097 movq mm0
, [edi
+ ebx
]
2098 psrlq mm2
, ShiftRem
// shift data to position correctly
2099 movq mm1
, [esi
+ ebx
]
2100 // Add (Prev_row/2) to Average
2102 pand mm3
, mm1
// get lsb for each prev_row byte
2103 psrlq mm1
, 1 // divide prev_row bytes by 2
2104 pand mm1
, mm4
// clear invalid bit 7 of each byte
2105 paddb mm0
, mm1
// add (Prev_row/2) to Avg for each byte
2106 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2107 movq mm1
, mm3
// now use mm1 for getting LBCarrys
2108 pand mm1
, mm2
// get LBCarrys for each byte where both
2109 // lsb's were == 1 (Only valid for active group)
2110 psrlq mm2
, 1 // divide raw bytes by 2
2111 pand mm2
, mm4
// clear invalid bit 7 of each byte
2112 paddb mm2
, mm1
// add LBCarrys to (Raw(x-bpp)/2) for each byte
2113 pand mm2
, mm7
// Leave only Active Group 1 bytes to add to Avg
2114 paddb mm0
, mm2
// add (Raw/2) + LBCarrys to Avg for each Active
2116 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2117 movq mm2
, mm0
// mov updated Raws to mm2
2118 psllq mm2
, ShiftBpp
// shift data to position correctly
2120 movq mm1
, mm3
// now use mm1 for getting LBCarrys
2121 pand mm1
, mm2
// get LBCarrys for each byte where both
2122 // lsb's were == 1 (Only valid for active group)
2123 psrlq mm2
, 1 // divide raw bytes by 2
2124 pand mm2
, mm4
// clear invalid bit 7 of each byte
2125 paddb mm2
, mm1
// add LBCarrys to (Raw(x-bpp)/2) for each byte
2126 pand mm2
, mm6
// Leave only Active Group 2 bytes to add to Avg
2127 paddb mm0
, mm2
// add (Raw/2) + LBCarrys to Avg for each Active
2130 // Now ready to write back to memory
2131 movq
[edi
+ ebx
- 8], mm0
2132 // Prep Raw(x-bpp) for next loop
2133 movq mm2
, mm0
// mov updated Raws to mm2
2140 ActiveMask
.use
= 0x000000000000ffff;
2141 ShiftBpp
.use
= 16; // == 2 * 8 [BUGFIX]
2142 ShiftRem
.use
= 48; // == 64 - 16 [BUGFIX]
2145 movq mm7
, ActiveMask
2146 // Re-init address pointers and offset
2147 mov ebx
, diff
// ebx ==> x = offset to alignment boundary
2148 movq mm5
, LBCarryMask
2149 mov edi
, row
// edi ==> Avg(x)
2150 movq mm4
, HBClearMask
2151 mov esi
, prev_row
// esi ==> Prior(x)
2152 // PRIME the pump (load the first Raw(x-bpp) data set
2153 movq mm2
, [edi
+ ebx
- 8] // Load previous aligned 8 bytes
2154 // (we correct position in loop below)
2156 movq mm0
, [edi
+ ebx
]
2157 psrlq mm2
, ShiftRem
// shift data to position correctly [BUGFIX]
2158 movq mm1
, [esi
+ ebx
]
2159 // Add (Prev_row/2) to Average
2161 pand mm3
, mm1
// get lsb for each prev_row byte
2162 psrlq mm1
, 1 // divide prev_row bytes by 2
2163 pand mm1
, mm4
// clear invalid bit 7 of each byte
2165 paddb mm0
, mm1
// add (Prev_row/2) to Avg for each byte
2166 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2167 movq mm1
, mm3
// now use mm1 for getting LBCarrys
2168 pand mm1
, mm2
// get LBCarrys for each byte where both
2169 // lsb's were == 1 (Only valid for active group)
2170 psrlq mm2
, 1 // divide raw bytes by 2
2171 pand mm2
, mm4
// clear invalid bit 7 of each byte
2172 paddb mm2
, mm1
// add LBCarrys to (Raw(x-bpp)/2) for each byte
2173 pand mm2
, mm6
// Leave only Active Group 1 bytes to add to Avg
2174 paddb mm0
, mm2
// add (Raw/2) + LBCarrys to Avg for each Active byte
2175 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2176 psllq mm6
, ShiftBpp
// shift the mm6 mask to cover bytes 2 & 3
2177 movq mm2
, mm0
// mov updated Raws to mm2
2178 psllq mm2
, ShiftBpp
// shift data to position correctly
2179 movq mm1
, mm3
// now use mm1 for getting LBCarrys
2180 pand mm1
, mm2
// get LBCarrys for each byte where both
2181 // lsb's were == 1 (Only valid for active group)
2182 psrlq mm2
, 1 // divide raw bytes by 2
2183 pand mm2
, mm4
// clear invalid bit 7 of each byte
2184 paddb mm2
, mm1
// add LBCarrys to (Raw(x-bpp)/2) for each byte
2185 pand mm2
, mm6
// Leave only Active Group 2 bytes to add to Avg
2186 paddb mm0
, mm2
// add (Raw/2) + LBCarrys to Avg for each Active byte
2188 // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
2189 psllq mm6
, ShiftBpp
// shift the mm6 mask to cover bytes 4 & 5
2190 movq mm2
, mm0
// mov updated Raws to mm2
2191 psllq mm2
, ShiftBpp
// shift data to position correctly
2192 // Data only needs to be shifted once here to
2193 // get the correct x-bpp offset.
2194 movq mm1
, mm3
// now use mm1 for getting LBCarrys
2195 pand mm1
, mm2
// get LBCarrys for each byte where both
2196 // lsb's were == 1 (Only valid for active group)
2197 psrlq mm2
, 1 // divide raw bytes by 2
2198 pand mm2
, mm4
// clear invalid bit 7 of each byte
2199 paddb mm2
, mm1
// add LBCarrys to (Raw(x-bpp)/2) for each byte
2200 pand mm2
, mm6
// Leave only Active Group 2 bytes to add to Avg
2201 paddb mm0
, mm2
// add (Raw/2) + LBCarrys to Avg for each Active byte
2203 // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
2204 psllq mm6
, ShiftBpp
// shift the mm6 mask to cover bytes 6 & 7
2205 movq mm2
, mm0
// mov updated Raws to mm2
2206 psllq mm2
, ShiftBpp
// shift data to position correctly
2207 // Data only needs to be shifted once here to
2208 // get the correct x-bpp offset.
2210 movq mm1
, mm3
// now use mm1 for getting LBCarrys
2211 pand mm1
, mm2
// get LBCarrys for each byte where both
2212 // lsb's were == 1 (Only valid for active group)
2213 psrlq mm2
, 1 // divide raw bytes by 2
2214 pand mm2
, mm4
// clear invalid bit 7 of each byte
2215 paddb mm2
, mm1
// add LBCarrys to (Raw(x-bpp)/2) for each byte
2216 pand mm2
, mm6
// Leave only Active Group 2 bytes to add to Avg
2217 paddb mm0
, mm2
// add (Raw/2) + LBCarrys to Avg for each Active byte
2220 // Now ready to write back to memory
2221 movq
[edi
+ ebx
- 8], mm0
2222 // Prep Raw(x-bpp) for next loop
2223 movq mm2
, mm0
// mov updated Raws to mm2
2232 // Re-init address pointers and offset
2233 mov ebx
, diff
// ebx ==> x = offset to alignment boundary
2234 mov edi
, row
// edi ==> Avg(x)
2235 cmp ebx
, FullLength
// Test if offset at end of array
2237 // Do Paeth decode for remaining bytes
2238 mov esi
, prev_row
// esi ==> Prior(x)
2240 xor ecx
, ecx
// zero ecx before using cl & cx in loop below
2241 sub edx
, bpp
// edx ==> Raw(x-bpp)
2243 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2245 mov cl
, [esi
+ ebx
] // load cl with Prior(x)
2246 mov al
, [edx
+ ebx
] // load al with Raw(x-bpp)
2249 shr ax
, 1 // divide by 2
2250 add al
, [edi
+ebx
-1] // Add Avg(x); -1 to offset inc ebx
2251 cmp ebx
, FullLength
// Check if at end of array
2252 mov
[edi
+ebx
-1], al
// Write back Raw(x);
2253 // mov does not affect flags; -1 to offset inc ebx
2263 // Re-init address pointers and offset
2264 mov ebx
, diff
// ebx ==> x = offset to alignment boundary
2265 movq mm5
, LBCarryMask
2266 mov edi
, row
// edi ==> Avg(x)
2267 movq mm4
, HBClearMask
2268 mov esi
, prev_row
// esi ==> Prior(x)
2269 // PRIME the pump (load the first Raw(x-bpp) data set
2270 movq mm2
, [edi
+ ebx
- 8] // Load previous aligned 8 bytes
2271 // (NO NEED to correct position in loop below)
2273 movq mm0
, [edi
+ ebx
]
2275 movq mm1
, [esi
+ ebx
]
2277 pand mm3
, mm1
// get lsb for each prev_row byte
2278 psrlq mm1
, 1 // divide prev_row bytes by 2
2279 pand mm3
, mm2
// get LBCarrys for each byte where both
2281 psrlq mm2
, 1 // divide raw bytes by 2
2282 pand mm1
, mm4
// clear invalid bit 7 of each byte
2283 paddb mm0
, mm3
// add LBCarrys to Avg for each byte
2284 pand mm2
, mm4
// clear invalid bit 7 of each byte
2285 paddb mm0
, mm1
// add (Prev_row/2) to Avg for each byte
2286 paddb mm0
, mm2
// add (Raw/2) to Avg for each byte
2288 movq
[edi
+ ebx
- 8], mm0
2289 movq mm2
, mm0
// reuse as Raw(x-bpp)
2294 default: // bpp greater than 8
2297 movq mm5
, LBCarryMask
2298 // Re-init address pointers and offset
2299 mov ebx
, diff
// ebx ==> x = offset to alignment boundary
2300 mov edi
, row
// edi ==> Avg(x)
2301 movq mm4
, HBClearMask
2303 mov esi
, prev_row
// esi ==> Prior(x)
2304 sub edx
, bpp
// edx ==> Raw(x-bpp)
2306 movq mm0
, [edi
+ ebx
]
2308 movq mm1
, [esi
+ ebx
]
2309 pand mm3
, mm1
// get lsb for each prev_row byte
2310 movq mm2
, [edx
+ ebx
]
2311 psrlq mm1
, 1 // divide prev_row bytes by 2
2312 pand mm3
, mm2
// get LBCarrys for each byte where both
2314 psrlq mm2
, 1 // divide raw bytes by 2
2315 pand mm1
, mm4
// clear invalid bit 7 of each byte
2316 paddb mm0
, mm3
// add LBCarrys to Avg for each byte
2317 pand mm2
, mm4
// clear invalid bit 7 of each byte
2318 paddb mm0
, mm1
// add (Prev_row/2) to Avg for each byte
2320 paddb mm0
, mm2
// add (Raw/2) to Avg for each byte
2322 movq
[edi
+ ebx
- 8], mm0
2327 } // end switch ( bpp )
2330 // MMX acceleration complete now do clean-up
2331 // Check if any remaining bytes left to decode
2332 mov ebx
, MMXLength
// ebx ==> x = offset bytes remaining after MMX
2333 mov edi
, row
// edi ==> Avg(x)
2334 cmp ebx
, FullLength
// Test if offset at end of array
2336 // Do Paeth decode for remaining bytes
2337 mov esi
, prev_row
// esi ==> Prior(x)
2339 xor ecx
, ecx
// zero ecx before using cl & cx in loop below
2340 sub edx
, bpp
// edx ==> Raw(x-bpp)
2342 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2344 mov cl
, [esi
+ ebx
] // load cl with Prior(x)
2345 mov al
, [edx
+ ebx
] // load al with Raw(x-bpp)
2348 shr ax
, 1 // divide by 2
2349 add al
, [edi
+ebx
-1] // Add Avg(x); -1 to offset inc ebx
2350 cmp ebx
, FullLength
// Check if at end of array
2351 mov
[edi
+ebx
-1], al
// Write back Raw(x);
2352 // mov does not affect flags; -1 to offset inc ebx
2355 emms
// End MMX instructions; prep for possible FP instrs.
2359 // Optimized code for PNG Paeth filter decoder
2361 png_read_filter_row_mmx_paeth(png_row_infop row_info
, png_bytep row
,
2364 png_uint_32 FullLength
;
2365 png_uint_32 MMXLength
;
2370 int patemp
, pbtemp
, pctemp
;
2372 bpp
= (row_info
->pixel_depth
+ 7) >> 3; // Get # bytes per pixel
2373 FullLength
= row_info
->rowbytes
; // # of bytes to filter
2376 xor ebx
, ebx
// ebx ==> x offset
2378 xor edx
, edx
// edx ==> x-bpp offset
2382 // Compute the Raw value for the first bpp bytes
2383 // Note: the formula works out to be always
2384 // Paeth(x) = Raw(x) + Prior(x) where x < bpp
2390 mov
[edi
+ ebx
- 1], al
2392 // get # of bytes to alignment
2393 mov diff
, edi
// take start of row
2394 add diff
, ebx
// add bpp
2396 add diff
, 0xf // add 7 + 8 to incr past alignment boundary
2397 and diff
, 0xfffffff8 // mask to alignment boundary
2398 sub diff
, edi
// subtract from start ==> value ebx at alignment
2403 // pav = p - a = (a + b - c) - a = b - c
2404 mov al
, [esi
+ ebx
] // load Prior(x) into al
2405 mov cl
, [esi
+ edx
] // load Prior(x-bpp) into cl
2406 sub eax
, ecx
// subtract Prior(x-bpp)
2407 mov patemp
, eax
// Save pav for later use
2409 // pbv = p - b = (a + b - c) - b = a - c
2410 mov al
, [edi
+ edx
] // load Raw(x-bpp) into al
2411 sub eax
, ecx
// subtract Prior(x-bpp)
2413 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2414 add eax
, patemp
// pcv = pav + pbv
2416 test eax
, 0x80000000
2418 neg eax
// reverse sign of neg values
2420 mov pctemp
, eax
// save pc for later use
2422 test ecx
, 0x80000000
2424 neg ecx
// reverse sign of neg values
2426 mov pbtemp
, ecx
// save pb for later use
2429 test eax
, 0x80000000
2431 neg eax
// reverse sign of neg values
2433 mov patemp
, eax
// save pa for later use
2437 // pa > pb; now test if pb <= pc
2440 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2441 mov cl
, [esi
+ edx
] // load Prior(x-bpp) into cl
2444 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
2445 mov cl
, [esi
+ ebx
] // load Prior(x) into cl
2448 // pa <= pb; now test if pa <= pc
2451 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2452 mov cl
, [esi
+ edx
] // load Prior(x-bpp) into cl
2455 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
2456 mov cl
, [edi
+ edx
] // load Raw(x-bpp) into cl
2460 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
2461 add
[edi
+ ebx
- 1], cl
2467 sub eax
, ebx
// subtract alignment fix
2468 and eax
, 0x00000007 // calc bytes over mult of 8
2469 sub ecx
, eax
// drop over bytes from original length
2472 // Now do the math for the rest of the row
2477 ActiveMask
.use
= 0x0000000000ffffff;
2478 ActiveMaskEnd
.use
= 0xffff000000000000;
2479 ShiftBpp
.use
= 24; // == bpp(3) * 8
2480 ShiftRem
.use
= 40; // == 64 - 24
2487 // PRIME the pump (load the first Raw(x-bpp) data set
2488 movq mm1
, [edi
+ebx
-8]
2490 psrlq mm1
, ShiftRem
// shift last 3 bytes to 1st 3 bytes
2491 movq mm2
, [esi
+ ebx
] // load b=Prior(x)
2492 punpcklbw mm1
, mm0
// Unpack High bytes of a
2493 movq mm3
, [esi
+ebx
-8] // Prep c=Prior(x-bpp) bytes
2494 punpcklbw mm2
, mm0
// Unpack High bytes of b
2495 psrlq mm3
, ShiftRem
// shift last 3 bytes to 1st 3 bytes
2496 // pav = p - a = (a + b - c) - a = b - c
2498 punpcklbw mm3
, mm0
// Unpack High bytes of c
2499 // pbv = p - b = (a + b - c) - b = a - c
2503 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2507 // pa = abs(p-a) = abs(pav)
2508 // pb = abs(p-b) = abs(pbv)
2509 // pc = abs(p-c) = abs(pcv)
2510 pcmpgtw mm0
, mm4
// Create mask pav bytes < 0
2512 pand mm0
, mm4
// Only pav bytes < 0 in mm7
2513 pcmpgtw mm7
, mm5
// Create mask pbv bytes < 0
2515 pand mm7
, mm5
// Only pbv bytes < 0 in mm0
2519 pcmpgtw mm0
, mm6
// Create mask pcv bytes < 0
2520 pand mm0
, mm6
// Only pav bytes < 0 in mm7
2526 pcmpgtw mm7
, mm5
// pa > pb?
2528 // use mm7 mask to merge pa & pb
2530 // use mm0 mask copy to merge a & b
2536 // test ((pa <= pb)? pa:pb) <= pc
2537 pcmpgtw mm7
, mm6
// pab > pc?
2544 movq mm3
, [esi
+ ebx
] // load c=Prior(x-bpp)
2545 pand mm7
, ActiveMask
2546 movq mm2
, mm3
// load b=Prior(x) step 1
2547 paddb mm7
, [edi
+ ebx
] // add Paeth predictor with Raw(x)
2548 punpcklbw mm3
, mm0
// Unpack High bytes of c
2549 movq
[edi
+ ebx
], mm7
// write back updated value
2550 movq mm1
, mm7
// Now mm1 will be used as Raw(x-bpp)
2551 // Now do Paeth for 2nd set of bytes (3-5)
2552 psrlq mm2
, ShiftBpp
// load b=Prior(x) step 2
2553 punpcklbw mm1
, mm0
// Unpack High bytes of a
2555 punpcklbw mm2
, mm0
// Unpack High bytes of b
2556 // pbv = p - b = (a + b - c) - b = a - c
2558 // pav = p - a = (a + b - c) - a = b - c
2562 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
2563 // pav + pbv = pbv + pav
2567 // pa = abs(p-a) = abs(pav)
2568 // pb = abs(p-b) = abs(pbv)
2569 // pc = abs(p-c) = abs(pcv)
2570 pcmpgtw mm0
, mm5
// Create mask pbv bytes < 0
2571 pcmpgtw mm7
, mm4
// Create mask pav bytes < 0
2572 pand mm0
, mm5
// Only pbv bytes < 0 in mm0
2573 pand mm7
, mm4
// Only pav bytes < 0 in mm7
2579 pcmpgtw mm0
, mm6
// Create mask pcv bytes < 0
2580 pand mm0
, mm6
// Only pav bytes < 0 in mm7
2585 pcmpgtw mm7
, mm5
// pa > pb?
2587 // use mm7 mask to merge pa & pb
2589 // use mm0 mask copy to merge a & b
2595 // test ((pa <= pb)? pa:pb) <= pc
2596 pcmpgtw mm7
, mm6
// pab > pc?
2597 movq mm2
, [esi
+ ebx
] // load b=Prior(x)
2604 movq mm3
, mm2
// load c=Prior(x-bpp) step 1
2605 pand mm7
, ActiveMask
2606 punpckhbw mm2
, mm0
// Unpack High bytes of b
2607 psllq mm7
, ShiftBpp
// Shift bytes to 2nd group of 3 bytes
2608 // pav = p - a = (a + b - c) - a = b - c
2610 paddb mm7
, [edi
+ ebx
] // add Paeth predictor with Raw(x)
2611 psllq mm3
, ShiftBpp
// load c=Prior(x-bpp) step 2
2612 movq
[edi
+ ebx
], mm7
// write back updated value
2614 punpckhbw mm3
, mm0
// Unpack High bytes of c
2615 psllq mm1
, ShiftBpp
// Shift bytes
2616 // Now mm1 will be used as Raw(x-bpp)
2617 // Now do Paeth for 3rd, and final, set of bytes (6-7)
2619 punpckhbw mm1
, mm0
// Unpack High bytes of a
2621 // pbv = p - b = (a + b - c) - b = a - c
2623 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2629 // pa = abs(p-a) = abs(pav)
2630 // pb = abs(p-b) = abs(pbv)
2631 // pc = abs(p-c) = abs(pcv)
2632 pcmpgtw mm0
, mm4
// Create mask pav bytes < 0
2633 pcmpgtw mm7
, mm5
// Create mask pbv bytes < 0
2634 pand mm0
, mm4
// Only pav bytes < 0 in mm7
2635 pand mm7
, mm5
// Only pbv bytes < 0 in mm0
2641 pcmpgtw mm0
, mm6
// Create mask pcv bytes < 0
2642 pand mm0
, mm6
// Only pav bytes < 0 in mm7
2647 pcmpgtw mm7
, mm5
// pa > pb?
2649 // use mm0 mask copy to merge a & b
2651 // use mm7 mask to merge pa & pb
2657 // test ((pa <= pb)? pa:pb) <= pc
2658 pcmpgtw mm7
, mm6
// pab > pc?
2664 // Step ebx to next set of 8 bytes and repeat loop til done
2666 pand mm1
, ActiveMaskEnd
2667 paddb mm1
, [edi
+ ebx
- 8] // add Paeth predictor with Raw(x)
2670 pxor mm0
, mm0
// pxor does not affect flags
2671 movq
[edi
+ ebx
- 8], mm1
// write back updated value
2672 // mm1 will be used as Raw(x-bpp) next loop
2673 // mm3 ready to be used as Prior(x-bpp) next loop
2683 ActiveMask
.use
= 0x00000000ffffffff;
2684 ActiveMask2
.use
= 0xffffffff00000000;
2685 ShiftBpp
.use
= bpp
<< 3; // == bpp * 8
2686 ShiftRem
.use
= 64 - ShiftBpp
.use
;
2692 // PRIME the pump (load the first Raw(x-bpp) data set
2693 movq mm1
, [edi
+ebx
-8]
2696 // Must shift to position Raw(x-bpp) data
2698 // Do first set of 4 bytes
2699 movq mm3
, [esi
+ebx
-8] // read c=Prior(x-bpp) bytes
2700 punpcklbw mm1
, mm0
// Unpack Low bytes of a
2701 movq mm2
, [esi
+ ebx
] // load b=Prior(x)
2702 punpcklbw mm2
, mm0
// Unpack Low bytes of b
2703 // Must shift to position Prior(x-bpp) data
2705 // pav = p - a = (a + b - c) - a = b - c
2707 punpcklbw mm3
, mm0
// Unpack Low bytes of c
2708 // pbv = p - b = (a + b - c) - b = a - c
2712 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2715 // pa = abs(p-a) = abs(pav)
2716 // pb = abs(p-b) = abs(pbv)
2717 // pc = abs(p-c) = abs(pcv)
2718 pcmpgtw mm0
, mm4
// Create mask pav bytes < 0
2720 pand mm0
, mm4
// Only pav bytes < 0 in mm7
2721 pcmpgtw mm7
, mm5
// Create mask pbv bytes < 0
2723 pand mm7
, mm5
// Only pbv bytes < 0 in mm0
2727 pcmpgtw mm0
, mm6
// Create mask pcv bytes < 0
2728 pand mm0
, mm6
// Only pav bytes < 0 in mm7
2734 pcmpgtw mm7
, mm5
// pa > pb?
2736 // use mm7 mask to merge pa & pb
2738 // use mm0 mask copy to merge a & b
2744 // test ((pa <= pb)? pa:pb) <= pc
2745 pcmpgtw mm7
, mm6
// pab > pc?
2752 movq mm3
, [esi
+ ebx
- 8] // load c=Prior(x-bpp)
2753 pand mm7
, ActiveMask
2755 movq mm2
, [esi
+ ebx
] // load b=Prior(x) step 1
2756 paddb mm7
, [edi
+ ebx
] // add Paeth predictor with Raw(x)
2758 movq
[edi
+ ebx
], mm7
// write back updated value
2759 movq mm1
, [edi
+ebx
-8]
2765 punpckhbw mm3
, mm0
// Unpack High bytes of c
2767 // Do second set of 4 bytes
2768 punpckhbw mm2
, mm0
// Unpack High bytes of b
2769 punpckhbw mm1
, mm0
// Unpack High bytes of a
2770 // pav = p - a = (a + b - c) - a = b - c
2772 // pbv = p - b = (a + b - c) - b = a - c
2776 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2779 // pa = abs(p-a) = abs(pav)
2780 // pb = abs(p-b) = abs(pbv)
2781 // pc = abs(p-c) = abs(pcv)
2782 pcmpgtw mm0
, mm4
// Create mask pav bytes < 0
2784 pand mm0
, mm4
// Only pav bytes < 0 in mm7
2785 pcmpgtw mm7
, mm5
// Create mask pbv bytes < 0
2787 pand mm7
, mm5
// Only pbv bytes < 0 in mm0
2791 pcmpgtw mm0
, mm6
// Create mask pcv bytes < 0
2792 pand mm0
, mm6
// Only pav bytes < 0 in mm7
2798 pcmpgtw mm7
, mm5
// pa > pb?
2800 // use mm7 mask to merge pa & pb
2802 // use mm0 mask copy to merge a & b
2808 // test ((pa <= pb)? pa:pb) <= pc
2809 pcmpgtw mm7
, mm6
// pab > pc?
2816 // Step ex to next set of 8 bytes and repeat loop til done
2819 paddb mm1
, [edi
+ ebx
- 8] // add Paeth predictor with Raw(x)
2821 movq
[edi
+ ebx
- 8], mm1
// write back updated value
2822 // mm1 will be used as Raw(x-bpp) next loop
2830 ActiveMask
.use
= 0x00000000ffffffff;
2836 // PRIME the pump (load the first Raw(x-bpp) data set
2837 movq mm1
, [edi
+ebx
-8] // Only time should need to read
2838 // a=Raw(x-bpp) bytes
2840 // Do first set of 4 bytes
2841 movq mm3
, [esi
+ebx
-8] // read c=Prior(x-bpp) bytes
2842 punpckhbw mm1
, mm0
// Unpack Low bytes of a
2843 movq mm2
, [esi
+ ebx
] // load b=Prior(x)
2844 punpcklbw mm2
, mm0
// Unpack High bytes of b
2845 // pav = p - a = (a + b - c) - a = b - c
2847 punpckhbw mm3
, mm0
// Unpack High bytes of c
2848 // pbv = p - b = (a + b - c) - b = a - c
2852 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2855 // pa = abs(p-a) = abs(pav)
2856 // pb = abs(p-b) = abs(pbv)
2857 // pc = abs(p-c) = abs(pcv)
2858 pcmpgtw mm0
, mm4
// Create mask pav bytes < 0
2860 pand mm0
, mm4
// Only pav bytes < 0 in mm7
2861 pcmpgtw mm7
, mm5
// Create mask pbv bytes < 0
2863 pand mm7
, mm5
// Only pbv bytes < 0 in mm0
2867 pcmpgtw mm0
, mm6
// Create mask pcv bytes < 0
2868 pand mm0
, mm6
// Only pav bytes < 0 in mm7
2874 pcmpgtw mm7
, mm5
// pa > pb?
2876 // use mm7 mask to merge pa & pb
2878 // use mm0 mask copy to merge a & b
2884 // test ((pa <= pb)? pa:pb) <= pc
2885 pcmpgtw mm7
, mm6
// pab > pc?
2892 movq mm3
, [esi
+ ebx
] // load c=Prior(x-bpp)
2893 pand mm7
, ActiveMask
2894 movq mm2
, mm3
// load b=Prior(x) step 1
2895 paddb mm7
, [edi
+ ebx
] // add Paeth predictor with Raw(x)
2896 punpcklbw mm3
, mm0
// Unpack High bytes of c
2897 movq
[edi
+ ebx
], mm7
// write back updated value
2898 movq mm1
, mm7
// Now mm1 will be used as Raw(x-bpp)
2899 // Do second set of 4 bytes
2900 punpckhbw mm2
, mm0
// Unpack Low bytes of b
2901 punpcklbw mm1
, mm0
// Unpack Low bytes of a
2902 // pav = p - a = (a + b - c) - a = b - c
2904 // pbv = p - b = (a + b - c) - b = a - c
2908 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2911 // pa = abs(p-a) = abs(pav)
2912 // pb = abs(p-b) = abs(pbv)
2913 // pc = abs(p-c) = abs(pcv)
2914 pcmpgtw mm0
, mm4
// Create mask pav bytes < 0
2916 pand mm0
, mm4
// Only pav bytes < 0 in mm7
2917 pcmpgtw mm7
, mm5
// Create mask pbv bytes < 0
2919 pand mm7
, mm5
// Only pbv bytes < 0 in mm0
2923 pcmpgtw mm0
, mm6
// Create mask pcv bytes < 0
2924 pand mm0
, mm6
// Only pav bytes < 0 in mm7
2930 pcmpgtw mm7
, mm5
// pa > pb?
2932 // use mm7 mask to merge pa & pb
2934 // use mm0 mask copy to merge a & b
2940 // test ((pa <= pb)? pa:pb) <= pc
2941 pcmpgtw mm7
, mm6
// pab > pc?
2948 // Step ex to next set of 8 bytes and repeat loop til done
2951 paddb mm1
, [edi
+ ebx
- 8] // add Paeth predictor with Raw(x)
2953 movq
[edi
+ ebx
- 8], mm1
// write back updated value
2954 // mm1 will be used as Raw(x-bpp) next loop
2961 ActiveMask
.use
= 0x00000000ffffffff;
2967 // PRIME the pump (load the first Raw(x-bpp) data set
2968 movq mm1
, [edi
+ebx
-8] // Only time should need to read
2969 // a=Raw(x-bpp) bytes
2971 // Do first set of 4 bytes
2972 movq mm3
, [esi
+ebx
-8] // read c=Prior(x-bpp) bytes
2973 punpcklbw mm1
, mm0
// Unpack Low bytes of a
2974 movq mm2
, [esi
+ ebx
] // load b=Prior(x)
2975 punpcklbw mm2
, mm0
// Unpack Low bytes of b
2976 // pav = p - a = (a + b - c) - a = b - c
2978 punpcklbw mm3
, mm0
// Unpack Low bytes of c
2979 // pbv = p - b = (a + b - c) - b = a - c
2983 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2986 // pa = abs(p-a) = abs(pav)
2987 // pb = abs(p-b) = abs(pbv)
2988 // pc = abs(p-c) = abs(pcv)
2989 pcmpgtw mm0
, mm4
// Create mask pav bytes < 0
2991 pand mm0
, mm4
// Only pav bytes < 0 in mm7
2992 pcmpgtw mm7
, mm5
// Create mask pbv bytes < 0
2994 pand mm7
, mm5
// Only pbv bytes < 0 in mm0
2998 pcmpgtw mm0
, mm6
// Create mask pcv bytes < 0
2999 pand mm0
, mm6
// Only pav bytes < 0 in mm7
3005 pcmpgtw mm7
, mm5
// pa > pb?
3007 // use mm7 mask to merge pa & pb
3009 // use mm0 mask copy to merge a & b
3015 // test ((pa <= pb)? pa:pb) <= pc
3016 pcmpgtw mm7
, mm6
// pab > pc?
3023 movq mm3
, [esi
+ebx
-8] // read c=Prior(x-bpp) bytes
3024 pand mm7
, ActiveMask
3025 movq mm2
, [esi
+ ebx
] // load b=Prior(x)
3026 paddb mm7
, [edi
+ ebx
] // add Paeth predictor with Raw(x)
3027 punpckhbw mm3
, mm0
// Unpack High bytes of c
3028 movq
[edi
+ ebx
], mm7
// write back updated value
3029 movq mm1
, [edi
+ebx
-8] // read a=Raw(x-bpp) bytes
3031 // Do second set of 4 bytes
3032 punpckhbw mm2
, mm0
// Unpack High bytes of b
3033 punpckhbw mm1
, mm0
// Unpack High bytes of a
3034 // pav = p - a = (a + b - c) - a = b - c
3036 // pbv = p - b = (a + b - c) - b = a - c
3040 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3043 // pa = abs(p-a) = abs(pav)
3044 // pb = abs(p-b) = abs(pbv)
3045 // pc = abs(p-c) = abs(pcv)
3046 pcmpgtw mm0
, mm4
// Create mask pav bytes < 0
3048 pand mm0
, mm4
// Only pav bytes < 0 in mm7
3049 pcmpgtw mm7
, mm5
// Create mask pbv bytes < 0
3051 pand mm7
, mm5
// Only pbv bytes < 0 in mm0
3055 pcmpgtw mm0
, mm6
// Create mask pcv bytes < 0
3056 pand mm0
, mm6
// Only pav bytes < 0 in mm7
3062 pcmpgtw mm7
, mm5
// pa > pb?
3064 // use mm7 mask to merge pa & pb
3066 // use mm0 mask copy to merge a & b
3072 // test ((pa <= pb)? pa:pb) <= pc
3073 pcmpgtw mm7
, mm6
// pab > pc?
3080 // Step ex to next set of 8 bytes and repeat loop til done
3083 paddb mm1
, [edi
+ ebx
- 8] // add Paeth predictor with Raw(x)
3085 movq
[edi
+ ebx
- 8], mm1
// write back updated value
3086 // mm1 will be used as Raw(x-bpp) next loop
3102 // Do Paeth decode for remaining bytes
3104 xor ecx
, ecx
// zero ecx before using cl & cx in loop below
3105 sub edx
, bpp
// Set edx = ebx - bpp
3108 // pav = p - a = (a + b - c) - a = b - c
3109 mov al
, [esi
+ ebx
] // load Prior(x) into al
3110 mov cl
, [esi
+ edx
] // load Prior(x-bpp) into cl
3111 sub eax
, ecx
// subtract Prior(x-bpp)
3112 mov patemp
, eax
// Save pav for later use
3114 // pbv = p - b = (a + b - c) - b = a - c
3115 mov al
, [edi
+ edx
] // load Raw(x-bpp) into al
3116 sub eax
, ecx
// subtract Prior(x-bpp)
3118 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3119 add eax
, patemp
// pcv = pav + pbv
3121 test eax
, 0x80000000
3123 neg eax
// reverse sign of neg values
3125 mov pctemp
, eax
// save pc for later use
3127 test ecx
, 0x80000000
3129 neg ecx
// reverse sign of neg values
3131 mov pbtemp
, ecx
// save pb for later use
3134 test eax
, 0x80000000
3136 neg eax
// reverse sign of neg values
3138 mov patemp
, eax
// save pa for later use
3142 // pa > pb; now test if pb <= pc
3145 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3146 mov cl
, [esi
+ edx
] // load Prior(x-bpp) into cl
3149 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3150 mov cl
, [esi
+ ebx
] // load Prior(x) into cl
3153 // pa <= pb; now test if pa <= pc
3156 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3157 mov cl
, [esi
+ edx
] // load Prior(x-bpp) into cl
3160 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3161 mov cl
, [edi
+ edx
] // load Raw(x-bpp) into cl
3165 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3166 add
[edi
+ ebx
- 1], cl
3172 return; // No need to go further with this one
3173 } // end switch ( bpp )
3176 // MMX acceleration complete now do clean-up
3177 // Check if any remaining bytes left to decode
3183 // Do Paeth decode for remaining bytes
3185 xor ecx
, ecx
// zero ecx before using cl & cx in loop below
3186 sub edx
, bpp
// Set edx = ebx - bpp
3189 // pav = p - a = (a + b - c) - a = b - c
3190 mov al
, [esi
+ ebx
] // load Prior(x) into al
3191 mov cl
, [esi
+ edx
] // load Prior(x-bpp) into cl
3192 sub eax
, ecx
// subtract Prior(x-bpp)
3193 mov patemp
, eax
// Save pav for later use
3195 // pbv = p - b = (a + b - c) - b = a - c
3196 mov al
, [edi
+ edx
] // load Raw(x-bpp) into al
3197 sub eax
, ecx
// subtract Prior(x-bpp)
3199 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3200 add eax
, patemp
// pcv = pav + pbv
3202 test eax
, 0x80000000
3204 neg eax
// reverse sign of neg values
3206 mov pctemp
, eax
// save pc for later use
3208 test ecx
, 0x80000000
3210 neg ecx
// reverse sign of neg values
3212 mov pbtemp
, ecx
// save pb for later use
3215 test eax
, 0x80000000
3217 neg eax
// reverse sign of neg values
3219 mov patemp
, eax
// save pa for later use
3223 // pa > pb; now test if pb <= pc
3226 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3227 mov cl
, [esi
+ edx
] // load Prior(x-bpp) into cl
3230 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3231 mov cl
, [esi
+ ebx
] // load Prior(x) into cl
3234 // pa <= pb; now test if pa <= pc
3237 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3238 mov cl
, [esi
+ edx
] // load Prior(x-bpp) into cl
3241 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3242 mov cl
, [edi
+ edx
] // load Raw(x-bpp) into cl
3246 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3247 add
[edi
+ ebx
- 1], cl
3251 emms
// End MMX instructions; prep for possible FP instrs.
3255 // Optimized code for PNG Sub filter decoder
3257 png_read_filter_row_mmx_sub(png_row_infop row_info
, png_bytep row
)
3261 png_uint_32 FullLength
;
3262 png_uint_32 MMXLength
;
3265 bpp
= (row_info
->pixel_depth
+ 7) >> 3; // Get # bytes per pixel
3266 FullLength
= row_info
->rowbytes
- bpp
; // # of bytes to filter
3269 mov esi
, edi
// lp = row
3270 add edi
, bpp
// rp = row + bpp
3272 // get # of bytes to alignment
3273 mov diff
, edi
// take start of row
3274 add diff
, 0xf // add 7 + 8 to incr past
3275 // alignment boundary
3277 and diff
, 0xfffffff8 // mask to alignment boundary
3278 sub diff
, edi
// subtract from start ==> value
3291 sub edx
, ebx
// subtract alignment fix
3292 and edx
, 0x00000007 // calc bytes over mult of 8
3293 sub ecx
, edx
// drop over bytes from length
3297 // Now do the math for the rest of the row
3302 ActiveMask
.use
= 0x0000ffffff000000;
3303 ShiftBpp
.use
= 24; // == 3 * 8
3304 ShiftRem
.use
= 40; // == 64 - 24
3307 movq mm7
, ActiveMask
// Load ActiveMask for 2nd active byte group
3308 mov esi
, edi
// lp = row
3309 add edi
, bpp
// rp = row + bpp
3312 psllq mm6
, ShiftBpp
// Move mask in mm6 to cover 3rd active
3314 // PRIME the pump (load the first Raw(x-bpp) data set
3315 movq mm1
, [edi
+ebx
-8]
3317 psrlq mm1
, ShiftRem
// Shift data for adding 1st bpp bytes
3318 // no need for mask; shift clears inactive bytes
3319 // Add 1st active group
3322 // Add 2nd active group
3323 movq mm1
, mm0
// mov updated Raws to mm1
3324 psllq mm1
, ShiftBpp
// shift data to position correctly
3325 pand mm1
, mm7
// mask to use only 2nd active group
3327 // Add 3rd active group
3328 movq mm1
, mm0
// mov updated Raws to mm1
3329 psllq mm1
, ShiftBpp
// shift data to position correctly
3330 pand mm1
, mm6
// mask to use only 3rd active group
3334 movq
[edi
+ebx
-8], mm0
// Write updated Raws back to array
3335 // Prep for doing 1st add at top of loop
3344 // Placed here just in case this is a duplicate of the
3345 // non-MMX code for the SUB filter in png_read_filter_row below
3350 // bpp = (row_info->pixel_depth + 7) >> 3;
3351 // for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
3352 // i < row_info->rowbytes; i++, rp++, lp++)
3354 // *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
3361 mov esi
, edi
// lp = row
3363 add edi
, bpp
// rp = row + bpp
3380 ShiftBpp
.use
= bpp
<< 3;
3381 ShiftRem
.use
= 64 - ShiftBpp
.use
;
3385 mov esi
, edi
// lp = row
3386 add edi
, bpp
// rp = row + bpp
3387 // PRIME the pump (load the first Raw(x-bpp) data set
3388 movq mm1
, [edi
+ebx
-8]
3390 psrlq mm1
, ShiftRem
// Shift data for adding 1st bpp bytes
3391 // no need for mask; shift clears inactive bytes
3394 // Add 2nd active group
3395 movq mm1
, mm0
// mov updated Raws to mm1
3396 psllq mm1
, ShiftBpp
// shift data to position correctly
3397 // there is no need for any mask
3398 // since shift clears inactive bits/bytes
3402 movq
[edi
+ebx
-8], mm0
3403 movq mm1
, mm0
// Prep for doing 1st add at top of loop
3411 ActiveMask
.use
= 0x00000000ffff0000;
3412 ShiftBpp
.use
= 16; // == 2 * 8
3413 ShiftRem
.use
= 48; // == 64 - 16
3415 movq mm7
, ActiveMask
// Load ActiveMask for 2nd active byte group
3419 psllq mm6
, ShiftBpp
// Move mask in mm6 to cover 3rd active
3421 mov esi
, edi
// lp = row
3423 add edi
, bpp
// rp = row + bpp
3424 psllq mm5
, ShiftBpp
// Move mask in mm5 to cover 4th active
3426 // PRIME the pump (load the first Raw(x-bpp) data set
3427 movq mm1
, [edi
+ebx
-8]
3429 // Add 1st active group
3430 psrlq mm1
, ShiftRem
// Shift data for adding 1st bpp bytes
3431 // no need for mask; shift clears inactive
3435 // Add 2nd active group
3436 movq mm1
, mm0
// mov updated Raws to mm1
3437 psllq mm1
, ShiftBpp
// shift data to position correctly
3438 pand mm1
, mm7
// mask to use only 2nd active group
3440 // Add 3rd active group
3441 movq mm1
, mm0
// mov updated Raws to mm1
3442 psllq mm1
, ShiftBpp
// shift data to position correctly
3443 pand mm1
, mm6
// mask to use only 3rd active group
3445 // Add 4th active group
3446 movq mm1
, mm0
// mov updated Raws to mm1
3447 psllq mm1
, ShiftBpp
// shift data to position correctly
3448 pand mm1
, mm5
// mask to use only 4th active group
3452 movq
[edi
+ebx
-8], mm0
// Write updated Raws back to array
3453 movq mm1
, mm0
// Prep for doing 1st add at top of loop
3463 mov esi
, edi
// lp = row
3464 add edi
, bpp
// rp = row + bpp
3466 movq mm7
, [edi
+ebx
-8] // PRIME the pump (load the first
3467 // Raw(x-bpp) data set
3468 and ecx
, 0x0000003f // calc bytes over mult of 64
3470 movq mm0
, [edi
+ebx
] // Load Sub(x) for 1st 8 bytes
3472 movq mm1
, [edi
+ebx
+8] // Load Sub(x) for 2nd 8 bytes
3473 movq
[edi
+ebx
], mm0
// Write Raw(x) for 1st 8 bytes
3474 // Now mm0 will be used as Raw(x-bpp) for
3475 // the 2nd group of 8 bytes. This will be
3476 // repeated for each group of 8 bytes with
3477 // the 8th group being used as the Raw(x-bpp)
3478 // for the 1st group of the next loop.
3480 movq mm2
, [edi
+ebx
+16] // Load Sub(x) for 3rd 8 bytes
3481 movq
[edi
+ebx
+8], mm1
// Write Raw(x) for 2nd 8 bytes
3483 movq mm3
, [edi
+ebx
+24] // Load Sub(x) for 4th 8 bytes
3484 movq
[edi
+ebx
+16], mm2
// Write Raw(x) for 3rd 8 bytes
3486 movq mm4
, [edi
+ebx
+32] // Load Sub(x) for 5th 8 bytes
3487 movq
[edi
+ebx
+24], mm3
// Write Raw(x) for 4th 8 bytes
3489 movq mm5
, [edi
+ebx
+40] // Load Sub(x) for 6th 8 bytes
3490 movq
[edi
+ebx
+32], mm4
// Write Raw(x) for 5th 8 bytes
3492 movq mm6
, [edi
+ebx
+48] // Load Sub(x) for 7th 8 bytes
3493 movq
[edi
+ebx
+40], mm5
// Write Raw(x) for 6th 8 bytes
3495 movq mm7
, [edi
+ebx
+56] // Load Sub(x) for 8th 8 bytes
3496 movq
[edi
+ebx
+48], mm6
// Write Raw(x) for 7th 8 bytes
3500 movq
[edi
+ebx
-8], mm7
// Write Raw(x) for 8th 8 bytes
3509 movq
[edi
+ebx
-8], mm0
// use -8 to offset early add to ebx
3510 movq mm7
, mm0
// Move calculated Raw(x) data to mm1 to
3511 // be the new Raw(x-bpp) for the next loop
3518 default: // bpp greater than 8 bytes
3523 mov esi
, edi
// lp = row
3524 add edi
, bpp
// rp = row + bpp
3531 movq
[edi
+ebx
-8], mm0
// mov does not affect flags; -8 to offset
3538 } // end switch ( bpp )
3545 mov esi
, edi
// lp = row
3547 add edi
, bpp
// rp = row + bpp
3555 emms
// End MMX instructions; prep for possible FP instrs.
3559 // Optimized code for PNG Up filter decoder
3561 png_read_filter_row_mmx_up(png_row_infop row_info
, png_bytep row
,
3565 len
= row_info
->rowbytes
; // # of bytes to filter
3568 // get # of bytes to alignment
3583 mov
[edi
+ ebx
-1], al
// mov does not affect flags; -1 to offset inc ebx
3588 sub edx
, ebx
// subtract alignment fix
3589 and edx
, 0x0000003f // calc bytes over mult of 64
3590 sub ecx
, edx
// drop over bytes from length
3591 // Unrolled loop - use all MMX registers and interleave to reduce
3592 // number of branch instructions (loops) and reduce partial stalls
3596 movq mm3
, [esi
+ebx
+8]
3598 movq mm2
, [edi
+ebx
+8]
3601 movq mm5
, [esi
+ebx
+16]
3602 movq
[edi
+ebx
+8], mm2
3603 movq mm4
, [edi
+ebx
+16]
3604 movq mm7
, [esi
+ebx
+24]
3606 movq mm6
, [edi
+ebx
+24]
3607 movq
[edi
+ebx
+16], mm4
3609 movq mm1
, [esi
+ebx
+32]
3610 movq
[edi
+ebx
+24], mm6
3611 movq mm0
, [edi
+ebx
+32]
3612 movq mm3
, [esi
+ebx
+40]
3614 movq mm2
, [edi
+ebx
+40]
3615 movq
[edi
+ebx
+32], mm0
3617 movq mm5
, [esi
+ebx
+48]
3618 movq
[edi
+ebx
+40], mm2
3619 movq mm4
, [edi
+ebx
+48]
3620 movq mm7
, [esi
+ebx
+56]
3622 movq mm6
, [edi
+ebx
+56]
3623 movq
[edi
+ebx
+48], mm4
3627 movq
[edi
+ebx
-8], mm6
// (+56)movq does not affect flags;
3628 // -8 to offset add ebx
3631 cmp edx
, 0 // Test for bytes over mult of 64
3635 // 2 lines added by lcreeve at netins.net
3636 // (mail 11 Jul 98 in png-implement list)
3637 cmp edx
, 8 //test for less than 8 bytes
3642 and edx
, 0x00000007 // calc bytes over mult of 8
3643 sub ecx
, edx
// drop over bytes from length
3645 // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
3652 movq
[edi
+ebx
-8], mm0
// movq does not affect flags; -8 to offset add ebx
3654 cmp edx
, 0 // Test for bytes over mult of 8
3658 add ecx
, edx
// move over byte count into counter
3659 // Loop using x86 registers to update remaining bytes
3665 mov
[edi
+ ebx
-1], al
// mov does not affect flags; -1 to offset inc ebx
3668 // Conversion of filtered row completed
3669 emms
// End MMX instructions; prep for possible FP instrs.
3674 // Optimized png_read_filter_row routines
3676 png_read_filter_row(png_structp png_ptr
, png_row_infop row_info
, png_bytep
3677 row
, png_bytep prev_row
, int filter
)
3683 if (mmx_supported
== 2) {
3684 #if !defined(PNG_1_0_X)
3685 /* this should have happened in png_init_mmx_flags() already */
3686 png_warning(png_ptr
, "asm_flags may not have been initialized");
3692 png_debug(1, "in png_read_filter_row\n");
3695 case 0: sprintf(filnm
, "none");
3697 #if !defined(PNG_1_0_X)
3698 case 1: sprintf(filnm
, "sub-%s",
3699 (png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_SUB
)? "MMX" : "x86");
3701 case 2: sprintf(filnm
, "up-%s",
3702 (png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_UP
)? "MMX" : "x86");
3704 case 3: sprintf(filnm
, "avg-%s",
3705 (png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_AVG
)? "MMX" : "x86");
3707 case 4: sprintf(filnm
, "Paeth-%s",
3708 (png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_PAETH
)? "MMX":"x86");
3711 case 1: sprintf(filnm
, "sub");
3713 case 2: sprintf(filnm
, "up");
3715 case 3: sprintf(filnm
, "avg");
3717 case 4: sprintf(filnm
, "Paeth");
3720 default: sprintf(filnm
, "unknw");
3723 png_debug2(0,"row=%5d, %s, ", png_ptr
->row_number
, filnm
);
3724 png_debug2(0, "pd=%2d, b=%d, ", (int)row_info
->pixel_depth
,
3725 (int)((row_info
->pixel_depth
+ 7) >> 3));
3726 png_debug1(0,"len=%8d, ", row_info
->rowbytes
);
3727 #endif /* PNG_DEBUG */
3731 case PNG_FILTER_VALUE_NONE
:
3734 case PNG_FILTER_VALUE_SUB
:
3736 #if !defined(PNG_1_0_X)
3737 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_SUB
) &&
3738 (row_info
->pixel_depth
>= png_ptr
->mmx_bitdepth_threshold
) &&
3739 (row_info
->rowbytes
>= png_ptr
->mmx_rowbytes_threshold
))
3744 png_read_filter_row_mmx_sub(row_info
, row
);
3749 png_uint_32 istop
= row_info
->rowbytes
;
3750 png_uint_32 bpp
= (row_info
->pixel_depth
+ 7) >> 3;
3751 png_bytep rp
= row
+ bpp
;
3754 for (i
= bpp
; i
< istop
; i
++)
3756 *rp
= (png_byte
)(((int)(*rp
) + (int)(*lp
++)) & 0xff);
3763 case PNG_FILTER_VALUE_UP
:
3765 #if !defined(PNG_1_0_X)
3766 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_UP
) &&
3767 (row_info
->pixel_depth
>= png_ptr
->mmx_bitdepth_threshold
) &&
3768 (row_info
->rowbytes
>= png_ptr
->mmx_rowbytes_threshold
))
3773 png_read_filter_row_mmx_up(row_info
, row
, prev_row
);
3778 png_uint_32 istop
= row_info
->rowbytes
;
3780 png_bytep pp
= prev_row
;
3782 for (i
= 0; i
< istop
; ++i
)
3784 *rp
= (png_byte
)(((int)(*rp
) + (int)(*pp
++)) & 0xff);
3791 case PNG_FILTER_VALUE_AVG
:
3793 #if !defined(PNG_1_0_X)
3794 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_AVG
) &&
3795 (row_info
->pixel_depth
>= png_ptr
->mmx_bitdepth_threshold
) &&
3796 (row_info
->rowbytes
>= png_ptr
->mmx_rowbytes_threshold
))
3801 png_read_filter_row_mmx_avg(row_info
, row
, prev_row
);
3807 png_bytep pp
= prev_row
;
3809 png_uint_32 bpp
= (row_info
->pixel_depth
+ 7) >> 3;
3810 png_uint_32 istop
= row_info
->rowbytes
- bpp
;
3812 for (i
= 0; i
< bpp
; i
++)
3814 *rp
= (png_byte
)(((int)(*rp
) +
3815 ((int)(*pp
++) >> 1)) & 0xff);
3819 for (i
= 0; i
< istop
; i
++)
3821 *rp
= (png_byte
)(((int)(*rp
) +
3822 ((int)(*pp
++ + *lp
++) >> 1)) & 0xff);
3829 case PNG_FILTER_VALUE_PAETH
:
3831 #if !defined(PNG_1_0_X)
3832 if ((png_ptr
->asm_flags
& PNG_ASM_FLAG_MMX_READ_FILTER_PAETH
) &&
3833 (row_info
->pixel_depth
>= png_ptr
->mmx_bitdepth_threshold
) &&
3834 (row_info
->rowbytes
>= png_ptr
->mmx_rowbytes_threshold
))
3839 png_read_filter_row_mmx_paeth(row_info
, row
, prev_row
);
3845 png_bytep pp
= prev_row
;
3847 png_bytep cp
= prev_row
;
3848 png_uint_32 bpp
= (row_info
->pixel_depth
+ 7) >> 3;
3849 png_uint_32 istop
=row_info
->rowbytes
- bpp
;
3851 for (i
= 0; i
< bpp
; i
++)
3853 *rp
= (png_byte
)(((int)(*rp
) + (int)(*pp
++)) & 0xff);
3857 for (i
= 0; i
< istop
; i
++) // use leftover rp,pp
3859 int a
, b
, c
, pa
, pb
, pc
, p
;
3873 pa
= p
< 0 ? -p
: p
;
3874 pb
= pc
< 0 ? -pc
: pc
;
3875 pc
= (p
+ pc
) < 0 ? -(p
+ pc
) : p
+ pc
;
3879 if (pa <= pb && pa <= pc)
3887 p
= (pa
<= pb
&& pa
<=pc
) ? a
: (pb
<= pc
) ? b
: c
;
3889 *rp
= (png_byte
)(((int)(*rp
) + p
) & 0xff);
3897 png_warning(png_ptr
, "Ignoring bad row filter type");
3903 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGVCRD */