1 /* inffas8664.c is a hand tuned assembler version of inffast.c - fast decoding
2 * version for AMD64 on Windows using Microsoft C compiler
4 * Copyright (C) 1995-2003 Mark Adler
5 * For conditions of distribution and use, see copyright notice in zlib.h
7 * Copyright (C) 2003 Chris Anderson <christop@charm.net>
8 * Please use the copyright conditions above.
10 * 2005 - Adaptation to Microsoft C Compiler for AMD64 by Gilles Vollant
12 * inffas8664.c call function inffas8664fnc in inffasx64.asm
13 * inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
15 * Dec-29-2003 -- I added AMD64 inflate asm support. This version is also
16 * slightly quicker on x86 systems because, instead of using rep movsb to copy
17 * data, it uses rep movsw, which moves data in 2-byte chunks instead of single
18 * bytes. I've tested the AMD64 code on a Fedora Core 1 + the x86_64 updates
19 * from http://fedora.linux.duke.edu/fc1_x86_64
20 * which is running on an Athlon 64 3000+ / Gigabyte GA-K8VT800M system with
21 * 1GB ram. The 64-bit version is about 4% faster than the 32-bit version,
22 * when decompressing mozilla-source-1.3.tar.gz.
24 * Mar-13-2003 -- Most of this is derived from inffast.S which is derived from
25 * the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at
26 * the moment. I have successfully compiled and tested this code with gcc2.96,
27 * gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S
28 * compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX
29 * enabled. I will attempt to merge the MMX code into this version. Newer
30 * versions of this and inffast.S can be found at
31 * http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/
41 /* Mark Adler's comments from inffast.c: */
44 Decode literal, length, and distance codes and write out the resulting
45 literal and match bytes until either not enough input or output is
46 available, an end-of-block is encountered, or a data error is encountered.
47 When large enough input and output buffers are supplied to inflate(), for
48 example, a 16K input buffer and a 64K output buffer, more than 95% of the
49 inflate execution time is spent in this routine.
55 strm->avail_out >= 258
56 start >= strm->avail_out
59 On return, state->mode is one of:
61 LEN -- ran out of enough output space or enough available input
62 TYPE -- reached end of block code, inflate() to interpret next block
63 BAD -- error in block data
67 - The maximum input bits used by a length/distance pair is 15 bits for the
68 length code, 5 bits for the length extra, 15 bits for the distance code,
69 and 13 bits for the distance extra. This totals 48 bits, or six bytes.
70 Therefore if strm->avail_in >= 6, then there is enough input to avoid
71 checking for available input while decoding.
73 - The maximum bytes that a single length/distance pair can output is 258
74 bytes, which is the maximum length that can be coded. inflate_fast()
75 requires strm->avail_out >= 258 for each loop to avoid checking for
81 typedef struct inffast_ar
{
82 /* 64 32 x86 x86_64 */
83 /* ar offset register */
84 /* 0 0 */ void *esp
; /* esp save */
85 /* 8 4 */ void *ebp
; /* ebp save */
86 /* 16 8 */ unsigned char FAR
*in
; /* esi rsi local strm->next_in */
87 /* 24 12 */ unsigned char FAR
*last
; /* r9 while in < last */
88 /* 32 16 */ unsigned char FAR
*out
; /* edi rdi local strm->next_out */
89 /* 40 20 */ unsigned char FAR
*beg
; /* inflate()'s init next_out */
90 /* 48 24 */ unsigned char FAR
*end
; /* r10 while out < end */
91 /* 56 28 */ unsigned char FAR
*window
;/* size of window, wsize!=0 */
92 /* 64 32 */ code
const FAR
*lcode
; /* ebp rbp local strm->lencode */
93 /* 72 36 */ code
const FAR
*dcode
; /* r11 local strm->distcode */
94 /* 80 40 */ size_t /*unsigned long */hold
; /* edx rdx local strm->hold */
95 /* 88 44 */ unsigned bits
; /* ebx rbx local strm->bits */
96 /* 92 48 */ unsigned wsize
; /* window size */
97 /* 96 52 */ unsigned write
; /* window write index */
98 /*100 56 */ unsigned lmask
; /* r12 mask for lcode */
99 /*104 60 */ unsigned dmask
; /* r13 mask for dcode */
100 /*108 64 */ unsigned len
; /* r14 match length */
101 /*112 68 */ unsigned dist
; /* r15 match distance */
102 /*116 72 */ unsigned status
; /* set when state chng*/
106 void inflate_fast(strm
, start
)
108 unsigned start
; /* inflate()'s starting value for strm->avail_out */
110 struct inflate_state FAR
*state
;
112 void inffas8664fnc(struct inffast_ar
* par
);
116 #if (defined( __GNUC__ ) && defined( __amd64__ ) && ! defined( __i386 )) || (defined(_MSC_VER) && defined(_M_AMD64))
117 #define PAD_AVAIL_IN 6
118 #define PAD_AVAIL_OUT 258
120 #define PAD_AVAIL_IN 5
121 #define PAD_AVAIL_OUT 257
124 /* copy state to local variables */
125 state
= (struct inflate_state FAR
*)strm
->state
;
127 ar
.in
= strm
->next_in
;
128 ar
.last
= ar
.in
+ (strm
->avail_in
- PAD_AVAIL_IN
);
129 ar
.out
= strm
->next_out
;
130 ar
.beg
= ar
.out
- (start
- strm
->avail_out
);
131 ar
.end
= ar
.out
+ (strm
->avail_out
- PAD_AVAIL_OUT
);
132 ar
.wsize
= state
->wsize
;
133 ar
.write
= state
->wnext
;
134 ar
.window
= state
->window
;
135 ar
.hold
= state
->hold
;
136 ar
.bits
= state
->bits
;
137 ar
.lcode
= state
->lencode
;
138 ar
.dcode
= state
->distcode
;
139 ar
.lmask
= (1U << state
->lenbits
) - 1;
140 ar
.dmask
= (1U << state
->distbits
) - 1;
142 /* decode literals and length/distances until end-of-block or not enough
143 input data or output space */
145 /* align in on 1/2 hold size boundary */
146 while (((size_t)(void *)ar
.in
& (sizeof(ar
.hold
) / 2 - 1)) != 0) {
147 ar
.hold
+= (unsigned long)*ar
.in
++ << ar
.bits
;
155 strm
->msg
= "invalid literal/length code";
156 else if (ar
.status
== 3)
157 strm
->msg
= "invalid distance code";
159 strm
->msg
= "invalid distance too far back";
162 else if ( ar
.status
== 1 ) {
166 /* return unused bytes (on entry, bits < 8, so in won't go too far back) */
167 ar
.len
= ar
.bits
>> 3;
169 ar
.bits
-= ar
.len
<< 3;
170 ar
.hold
&= (1U << ar
.bits
) - 1;
172 /* update state and return */
173 strm
->next_in
= ar
.in
;
174 strm
->next_out
= ar
.out
;
175 strm
->avail_in
= (unsigned)(ar
.in
< ar
.last
?
176 PAD_AVAIL_IN
+ (ar
.last
- ar
.in
) :
177 PAD_AVAIL_IN
- (ar
.in
- ar
.last
));
178 strm
->avail_out
= (unsigned)(ar
.out
< ar
.end
?
179 PAD_AVAIL_OUT
+ (ar
.end
- ar
.out
) :
180 PAD_AVAIL_OUT
- (ar
.out
- ar
.end
));
181 state
->hold
= (unsigned long)ar
.hold
;
182 state
->bits
= ar
.bits
;