1 /* $NetBSD: inffas8664.c,v 1.1.1.1 2006/01/14 20:10:55 christos Exp $ */
3 /* inffas8664.c is a hand tuned assembler version of inffast.c - fast decoding
4 * version for AMD64 on Windows using Microsoft C compiler
6 * Copyright (C) 1995-2003 Mark Adler
7 * For conditions of distribution and use, see copyright notice in zlib.h
9 * Copyright (C) 2003 Chris Anderson <christop@charm.net>
10 * Please use the copyright conditions above.
12 * 2005 - Adaptation to Microsoft C Compiler for AMD64 by Gilles Vollant
14 * inffas8664.c call function inffas8664fnc in inffasx64.asm
15 * inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
17 * Dec-29-2003 -- I added AMD64 inflate asm support. This version is also
18 * slightly quicker on x86 systems because, instead of using rep movsb to copy
19 * data, it uses rep movsw, which moves data in 2-byte chunks instead of single
20 * bytes. I've tested the AMD64 code on a Fedora Core 1 + the x86_64 updates
21 * from http://fedora.linux.duke.edu/fc1_x86_64
22 * which is running on an Athlon 64 3000+ / Gigabyte GA-K8VT800M system with
23 * 1GB ram. The 64-bit version is about 4% faster than the 32-bit version,
24 * when decompressing mozilla-source-1.3.tar.gz.
26 * Mar-13-2003 -- Most of this is derived from inffast.S which is derived from
27 * the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at
28 * the moment. I have successfully compiled and tested this code with gcc2.96,
29 * gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S
30 * compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX
31 * enabled. I will attempt to merge the MMX code into this version. Newer
32 * versions of this and inffast.S can be found at
33 * http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/
43 /* Mark Adler's comments from inffast.c: */
46 Decode literal, length, and distance codes and write out the resulting
47 literal and match bytes until either not enough input or output is
48 available, an end-of-block is encountered, or a data error is encountered.
49 When large enough input and output buffers are supplied to inflate(), for
50 example, a 16K input buffer and a 64K output buffer, more than 95% of the
51 inflate execution time is spent in this routine.
57 strm->avail_out >= 258
58 start >= strm->avail_out
61 On return, state->mode is one of:
63 LEN -- ran out of enough output space or enough available input
64 TYPE -- reached end of block code, inflate() to interpret next block
65 BAD -- error in block data
69 - The maximum input bits used by a length/distance pair is 15 bits for the
70 length code, 5 bits for the length extra, 15 bits for the distance code,
71 and 13 bits for the distance extra. This totals 48 bits, or six bytes.
72 Therefore if strm->avail_in >= 6, then there is enough input to avoid
73 checking for available input while decoding.
75 - The maximum bytes that a single length/distance pair can output is 258
76 bytes, which is the maximum length that can be coded. inflate_fast()
77 requires strm->avail_out >= 258 for each loop to avoid checking for
83 typedef struct inffast_ar
{
84 /* 64 32 x86 x86_64 */
85 /* ar offset register */
86 /* 0 0 */ void *esp
; /* esp save */
87 /* 8 4 */ void *ebp
; /* ebp save */
88 /* 16 8 */ unsigned char FAR
*in
; /* esi rsi local strm->next_in */
89 /* 24 12 */ unsigned char FAR
*last
; /* r9 while in < last */
90 /* 32 16 */ unsigned char FAR
*out
; /* edi rdi local strm->next_out */
91 /* 40 20 */ unsigned char FAR
*beg
; /* inflate()'s init next_out */
92 /* 48 24 */ unsigned char FAR
*end
; /* r10 while out < end */
93 /* 56 28 */ unsigned char FAR
*window
;/* size of window, wsize!=0 */
94 /* 64 32 */ code
const FAR
*lcode
; /* ebp rbp local strm->lencode */
95 /* 72 36 */ code
const FAR
*dcode
; /* r11 local strm->distcode */
96 /* 80 40 */ size_t /*unsigned long */hold
; /* edx rdx local strm->hold */
97 /* 88 44 */ unsigned bits
; /* ebx rbx local strm->bits */
98 /* 92 48 */ unsigned wsize
; /* window size */
99 /* 96 52 */ unsigned write
; /* window write index */
100 /*100 56 */ unsigned lmask
; /* r12 mask for lcode */
101 /*104 60 */ unsigned dmask
; /* r13 mask for dcode */
102 /*108 64 */ unsigned len
; /* r14 match length */
103 /*112 68 */ unsigned dist
; /* r15 match distance */
104 /*116 72 */ unsigned status
; /* set when state chng*/
108 void inflate_fast(strm
, start
)
110 unsigned start
; /* inflate()'s starting value for strm->avail_out */
112 struct inflate_state FAR
*state
;
114 void inffas8664fnc(struct inffast_ar
* par
);
118 #if (defined( __GNUC__ ) && defined( __amd64__ ) && ! defined( __i386 )) || (defined(_MSC_VER) && defined(_M_AMD64))
119 #define PAD_AVAIL_IN 6
120 #define PAD_AVAIL_OUT 258
122 #define PAD_AVAIL_IN 5
123 #define PAD_AVAIL_OUT 257
126 /* copy state to local variables */
127 state
= (struct inflate_state FAR
*)strm
->state
;
129 ar
.in
= strm
->next_in
;
130 ar
.last
= ar
.in
+ (strm
->avail_in
- PAD_AVAIL_IN
);
131 ar
.out
= strm
->next_out
;
132 ar
.beg
= ar
.out
- (start
- strm
->avail_out
);
133 ar
.end
= ar
.out
+ (strm
->avail_out
- PAD_AVAIL_OUT
);
134 ar
.wsize
= state
->wsize
;
135 ar
.write
= state
->write
;
136 ar
.window
= state
->window
;
137 ar
.hold
= state
->hold
;
138 ar
.bits
= state
->bits
;
139 ar
.lcode
= state
->lencode
;
140 ar
.dcode
= state
->distcode
;
141 ar
.lmask
= (1U << state
->lenbits
) - 1;
142 ar
.dmask
= (1U << state
->distbits
) - 1;
144 /* decode literals and length/distances until end-of-block or not enough
145 input data or output space */
147 /* align in on 1/2 hold size boundary */
148 while (((size_t)(void *)ar
.in
& (sizeof(ar
.hold
) / 2 - 1)) != 0) {
149 ar
.hold
+= (unsigned long)*ar
.in
++ << ar
.bits
;
157 strm
->msg
= "invalid literal/length code";
158 else if (ar
.status
== 3)
159 strm
->msg
= "invalid distance code";
161 strm
->msg
= "invalid distance too far back";
164 else if ( ar
.status
== 1 ) {
168 /* return unused bytes (on entry, bits < 8, so in won't go too far back) */
169 ar
.len
= ar
.bits
>> 3;
171 ar
.bits
-= ar
.len
<< 3;
172 ar
.hold
&= (1U << ar
.bits
) - 1;
174 /* update state and return */
175 strm
->next_in
= ar
.in
;
176 strm
->next_out
= ar
.out
;
177 strm
->avail_in
= (unsigned)(ar
.in
< ar
.last
?
178 PAD_AVAIL_IN
+ (ar
.last
- ar
.in
) :
179 PAD_AVAIL_IN
- (ar
.in
- ar
.last
));
180 strm
->avail_out
= (unsigned)(ar
.out
< ar
.end
?
181 PAD_AVAIL_OUT
+ (ar
.end
- ar
.out
) :
182 PAD_AVAIL_OUT
- (ar
.out
- ar
.end
));
183 state
->hold
= (unsigned long)ar
.hold
;
184 state
->bits
= ar
.bits
;