1 /* $NetBSD: compress.c,v 1.6 2003/08/07 11:17:24 agc Exp $ */
4 * Copyright (c) 1989, 1993
5 * The Regents of the University of California. All rights reserved.
7 * This code is derived from software contributed to Berkeley by
8 * Edward Wang at The University of California, Berkeley.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 #include <sys/cdefs.h>
38 static char sccsid
[] = "@(#)compress.c 8.1 (Berkeley) 6/6/93";
40 __RCSID("$NetBSD: compress.c,v 1.6 2003/08/07 11:17:24 agc Exp $");
55 /* tunable parameters */
61 int cc_token_max
= 8; /* <= TOKEN_MAX */
62 int cc_token_min
= 2; /* > tt.tt_put_token_cost */
66 int cc_bufsize
= 1024 * 3; /* XXX, or 80 * 24 * 2 */
78 char string
[TOKEN_MAX
];
84 long time
; /* time last seen */
85 short bcount
; /* count in this buffer */
86 short ccount
; /* count in compression */
87 short places
; /* places in the buffer */
88 short code
; /* token code */
89 struct cc
*qforw
, *qback
;
90 struct cc
*hforw
, **hback
;
93 short cc_thresholds
[TOKEN_MAX
+ 1];
94 #define thresh(length) (cc_thresholds[length])
95 #define threshp(code, count, length) \
96 ((code) >= 0 || (short) (count) >= cc_thresholds[length])
99 short cc_wthresholds
[TOKEN_MAX
+ 1];
100 #define wthresh(length) (cc_wthresholds[length])
101 #define wthreshp(weight, length) ((short) (weight) >= cc_wthresholds[length])
103 #define wthreshp(weight, length) (0)
107 short cc_wlimits
[TOKEN_MAX
+ 1];
108 #define wlimit(length) (cc_wlimits[length])
111 #define put_token_score(length) ((length) - tt.tt_put_token_cost)
113 int cc_score_adjustments
[TOKEN_MAX
+ 1][8]; /* XXX, 8 > max of cc_thresholds */
114 #define score_adjust(score, p) \
116 int _length = (p)->length; \
117 int _ccount = (p)->ccount; \
118 if (threshp((p)->code, _ccount, _length) || \
119 wthreshp((p)->weight, _length)) /* XXX */ \
120 (score) -= _length - tt.tt_put_token_cost; \
122 (score) += cc_score_adjustments[_length][_ccount]; \
125 int cc_initial_scores
[TOKEN_MAX
+ 1][8]; /* XXX, 8 > max of cc_thresholds */
127 struct cc cc_q0a
, cc_q0b
, cc_q1a
, cc_q1b
;
129 #define qinsert(p1, p2) \
131 struct cc *forw = (p1)->qforw; \
132 struct cc *back = (p1)->qback; \
133 back->qforw = forw; \
134 forw->qback = back; \
135 forw = (p2)->qforw; \
136 (p1)->qforw = forw; \
137 forw->qback = (p1); \
138 (p2)->qforw = (p1); \
139 (p1)->qback = (p2); \
142 #define qinsertq(q, p) \
143 ((q)->qforw == (q) ? 0 : \
144 ((q)->qback->qforw = (p)->qforw, \
145 (p)->qforw->qback = (q)->qback, \
146 (q)->qforw->qback = (p), \
147 (p)->qforw = (q)->qforw, \
152 #define HSIZE (1 << H)
153 #define hash(h, c) (((((h) >> (H - 8)) | (h) << 8) ^ (c)) & (HSIZE - 1))
156 struct cc
**cc_output
; /* the output array */
157 short *cc_places
[TOKEN_MAX
+ 1];
158 short *cc_hashcodes
; /* for computing hashcodes */
159 struct cc
**cc_htab
; /* the hash table */
160 struct cc
**cc_tokens
; /* holds all the active tokens */
166 long cc_time
, cc_time0
;
168 char *cc_tt_ob
, *cc_tt_obe
;
170 int cc_compress(struct cc
**, struct cc
**, char);
171 void cc_compress_cleanup(struct cc
**, int);
172 void cc_compress_phase(struct cc
**, int, struct cc
**, int);
173 void cc_compress_phase1(struct cc
**, struct cc
**, int, int);
174 void cc_output_phase(char *, struct cc
**, int);
175 int cc_sweep(char *, int, struct cc
**, int);
176 void cc_sweep0(char *, int, int);
177 int cc_sweep_phase(char *, int, struct cc
**);
178 void cc_sweep_reverse(struct cc
**, short *);
179 int cc_token_compare(const void *, const void *);
187 if (tt
.tt_token_max
> cc_token_max
)
188 tt
.tt_token_max
= cc_token_max
;
189 if (tt
.tt_token_min
< cc_token_min
)
190 tt
.tt_token_min
= cc_token_min
;
191 if (tt
.tt_token_min
> tt
.tt_token_max
) {
195 if (tt
.tt_ntoken
> cc_ntoken
/ 2) /* not likely */
196 tt
.tt_ntoken
= cc_ntoken
/ 2;
197 #define C(x) (sizeof (x) / sizeof *(x))
198 for (i
= 0; i
< (int)C(cc_thresholds
); i
++) {
199 int h
= i
- tt
.tt_put_token_cost
;
202 (tt
.tt_set_token_cost
+ 1 + h
- 1) / h
+ 1;
204 cc_thresholds
[i
] = 0;
206 for (i
= 0; i
< (int)C(cc_score_adjustments
); i
++) {
207 int t
= cc_thresholds
[i
];
208 for (j
= 0; j
< (int)C(*cc_score_adjustments
); j
++) {
210 cc_score_adjustments
[i
][j
] =
211 - (i
- tt
.tt_put_token_cost
);
213 cc_score_adjustments
[i
][j
] = 0;
217 * length * (ccount + 1) a
219 * set-token-cost + length +
220 * ccount * put-token-cost b
221 * the score adjustment is (b - a)
223 cc_score_adjustments
[i
][j
] =
224 tt
.tt_set_token_cost
+ i
+
225 j
* tt
.tt_put_token_cost
-
228 cc_initial_scores
[i
][j
] = 0;
231 * - (set-token-cost +
232 * (length - put-token-cost) -
233 * (length - put-token-cost) * ccount)
235 cc_initial_scores
[i
][j
] =
236 - (tt
.tt_set_token_cost
+
237 (i
- tt
.tt_put_token_cost
) -
238 (i
- tt
.tt_put_token_cost
) * j
);
242 for (i
= 1; i
< C(cc_wthresholds
); i
++) {
244 ((tt
.tt_set_token_cost
+ tt
.tt_put_token_cost
) / i
+
247 cc_wlimits
[i
] = cc_wthresholds
[i
] + cc_weight
;
251 if ((cc_output
= (struct cc
**)
252 malloc((unsigned) cc_bufsize
* sizeof *cc_output
)) == 0)
254 if ((cc_hashcodes
= (short *)
255 malloc((unsigned) cc_bufsize
* sizeof *cc_hashcodes
)) == 0)
257 if ((cc_htab
= (struct cc
**) malloc(HSIZE
* sizeof *cc_htab
)) == 0)
259 if ((cc_tokens
= (struct cc
**)
261 (cc_ntoken
+ tt
.tt_token_max
- tt
.tt_token_min
+ 1) *
262 sizeof *cc_tokens
)) == 0)
264 if ((cc_undo
= (struct cc_undo
*)
265 malloc((unsigned) cc_bufsize
* sizeof *cc_undo
)) == 0)
267 for (i
= tt
.tt_token_min
; i
<= tt
.tt_token_max
; i
++)
268 if ((cc_places
[i
] = (short *)
269 malloc((unsigned) cc_bufsize
* sizeof **cc_places
)) == 0)
271 cc_q0a
.qforw
= cc_q0a
.qback
= &cc_q0a
;
272 cc_q0b
.qforw
= cc_q0b
.qback
= &cc_q0b
;
273 cc_q1a
.qforw
= cc_q1a
.qback
= &cc_q1a
;
274 cc_q1b
.qforw
= cc_q1b
.qback
= &cc_q1b
;
275 if ((p
= (struct cc
*) malloc((unsigned) cc_ntoken
* sizeof *p
)) == 0)
277 for (i
= 0; i
< tt
.tt_ntoken
; i
++) {
280 p
->qback
= cc_q0a
.qback
;
286 for (; i
< cc_ntoken
; i
++) {
289 p
->qback
= cc_q1a
.qback
;
297 if ((cc_buffer
= malloc((unsigned) cc_bufsize
)) == 0)
309 tt_obp
= tt_ob
= cc_buffer
;
310 tt_obe
= tt_ob
+ cc_bufsize
;
311 tt
.tt_flush
= ccflush
;
313 cc_trace_fp
= fopen("window-trace", "a");
314 (void) fcntl(fileno(cc_trace_fp
), F_SETFD
, 1);
324 memset((char *) cc_htab
, 0, HSIZE
* sizeof *cc_htab
);
325 for (p
= cc_q0a
.qforw
; p
!= &cc_q0a
; p
= p
->qforw
)
327 for (p
= cc_q1a
.qforw
; p
!= &cc_q1a
; p
= p
->qforw
)
336 tt_obp
= tt_ob
= cc_tt_ob
;
339 if (cc_trace_fp
!= NULL
) {
340 (void) fclose(cc_trace_fp
);
348 int bufsize
= tt_obp
- tt_ob
;
351 if (tt_ob
!= cc_buffer
)
353 if (cc_trace_fp
!= NULL
) {
354 (void) fwrite(tt_ob
, 1, bufsize
, cc_trace_fp
);
355 (void) putc(-1, cc_trace_fp
);
358 (*tt
.tt_compress
)(1);
359 if (bufsize
< tt
.tt_token_min
) {
363 tt_obp
= tt_ob
= cc_tt_ob
;
367 n
= cc_sweep_phase(cc_buffer
, bufsize
, cc_tokens
);
368 cc_compress_phase(cc_output
, bufsize
, cc_tokens
, n
);
369 cc_output_phase(cc_buffer
, cc_output
, bufsize
);
371 tt_obp
= tt_ob
= cc_buffer
;
372 tt_obe
= cc_buffer
+ cc_bufsize
;
374 (*tt
.tt_compress
)(0);
375 tt
.tt_flush
= ccflush
;
379 cc_sweep_phase(char *buffer
, int bufsize
, struct cc
**tokens
)
381 struct cc
**pp
= tokens
;
393 cc_sweep0(buffer
, bufsize
, tt
.tt_token_min
- 1);
399 for (i
= tt
.tt_token_min
; i
<= tt
.tt_token_max
; i
++) {
408 (void) fflush(stdout
);
411 n
= cc_sweep(buffer
, bufsize
, pp
, i
);
423 qinsertq(&cc_q1b
, &cc_q1a
);
426 printf("\n %d tokens, %d candidates\n",
435 cc_sweep0(char *buffer
, int n
, int length
)
441 short pc
= tt
.tt_padc
;
443 /* n and length are at least 1 */
448 if ((*hc
++ = *p
++) == pc
)
454 for (i
= n
--; --i
;) {
455 if ((c
= *p
++) == pc
|| *hc
< 0)
465 cc_sweep(char *buffer
, int bufsize
, struct cc
**tokens
, int length
)
471 short *places
= cc_places
[length
];
472 struct cc
**pp
= tokens
;
473 short threshold
= thresh(length
);
475 short wthreshold
= wthresh(length
);
476 short limit
= wlimit(length
);
479 short pc
= tt
.tt_padc
;
486 for (i
= 0; i
< bufsize
; i
++, time0
++) {
493 if ((hh
= *hc1
) < 0 || c
== pc
) {
498 h
= cc_htab
+ (*hc1
++ = hash(hh
, c
));
501 for (p
= *h
; p
!= 0; p
= p
->hforw
)
502 if (p
->length
== (char) length
) {
503 char *p1
= p
->string
;
504 char *p2
= cp
- length
;
516 (p
->time
>= cc_time0
&& p
->length
== (char) length
))
519 if ((*p
->hback
= p
->hforw
) != 0)
520 p
->hforw
->hback
= p
->hback
;
522 char *p1
= p
->string
;
523 char *p2
= cp
- length
;
531 p
->weight
= cc_weight
;
537 if ((p
->hforw
= *h
) != 0)
538 p
->hforw
->hback
= &p
->hforw
;
547 } else if (p
->time
< cc_time0
) {
549 if ((p
->weight
+= p
->time
- time0
) < 0)
550 p
->weight
= cc_weight
;
551 else if ((p
->weight
+= cc_weight
) > limit
)
562 if (p
->weight
>= wthreshold
) {
577 } else if (p
->time
+ length
> time0
) {
579 * overlapping token, don't count as two and
580 * don't update time0, but do adjust weight to offset
584 if (cc_weight
!= 0) { /* XXX */
585 p
->weight
+= time0
- p
->time
;
586 if (!p
->flag
&& p
->weight
>= wthreshold
) {
593 places
[i
] = p
->places
;
597 if ((p
->weight
+= p
->time
- time0
) < 0)
598 p
->weight
= cc_weight
;
599 else if ((p
->weight
+= cc_weight
) > limit
)
605 /* code must be < 0 if flag false here */
606 (p
->bcount
>= threshold
608 || p
->weight
>= wthreshold
615 places
[i
] = p
->places
;
619 if ((i
= pp
- tokens
) > 0) {
622 cc_sweep_reverse(tokens
, places
);
623 if (cc_sort
&& i
> 1) {
624 qsort((char *) tokens
, i
, sizeof *tokens
,
628 if ((i
= i
* cc_chop
/ 100) == 0)
638 cc_sweep_reverse(struct cc
**pp
, short int *places
)
643 while ((p
= *pp
++) != 0) {
646 /* the list is never empty */
651 } while ((t
= frnt
) >= 0);
657 cc_compress_phase(struct cc
**output
, int bufsize
, struct cc
**tokens
, int ntoken
)
661 memset((char *) output
, 0, bufsize
* sizeof *output
);
662 for (i
= 0; i
< cc_npass0
; i
++)
663 cc_compress_phase1(output
, tokens
, ntoken
, 0);
664 for (i
= 0; i
< cc_npass1
; i
++)
665 cc_compress_phase1(output
, tokens
, ntoken
, 1);
666 cc_compress_cleanup(output
, bufsize
);
670 cc_compress_phase1(struct cc
**output
, struct cc
**tokens
, int ntoken
, int flag
)
675 int nt
= 0, cc
= 0, nc
= 0;
685 while (pp
< tokens
+ ntoken
) {
696 printf(" (%d", (*pp
)->length
);
697 (void) fflush(stdout
);
700 pp
+= cc_compress(output
, pp
, flag
);
703 printf(" %dt %du %dc)", ntoken_stat
, ccount_stat
,
713 printf("\n total: (%dt %du %dc)\n", nt
, cc
, nc
);
720 cc_compress_cleanup(struct cc
**output
, int bufsize
)
724 /* the previous output phase may have been interrupted */
725 qinsertq(&cc_q0b
, &cc_q0a
);
726 for (end
= output
+ bufsize
; output
< end
;) {
729 if ((p
= *output
) == 0) {
735 } else if (p
->code
>= 0) {
738 } else if (p
->ccount
== 0) {
740 } else if (p
->ccount
>= thresh(length
)
742 || wthreshp(p
->weight
, length
)
755 cc_compress(struct cc
**output
, struct cc
**tokens
, char flag
)
757 struct cc
**pp
= tokens
;
758 struct cc
*p
= *pp
++;
759 int length
= p
->length
;
760 int threshold
= thresh(length
);
762 short wthreshold
= wthresh(length
);
764 short *places
= cc_places
[length
];
765 int *initial_scores
= cc_initial_scores
[length
];
766 int initial_score0
= put_token_score(length
);
770 struct cc_undo
*undop
;
778 if ((short) ccount
>= p
->bcount
)
780 if (p
->code
>= 0 || ccount
>= threshold
)
783 else if (p
->weight
>= wthreshold
)
784 /* allow one fewer match than normal */
785 /* XXX, should adjust for ccount */
786 score
= - tt
.tt_set_token_cost
;
789 score
= initial_scores
[ccount
];
794 for (i
= p
->places
; i
>= 0; i
= places
[i
]) {
797 struct cc
**ip
= output
+ i
;
798 int score0
= initial_score0
;
799 struct cc
**iip
= ip
+ length
;
800 struct cc_undo
*undop1
= undop
;
802 if ((x
= *(jp
= ip
)) != 0)
804 while (--jp
>= output
)
805 if ((x
= *jp
) != 0) {
806 if (jp
+ x
->length
> ip
)
812 if ((x
= *jp
) == 0) {
827 score_adjust(score0
, x
);
828 if (score0
< 0 && flag
)
840 while (--undop
>= undop1
)
841 if ((*undop
->pos
= x
= undop
->val
))
847 ccount_stat
+= ccount
- p
->ccount
;
849 ncover_stat
+= ncover
;
853 struct cc_undo
*u
= cc_undo
;
854 while (--undop
>= u
) {
856 if ((*undop
->pos
= x
= undop
->val
))
860 } while ((p
= *pp
++) != 0);
865 cc_output_phase(char *buffer
, struct cc
**output
, int bufsize
)
870 for (i
= 0; i
< bufsize
;) {
871 if ((p
= output
[i
]) == 0) {
874 } else if (p
->code
>= 0) {
875 if (--p
->ccount
== 0)
877 (*tt
.tt_put_token
)(p
->code
, p
->string
, p
->length
);
879 wwntoksave
+= put_token_score(p
->length
);
881 } else if ((p1
= cc_q0a
.qback
) != &cc_q0a
) {
884 qinsert(p1
, &cc_q1a
);
885 if (--p
->ccount
== 0)
889 (*tt
.tt_set_token
)(p
->code
, p
->string
, p
->length
);
891 wwntoksave
-= tt
.tt_set_token_cost
;
895 ttwrite(p
->string
, p
->length
);
904 cc_token_compare(const void *p1
, const void *p2
)
906 const struct cc
* const * vp1
= p1
;
907 const struct cc
* const * vp2
= p2
;
908 return (*vp2
)->bcount
- (*vp1
)->bcount
;