1 /* $NetBSD: locate.bigram.c,v 1.11 2008/07/21 14:19:23 lukem Exp $ */
4 * Copyright (c) 1989, 1993
5 * The Regents of the University of California. All rights reserved.
7 * This code is derived from software contributed to Berkeley by
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 #include <sys/cdefs.h>
37 __COPYRIGHT("@(#) Copyright (c) 1989, 1993\
38 The Regents of the University of California. All rights reserved.");
43 static char sccsid
[] = "@(#)locate.bigram.c 8.2 (Berkeley) 4/28/95";
45 __RCSID("$NetBSD: locate.bigram.c,v 1.11 2008/07/21 14:19:23 lukem Exp $");
49 * bigram < text > bigrams
51 * List bigrams for 'updatedb' script.
52 * Use 'code' to encode a file using this output.
58 #include <sys/param.h> /* for MAXPATHLEN */
60 int main(int, char **);
61 static int compare_bigrams(const void *, const void *);
62 static void add_bigram(u_char
, u_char
);
64 static char buf1
[MAXPATHLEN
] = " ";
65 static char buf2
[MAXPATHLEN
];
69 u_char b1
, b2
; /* needed for final sorting */
72 struct bigram bigrams
[256 * 256];
75 add_bigram(u_char i1
, u_char i2
)
77 if (i1
!= '\n' && i2
!= '\n')
78 bigrams
[(i1
<<8)+i2
].count
++;
82 compare_bigrams(const void *item1
, const void *item2
)
84 const struct bigram
*it1
=item1
, *it2
=item2
;
86 if (it1
->count
!= it2
->count
)
87 return it2
->count
- it1
->count
;
88 else if (it1
->b1
!= it2
->b1
)
89 return it2
->b1
- it1
->b1
;
91 return it2
->b2
- it2
->b2
;
95 main(int argc
, char *argv
[])
98 char *oldpath
= buf1
, *path
= buf2
;
102 /* initialize bigram array */
103 memset(bigrams
, 0, sizeof(bigrams
));
104 for(i
=0; i
< 65536; i
++) {
105 bigrams
[i
].b1
= i
/ 256;
106 bigrams
[i
].b2
= i
% 256;
109 while ( fgets ( path
, sizeof(buf2
), stdin
) != NULL
) {
111 /* skip longest common prefix */
112 for ( cp
= path
; *cp
== *oldpath
; cp
++, oldpath
++ )
113 if ( *oldpath
== '\0' )
117 * output post-residue bigrams only
119 for(; cp
[0] != '\0' && cp
[1] != '\0'; cp
+= 2)
120 add_bigram((u_char
)cp
[0], (u_char
)cp
[1]);
122 if (path
== buf1
) /* swap pointers */
123 path
= buf2
, oldpath
= buf1
;
125 path
= buf1
, oldpath
= buf2
;
128 /* sort the bigrams by how many times it appeared and their value */
129 heapsort((void *)bigrams
, 256 * 256, sizeof(struct bigram
),
132 /* write 128 most frequent bigrams out */
134 for (i
= 0; i
< 128 && bg
->count
> 0; i
++, bg
++) {
136 fputc(bg
->b1
, stdout
);
138 fputc(bg
->b2
, stdout
);