Sync usage with man page.
[netbsd-mini2440.git] / usr.bin / sort / msort.c
blobe0572808c9a86256eea1165b7fe27d4514666a03
1 /* $NetBSD: msort.c,v 1.28 2009/10/09 20:29:43 dsl Exp $ */
3 /*-
4 * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
5 * All rights reserved.
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Ben Harris and Jaromir Dolecek.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
32 /*-
33 * Copyright (c) 1993
34 * The Regents of the University of California. All rights reserved.
36 * This code is derived from software contributed to Berkeley by
37 * Peter McIlroy.
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
41 * are met:
42 * 1. Redistributions of source code must retain the above copyright
43 * notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 * notice, this list of conditions and the following disclaimer in the
46 * documentation and/or other materials provided with the distribution.
47 * 3. Neither the name of the University nor the names of its contributors
48 * may be used to endorse or promote products derived from this software
49 * without specific prior written permission.
51 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * SUCH DAMAGE.
64 #include "sort.h"
65 #include "fsort.h"
67 __RCSID("$NetBSD: msort.c,v 1.28 2009/10/09 20:29:43 dsl Exp $");
69 #include <stdlib.h>
70 #include <string.h>
71 #include <unistd.h>
72 #include <util.h>
74 /* Subroutines using comparisons: merge sort and check order */
75 #define DELETE (1)
77 typedef struct mfile {
78 FILE *fp;
79 get_func_t get;
80 RECHEADER *rec;
81 u_char *end;
82 } MFILE;
84 static int cmp(RECHEADER *, RECHEADER *);
85 static int insert(struct mfile **, struct mfile *, int, int);
86 static void merge_sort_fstack(FILE *, put_func_t, struct field *);
89 * Number of files merge() can merge in one pass.
91 #define MERGE_FNUM 16
93 static struct mfile fstack[MERGE_FNUM];
94 static struct mfile fstack_1[MERGE_FNUM];
95 static struct mfile fstack_2[MERGE_FNUM];
96 static int fstack_count, fstack_1_count, fstack_2_count;
98 void
99 save_for_merge(FILE *fp, get_func_t get, struct field *ftbl)
101 FILE *mfp, *mfp1, *mfp2;
103 if (fstack_count == MERGE_FNUM) {
104 /* Must reduce the number of temporary files */
105 mfp = ftmp();
106 merge_sort_fstack(mfp, putrec, ftbl);
107 /* Save output in next layer */
108 if (fstack_1_count == MERGE_FNUM) {
109 mfp1 = ftmp();
110 memcpy(fstack, fstack_1, sizeof fstack);
111 merge_sort_fstack(mfp1, putrec, ftbl);
112 if (fstack_2_count == MERGE_FNUM) {
113 /* More than 4096 files! */
114 mfp2 = ftmp();
115 memcpy(fstack, fstack_2, sizeof fstack);
116 merge_sort_fstack(mfp2, putrec, ftbl);
117 fstack_2[0].fp = mfp2;
118 fstack_2_count = 1;
120 fstack_2[fstack_2_count].fp = mfp1;
121 fstack_2[fstack_2_count].get = geteasy;
122 fstack_2_count++;
123 fstack_1_count = 0;
125 fstack_1[fstack_1_count].fp = mfp;
126 fstack_1[fstack_1_count].get = geteasy;
127 fstack_1_count++;
128 fstack_count = 0;
131 fstack[fstack_count].fp = fp;
132 fstack[fstack_count++].get = get;
135 void
136 fmerge(struct filelist *filelist, int nfiles, FILE *outfp, struct field *ftbl)
138 get_func_t get = SINGL_FLD ? makeline : makekey;
139 FILE *fp;
140 int i;
142 for (i = 0; i < nfiles; i++) {
143 fp = fopen(filelist->names[i], "r");
144 if (fp == NULL)
145 err(2, "%s", filelist->names[i]);
146 save_for_merge(fp, get, ftbl);
149 merge_sort(outfp, putline, ftbl);
152 void
153 merge_sort(FILE *outfp, put_func_t put, struct field *ftbl)
155 int count = fstack_1_count + fstack_2_count;
156 FILE *mfp;
157 int i;
159 if (count == 0) {
160 /* All files in initial array */
161 merge_sort_fstack(outfp, put, ftbl);
162 return;
165 count += fstack_count;
167 /* Too many files for one merge sort */
168 for (;;) {
169 /* Sort latest 16 files */
170 i = count;
171 if (i > MERGE_FNUM)
172 i = MERGE_FNUM;
173 while (fstack_count > 0)
174 fstack[--i] = fstack[--fstack_count];
175 while (i > 0 && fstack_1_count > 0)
176 fstack[--i] = fstack_1[--fstack_1_count];
177 while (i > 0)
178 fstack[--i] = fstack_2[--fstack_2_count];
179 if (count <= MERGE_FNUM) {
180 /* Got all the data */
181 fstack_count = count;
182 merge_sort_fstack(outfp, put, ftbl);
183 return;
185 mfp = ftmp();
186 fstack_count = count > MERGE_FNUM ? MERGE_FNUM : count;
187 merge_sort_fstack(mfp, putrec, ftbl);
188 fstack[0].fp = mfp;
189 fstack[0].get = geteasy;
190 fstack_count = 1;
191 count -= MERGE_FNUM - 1;
195 static void
196 merge_sort_fstack(FILE *outfp, put_func_t put, struct field *ftbl)
198 struct mfile *flistb[MERGE_FNUM], **flist = flistb, *cfile;
199 RECHEADER *new_rec;
200 u_char *new_end;
201 void *tmp;
202 int c, i, nfiles;
203 size_t sz;
205 /* Read one record from each file (read again if a duplicate) */
206 for (nfiles = i = 0; i < fstack_count; i++) {
207 cfile = &fstack[i];
208 if (cfile->rec == NULL) {
209 cfile->rec = emalloc(DEFLLEN);
210 cfile->end = (u_char *)cfile->rec + DEFLLEN;
212 rewind(cfile->fp);
214 for (;;) {
215 c = cfile->get(cfile->fp, cfile->rec, cfile->end, ftbl);
216 if (c == EOF)
217 break;
219 if (c == BUFFEND) {
220 /* Double buffer size */
221 sz = (cfile->end - (u_char *)cfile->rec) * 2;
222 cfile->rec = erealloc(cfile->rec, sz);
223 cfile->end = (u_char *)cfile->rec + sz;
224 continue;
227 if (nfiles != 0) {
228 if (insert(flist, cfile, nfiles, !DELETE))
229 /* Duplicate removed */
230 continue;
231 } else
232 flist[0] = cfile;
233 nfiles++;
234 break;
238 if (nfiles == 0)
239 return;
242 * We now loop reading a new record from the file with the
243 * 'sorted first' existing record.
244 * As each record is added, the 'first' record is written to the
245 * output file - maintaining one record from each file in the sorted
246 * list.
248 new_rec = emalloc(DEFLLEN);
249 new_end = (u_char *)new_rec + DEFLLEN;
250 for (;;) {
251 cfile = flist[0];
252 c = cfile->get(cfile->fp, new_rec, new_end, ftbl);
253 if (c == EOF) {
254 /* Write out last record from now-empty input */
255 put(cfile->rec, outfp);
256 if (--nfiles == 0)
257 break;
258 /* Replace from file with now-first sorted record. */
259 /* (Moving base 'flist' saves copying everything!) */
260 flist++;
261 continue;
263 if (c == BUFFEND) {
264 /* Buffer not large enough - double in size */
265 sz = (new_end - (u_char *)new_rec) * 2;
266 new_rec = erealloc(new_rec, sz);
267 new_end = (u_char *)new_rec +sz;
268 continue;
271 /* Swap in new buffer, saving old */
272 tmp = cfile->rec;
273 cfile->rec = new_rec;
274 new_rec = tmp;
275 tmp = cfile->end;
276 cfile->end = new_end;
277 new_end = tmp;
279 /* Add into sort, removing the original first entry */
280 c = insert(flist, cfile, nfiles, DELETE);
281 if (c != 0 || (UNIQUE && cfile == flist[0]
282 && cmp(new_rec, cfile->rec) == 0)) {
283 /* Was an unwanted duplicate, restore buffer */
284 tmp = cfile->rec;
285 cfile->rec = new_rec;
286 new_rec = tmp;
287 tmp = cfile->end;
288 cfile->end = new_end;
289 new_end = tmp;
290 continue;
293 /* Write out 'old' record */
294 put(new_rec, outfp);
297 free(new_rec);
301 * if delete: inserts rec in flist, deletes flist[0];
302 * otherwise just inserts *rec in flist.
303 * Returns 1 if record is a duplicate to be ignored.
305 static int
306 insert(struct mfile **flist, struct mfile *rec, int ttop, int delete)
308 int mid, top = ttop, bot = 0, cmpv = 1;
310 for (mid = top / 2; bot + 1 != top; mid = (bot + top) / 2) {
311 cmpv = cmp(rec->rec, flist[mid]->rec);
312 if (cmpv == 0 ) {
313 if (UNIQUE)
314 /* Duplicate key, read another record */
315 /* NB: This doesn't guarantee to keep any
316 * particular record. */
317 return 1;
319 * Apply sort by input file order.
320 * We could truncate the sort is the fileno are
321 * adjacent - but that is all too hard!
322 * The fileno cannot be equal, since we only have one
323 * record from each file (+ flist[0] which never
324 * comes here).
326 cmpv = rec < flist[mid] ? -1 : 1;
327 if (REVERSE)
328 cmpv = -cmpv;
330 if (cmpv < 0)
331 top = mid;
332 else
333 bot = mid;
336 /* At this point we haven't yet compared against flist[0] */
338 if (delete) {
339 /* flist[0] is ourselves, only the caller knows the old data */
340 if (bot != 0) {
341 memmove(flist, flist + 1, bot * sizeof(MFILE *));
342 flist[bot] = rec;
344 return 0;
347 /* Inserting original set of records */
349 if (bot == 0 && cmpv != 0) {
350 /* Doesn't match flist[1], must compare with flist[0] */
351 cmpv = cmp(rec->rec, flist[0]->rec);
352 if (cmpv == 0 && UNIQUE)
353 return 1;
354 /* Add matching keys in file order (ie new is later) */
355 if (cmpv < 0)
356 bot = -1;
358 bot++;
359 memmove(flist + bot + 1, flist + bot, (ttop - bot) * sizeof(MFILE *));
360 flist[bot] = rec;
361 return 0;
365 * check order on one file
367 void
368 order(struct filelist *filelist, struct field *ftbl)
370 get_func_t get = SINGL_FLD ? makeline : makekey;
371 RECHEADER *crec, *prec, *trec;
372 u_char *crec_end, *prec_end, *trec_end;
373 FILE *fp;
374 int c;
376 fp = fopen(filelist->names[0], "r");
377 if (fp == NULL)
378 err(2, "%s", filelist->names[0]);
380 crec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
381 crec_end = crec->data + DEFLLEN;
382 prec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
383 prec_end = prec->data + DEFLLEN;
385 /* XXX this does exit(0) for overlong lines */
386 if (get(fp, prec, prec_end, ftbl) != 0)
387 exit(0);
388 while (get(fp, crec, crec_end, ftbl) == 0) {
389 if (0 < (c = cmp(prec, crec))) {
390 crec->data[crec->length-1] = 0;
391 errx(1, "found disorder: %s", crec->data+crec->offset);
393 if (UNIQUE && !c) {
394 crec->data[crec->length-1] = 0;
395 errx(1, "found non-uniqueness: %s",
396 crec->data+crec->offset);
399 * Swap pointers so that this record is on place pointed
400 * to by prec and new record is read to place pointed to by
401 * crec.
403 trec = prec;
404 prec = crec;
405 crec = trec;
406 trec_end = prec_end;
407 prec_end = crec_end;
408 crec_end = trec_end;
410 exit(0);
413 static int
414 cmp(RECHEADER *rec1, RECHEADER *rec2)
416 int len;
417 int r;
419 /* key is weights */
420 len = min(rec1->keylen, rec2->keylen);
421 r = memcmp(rec1->data, rec2->data, len);
422 if (r == 0)
423 r = rec1->keylen - rec2->keylen;
424 if (REVERSE)
425 r = -r;
426 return r;