etc/protocols - sync with NetBSD-8
[minix.git] / usr.bin / sort / msort.c
blob4383c7d3d970268a8170be8559018ce49b9b451c
1 /* $NetBSD: msort.c,v 1.30 2010/02/05 21:58:42 enami Exp $ */
3 /*-
4 * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
5 * All rights reserved.
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Ben Harris and Jaromir Dolecek.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
32 /*-
33 * Copyright (c) 1993
34 * The Regents of the University of California. All rights reserved.
36 * This code is derived from software contributed to Berkeley by
37 * Peter McIlroy.
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
41 * are met:
42 * 1. Redistributions of source code must retain the above copyright
43 * notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 * notice, this list of conditions and the following disclaimer in the
46 * documentation and/or other materials provided with the distribution.
47 * 3. Neither the name of the University nor the names of its contributors
48 * may be used to endorse or promote products derived from this software
49 * without specific prior written permission.
51 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * SUCH DAMAGE.
64 #include "sort.h"
65 #include "fsort.h"
67 __RCSID("$NetBSD: msort.c,v 1.30 2010/02/05 21:58:42 enami Exp $");
69 #include <stdlib.h>
70 #include <string.h>
71 #include <unistd.h>
72 #include <util.h>
74 /* Subroutines using comparisons: merge sort and check order */
75 #define DELETE (1)
77 typedef struct mfile {
78 FILE *fp;
79 get_func_t get;
80 RECHEADER *rec;
81 u_char *end;
82 } MFILE;
84 static int cmp(RECHEADER *, RECHEADER *);
85 static int insert(struct mfile **, struct mfile *, int, int);
86 static void merge_sort_fstack(FILE *, put_func_t, struct field *);
89 * Number of files merge() can merge in one pass.
91 #define MERGE_FNUM 16
93 static struct mfile fstack[MERGE_FNUM];
94 static struct mfile fstack_1[MERGE_FNUM];
95 static struct mfile fstack_2[MERGE_FNUM];
96 static int fstack_count, fstack_1_count, fstack_2_count;
98 void
99 save_for_merge(FILE *fp, get_func_t get, struct field *ftbl)
101 FILE *mfp, *mfp1, *mfp2;
103 if (fstack_count == MERGE_FNUM) {
104 /* Must reduce the number of temporary files */
105 mfp = ftmp();
106 merge_sort_fstack(mfp, putrec, ftbl);
107 /* Save output in next layer */
108 if (fstack_1_count == MERGE_FNUM) {
109 mfp1 = ftmp();
110 memcpy(fstack, fstack_1, sizeof fstack);
111 merge_sort_fstack(mfp1, putrec, ftbl);
112 if (fstack_2_count == MERGE_FNUM) {
113 /* More than 4096 files! */
114 mfp2 = ftmp();
115 memcpy(fstack, fstack_2, sizeof fstack);
116 merge_sort_fstack(mfp2, putrec, ftbl);
117 fstack_2[0].fp = mfp2;
118 fstack_2_count = 1;
120 fstack_2[fstack_2_count].fp = mfp1;
121 fstack_2[fstack_2_count].get = geteasy;
122 fstack_2_count++;
123 fstack_1_count = 0;
125 fstack_1[fstack_1_count].fp = mfp;
126 fstack_1[fstack_1_count].get = geteasy;
127 fstack_1_count++;
128 fstack_count = 0;
131 fstack[fstack_count].fp = fp;
132 fstack[fstack_count++].get = get;
135 void
136 fmerge(struct filelist *filelist, int nfiles, FILE *outfp, struct field *ftbl)
138 get_func_t get = SINGL_FLD ? makeline : makekey;
139 FILE *fp;
140 int i;
142 for (i = 0; i < nfiles; i++) {
143 #if defined(__minix)
144 /* LSC FIXME: Not very pretty, but reduce the diff */
145 #include "pathnames.h"
146 if (!strcmp(filelist->names[0], _PATH_STDIN))
147 fp = stdin;
148 else
149 #endif /* defined(__minix) */
150 fp = fopen(filelist->names[i], "r");
151 if (fp == NULL)
152 err(2, "%s", filelist->names[i]);
153 save_for_merge(fp, get, ftbl);
156 merge_sort(outfp, putline, ftbl);
159 void
160 merge_sort(FILE *outfp, put_func_t put, struct field *ftbl)
162 int count = fstack_1_count + fstack_2_count;
163 FILE *mfp;
164 int i;
166 if (count == 0) {
167 /* All files in initial array */
168 merge_sort_fstack(outfp, put, ftbl);
169 return;
172 count += fstack_count;
174 /* Too many files for one merge sort */
175 for (;;) {
176 /* Sort latest 16 files */
177 i = count;
178 if (i > MERGE_FNUM)
179 i = MERGE_FNUM;
180 while (fstack_count > 0)
181 fstack[--i] = fstack[--fstack_count];
182 while (i > 0 && fstack_1_count > 0)
183 fstack[--i] = fstack_1[--fstack_1_count];
184 while (i > 0)
185 fstack[--i] = fstack_2[--fstack_2_count];
186 if (count <= MERGE_FNUM) {
187 /* Got all the data */
188 fstack_count = count;
189 merge_sort_fstack(outfp, put, ftbl);
190 return;
192 mfp = ftmp();
193 fstack_count = count > MERGE_FNUM ? MERGE_FNUM : count;
194 merge_sort_fstack(mfp, putrec, ftbl);
195 fstack[0].fp = mfp;
196 fstack[0].get = geteasy;
197 fstack_count = 1;
198 count -= MERGE_FNUM - 1;
202 static void
203 merge_sort_fstack(FILE *outfp, put_func_t put, struct field *ftbl)
205 struct mfile *flistb[MERGE_FNUM], **flist = flistb, *cfile;
206 RECHEADER *new_rec;
207 u_char *new_end;
208 void *tmp;
209 int c, i, nfiles;
210 size_t sz;
212 /* Read one record from each file (read again if a duplicate) */
213 for (nfiles = i = 0; i < fstack_count; i++) {
214 cfile = &fstack[i];
215 if (cfile->rec == NULL) {
216 cfile->rec = allocrec(NULL, DEFLLEN);
217 cfile->end = (u_char *)cfile->rec + DEFLLEN;
219 rewind(cfile->fp);
221 for (;;) {
222 c = cfile->get(cfile->fp, cfile->rec, cfile->end, ftbl);
223 if (c == EOF)
224 break;
226 if (c == BUFFEND) {
227 /* Double buffer size */
228 sz = (cfile->end - (u_char *)cfile->rec) * 2;
229 cfile->rec = allocrec(cfile->rec, sz);
230 cfile->end = (u_char *)cfile->rec + sz;
231 continue;
234 if (nfiles != 0) {
235 if (insert(flist, cfile, nfiles, !DELETE))
236 /* Duplicate removed */
237 continue;
238 } else
239 flist[0] = cfile;
240 nfiles++;
241 break;
245 if (nfiles == 0)
246 return;
249 * We now loop reading a new record from the file with the
250 * 'sorted first' existing record.
251 * As each record is added, the 'first' record is written to the
252 * output file - maintaining one record from each file in the sorted
253 * list.
255 new_rec = allocrec(NULL, DEFLLEN);
256 new_end = (u_char *)new_rec + DEFLLEN;
257 for (;;) {
258 cfile = flist[0];
259 c = cfile->get(cfile->fp, new_rec, new_end, ftbl);
260 if (c == EOF) {
261 /* Write out last record from now-empty input */
262 put(cfile->rec, outfp);
263 if (--nfiles == 0)
264 break;
265 /* Replace from file with now-first sorted record. */
266 /* (Moving base 'flist' saves copying everything!) */
267 flist++;
268 continue;
270 if (c == BUFFEND) {
271 /* Buffer not large enough - double in size */
272 sz = (new_end - (u_char *)new_rec) * 2;
273 new_rec = allocrec(new_rec, sz);
274 new_end = (u_char *)new_rec +sz;
275 continue;
278 /* Swap in new buffer, saving old */
279 tmp = cfile->rec;
280 cfile->rec = new_rec;
281 new_rec = tmp;
282 tmp = cfile->end;
283 cfile->end = new_end;
284 new_end = tmp;
286 /* Add into sort, removing the original first entry */
287 c = insert(flist, cfile, nfiles, DELETE);
288 if (c != 0 || (UNIQUE && cfile == flist[0]
289 && cmp(new_rec, cfile->rec) == 0)) {
290 /* Was an unwanted duplicate, restore buffer */
291 tmp = cfile->rec;
292 cfile->rec = new_rec;
293 new_rec = tmp;
294 tmp = cfile->end;
295 cfile->end = new_end;
296 new_end = tmp;
297 continue;
300 /* Write out 'old' record */
301 put(new_rec, outfp);
304 free(new_rec);
308 * if delete: inserts rec in flist, deletes flist[0];
309 * otherwise just inserts *rec in flist.
310 * Returns 1 if record is a duplicate to be ignored.
312 static int
313 insert(struct mfile **flist, struct mfile *rec, int ttop, int delete)
315 int mid, top = ttop, bot = 0, cmpv = 1;
317 for (mid = top / 2; bot + 1 != top; mid = (bot + top) / 2) {
318 cmpv = cmp(rec->rec, flist[mid]->rec);
319 if (cmpv == 0 ) {
320 if (UNIQUE)
321 /* Duplicate key, read another record */
322 /* NB: This doesn't guarantee to keep any
323 * particular record. */
324 return 1;
326 * Apply sort by input file order.
327 * We could truncate the sort is the fileno are
328 * adjacent - but that is all too hard!
329 * The fileno cannot be equal, since we only have one
330 * record from each file (+ flist[0] which never
331 * comes here).
333 cmpv = rec < flist[mid] ? -1 : 1;
334 if (REVERSE)
335 cmpv = -cmpv;
337 if (cmpv < 0)
338 top = mid;
339 else
340 bot = mid;
343 /* At this point we haven't yet compared against flist[0] */
345 if (delete) {
346 /* flist[0] is ourselves, only the caller knows the old data */
347 if (bot != 0) {
348 memmove(flist, flist + 1, bot * sizeof(MFILE *));
349 flist[bot] = rec;
351 return 0;
354 /* Inserting original set of records */
356 if (bot == 0 && cmpv != 0) {
357 /* Doesn't match flist[1], must compare with flist[0] */
358 cmpv = cmp(rec->rec, flist[0]->rec);
359 if (cmpv == 0 && UNIQUE)
360 return 1;
361 /* Add matching keys in file order (ie new is later) */
362 if (cmpv < 0)
363 bot = -1;
365 bot++;
366 memmove(flist + bot + 1, flist + bot, (ttop - bot) * sizeof(MFILE *));
367 flist[bot] = rec;
368 return 0;
372 * check order on one file
374 void
375 order(struct filelist *filelist, struct field *ftbl)
377 get_func_t get = SINGL_FLD ? makeline : makekey;
378 RECHEADER *crec, *prec, *trec;
379 u_char *crec_end, *prec_end, *trec_end;
380 FILE *fp;
381 int c;
383 #if defined(__minix)
384 if (!strcmp(filelist->names[0], _PATH_STDIN))
385 fp = stdin;
386 else
387 #endif /* defined(__minix) */
388 fp = fopen(filelist->names[0], "r");
389 if (fp == NULL)
390 err(2, "%s", filelist->names[0]);
392 crec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
393 crec_end = crec->data + DEFLLEN;
394 prec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
395 prec_end = prec->data + DEFLLEN;
397 /* XXX this does exit(0) for overlong lines */
398 if (get(fp, prec, prec_end, ftbl) != 0)
399 exit(0);
400 while (get(fp, crec, crec_end, ftbl) == 0) {
401 if (0 < (c = cmp(prec, crec))) {
402 crec->data[crec->length-1] = 0;
403 errx(1, "found disorder: %s", crec->data+crec->offset);
405 if (UNIQUE && !c) {
406 crec->data[crec->length-1] = 0;
407 errx(1, "found non-uniqueness: %s",
408 crec->data+crec->offset);
411 * Swap pointers so that this record is on place pointed
412 * to by prec and new record is read to place pointed to by
413 * crec.
415 trec = prec;
416 prec = crec;
417 crec = trec;
418 trec_end = prec_end;
419 prec_end = crec_end;
420 crec_end = trec_end;
422 exit(0);
425 static int
426 cmp(RECHEADER *rec1, RECHEADER *rec2)
428 int len;
429 int r;
431 /* key is weights */
432 len = min(rec1->keylen, rec2->keylen);
433 r = memcmp(rec1->data, rec2->data, len);
434 if (r == 0)
435 r = rec1->keylen - rec2->keylen;
436 if (REVERSE)
437 r = -r;
438 return r;