1 /* $NetBSD: msort.c,v 1.30 2010/02/05 21:58:42 enami Exp $ */
4 * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Ben Harris and Jaromir Dolecek.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
34 * The Regents of the University of California. All rights reserved.
36 * This code is derived from software contributed to Berkeley by
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
42 * 1. Redistributions of source code must retain the above copyright
43 * notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 * notice, this list of conditions and the following disclaimer in the
46 * documentation and/or other materials provided with the distribution.
47 * 3. Neither the name of the University nor the names of its contributors
48 * may be used to endorse or promote products derived from this software
49 * without specific prior written permission.
51 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
67 __RCSID("$NetBSD: msort.c,v 1.30 2010/02/05 21:58:42 enami Exp $");
74 /* Subroutines using comparisons: merge sort and check order */
77 typedef struct mfile
{
84 static int cmp(RECHEADER
*, RECHEADER
*);
85 static int insert(struct mfile
**, struct mfile
*, int, int);
86 static void merge_sort_fstack(FILE *, put_func_t
, struct field
*);
89 * Number of files merge() can merge in one pass.
93 static struct mfile fstack
[MERGE_FNUM
];
94 static struct mfile fstack_1
[MERGE_FNUM
];
95 static struct mfile fstack_2
[MERGE_FNUM
];
96 static int fstack_count
, fstack_1_count
, fstack_2_count
;
99 save_for_merge(FILE *fp
, get_func_t get
, struct field
*ftbl
)
101 FILE *mfp
, *mfp1
, *mfp2
;
103 if (fstack_count
== MERGE_FNUM
) {
104 /* Must reduce the number of temporary files */
106 merge_sort_fstack(mfp
, putrec
, ftbl
);
107 /* Save output in next layer */
108 if (fstack_1_count
== MERGE_FNUM
) {
110 memcpy(fstack
, fstack_1
, sizeof fstack
);
111 merge_sort_fstack(mfp1
, putrec
, ftbl
);
112 if (fstack_2_count
== MERGE_FNUM
) {
113 /* More than 4096 files! */
115 memcpy(fstack
, fstack_2
, sizeof fstack
);
116 merge_sort_fstack(mfp2
, putrec
, ftbl
);
117 fstack_2
[0].fp
= mfp2
;
120 fstack_2
[fstack_2_count
].fp
= mfp1
;
121 fstack_2
[fstack_2_count
].get
= geteasy
;
125 fstack_1
[fstack_1_count
].fp
= mfp
;
126 fstack_1
[fstack_1_count
].get
= geteasy
;
131 fstack
[fstack_count
].fp
= fp
;
132 fstack
[fstack_count
++].get
= get
;
136 fmerge(struct filelist
*filelist
, int nfiles
, FILE *outfp
, struct field
*ftbl
)
138 get_func_t get
= SINGL_FLD
? makeline
: makekey
;
142 for (i
= 0; i
< nfiles
; i
++) {
144 /* LSC FIXME: Not very pretty, but reduce the diff */
145 #include "pathnames.h"
146 if (!strcmp(filelist
->names
[0], _PATH_STDIN
))
149 #endif /* defined(__minix) */
150 fp
= fopen(filelist
->names
[i
], "r");
152 err(2, "%s", filelist
->names
[i
]);
153 save_for_merge(fp
, get
, ftbl
);
156 merge_sort(outfp
, putline
, ftbl
);
160 merge_sort(FILE *outfp
, put_func_t put
, struct field
*ftbl
)
162 int count
= fstack_1_count
+ fstack_2_count
;
167 /* All files in initial array */
168 merge_sort_fstack(outfp
, put
, ftbl
);
172 count
+= fstack_count
;
174 /* Too many files for one merge sort */
176 /* Sort latest 16 files */
180 while (fstack_count
> 0)
181 fstack
[--i
] = fstack
[--fstack_count
];
182 while (i
> 0 && fstack_1_count
> 0)
183 fstack
[--i
] = fstack_1
[--fstack_1_count
];
185 fstack
[--i
] = fstack_2
[--fstack_2_count
];
186 if (count
<= MERGE_FNUM
) {
187 /* Got all the data */
188 fstack_count
= count
;
189 merge_sort_fstack(outfp
, put
, ftbl
);
193 fstack_count
= count
> MERGE_FNUM
? MERGE_FNUM
: count
;
194 merge_sort_fstack(mfp
, putrec
, ftbl
);
196 fstack
[0].get
= geteasy
;
198 count
-= MERGE_FNUM
- 1;
203 merge_sort_fstack(FILE *outfp
, put_func_t put
, struct field
*ftbl
)
205 struct mfile
*flistb
[MERGE_FNUM
], **flist
= flistb
, *cfile
;
212 /* Read one record from each file (read again if a duplicate) */
213 for (nfiles
= i
= 0; i
< fstack_count
; i
++) {
215 if (cfile
->rec
== NULL
) {
216 cfile
->rec
= allocrec(NULL
, DEFLLEN
);
217 cfile
->end
= (u_char
*)cfile
->rec
+ DEFLLEN
;
222 c
= cfile
->get(cfile
->fp
, cfile
->rec
, cfile
->end
, ftbl
);
227 /* Double buffer size */
228 sz
= (cfile
->end
- (u_char
*)cfile
->rec
) * 2;
229 cfile
->rec
= allocrec(cfile
->rec
, sz
);
230 cfile
->end
= (u_char
*)cfile
->rec
+ sz
;
235 if (insert(flist
, cfile
, nfiles
, !DELETE
))
236 /* Duplicate removed */
249 * We now loop reading a new record from the file with the
250 * 'sorted first' existing record.
251 * As each record is added, the 'first' record is written to the
252 * output file - maintaining one record from each file in the sorted
255 new_rec
= allocrec(NULL
, DEFLLEN
);
256 new_end
= (u_char
*)new_rec
+ DEFLLEN
;
259 c
= cfile
->get(cfile
->fp
, new_rec
, new_end
, ftbl
);
261 /* Write out last record from now-empty input */
262 put(cfile
->rec
, outfp
);
265 /* Replace from file with now-first sorted record. */
266 /* (Moving base 'flist' saves copying everything!) */
271 /* Buffer not large enough - double in size */
272 sz
= (new_end
- (u_char
*)new_rec
) * 2;
273 new_rec
= allocrec(new_rec
, sz
);
274 new_end
= (u_char
*)new_rec
+sz
;
278 /* Swap in new buffer, saving old */
280 cfile
->rec
= new_rec
;
283 cfile
->end
= new_end
;
286 /* Add into sort, removing the original first entry */
287 c
= insert(flist
, cfile
, nfiles
, DELETE
);
288 if (c
!= 0 || (UNIQUE
&& cfile
== flist
[0]
289 && cmp(new_rec
, cfile
->rec
) == 0)) {
290 /* Was an unwanted duplicate, restore buffer */
292 cfile
->rec
= new_rec
;
295 cfile
->end
= new_end
;
300 /* Write out 'old' record */
308 * if delete: inserts rec in flist, deletes flist[0];
309 * otherwise just inserts *rec in flist.
310 * Returns 1 if record is a duplicate to be ignored.
313 insert(struct mfile
**flist
, struct mfile
*rec
, int ttop
, int delete)
315 int mid
, top
= ttop
, bot
= 0, cmpv
= 1;
317 for (mid
= top
/ 2; bot
+ 1 != top
; mid
= (bot
+ top
) / 2) {
318 cmpv
= cmp(rec
->rec
, flist
[mid
]->rec
);
321 /* Duplicate key, read another record */
322 /* NB: This doesn't guarantee to keep any
323 * particular record. */
326 * Apply sort by input file order.
327 * We could truncate the sort is the fileno are
328 * adjacent - but that is all too hard!
329 * The fileno cannot be equal, since we only have one
330 * record from each file (+ flist[0] which never
333 cmpv
= rec
< flist
[mid
] ? -1 : 1;
343 /* At this point we haven't yet compared against flist[0] */
346 /* flist[0] is ourselves, only the caller knows the old data */
348 memmove(flist
, flist
+ 1, bot
* sizeof(MFILE
*));
354 /* Inserting original set of records */
356 if (bot
== 0 && cmpv
!= 0) {
357 /* Doesn't match flist[1], must compare with flist[0] */
358 cmpv
= cmp(rec
->rec
, flist
[0]->rec
);
359 if (cmpv
== 0 && UNIQUE
)
361 /* Add matching keys in file order (ie new is later) */
366 memmove(flist
+ bot
+ 1, flist
+ bot
, (ttop
- bot
) * sizeof(MFILE
*));
372 * check order on one file
375 order(struct filelist
*filelist
, struct field
*ftbl
)
377 get_func_t get
= SINGL_FLD
? makeline
: makekey
;
378 RECHEADER
*crec
, *prec
, *trec
;
379 u_char
*crec_end
, *prec_end
, *trec_end
;
384 if (!strcmp(filelist
->names
[0], _PATH_STDIN
))
387 #endif /* defined(__minix) */
388 fp
= fopen(filelist
->names
[0], "r");
390 err(2, "%s", filelist
->names
[0]);
392 crec
= malloc(offsetof(RECHEADER
, data
[DEFLLEN
]));
393 crec_end
= crec
->data
+ DEFLLEN
;
394 prec
= malloc(offsetof(RECHEADER
, data
[DEFLLEN
]));
395 prec_end
= prec
->data
+ DEFLLEN
;
397 /* XXX this does exit(0) for overlong lines */
398 if (get(fp
, prec
, prec_end
, ftbl
) != 0)
400 while (get(fp
, crec
, crec_end
, ftbl
) == 0) {
401 if (0 < (c
= cmp(prec
, crec
))) {
402 crec
->data
[crec
->length
-1] = 0;
403 errx(1, "found disorder: %s", crec
->data
+crec
->offset
);
406 crec
->data
[crec
->length
-1] = 0;
407 errx(1, "found non-uniqueness: %s",
408 crec
->data
+crec
->offset
);
411 * Swap pointers so that this record is on place pointed
412 * to by prec and new record is read to place pointed to by
426 cmp(RECHEADER
*rec1
, RECHEADER
*rec2
)
432 len
= min(rec1
->keylen
, rec2
->keylen
);
433 r
= memcmp(rec1
->data
, rec2
->data
, len
);
435 r
= rec1
->keylen
- rec2
->keylen
;