1 /***********************************************************************
3 * This software is part of the ast package *
4 * Copyright (c) 1992-2010 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Common Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
9 * A copy of the License is available at *
10 * http://www.opensource.org/licenses/cpl1.0.txt *
11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
13 * Information and Software Systems Research *
17 * Glenn Fowler <gsf@research.att.com> *
18 * David Korn <dgk@research.att.com> *
20 ***********************************************************************/
30 static const char usage
[] =
31 "[-?\n@(#)$Id: join (AT&T Research) 2009-12-10 $\n]"
33 "[+NAME?join - relational database operator]"
34 "[+DESCRIPTION?\bjoin\b performs an \aequality join\a on the files \afile1\a "
35 "and \afile2\a and writes the resulting joined files to standard "
36 "output. By default, a field is delimited by one or more spaces "
37 "and tabs with leading spaces and/or tabs ignored. The \b-t\b option "
38 "can be used to change the field delimiter.]"
39 "[+?The \ajoin field\a is a field in each file on which files are compared. "
40 "By default \bjoin\b writes one line in the output for each pair "
41 "of lines in \afiles1\a and \afiles2\a that have identical join "
42 "fields. The default output line consists of the join field, "
43 "then the remaining fields from \afile1\a, then the remaining "
44 "fields from \afile2\a, but this can be changed with the \b-o\b "
45 "option. The \b-a\b option can be used to add unmatched lines "
46 "to the output. The \b-v\b option can be used to output only "
48 "[+?The files \afile1\a and \afile2\a must be ordered in the collating "
49 "sequence of \bsort -b\b on the fields on which they are to be "
50 "joined otherwise the results are unspecified.]"
51 "[+?If either \afile1\a or \afile2\a is \b-\b, \bjoin\b "
52 "uses standard input starting at the current location.]"
54 "[e:empty]:[string?Replace empty output fields in the list selected with"
55 " \b-o\b with \astring\a.]"
56 "[o:output]:[list?Construct the output line to comprise the fields specified "
57 "in a blank or comma separated list \alist\a. Each element in "
58 "\alist\a consists of a file number (either 1 or 2), a period, "
59 "and a field number or \b0\b representing the join field. "
60 "As an obsolete feature multiple occurrences of \b-o\b can "
62 "[t:separator|tabs]:[delim?Use \adelim\a as the field separator for both input"
64 "[1:j1]#[field?Join on field \afield\a of \afile1\a. Fields start at 1.]"
65 "[2:j2]#[field?Join on field \afield\a of \afile2\a. Fields start at 1.]"
66 "[j:join]#[field?Equivalent to \b-1\b \afield\a \b-2\b \afield\a.]"
67 "[a:unpairable]#[fileno?Write a line for each unpairable line in file"
68 " \afileno\a, where \afileno\a is either 1 or 2, in addition to the"
69 " normal output. If \b-a\b options appear for both 1 and 2, then "
70 "all unpairable lines will be output.]"
71 "[v:suppress]#[fileno?Write a line for each unpairable line in file"
72 " \afileno\a, where \afileno\a is either 1 or 2, instead of the normal "
73 "output. If \b-v\b options appear for both 1 and 2, then "
74 "all unpairable lines will be output.] ]"
75 "[i:ignorecase?Ignore case in field comparisons.]"
76 "[B!:mmap?Enable memory mapped reads instead of buffered.]"
78 "[+?The following obsolete option forms are also recognized: \b-j\b \afield\a"
79 " is equivalent to \b-1\b \afield\a \b-2\b \afield\a, \b-j1\b \afield\a"
80 " is equivalent to \b-1\b \afield\a, and \b-j2\b \afield\a is"
81 " equivalent to \b-2\b \afield\a.]"
87 "[+0?Both files processed successfully.]"
88 "[+>0?An error occurred.]"
90 "[+SEE ALSO?\bcut\b(1), \bcomm\b(1), \bpaste\b(1), \bsort\b(1), \buniq\b(1)]"
96 #if _hdr_wchar && _hdr_wctype && _lib_iswctype
106 #define iswspace(x) isspace(x)
114 #define C_ALL (C_FILE1|C_FILE2|C_COMMON)
124 typedef struct Field_s
130 typedef struct File_s
146 typedef struct Join_s
148 unsigned char state
[1<<CHAR_BIT
];
167 done(register Join_t
* jp
)
169 if (jp
->file
[0].iop
&& jp
->file
[0].iop
!= sfstdin
)
170 sfclose(jp
->file
[0].iop
);
171 if (jp
->file
[1].iop
&& jp
->file
[1].iop
!= sfstdin
)
172 sfclose(jp
->file
[1].iop
);
175 if (jp
->file
[0].fields
)
176 free(jp
->file
[0].fields
);
177 if (jp
->file
[1].fields
)
178 free(jp
->file
[1].fields
);
190 setlocale(LC_ALL
, "");
191 if (jp
= newof(0, Join_t
, 1, 0))
193 if (jp
->mb
= mbwide())
194 for (i
= 0x80; i
<= 0xff; i
++)
195 jp
->state
[i
] = S_WIDE
;
196 jp
->state
[' '] = jp
->state
['\t'] = S_SPACE
;
197 jp
->state
['\n'] = S_NL
;
200 if (!(jp
->file
[0].fields
= newof(0, Field_t
, NFIELD
+ 1, 0)) ||
201 !(jp
->file
[1].fields
= newof(0, Field_t
, NFIELD
+ 1, 0)))
206 jp
->file
[0].maxfields
= NFIELD
;
207 jp
->file
[1].maxfields
= NFIELD
;
208 jp
->outmode
= C_COMMON
;
214 getolist(Join_t
* jp
, const char* first
, char** arglist
)
216 register const char* cp
= first
;
217 char** argv
= arglist
;
224 outptr
= jp
->outlist
= newof(0, int, NFIELD
+ 1, 0);
225 outmax
= outptr
+ NFIELD
;
228 if (c
==' ' || c
=='\t' || c
==',')
231 if (*cp
=='0' && ((c
=cp
[1])==0 || c
==' ' || c
=='\t' || c
==','))
237 if (cp
[1]!='.' || (*cp
!='1' && *cp
!='2') || (c
=strtol(cp
+2,&str
,10)) <=0)
239 error(2,"%s: invalid field list",first
);
247 if (outptr
>= outmax
)
249 jp
->outlist
= newof(jp
->outlist
, int, 2 * nfield
+ 1, 0);
250 outptr
= jp
->outlist
+ nfield
;
252 outmax
= jp
->outlist
+ nfield
;
257 /* need to accept obsolescent command syntax */
260 if (!(cp
= *argv
) || cp
[1]!='.' || (*cp
!='1' && *cp
!='2'))
262 if (*cp
=='0' && cp
[1]==0)
270 c
= strtol(cp
+2, &str
,10);
278 if (outptr
>= outmax
)
280 jp
->outlist
= newof(jp
->outlist
, int, 2 * nfield
+ 1, 0);
281 outptr
= jp
->outlist
+ nfield
;
283 outmax
= jp
->outlist
+ nfield
;
292 * read in a record from file <index> and split into fields
294 static unsigned char*
295 getrec(Join_t
* jp
, int index
, int discard
)
297 register unsigned char* sp
= jp
->state
;
298 register File_t
* fp
= &jp
->file
[index
];
299 register Field_t
* field
= fp
->fields
;
300 register Field_t
* fieldmax
= field
+ fp
->maxfields
;
305 if (sh_checksig(jp
->context
))
307 if (discard
&& fp
->discard
)
308 sfraise(fp
->iop
, SFSK_DISCARD
, NiL
);
311 if (!(cp
= sfgetr(fp
->iop
, '\n', 0)))
313 jp
->outmode
&= ~(1<<index
);
317 fp
->reclen
= sfvalue(fp
->iop
);
318 if (jp
->delim
== '\n') /* handle new-line delimiter specially */
326 do /* separate into fields */
328 if (field
>= fieldmax
)
330 n
= 2 * fp
->maxfields
;
331 fp
->fields
= newof(fp
->fields
, Field_t
, n
+ 1, 0);
332 field
= fp
->fields
+ fp
->maxfields
;
334 fieldmax
= fp
->fields
+ n
;
339 switch (sp
[*(unsigned char*)cp
])
346 if (iswspace(mbchar(tp
)))
359 switch (sp
[*(unsigned char*)cp
++])
365 if (iswspace(mbchar(tp
)))
375 while (sp
[*(unsigned char*)cp
++]==S_SPACE
);
384 switch (n
= sp
[*(unsigned char*)cp
++])
396 if (jp
->delim
== -1 && iswspace(n
))
409 while (!(n
= sp
[*(unsigned char*)cp
++]));
414 fp
->nfields
= field
- fp
->fields
;
415 if ((n
= fp
->field
) < fp
->nfields
)
417 cp
= fp
->fields
[n
].beg
;
418 /* eliminate leading spaces */
424 switch (sp
[*(unsigned char*)cp
++])
430 if (iswspace(mbchar(tp
)))
440 while (sp
[*(unsigned char*)cp
++]==S_SPACE
);
443 fp
->fieldlen
= fp
->fields
[n
].end
- cp
;
444 return (unsigned char*)cp
;
447 return (unsigned char*)"";
450 static unsigned char*
451 _trace_getrec(Join_t
* jp
, int index
, int discard
)
455 r
= getrec(jp
, index
, discard
);
458 #define getrec _trace_getrec
461 static unsigned char* u1
,u2
,u3
;
462 #define getrec(p,n,d) (u1 = getrec(p, n, d), sfprintf(sfstdout, "[G%d#%d@%I*d:%-.8s]", __LINE__, n, sizeof(Sfoff_t), sftell(p->file[n].iop), u1), u1)
466 * print field <n> from file <index>
469 outfield(Join_t
* jp
, int index
, register int n
, int last
)
471 register File_t
* fp
= &jp
->file
[index
];
473 register char* cpmax
;
475 register Sfio_t
* iop
= jp
->outfile
;
480 cp
= fp
->fields
[n
].beg
;
481 cpmax
= fp
->fields
[n
].end
+ 1;
485 if ((n
= jp
->delim
) == -1)
487 if (cp
&& fp
->spaces
)
489 register unsigned char* sp
= jp
->state
;
491 /*eliminate leading spaces */
495 switch (sp
[*(unsigned char*)cp
++])
501 if (iswspace(mbchar(tp
)))
511 while (sp
[*(unsigned char*)cp
++]==S_SPACE
);
516 else if (jp
->delimstr
)
528 if (jp
->nullfield
&& sfputr(iop
, jp
->nullfield
, -1) < 0)
531 else if (sfwrite(iop
, cp
, size
) < 0)
533 if (sfwrite(iop
, jp
->delimstr
, jp
->delimlen
) < 0)
540 else if (sfputr(iop
, jp
->nullfield
, n
) < 0)
547 if (sfwrite(iop
, cp
, size
) < 0)
556 #define outfield(p,i,n,f) (sfprintf(sfstdout, "[F%d#%d:%d,%d]", __LINE__, i1=i, i2=n, i3=f), outfield(p, i1, i2, i3))
560 outrec(register Join_t
* jp
, int mode
)
569 if (mode
< 0 && jp
->file
[0].hit
++)
571 if (mode
> 0 && jp
->file
[1].hit
++)
573 if (out
= jp
->outlist
)
575 while ((n
= *out
++) >= 0)
580 j
= jp
->file
[i
].field
;
585 j
= (mode
<0 && i
|| mode
>0 && !i
) ?
586 jp
->file
[i
].nfields
:
589 if (outfield(jp
, i
, j
, *out
< 0) < 0)
594 k
= jp
->file
[0].nfields
;
596 k
+= jp
->file
[1].nfields
- 1;
602 k
-= (fp
->nfields
- 1);
608 /* output join field first */
609 if (outfield(jp
,i
,n
,!--k
) < 0)
615 if (outfield(jp
,i
,j
,!--k
) < 0)
624 for (;j
<fp
->nfields
; j
++)
626 if (j
!=n
&& outfield(jp
,i
,j
,!--k
) < 0)
636 #define outrec(p,n) (sfprintf(sfstdout, "[R#%d,%d,%lld,%lld:%-.*s{%d}:%-.*s{%d}]", __LINE__, i1=n, lo, hi, jp->file[0].fieldlen, cp1, jp->file[0].hit, jp->file[1].fieldlen, cp2, jp->file[1].hit), outrec(p, i1))
642 register unsigned char* cp1
;
643 register unsigned char* cp2
;
653 if ((cp1
= getrec(jp
, 0, 0)) && (cp2
= getrec(jp
, 1, 0)) || (cp2
= 0))
655 n1
= jp
->file
[0].fieldlen
;
656 n2
= jp
->file
[1].fieldlen
;
660 n
= n1
< n2
? n1
: n2
;
662 if (!n
&& !(cmp
= n1
< n2
? -1 : (n1
> n2
)) || n
&& !(cmp
= (int)*cp1
- (int)*cp2
) && !(cmp
= jp
->ignorecase
? strncasecmp((char*)cp1
, (char*)cp2
, n
) : memcmp(cp1
, cp2
, n
)))
664 sfprintf(sfstdout
, "[C#%d:%d(%c-%c),%d,%lld,%lld%s]", __LINE__
, cmp
, *cp1
, *cp2
, same
, lo
, hi
, (jp
->outmode
& C_COMMON
) ? ",COMMON" : "");
667 if (!n
&& !(cmp
= n1
< n2
? -1 : (n1
> n2
)) || n
&& !(cmp
= (int)*cp1
- (int)*cp2
) && !(cmp
= jp
->ignorecase
? strncasecmp((char*)cp1
, (char*)cp2
, n
) : memcmp(cp1
, cp2
, n
)) && !(cmp
= n1
- n2
))
670 if (!(jp
->outmode
& C_COMMON
))
672 if (cp1
= getrec(jp
, 0, 1))
674 n1
= jp
->file
[0].fieldlen
;
678 if ((jp
->ooutmode
& (C_FILE1
|C_FILE2
)) != C_FILE2
)
680 if (sfseek(jp
->file
[0].iop
, (Sfoff_t
)-jp
->file
[0].reclen
, SEEK_CUR
) < 0 || !(cp1
= getrec(jp
, 0, 0)))
682 error(ERROR_SYSTEM
|2, "%s: seek error", jp
->file
[0].name
);
686 else if (outrec(jp
, 0) < 0)
688 else if (lo
< 0 && (jp
->outmode
& C_COMMON
))
690 if ((lo
= sfseek(jp
->file
[1].iop
, (Sfoff_t
)0, SEEK_CUR
)) < 0)
692 error(ERROR_SYSTEM
|2, "%s: seek error", jp
->file
[1].name
);
695 lo
-= jp
->file
[1].reclen
;
697 if (cp2
= getrec(jp
, 1, lo
< 0))
699 n2
= jp
->file
[1].fieldlen
;
703 sfprintf(sfstdout
, "[2#%d:0,%lld,%lld]", __LINE__
, lo
, hi
);
712 if (n2
> jp
->samesize
)
714 jp
->samesize
= roundof(n2
, 16);
715 if (!(jp
->same
= newof(jp
->same
, char, jp
->samesize
, 0)))
717 error(ERROR_SYSTEM
|2, "out of space");
721 memcpy(jp
->same
, cp2
, o2
= n2
);
722 if (!(cp2
= getrec(jp
, 1, 0)))
724 n2
= jp
->file
[1].fieldlen
;
725 if (n2
== o2
&& *cp2
== *jp
->same
&& !memcmp(cp2
, jp
->same
, n2
))
731 if (sfseek(jp
->file
[1].iop
, hi
, SEEK_SET
) != hi
)
733 error(ERROR_SYSTEM
|2, "%s: seek error", jp
->file
[1].name
);
738 else if ((jp
->outmode
& C_FILE2
) && outrec(jp
, 1) < 0)
741 if (cp2
= getrec(jp
, 1, 1))
743 n2
= jp
->file
[1].fieldlen
;
747 sfprintf(sfstdout
, "[2#%d:0,%lld,%lld]", __LINE__
, lo
, hi
);
753 if (!(cp1
= getrec(jp
, 0, 0)))
755 n1
= jp
->file
[0].fieldlen
;
760 if ((hi
= sfseek(jp
->file
[1].iop
, (Sfoff_t
)0, SEEK_CUR
)) < 0 ||
761 (hi
-= jp
->file
[1].reclen
) < 0 ||
762 sfseek(jp
->file
[1].iop
, lo
, SEEK_SET
) != lo
||
763 !(cp2
= getrec(jp
, 1, 0)))
765 error(ERROR_SYSTEM
|2, "%s: seek error", jp
->file
[1].name
);
768 n2
= jp
->file
[1].fieldlen
;
770 if (jp
->file
[1].discard
)
771 sfseek(jp
->file
[1].iop
, (Sfoff_t
)-1, SEEK_SET
);
775 else if ((jp
->outmode
& C_FILE1
) && outrec(jp
, -1) < 0)
777 if (!(cp1
= getrec(jp
, 0, 1)))
779 n1
= jp
->file
[0].fieldlen
;
783 sfprintf(sfstdout
, "[X#%d:?,%p,%p,%d%,%d,%d%s]", __LINE__
, cp1
, cp2
, cmp
, lo
, hi
, (jp
->outmode
& C_COMMON
) ? ",COMMON" : "");
788 sfseek(jp
->file
[1].iop
, (Sfoff_t
)0, SEEK_CUR
) < hi
&&
789 sfseek(jp
->file
[1].iop
, hi
, SEEK_SET
) != hi
)
791 error(ERROR_SYSTEM
|2, "%s: seek error", jp
->file
[1].name
);
795 sfprintf(sfstdout
, "[O#%d:%02o:%02o]", __LINE__
, jp
->ooutmode
, jp
->outmode
);
797 cp1
= (!cp1
&& cmp
&& hi
< 0 && !jp
->file
[1].hit
&& ((jp
->ooutmode
^ C_ALL
) <= 1 || jp
->outmode
== 2)) ? cp2
: getrec(jp
, 1, 0);
807 sfprintf(sfstdout
, "[X#%d:%d,%p,%p,%d,%02o,%02o%s]", __LINE__
, n
, cp1
, cp2
, cmp
, jp
->ooutmode
, jp
->outmode
, (jp
->outmode
& C_COMMON
) ? ",COMMON" : "");
809 if (!cp1
|| !(jp
->outmode
& (1<<n
)))
811 if (cp1
&& jp
->file
[n
].iop
== sfstdin
)
812 sfseek(sfstdin
, (Sfoff_t
)0, SEEK_END
);
815 if (outrec(jp
, cmp
) < 0)
819 if (!getrec(jp
, n
, 1))
821 } while (outrec(jp
, cmp
) >= 0);
826 b_join(int argc
, char** argv
, void* context
)
834 cmdinit(argc
, argv
, context
, ERROR_CATALOG
, ERROR_NOTIFY
);
837 error(ERROR_system(1),"out of space");
838 jp
->context
= context
;
841 switch (n
= optget(argv
, usage
))
847 * check for obsolete "-j1 field" and "-j2 field"
850 if (opt_info
.offset
== 0)
852 cp
= argv
[opt_info
.index
- 1];
853 for (n
= strlen(cp
) - 1; n
> 0 && cp
[n
] != 'j'; n
--);
860 if (opt_info
.num
!=1 && opt_info
.num
!=2)
861 error(2,"-jfileno field: fileno must be 1 or 2");
862 n
= '0' + opt_info
.num
;
863 if (!(cp
= argv
[opt_info
.index
]))
868 opt_info
.num
= strtol(cp
, &e
, 10);
878 jp
->file
[0].field
= (int)(opt_info
.num
-1);
884 if (opt_info
.num
<=0)
885 error(2,"field number must positive");
886 jp
->file
[n
-'1'].field
= (int)(opt_info
.num
-1);
889 jp
->outmode
&= ~C_COMMON
;
892 if (opt_info
.num
!=1 && opt_info
.num
!=2)
893 error(2,"%s: file number must be 1 or 2", opt_info
.name
);
894 jp
->outmode
|= 1<<(opt_info
.num
-1);
897 jp
->nullfield
= opt_info
.arg
;
900 /* need to accept obsolescent command syntax */
901 n
= getolist(jp
, opt_info
.arg
, argv
+opt_info
.index
);
905 jp
->state
[' '] = jp
->state
['\t'] = 0;
909 jp
->delim
= mbchar(cp
);
910 if ((n
= cp
- opt_info
.arg
) > 1)
913 jp
->delimstr
= opt_info
.arg
;
917 n
= *(unsigned char*)opt_info
.arg
;
918 jp
->state
[n
] = S_DELIM
;
922 jp
->ignorecase
= !opt_info
.num
;
925 jp
->buffered
= !opt_info
.num
;
928 error(2, "%s", opt_info
.arg
);
932 error(ERROR_usage(2), "%s", opt_info
.arg
);
937 argv
+= opt_info
.index
;
938 argc
-= opt_info
.index
;
939 if (error_info
.errors
|| argc
!=2)
942 error(ERROR_usage(2),"%s", optusage(NiL
));
944 jp
->ooutmode
= jp
->outmode
;
945 jp
->file
[0].name
= cp
= *argv
++;
948 if (sfseek(sfstdin
,(Sfoff_t
)0,SEEK_CUR
) < 0)
950 if (sfdcseekable(sfstdin
))
951 error(ERROR_warn(0),"%s: seek may fail",cp
);
953 jp
->file
[0].discard
= 1;
955 jp
->file
[0].iop
= sfstdin
;
957 else if (!(jp
->file
[0].iop
= sfopen(NiL
, cp
, "r")))
960 error(ERROR_system(1),"%s: cannot open",cp
);
962 jp
->file
[1].name
= cp
= *argv
;
965 if (sfseek(sfstdin
,(Sfoff_t
)0,SEEK_CUR
) < 0)
967 if (sfdcseekable(sfstdin
))
968 error(ERROR_warn(0),"%s: seek may fail",cp
);
970 jp
->file
[1].discard
= 1;
972 jp
->file
[1].iop
= sfstdin
;
974 else if (!(jp
->file
[1].iop
= sfopen(NiL
, cp
, "r")))
977 error(ERROR_system(1),"%s: cannot open",cp
);
981 sfsetbuf(jp
->file
[0].iop
, jp
->file
[0].iop
, SF_UNBOUND
);
982 sfsetbuf(jp
->file
[1].iop
, jp
->file
[1].iop
, SF_UNBOUND
);
984 jp
->outfile
= sfstdout
;
990 error(ERROR_system(1),"write error");
992 else if (jp
->file
[0].iop
==sfstdin
|| jp
->file
[1].iop
==sfstdin
)
993 sfseek(sfstdin
,(Sfoff_t
)0,SEEK_END
);
995 return error_info
.errors
;