1 /* $Id: read.c,v 1.150.2.5 2017/01/09 02:25:53 schwarze Exp $ */
3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2010-2017 Ingo Schwarze <schwarze@openbsd.org>
5 * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org>
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
21 #include <sys/types.h>
42 #include "mandoc_aux.h"
47 #include "libmandoc.h"
50 #define REPARSE_LIMIT 1000
53 struct roff_man
*man
; /* man parser */
54 struct roff
*roff
; /* roff parser (!NULL) */
55 char *sodest
; /* filename pointed to by .so */
56 const char *file
; /* filename of current input file */
57 struct buf
*primary
; /* buffer currently being parsed */
58 struct buf
*secondary
; /* preprocessed copy of input */
59 const char *defos
; /* default operating system */
60 mandocmsg mmsg
; /* warning/error message handler */
61 enum mandoclevel file_status
; /* status of current parse */
62 enum mandoclevel wlevel
; /* ignore messages below this */
63 int options
; /* parser options */
64 int gzip
; /* current input file is gzipped */
65 int filenc
; /* encoding of the current file */
66 int reparse_count
; /* finite interp. stack */
67 int line
; /* line number in the file */
70 static void choose_parser(struct mparse
*);
71 static void resize_buf(struct buf
*, size_t);
72 static void mparse_buf_r(struct mparse
*, struct buf
, size_t, int);
73 static int read_whole_file(struct mparse
*, const char *, int,
75 static void mparse_end(struct mparse
*);
76 static void mparse_parse_buffer(struct mparse
*, struct buf
,
79 static const enum mandocerr mandoclimits
[MANDOCLEVEL_MAX
] = {
89 static const char * const mandocerrs
[MANDOCERR_MAX
] = {
94 /* related to the prologue */
95 "missing manual title, using UNTITLED",
96 "missing manual title, using \"\"",
97 "lower case character in document title",
98 "missing manual section, using \"\"",
99 "unknown manual section",
100 "missing date, using today's date",
101 "cannot parse date, using it verbatim",
102 "missing Os macro, using \"\"",
103 "duplicate prologue macro",
104 "late prologue macro",
105 "skipping late title macro",
106 "prologue macros out of order",
108 /* related to document structure */
109 ".so is fragile, better use ln(1)",
111 "content before first section header",
112 "first section is not \"NAME\"",
113 "NAME section without Nm before Nd",
114 "NAME section without description",
115 "description not at the end of NAME",
116 "bad NAME section content",
117 "missing comma before name",
118 "missing description line, using \"\"",
119 "sections out of conventional order",
120 "duplicate section title",
121 "unexpected section",
123 "unusual Xr punctuation",
124 "AUTHORS section without An macro",
126 /* related to macros and nesting */
128 "macro neither callable nor escaped",
129 "skipping paragraph macro",
130 "moving paragraph macro out of list",
131 "skipping no-space macro",
132 "blocks badly nested",
133 "nested displays are not portable",
134 "moving content out of list",
135 "fill mode already enabled, skipping",
136 "fill mode already disabled, skipping",
139 /* related to missing macro arguments */
140 "skipping empty request",
141 "conditional request controls empty scope",
142 "skipping empty macro",
144 "empty argument, using 0n",
145 "missing display type, using -ragged",
146 "list type is not the first argument",
147 "missing -width in -tag list, using 6n",
148 "missing utility name, using \"\"",
149 "missing function name, using \"\"",
150 "empty head in list item",
152 "missing font type, using \\fR",
153 "unknown font type, using \\fR",
154 "nothing follows prefix",
155 "empty reference block",
156 "missing section argument",
157 "missing -std argument, adding it",
158 "missing option string, using \"\"",
159 "missing resource identifier, using \"\"",
160 "missing eqn box, using \"\"",
162 /* related to bad macro arguments */
163 "unterminated quoted argument",
164 "duplicate argument",
165 "skipping duplicate argument",
166 "skipping duplicate display type",
167 "skipping duplicate list type",
168 "skipping -width argument",
169 "wrong number of cells",
170 "unknown AT&T UNIX version",
171 "comma in function argument",
172 "parenthesis in function name",
173 "invalid content in Rs block",
174 "invalid Boolean argument",
175 "unknown font, skipping request",
176 "odd number of characters in request",
178 /* related to plain text */
179 "blank line in fill mode, using .sp",
180 "tab in filled text",
181 "whitespace at end of input line",
183 "invalid escape sequence",
184 "undefined string, using \"\"",
186 /* related to tables */
187 "tbl line starts with span",
188 "tbl column starts with span",
189 "skipping vertical bar in tbl layout",
193 /* related to tables */
194 "non-alphabetic character in tbl options",
195 "skipping unknown tbl option",
196 "missing tbl option argument",
197 "wrong tbl option argument size",
199 "invalid character in tbl layout",
200 "unmatched parenthesis in tbl layout",
201 "tbl without any data cells",
202 "ignoring data in spanned tbl cell",
203 "ignoring extra tbl data cells",
204 "data block open at end of tbl",
206 /* related to document structure and macros */
208 "input stack limit exceeded, infinite loop?",
209 "skipping bad character",
210 "skipping unknown macro",
211 "skipping insecure request",
212 "skipping item outside list",
213 "skipping column outside column list",
214 "skipping end of block that is not open",
215 "fewer RS blocks open, skipping",
216 "inserting missing end of block",
217 "appending missing end of block",
219 /* related to request and macro arguments */
220 "escaped character not allowed in a name",
221 "NOT IMPLEMENTED: Bd -file",
222 "skipping display without arguments",
223 "missing list type, using -item",
224 "missing manual name, using \"\"",
225 "uname(3) system call failed, using UNKNOWN",
226 "unknown standard specifier",
227 "skipping request without numeric argument",
228 "NOT IMPLEMENTED: .so with absolute path or \"..\"",
229 ".so request failed",
230 "skipping all arguments",
231 "skipping excess arguments",
234 "unsupported feature",
236 "unsupported control character",
237 "unsupported roff request",
238 "eqn delim option in tbl",
239 "unsupported tbl layout modifier",
240 "ignoring macro in table",
243 static const char * const mandoclevels
[MANDOCLEVEL_MAX
] = {
255 resize_buf(struct buf
*buf
, size_t initial
)
258 buf
->sz
= buf
->sz
> initial
/2 ? 2 * buf
->sz
: initial
;
259 buf
->buf
= mandoc_realloc(buf
->buf
, buf
->sz
);
263 choose_parser(struct mparse
*curp
)
269 * If neither command line arguments -mdoc or -man select
270 * a parser nor the roff parser found a .Dd or .TH macro
271 * yet, look ahead in the main input buffer.
274 if ((format
= roff_getformat(curp
->roff
)) == 0) {
275 cp
= curp
->primary
->buf
;
276 ep
= cp
+ curp
->primary
->sz
;
278 if (*cp
== '.' || *cp
== '\'') {
280 if (cp
[0] == 'D' && cp
[1] == 'd') {
281 format
= MPARSE_MDOC
;
284 if (cp
[0] == 'T' && cp
[1] == 'H') {
289 cp
= memchr(cp
, '\n', ep
- cp
);
296 if (format
== MPARSE_MDOC
) {
298 curp
->man
->macroset
= MACROSET_MDOC
;
299 curp
->man
->first
->tok
= TOKEN_NONE
;
302 curp
->man
->macroset
= MACROSET_MAN
;
303 curp
->man
->first
->tok
= TOKEN_NONE
;
308 * Main parse routine for a buffer.
309 * It assumes encoding and line numbering are already set up.
310 * It can recurse directly (for invocations of user-defined
311 * macros, inline equations, and input line traps)
312 * and indirectly (for .so file inclusion).
315 mparse_buf_r(struct mparse
*curp
, struct buf blk
, size_t i
, int start
)
317 const struct tbl_span
*span
;
319 const char *save_file
;
321 size_t pos
; /* byte number in the ln buffer */
322 size_t j
; /* auxiliary byte number in the blk buffer */
325 int lnn
; /* line number in the real file */
329 memset(&ln
, 0, sizeof(ln
));
335 if (0 == pos
&& '\0' == blk
.buf
[i
])
340 curp
->reparse_count
= 0;
343 curp
->filenc
& MPARSE_UTF8
&&
344 curp
->filenc
& MPARSE_LATIN1
)
345 curp
->filenc
= preconv_cue(&blk
, i
);
348 while (i
< blk
.sz
&& (start
|| blk
.buf
[i
] != '\0')) {
351 * When finding an unescaped newline character,
352 * leave the character loop to process the line.
353 * Skip a preceding carriage return, if any.
356 if ('\r' == blk
.buf
[i
] && i
+ 1 < blk
.sz
&&
357 '\n' == blk
.buf
[i
+ 1])
359 if ('\n' == blk
.buf
[i
]) {
366 * Make sure we have space for the worst
367 * case of 11 bytes: "\\[u10ffff]\0"
370 if (pos
+ 11 > ln
.sz
)
371 resize_buf(&ln
, 256);
374 * Encode 8-bit input.
379 if ( ! (curp
->filenc
&& preconv_encode(
380 &blk
, &i
, &ln
, &pos
, &curp
->filenc
))) {
381 mandoc_vmsg(MANDOCERR_CHAR_BAD
, curp
,
382 curp
->line
, pos
, "0x%x", c
);
390 * Exclude control characters.
393 if (c
== 0x7f || (c
< 0x20 && c
!= 0x09)) {
394 mandoc_vmsg(c
== 0x00 || c
== 0x04 ||
395 c
> 0x0a ? MANDOCERR_CHAR_BAD
:
396 MANDOCERR_CHAR_UNSUPP
,
397 curp
, curp
->line
, pos
, "0x%x", c
);
404 /* Trailing backslash = a plain char. */
406 if (blk
.buf
[i
] != '\\' || i
+ 1 == blk
.sz
) {
407 ln
.buf
[pos
++] = blk
.buf
[i
++];
412 * Found escape and at least one other character.
413 * When it's a newline character, skip it.
414 * When there is a carriage return in between,
415 * skip that one as well.
418 if ('\r' == blk
.buf
[i
+ 1] && i
+ 2 < blk
.sz
&&
419 '\n' == blk
.buf
[i
+ 2])
421 if ('\n' == blk
.buf
[i
+ 1]) {
427 if ('"' == blk
.buf
[i
+ 1] || '#' == blk
.buf
[i
+ 1]) {
430 /* Comment, skip to end of line */
431 for (; i
< blk
.sz
; ++i
) {
432 if (blk
.buf
[i
] != '\n')
434 if (blk
.buf
[i
- 1] == ' ' ||
435 blk
.buf
[i
- 1] == '\t')
439 pos
+ i
-1 - j
, NULL
);
445 /* Backout trailing whitespaces */
446 for (; pos
> 0; --pos
) {
447 if (ln
.buf
[pos
- 1] != ' ')
449 if (pos
> 2 && ln
.buf
[pos
- 2] == '\\')
455 /* Catch escaped bogus characters. */
457 c
= (unsigned char) blk
.buf
[i
+1];
459 if ( ! (isascii(c
) &&
460 (isgraph(c
) || isblank(c
)))) {
461 mandoc_vmsg(MANDOCERR_CHAR_BAD
, curp
,
462 curp
->line
, pos
, "0x%x", c
);
468 /* Some other escape sequence, copy & cont. */
470 ln
.buf
[pos
++] = blk
.buf
[i
++];
471 ln
.buf
[pos
++] = blk
.buf
[i
++];
475 resize_buf(&ln
, 256);
480 * A significant amount of complexity is contained by
481 * the roff preprocessor. It's line-oriented but can be
482 * expressed on one line, so we need at times to
483 * readjust our starting point and re-run it. The roff
484 * preprocessor can also readjust the buffers with new
485 * data, so we pass them in wholesale.
491 * Maintain a lookaside buffer of all parsed lines. We
492 * only do this if mparse_keep() has been invoked (the
493 * buffer may be accessed with mparse_getkeep()).
496 if (curp
->secondary
) {
497 curp
->secondary
->buf
= mandoc_realloc(
498 curp
->secondary
->buf
,
499 curp
->secondary
->sz
+ pos
+ 2);
500 memcpy(curp
->secondary
->buf
+
503 curp
->secondary
->sz
+= pos
;
505 [curp
->secondary
->sz
] = '\n';
506 curp
->secondary
->sz
++;
508 [curp
->secondary
->sz
] = '\0';
511 rr
= roff_parseln(curp
->roff
, curp
->line
, &ln
, &of
);
515 if (REPARSE_LIMIT
>= ++curp
->reparse_count
)
516 mparse_buf_r(curp
, ln
, of
, 0);
518 mandoc_msg(MANDOCERR_ROFFLOOP
, curp
,
519 curp
->line
, pos
, NULL
);
523 pos
= strlen(ln
.buf
);
531 if ( ! (curp
->options
& MPARSE_SO
) &&
532 (i
>= blk
.sz
|| blk
.buf
[i
] == '\0')) {
533 curp
->sodest
= mandoc_strdup(ln
.buf
+ of
);
538 * We remove `so' clauses from our lookaside
539 * buffer because we're going to descend into
540 * the file recursively.
543 curp
->secondary
->sz
-= pos
+ 1;
544 save_file
= curp
->file
;
545 if ((fd
= mparse_open(curp
, ln
.buf
+ of
)) != -1) {
546 mparse_readfd(curp
, fd
, ln
.buf
+ of
);
548 curp
->file
= save_file
;
550 curp
->file
= save_file
;
551 mandoc_vmsg(MANDOCERR_SO_FAIL
,
552 curp
, curp
->line
, pos
,
553 ".so %s", ln
.buf
+ of
);
554 ln
.sz
= mandoc_asprintf(&cp
,
555 ".sp\nSee the file %s.\n.sp",
560 mparse_buf_r(curp
, ln
, of
, 0);
568 if (curp
->man
->macroset
== MACROSET_NONE
)
572 * Lastly, push down into the parsers themselves.
573 * If libroff returns ROFF_TBL, then add it to the
574 * currently open parse. Since we only get here if
575 * there does exist data (see tbl_data.c), we're
576 * guaranteed that something's been allocated.
577 * Do the same for ROFF_EQN.
581 while ((span
= roff_span(curp
->roff
)) != NULL
)
582 roff_addtbl(curp
->man
, span
);
583 else if (rr
== ROFF_EQN
)
584 roff_addeqn(curp
->man
, roff_eqn(curp
->roff
));
585 else if ((curp
->man
->macroset
== MACROSET_MDOC
?
586 mdoc_parseln(curp
->man
, curp
->line
, ln
.buf
, of
) :
587 man_parseln(curp
->man
, curp
->line
, ln
.buf
, of
)) == 2)
590 /* Temporary buffers typically are not full. */
592 if (0 == start
&& '\0' == blk
.buf
[i
])
595 /* Start the next input line. */
604 read_whole_file(struct mparse
*curp
, const char *file
, int fd
,
605 struct buf
*fb
, int *with_mmap
)
614 if (fstat(fd
, &st
) == -1)
615 err((int)MANDOCLEVEL_SYSERR
, "%s", file
);
618 * If we're a regular file, try just reading in the whole entry
619 * via mmap(). This is faster than reading it into blocks, and
620 * since each file is only a few bytes to begin with, I'm not
621 * concerned that this is going to tank any machines.
624 if (curp
->gzip
== 0 && S_ISREG(st
.st_mode
)) {
625 if (st
.st_size
> 0x7fffffff) {
626 mandoc_msg(MANDOCERR_TOOLARGE
, curp
, 0, 0, NULL
);
630 fb
->sz
= (size_t)st
.st_size
;
631 fb
->buf
= mmap(NULL
, fb
->sz
, PROT_READ
, MAP_SHARED
, fd
, 0);
632 if (fb
->buf
!= MAP_FAILED
)
638 if ((gz
= gzdopen(fd
, "rb")) == NULL
)
639 err((int)MANDOCLEVEL_SYSERR
, "%s", file
);
644 * If this isn't a regular file (like, say, stdin), then we must
645 * go the old way and just read things in bit by bit.
654 if (fb
->sz
== (1U << 31)) {
655 mandoc_msg(MANDOCERR_TOOLARGE
, curp
,
659 resize_buf(fb
, 65536);
662 gzread(gz
, fb
->buf
+ (int)off
, fb
->sz
- off
) :
663 read(fd
, fb
->buf
+ (int)off
, fb
->sz
- off
);
669 err((int)MANDOCLEVEL_SYSERR
, "%s", file
);
679 mparse_end(struct mparse
*curp
)
681 if (curp
->man
->macroset
== MACROSET_NONE
)
682 curp
->man
->macroset
= MACROSET_MAN
;
683 if (curp
->man
->macroset
== MACROSET_MDOC
)
684 mdoc_endparse(curp
->man
);
686 man_endparse(curp
->man
);
687 roff_endparse(curp
->roff
);
691 mparse_parse_buffer(struct mparse
*curp
, struct buf blk
, const char *file
)
693 struct buf
*svprimary
;
696 static int recursion_depth
;
698 if (64 < recursion_depth
) {
699 mandoc_msg(MANDOCERR_ROFFLOOP
, curp
, curp
->line
, 0, NULL
);
703 /* Line number is per-file. */
706 svprimary
= curp
->primary
;
707 curp
->primary
= &blk
;
711 /* Skip an UTF-8 byte order mark. */
712 if (curp
->filenc
& MPARSE_UTF8
&& blk
.sz
> 2 &&
713 (unsigned char)blk
.buf
[0] == 0xef &&
714 (unsigned char)blk
.buf
[1] == 0xbb &&
715 (unsigned char)blk
.buf
[2] == 0xbf) {
717 curp
->filenc
&= ~MPARSE_LATIN1
;
721 mparse_buf_r(curp
, blk
, offset
, 1);
723 if (--recursion_depth
== 0)
726 curp
->primary
= svprimary
;
731 mparse_readmem(struct mparse
*curp
, void *buf
, size_t len
,
739 mparse_parse_buffer(curp
, blk
, file
);
740 return curp
->file_status
;
744 * Read the whole file into memory and call the parsers.
745 * Called recursively when an .so request is encountered.
748 mparse_readfd(struct mparse
*curp
, int fd
, const char *file
)
754 if (read_whole_file(curp
, file
, fd
, &blk
, &with_mmap
)) {
755 save_filenc
= curp
->filenc
;
756 curp
->filenc
= curp
->options
&
757 (MPARSE_UTF8
| MPARSE_LATIN1
);
758 mparse_parse_buffer(curp
, blk
, file
);
759 curp
->filenc
= save_filenc
;
762 munmap(blk
.buf
, blk
.sz
);
767 return curp
->file_status
;
771 mparse_open(struct mparse
*curp
, const char *file
)
777 cp
= strrchr(file
, '.');
778 curp
->gzip
= (cp
!= NULL
&& ! strcmp(cp
+ 1, "gz"));
780 /* First try to use the filename as it is. */
782 if ((fd
= open(file
, O_RDONLY
)) != -1)
786 * If that doesn't work and the filename doesn't
787 * already end in .gz, try appending .gz.
791 mandoc_asprintf(&cp
, "%s.gz", file
);
792 fd
= open(cp
, O_RDONLY
);
800 /* Neither worked, give up. */
802 mandoc_msg(MANDOCERR_FILE
, curp
, 0, 0, strerror(errno
));
807 mparse_alloc(int options
, enum mandoclevel wlevel
, mandocmsg mmsg
,
812 curp
= mandoc_calloc(1, sizeof(struct mparse
));
814 curp
->options
= options
;
815 curp
->wlevel
= wlevel
;
819 curp
->roff
= roff_alloc(curp
, options
);
820 curp
->man
= roff_man_alloc( curp
->roff
, curp
, curp
->defos
,
821 curp
->options
& MPARSE_QUICK
? 1 : 0);
822 if (curp
->options
& MPARSE_MDOC
) {
824 curp
->man
->macroset
= MACROSET_MDOC
;
825 } else if (curp
->options
& MPARSE_MAN
) {
827 curp
->man
->macroset
= MACROSET_MAN
;
829 curp
->man
->first
->tok
= TOKEN_NONE
;
834 mparse_reset(struct mparse
*curp
)
836 roff_reset(curp
->roff
);
837 roff_man_reset(curp
->man
);
839 curp
->secondary
->sz
= 0;
841 curp
->file_status
= MANDOCLEVEL_OK
;
848 mparse_free(struct mparse
*curp
)
851 roff_man_free(curp
->man
);
853 roff_free(curp
->roff
);
855 free(curp
->secondary
->buf
);
857 free(curp
->secondary
);
863 mparse_result(struct mparse
*curp
, struct roff_man
**man
,
867 if (sodest
&& NULL
!= (*sodest
= curp
->sodest
)) {
876 mparse_updaterc(struct mparse
*curp
, enum mandoclevel
*rc
)
878 if (curp
->file_status
> *rc
)
879 *rc
= curp
->file_status
;
883 mandoc_vmsg(enum mandocerr t
, struct mparse
*m
,
884 int ln
, int pos
, const char *fmt
, ...)
890 (void)vsnprintf(buf
, sizeof(buf
), fmt
, ap
);
893 mandoc_msg(t
, m
, ln
, pos
, buf
);
897 mandoc_msg(enum mandocerr er
, struct mparse
*m
,
898 int ln
, int col
, const char *msg
)
900 enum mandoclevel level
;
902 level
= MANDOCLEVEL_UNSUPP
;
903 while (er
< mandoclimits
[level
])
906 if (level
< m
->wlevel
&& er
!= MANDOCERR_FILE
)
910 (*m
->mmsg
)(er
, level
, m
->file
, ln
, col
, msg
);
912 if (m
->file_status
< level
)
913 m
->file_status
= level
;
917 mparse_strerror(enum mandocerr er
)
920 return mandocerrs
[er
];
924 mparse_strlevel(enum mandoclevel lvl
)
926 return mandoclevels
[lvl
];
930 mparse_keep(struct mparse
*p
)
933 assert(NULL
== p
->secondary
);
934 p
->secondary
= mandoc_calloc(1, sizeof(struct buf
));
938 mparse_getkeep(const struct mparse
*p
)
941 assert(p
->secondary
);
942 return p
->secondary
->sz
? p
->secondary
->buf
: NULL
;