1 /* GNU fmt -- simple text formatter.
2 Copyright (C) 1994, 1995 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 /* Written by Ross Paterson <rap@doc.ic.ac.uk>. */
23 #include <sys/types.h>
26 /* Redefine. Otherwise, systems (Unicos for one) with headers that define
27 it to be a type get syntax errors for the variable declaration below. */
28 #define word unused_word_type
34 /* The following parameters represent the program's idea of what is
35 "best". Adjust to taste, subject to the caveats given. */
37 /* Default longest permitted line length (max_width). */
40 /* Prefer lines to be LEEWAY % shorter than the maximum width, giving
41 room for optimization. */
44 /* The default secondary indent of tagged paragraph used for unindented
45 one-line paragraphs not preceded by any multi-line paragraphs. */
48 /* Costs and bonuses are expressed as the equivalent departure from the
49 optimal line length, multiplied by 10. e.g. assigning something a
50 cost of 50 means that it is as bad as a line 5 characters too short
51 or too long. The definition of SHORT_COST(n) should not be changed.
52 However, EQUIV(n) may need tuning. */
56 #define MAXCOST (~(((COST) 1) << (8 * sizeof (COST) -1)))
58 #define SQR(n) ((n) * (n))
59 #define EQUIV(n) SQR ((COST) (n))
61 /* Cost of a filled line n chars longer or shorter than best_width. */
62 #define SHORT_COST(n) EQUIV ((n) * 10)
64 /* Cost of the difference between adjacent filled lines. */
65 #define RAGGED_COST(n) (SHORT_COST (n) / 2)
67 /* Basic cost per line. */
68 #define LINE_COST EQUIV (70)
70 /* Cost of breaking a line after the first word of a sentence, where
71 the length of the word is N. */
72 #define WIDOW_COST(n) (EQUIV (200) / ((n) + 2))
74 /* Cost of breaking a line before the last word of a sentence, where
75 the length of the word is N. */
76 #define ORPHAN_COST(n) (EQUIV (150) / ((n) + 2))
78 /* Bonus for breaking a line at the end of a sentence. */
79 #define SENTENCE_BONUS EQUIV (50)
81 /* Cost of breaking a line after a period not marking end of a sentence.
82 With the definition of sentence we are using (borrowed from emacs, see
83 get_line()) such a break would then look like a sentence break. Hence
84 we assign a very high cost -- it should be avoided unless things are
86 #define NOBREAK_COST EQUIV (600)
88 /* Bonus for breaking a line before open parenthesis. */
89 #define PAREN_BONUS EQUIV (40)
91 /* Bonus for breaking a line after other punctuation. */
92 #define PUNCT_BONUS EQUIV(40)
94 /* Credit for breaking a long paragraph one line later. */
95 #define LINE_CREDIT EQUIV(3)
97 /* Size of paragraph buffer, in words and characters. Longer paragraphs
98 are handled neatly (cf. flush_paragraph()), so there's little to gain
99 by making these larger. */
100 #define MAXWORDS 1000
101 #define MAXCHARS 5000
103 /* Extra ctype(3)-style macros. */
105 #define isopen(c) (strchr ("([`'\"", c) != NULL)
106 #define isclose(c) (strchr (")]'\"", c) != NULL)
107 #define isperiod(c) (strchr (".?!", c) != NULL)
109 /* Size of a tab stop, for expansion on input and re-introduction on
113 /* Miscellaneous definitions. */
115 typedef unsigned int bool;
119 /* Word descriptor structure. */
121 typedef struct Word WORD
;
126 /* Static attributes determined during input. */
128 const char *text
; /* the text of the word */
129 short length
; /* length of this word */
130 short space
; /* the size of the following space */
131 bool paren
:1; /* starts with open paren */
132 bool period
:1; /* ends in [.?!])* */
133 bool punct
:1; /* ends in punctuation */
134 bool final
:1; /* end of sentence */
136 /* The remaining fields are computed during the optimization. */
138 short line_length
; /* length of the best line starting here */
139 COST best_cost
; /* cost of best paragraph starting here */
140 WORD
*next_break
; /* break which achieves best_cost */
143 /* Forward declarations. */
145 static void set_prefix
__P ((char *p
));
146 static void fmt
__P ((FILE *f
));
147 static bool get_paragraph
__P ((FILE *f
));
148 static int get_line
__P ((FILE *f
, int c
));
149 static int get_prefix
__P ((FILE *f
));
150 static int get_space
__P ((FILE *f
, int c
));
151 static int copy_rest
__P ((FILE *f
, int c
));
152 static bool same_para
__P ((int c
));
153 static void flush_paragraph
__P ((void));
154 static void fmt_paragraph
__P ((void));
155 static void check_punctuation
__P ((WORD
*w
));
156 static COST base_cost
__P ((WORD
*this));
157 static COST line_cost
__P ((WORD
*next
, int len
));
158 static void put_paragraph
__P ((WORD
*finish
));
159 static void put_line
__P ((WORD
*w
, int indent
));
160 static void put_word
__P ((WORD
*w
));
161 static void put_space
__P ((int space
));
163 /* The name this program was run with. */
164 const char *program_name
;
166 /* If nonzero, display usage information and exit. */
167 static int show_help
= 0;
169 /* If nonzero, print the version on standard output and exit. */
170 static int show_version
= 0;
174 /* If TRUE, first 2 lines may have different indent (default FALSE). */
177 /* If TRUE, first 2 lines _must_ have different indent (default FALSE). */
180 /* If TRUE, each line is a paragraph on its own (default FALSE). */
183 /* If TRUE, don't preserve inter-word spacing (default FALSE). */
186 /* Prefix minus leading and trailing spaces (default ""). */
187 static const char *prefix
;
189 /* User-supplied maximum line width (default WIDTH). The only output
191 longer than this will each comprise a single word. */
192 static int max_width
;
194 /* Values derived from the option values. */
196 /* The length of prefix minus leading space. */
197 static int prefix_full_length
;
199 /* The length of the leading space trimmed from the prefix. */
200 static int prefix_lead_space
;
202 /* The length of prefix minus leading and trailing space. */
203 static int prefix_length
;
205 /* The preferred width of text lines, set to LEEWAY % less than max_width. */
206 static int best_width
;
208 /* Dynamic variables. */
210 /* Start column of the character most recently read from the input file. */
211 static int in_column
;
213 /* Start column of the next character to be written to stdout. */
214 static int out_column
;
216 /* Space for the paragraph text -- longer paragraphs are handled neatly
217 (cf. flush_paragraph()). */
218 static char parabuf
[MAXCHARS
];
220 /* A pointer into parabuf, indicating the first unused character position. */
223 /* The words of a paragraph -- longer paragraphs are handled neatly
224 (cf. flush_paragraph()). */
225 static WORD word
[MAXWORDS
];
227 /* A pointer into the above word array, indicating the first position
228 after the last complete word. Sometimes it will point at an incomplete
230 static WORD
*word_limit
;
232 /* If TRUE, current input file contains tab characters, and so tabs can be
233 used for white space on output. */
236 /* Space before trimmed prefix on each line of the current paragraph. */
237 static int prefix_indent
;
239 /* Indentation of the first line of the current paragraph. */
240 static int first_indent
;
242 /* Indentation of other lines of the current paragraph */
243 static int other_indent
;
245 /* To detect the end of a paragraph, we need to look ahead to the first
246 non-blank character after the prefix on the next line, or the first
247 character on the following line that failed to match the prefix.
248 We can reconstruct the lookahead from that character (next_char), its
249 position on the line (in_column) and the amount of space before the
250 prefix (next_prefix_indent). See get_paragraph() and copy_rest(). */
252 /* The last character read from the input file. */
253 static int next_char
;
255 /* The space before the trimmed prefix (or part of it) on the next line
256 after the current paragraph. */
257 static int next_prefix_indent
;
259 /* If nonzero, the length of the last line output in the current
260 paragraph, used to charge for raggedness at the split point for long
261 paragraphs chosen by fmt_paragraph(). */
262 static int last_line_length
;
268 fprintf (stderr
, _("Try `%s --help' for more information.\n"),
272 printf (_("Usage: %s [-DIGITS] [OPTION]... [FILE]...\n"), program_name
);
274 Reformat each paragraph in the FILE(s), writing to standard output.\n\
275 If no FILE or if FILE is `-', read standard input.\n\
277 Mandatory arguments to long options are mandatory for short options too.\n\
278 -c, --crown-margin preserve indentation of first two lines\n\
279 -s, --split-only split long lines, but do not refill\n\
280 -t, --tagged-paragraph indentation of first line different from second\n\
281 -u, --uniform-spacing one space between words, two after sentences\n\
282 -w, --width=NUMBER maximum line width (default of 75 columns)\n\
283 -p, --prefix=STRING combine only lines having STRING as prefix\n\
284 --help display this help and exit\n\
285 --version output version information and exit\n\
287 In -wNUMBER, the letter `w' may be omitted.\n"),
293 /* Decode options and launch execution. */
295 static const struct option long_options
[] =
297 {"crown-margin", no_argument
, NULL
, 'c'},
298 {"help", no_argument
, &show_help
, 1},
299 {"prefix", required_argument
, NULL
, 'p'},
300 {"split-only", no_argument
, NULL
, 's'},
301 {"tagged-paragraph", no_argument
, NULL
, 't'},
302 {"uniform-spacing", no_argument
, NULL
, 'u'},
303 {"version", no_argument
, &show_version
, 1},
304 {"width", required_argument
, NULL
, 'w'},
309 main (register int argc
, register char **argv
)
314 program_name
= argv
[0];
316 crown
= tagged
= split
= uniform
= FALSE
;
319 prefix_length
= prefix_lead_space
= prefix_full_length
= 0;
321 if (argc
> 1 && argv
[1][0] == '-' && ISDIGIT (argv
[1][1]))
324 /* Old option syntax; a dash followed by one or more digits.
325 Move past the number. */
326 for (++argv
[1]; ISDIGIT (*argv
[1]); ++argv
[1])
328 /* FIXME: use strtol to detect overflow. */
329 max_width
= max_width
* 10 + *argv
[1] - '0';
331 /* Make the options we just parsed invisible to getopt. */
337 while ((optchar
= getopt_long (argc
, argv
, "0123456789cstuw:p:",
365 /* FIXME: use strtol. */
366 max_width
= atoi (optarg
);
377 printf ("fmt - %s\n", version_string
);
384 best_width
= max_width
* (2 * (100 - LEEWAY
) + 1) / 200;
389 for (; optind
< argc
; optind
++)
390 if (strcmp (argv
[optind
], "-") == 0)
394 infile
= fopen (argv
[optind
], "r");
401 error (0, errno
, argv
[optind
]);
407 /* Trim space from the front and back of the string P, yielding the prefix,
408 and record the lengths of the prefix and the space trimmed. */
411 set_prefix (register char *p
)
415 prefix_lead_space
= 0;
422 prefix_full_length
= strlen (p
);
423 s
= p
+ prefix_full_length
;
424 while (s
> p
&& s
[-1] == ' ')
427 prefix_length
= s
- p
;
430 /* read file F and send formatted output to stdout. */
437 next_char
= get_prefix (f
);
438 while (get_paragraph (f
))
441 put_paragraph (word_limit
);
445 /* Read a paragraph from input file F. A paragraph consists of a
446 maximal number of non-blank (excluding any prefix) lines subject to:
447 * In split mode, a paragraph is a single non-blank line.
448 * In crown mode, the second and subsequent lines must have the
449 same indentation, but possibly different from the indent of the
451 * Tagged mode is similar, but the first and second lines must have
452 different indentations.
453 * Otherwise, all lines of a paragraph must have the same indent.
454 If a prefix is in effect, it must be present at the same indent for
455 each line in the paragraph.
457 Return FALSE if end-of-file was encountered before the start of a
458 paragraph, else TRUE. */
461 get_paragraph (FILE *f
)
465 last_line_length
= 0;
468 /* Scan (and copy) blank lines, and lines not introduced by the prefix. */
470 while (c
== '\n' || c
== EOF
471 || next_prefix_indent
< prefix_lead_space
472 || in_column
< next_prefix_indent
+ prefix_full_length
)
474 c
= copy_rest (f
, c
);
484 /* Got a suitable first line for a paragraph. */
486 prefix_indent
= next_prefix_indent
;
487 first_indent
= in_column
;
492 /* Read rest of paragraph (unless split is specified). */
495 other_indent
= first_indent
;
500 other_indent
= in_column
;
502 { /* for each line till the end of the para */
505 while (same_para (c
) && in_column
== other_indent
);
508 other_indent
= first_indent
;
512 if (same_para (c
) && in_column
!= first_indent
)
514 other_indent
= in_column
;
516 { /* for each line till the end of the para */
519 while (same_para (c
) && in_column
== other_indent
);
522 /* Only one line: use the secondary indent from last time if it
523 splits, or 0 if there have been no multi-line paragraphs in the
524 input so far. But if these rules make the two indents the same,
525 pick a new secondary indent. */
527 else if (other_indent
== first_indent
)
528 other_indent
= first_indent
== 0 ? DEF_INDENT
: 0;
532 other_indent
= first_indent
;
533 while (same_para (c
) && in_column
== other_indent
)
536 (word_limit
- 1)->period
= (word_limit
- 1)->final
= TRUE
;
541 /* Copy to the output a line that failed to match the prefix, or that
542 was blank after the prefix. In the former case, C is the character
543 that failed to match the prefix. In the latter, C is \n or EOF.
544 Return the character (\n or EOF) ending the line. */
547 copy_rest (FILE *f
, register int c
)
549 register const char *s
;
552 if (in_column
> next_prefix_indent
&& c
!= '\n' && c
!= EOF
)
554 put_space (next_prefix_indent
);
555 for (s
= prefix
; out_column
!= in_column
; out_column
++)
558 while (c
!= '\n' && c
!= EOF
)
566 /* Return TRUE if a line whose first non-blank character after the
567 prefix (if any) is C could belong to the current paragraph,
571 same_para (register int c
)
573 return (next_prefix_indent
== prefix_indent
574 && in_column
>= next_prefix_indent
+ prefix_full_length
575 && c
!= '\n' && c
!= EOF
);
578 /* Read a line from input file F, given first non-blank character C
579 after the prefix, and the following indent, and break it into words.
580 A word is a maximal non-empty string of non-white characters. A word
581 ending in [.?!]["')\]]* and followed by end-of-line or at least two
582 spaces ends a sentence, as in emacs.
584 Return the first non-blank character of the next line. */
587 get_line (FILE *f
, register int c
)
590 register char *end_of_parabuf
;
591 register WORD
*end_of_word
;
593 end_of_parabuf
= ¶buf
[MAXCHARS
];
594 end_of_word
= &word
[MAXWORDS
- 2];
597 { /* for each word in a line */
601 word_limit
->text
= wptr
;
604 if (wptr
== end_of_parabuf
)
609 while (c
!= EOF
&& !isspace (c
));
610 in_column
+= word_limit
->length
= wptr
- word_limit
->text
;
611 check_punctuation (word_limit
);
613 /* Scan inter-word space. */
616 c
= get_space (f
, c
);
617 word_limit
->space
= in_column
- start
;
618 word_limit
->final
= (c
== EOF
619 || (word_limit
->period
620 && (c
== '\n' || word_limit
->space
> 1)));
621 if (c
== '\n' || c
== EOF
|| uniform
)
622 word_limit
->space
= word_limit
->final
? 2 : 1;
623 if (word_limit
== end_of_word
)
630 return get_prefix (f
);
633 /* Read a prefix from input file F. Return either first non-matching
634 character, or first non-blank character after the prefix. */
640 register const char *p
;
643 c
= get_space (f
, getc (f
));
644 if (prefix_length
== 0)
645 next_prefix_indent
= prefix_lead_space
< in_column
?
646 prefix_lead_space
: in_column
;
649 next_prefix_indent
= in_column
;
650 for (p
= prefix
; *p
!= '\0'; p
++)
657 c
= get_space (f
, c
);
662 /* Read blank characters from input file F, starting with C, and keeping
663 in_column up-to-date. Return first non-blank character. */
666 get_space (FILE *f
, register int c
)
675 in_column
= (in_column
/ TABWIDTH
+ 1) * TABWIDTH
;
683 /* Set extra fields in word W describing any attached punctuation. */
686 check_punctuation (register WORD
*w
)
688 register const char *start
, *finish
;
691 finish
= start
+ (w
->length
- 1);
692 w
->paren
= isopen (*start
);
693 w
->punct
= ispunct (*finish
);
694 while (isclose (*finish
) && finish
> start
)
696 w
->period
= isperiod (*finish
);
699 /* Flush part of the paragraph to make room. This function is called on
700 hitting the limit on the number of words or characters. */
703 flush_paragraph (void)
710 /* In the special case where it's all one word, just flush it. */
712 if (word_limit
== word
)
714 printf ("%*s", wptr
- parabuf
, parabuf
);
720 - format what you have so far as a paragraph,
721 - find a low-cost line break near the end,
723 - make that the start of the paragraph. */
727 /* Choose a good split point. */
729 split_point
= word_limit
;
730 best_break
= MAXCOST
;
731 for (w
= word
->next_break
; w
!= word_limit
; w
= w
->next_break
)
733 if (w
->best_cost
- w
->next_break
->best_cost
< best_break
)
736 best_break
= w
->best_cost
- w
->next_break
->best_cost
;
738 if (best_break
<= MAXCOST
- LINE_CREDIT
)
739 best_break
+= LINE_CREDIT
;
741 put_paragraph (split_point
);
743 /* Copy text of words down to start of parabuf -- we use memmove because
744 the source and target may overlap. */
746 memmove (parabuf
, split_point
->text
, (size_t) (wptr
- split_point
->text
));
747 shift
= split_point
->text
- parabuf
;
750 /* Adjust text pointers. */
752 for (w
= split_point
; w
<= word_limit
; w
++)
755 /* Copy words from split_point down to word -- we use memmove because
756 the source and target may overlap. */
758 memmove ((char *) word
, (char *) split_point
,
759 (word_limit
- split_point
+ 1) * sizeof (WORD
));
760 word_limit
-= split_point
- word
;
763 /* Compute the optimal formatting for the whole paragraph by computing
764 and remembering the optimal formatting for each suffix from the empty
765 one to the whole paragraph. */
770 register WORD
*start
, *w
;
772 register COST wcost
, best
;
775 word_limit
->best_cost
= 0;
776 saved_length
= word_limit
->length
;
777 word_limit
->length
= max_width
; /* sentinel */
779 for (start
= word_limit
- 1; start
>= word
; start
--)
782 len
= start
== word
? first_indent
: other_indent
;
784 /* At least one word, however long, in the line. */
792 /* Consider breaking before w. */
794 wcost
= line_cost (w
, len
) + w
->best_cost
;
795 if (start
== word
&& last_line_length
> 0)
796 wcost
+= RAGGED_COST (len
- last_line_length
);
800 start
->next_break
= w
;
801 start
->line_length
= len
;
803 len
+= (w
- 1)->space
+ w
->length
; /* w > start >= word */
805 while (len
< max_width
);
806 start
->best_cost
= best
+ base_cost (start
);
809 word_limit
->length
= saved_length
;
812 /* Return the constant component of the cost of breaking before the
816 base_cost (register WORD
*this)
824 if ((this - 1)->period
)
826 if ((this - 1)->final
)
827 cost
-= SENTENCE_BONUS
;
829 cost
+= NOBREAK_COST
;
831 else if ((this - 1)->punct
)
833 else if (this > word
+ 1 && (this - 2)->final
)
834 cost
+= WIDOW_COST ((this - 1)->length
);
839 else if (this->final
)
840 cost
+= ORPHAN_COST (this->length
);
845 /* Return the component of the cost of breaking before word NEXT that
846 depends on LEN, the length of the line beginning there. */
849 line_cost (register WORD
*next
, register int len
)
854 if (next
== word_limit
)
856 n
= best_width
- len
;
857 cost
= SHORT_COST (n
);
858 if (next
->next_break
!= word_limit
)
860 n
= len
- next
->line_length
;
861 cost
+= RAGGED_COST (n
);
866 /* Output to stdout a paragraph from word up to (but not including)
867 FINISH, which must be in the next_break chain from word. */
870 put_paragraph (register WORD
*finish
)
874 put_line (word
, first_indent
);
875 for (w
= word
->next_break
; w
!= finish
; w
= w
->next_break
)
876 put_line (w
, other_indent
);
879 /* Output to stdout the line beginning with word W, beginning in column
880 INDENT, including the prefix (if any). */
883 put_line (register WORD
*w
, int indent
)
885 register WORD
*endline
;
888 put_space (prefix_indent
);
889 fputs (prefix
, stdout
);
890 out_column
+= prefix_length
;
891 put_space (indent
- out_column
);
893 endline
= w
->next_break
- 1;
894 for (; w
!= endline
; w
++)
897 put_space (w
->space
);
900 last_line_length
= out_column
;
904 /* Output to stdout the word W. */
907 put_word (register WORD
*w
)
909 register const char *s
;
913 for (n
= w
->length
; n
!= 0; n
--)
915 out_column
+= w
->length
;
918 /* Output to stdout SPACE spaces, or equivalent tabs. */
921 put_space (int space
)
923 register int space_target
, tab_target
;
925 space_target
= out_column
+ space
;
928 tab_target
= space_target
/ TABWIDTH
* TABWIDTH
;
929 if (out_column
+ 1 < tab_target
)
930 while (out_column
< tab_target
)
933 out_column
= (out_column
/ TABWIDTH
+ 1) * TABWIDTH
;
936 while (out_column
< space_target
)