1 /* wc - print the number of lines, words, and bytes in files
2 Copyright (C) 1985-2023 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Written by Paul Rubin, phr@ocf.berkeley.edu
18 and David MacKenzie, djm@gnu.ai.mit.edu. */
23 #include <stdckdint.h>
26 #include <sys/types.h>
30 #include <argv-iter.h>
33 #include <readtokens0.h>
34 #include <stat-size.h>
35 #include <xbinary-io.h>
40 /* The official name of this program (e.g., no 'g' prefix). */
41 #define PROGRAM_NAME "wc"
44 proper_name ("Paul Rubin"), \
45 proper_name ("David MacKenzie")
47 /* Size of atomic reads. */
48 #define BUFFER_SIZE (16 * 1024)
50 static bool wc_isprint
[UCHAR_MAX
+ 1];
51 static bool wc_isspace
[UCHAR_MAX
+ 1];
55 /* Cumulative number of lines, words, chars and bytes in all files so far.
56 max_line_length is the maximum over all files processed so far. */
57 static uintmax_t total_lines
;
58 static uintmax_t total_words
;
59 static uintmax_t total_chars
;
60 static uintmax_t total_bytes
;
61 static bool total_lines_overflow
;
62 static bool total_words_overflow
;
63 static bool total_chars_overflow
;
64 static bool total_bytes_overflow
;
65 static intmax_t max_line_length
;
67 /* Which counts to print. */
68 static bool print_lines
, print_words
, print_chars
, print_bytes
;
69 static bool print_linelength
;
71 /* The print width of each count. */
72 static int number_width
;
74 /* True if we have ever read the standard input. */
75 static bool have_read_stdin
;
77 /* Used to determine if file size can be determined without reading. */
78 static idx_t page_size
;
80 /* Enable to _not_ treat non breaking space as a word separator. */
81 static bool posixly_correct
;
83 /* The result of calling fstat or stat on a file descriptor or file. */
86 /* If positive, fstat or stat has not been called yet. Otherwise,
87 this is the value returned from fstat or stat. */
90 /* If FAILED is zero, this is the file's status. */
94 /* For long options that have no equivalent short option, use a
95 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
98 DEBUG_PROGRAM_OPTION
= CHAR_MAX
+ 1,
103 static struct option
const longopts
[] =
105 {"bytes", no_argument
, nullptr, 'c'},
106 {"chars", no_argument
, nullptr, 'm'},
107 {"lines", no_argument
, nullptr, 'l'},
108 {"words", no_argument
, nullptr, 'w'},
109 {"debug", no_argument
, nullptr, DEBUG_PROGRAM_OPTION
},
110 {"files0-from", required_argument
, nullptr, FILES0_FROM_OPTION
},
111 {"max-line-length", no_argument
, nullptr, 'L'},
112 {"total", required_argument
, nullptr, TOTAL_OPTION
},
113 {GETOPT_HELP_OPTION_DECL
},
114 {GETOPT_VERSION_OPTION_DECL
},
115 {nullptr, 0, nullptr, 0}
120 total_auto
, /* 0: default or --total=auto */
121 total_always
, /* 1: --total=always */
122 total_only
, /* 2: --total=only */
123 total_never
/* 3: --total=never */
125 static char const *const total_args
[] =
127 "auto", "always", "only", "never", nullptr
129 static enum total_type
const total_types
[] =
131 total_auto
, total_always
, total_only
, total_never
133 ARGMATCH_VERIFY (total_args
, total_types
);
134 static enum total_type total_mode
= total_auto
;
136 #ifdef USE_AVX2_WC_LINECOUNT
138 avx2_supported (void)
140 bool avx_enabled
= 0 < __builtin_cpu_supports ("avx2");
143 error (0, 0, (avx_enabled
144 ? _("using avx2 hardware support")
145 : _("avx2 support not detected")));
154 if (status
!= EXIT_SUCCESS
)
159 Usage: %s [OPTION]... [FILE]...\n\
160 or: %s [OPTION]... --files0-from=F\n\
162 program_name
, program_name
);
164 Print newline, word, and byte counts for each FILE, and a total line if\n\
165 more than one FILE is specified. A word is a nonempty sequence of non white\n\
166 space delimited by white space characters or by start or end of input.\n\
173 The options below may be used to select which counts are printed, always in\n\
174 the following order: newline, word, character, byte, maximum line length.\n\
175 -c, --bytes print the byte counts\n\
176 -m, --chars print the character counts\n\
177 -l, --lines print the newline counts\n\
180 --files0-from=F read input from the files specified by\n\
181 NUL-terminated names in file F;\n\
182 If F is - then read names from standard input\n\
183 -L, --max-line-length print the maximum display width\n\
184 -w, --words print the word counts\n\
187 --total=WHEN when to print a line with total counts;\n\
188 WHEN can be: auto, always, only, never\n\
190 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
191 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
192 emit_ancillary_info (PROGRAM_NAME
);
197 /* Return non zero if a non breaking space. */
200 iswnbspace (wint_t wc
)
202 return ! posixly_correct
203 && (wc
== 0x00A0 || wc
== 0x2007
204 || wc
== 0x202F || wc
== 0x2060);
207 /* FILE is the name of the file (or null for standard input)
208 associated with the specified counters. */
210 write_counts (uintmax_t lines
,
217 static char const format_sp_int
[] = " %*s";
218 char const *format_int
= format_sp_int
+ 1;
219 char buf
[MAX (INT_BUFSIZE_BOUND (intmax_t),
220 INT_BUFSIZE_BOUND (uintmax_t))];
224 printf (format_int
, number_width
, umaxtostr (lines
, buf
));
225 format_int
= format_sp_int
;
229 printf (format_int
, number_width
, umaxtostr (words
, buf
));
230 format_int
= format_sp_int
;
234 printf (format_int
, number_width
, umaxtostr (chars
, buf
));
235 format_int
= format_sp_int
;
239 printf (format_int
, number_width
, umaxtostr (bytes
, buf
));
240 format_int
= format_sp_int
;
242 if (print_linelength
)
243 printf (format_int
, number_width
, imaxtostr (linelength
, buf
));
245 printf (" %s", strchr (file
, '\n') ? quotef (file
) : file
);
249 /* Read FD and return a summary. */
250 static struct wc_lines
253 #ifdef USE_AVX2_WC_LINECOUNT
254 static signed char use_avx2
;
256 use_avx2
= avx2_supported () ? 1 : -1;
258 return wc_lines_avx2 (fd
);
261 intmax_t lines
= 0, bytes
= 0;
262 bool long_lines
= false;
266 char buf
[BUFFER_SIZE
+ 1];
267 ssize_t bytes_read
= read (fd
, buf
, BUFFER_SIZE
);
269 return (struct wc_lines
) { bytes_read
== 0 ? 0 : errno
, lines
, bytes
};
272 char *end
= buf
+ bytes_read
;
277 /* Avoid function call overhead for shorter lines. */
278 for (char *p
= buf
; p
< end
; p
++)
279 buflines
+= *p
== '\n';
283 /* rawmemchr is more efficient with longer lines. */
285 for (char *p
= buf
; (p
= rawmemchr (p
, '\n')) < end
; p
++)
289 /* If the average line length in the block is >= 15, then use
290 memchr for the next block, where system specific optimizations
291 may outweigh function call overhead.
292 FIXME: This line length was determined in 2015, on both
293 x86_64 and ppc64, but it's worth re-evaluating in future with
294 newer compilers, CPUs, or memchr() implementations etc. */
295 long_lines
= 15 * buflines
<= bytes_read
;
300 /* Count words. FILE_X is the name of the file (or null for standard
301 input) that is open on descriptor FD. *FSTATUS is its status.
302 CURRENT_POS is the current file offset if known, negative if unknown.
303 Return true if successful. */
305 wc (int fd
, char const *file_x
, struct fstatus
*fstatus
, off_t current_pos
)
308 char buf
[BUFFER_SIZE
+ 1];
309 intmax_t lines
, words
, chars
, bytes
, linelength
;
310 bool count_bytes
, count_chars
, count_complicated
;
311 char const *file
= file_x
? file_x
: _("standard input");
313 lines
= words
= chars
= bytes
= linelength
= 0;
315 /* If in the current locale, chars are equivalent to bytes, we prefer
316 counting bytes, because that's easier. */
319 count_bytes
= print_bytes
;
320 count_chars
= print_chars
;
324 count_bytes
= print_bytes
|| print_chars
;
327 count_complicated
= print_words
|| print_linelength
;
329 /* Advise the kernel of our access pattern only if we will read(). */
330 if (!count_bytes
|| count_chars
|| print_lines
|| count_complicated
)
331 fdadvise (fd
, 0, 0, FADVISE_SEQUENTIAL
);
333 /* When counting only bytes, save some line- and word-counting
334 overhead. If FD is a 'regular' Unix file, using lseek is enough
335 to get its 'size' in bytes. Otherwise, read blocks of BUFFER_SIZE
336 bytes at a time until EOF. Note that the 'size' (number of bytes)
337 that wc reports is smaller than stats.st_size when the file is not
338 positioned at its beginning. That's why the lseek calls below are
339 necessary. For example the command
340 '(dd ibs=99k skip=1 count=0; ./wc -c) < /etc/group'
341 should make wc report '0' bytes. */
343 if (count_bytes
&& !count_chars
&& !print_lines
&& !count_complicated
)
345 bool skip_read
= false;
347 if (0 < fstatus
->failed
)
348 fstatus
->failed
= fstat (fd
, &fstatus
->st
);
350 /* For sized files, seek to one st_blksize before EOF rather than to EOF.
351 This works better for files in proc-like file systems where
352 the size is only approximate. */
353 if (! fstatus
->failed
&& usable_st_size (&fstatus
->st
)
354 && 0 <= fstatus
->st
.st_size
)
356 off_t end_pos
= fstatus
->st
.st_size
;
358 current_pos
= lseek (fd
, 0, SEEK_CUR
);
360 if (end_pos
% page_size
)
362 /* We only need special handling of /proc and /sys files etc.
363 when they're a multiple of PAGE_SIZE. In the common case
364 for files with st_size not a multiple of PAGE_SIZE,
365 it's more efficient and accurate to use st_size.
367 Be careful here. The current position may actually be
368 beyond the end of the file. As in the example above. */
370 bytes
= end_pos
< current_pos
? 0 : end_pos
- current_pos
;
371 if (bytes
&& 0 <= lseek (fd
, bytes
, SEEK_CUR
))
378 off_t hi_pos
= (end_pos
379 - end_pos
% (STP_BLKSIZE (&fstatus
->st
) + 1));
380 if (0 <= current_pos
&& current_pos
< hi_pos
381 && 0 <= lseek (fd
, hi_pos
, SEEK_CUR
))
382 bytes
= hi_pos
- current_pos
;
388 fdadvise (fd
, 0, 0, FADVISE_SEQUENTIAL
);
389 for (ssize_t bytes_read
;
390 (bytes_read
= read (fd
, buf
, BUFFER_SIZE
));
399 else if (!count_chars
&& !count_complicated
)
401 /* Use a separate loop when counting only lines or lines and bytes --
402 but not chars or words. */
403 struct wc_lines w
= wc_lines (fd
);
408 else if (MB_CUR_MAX
> 1)
410 bool in_word
= false;
411 intmax_t linepos
= 0;
412 mbstate_t state
; mbszero (&state
);
413 bool in_shift
= false;
414 idx_t prev
= 0; /* Number of bytes carried over from previous round. */
416 for (ssize_t bytes_read
;
417 ((bytes_read
= read (fd
, buf
+ prev
, BUFFER_SIZE
- prev
))
429 char const *plim
= p
+ prev
+ bytes_read
;
436 if (!in_shift
&& 0 <= *p
&& *p
< 0x80)
438 /* Handle most ASCII characters quickly, without calling
446 idx_t scanbytes
= plim
- (p
+ prev
);
447 size_t n
= mbrtoc32 (&wide_char
, p
+ prev
, scanbytes
, &state
);
452 if (n
== (size_t) -2 && plim
- p
< BUFFER_SIZE
455 /* An incomplete character that is not ridiculously
456 long and there may be more input. Move the bytes
457 to buffer start and prepare to read more data. */
459 memmove (buf
, p
, prev
);
464 /* Remember that we read a byte, but don't complain
465 about the error. Because of the decoding error,
466 this is a considered to be byte but not a
467 character (that is, chars is not incremented). */
472 /* Treat encoding errors as non white space.
473 POSIX says a word is "a non-zero-length string of
474 characters delimited by white space". This is
475 wrong in some sense, as the string can be delimited
476 by start or end of input, and it is unclear what it
477 means when the input contains encoding errors.
478 Since encoding errors are not white space,
479 treat them that way here. */
486 single_byte
= charbytes
== !in_shift
;
487 in_shift
= !mbsinit (&state
);
497 if (linepos
> linelength
)
498 linelength
= linepos
;
504 linepos
+= 8 - (linepos
% 8);
519 linepos
+= wc_isprint
[wide_char
];
520 in_word2
= !wc_isspace
[wide_char
];
524 /* c32width can be expensive on macOS for example,
525 so avoid if not needed. */
526 if (print_linelength
)
528 int width
= c32width (wide_char
);
532 in_word2
= !iswnbspace (wide_char
);
535 /* Count words by counting word starts, i.e., each
536 white space character (or the start of input)
537 followed by non white space. */
538 words
+= !in_word
& in_word2
;
548 if (linepos
> linelength
)
549 linelength
= linepos
;
553 bool in_word
= false;
554 intmax_t linepos
= 0;
556 for (ssize_t bytes_read
; (bytes_read
= read (fd
, buf
, BUFFER_SIZE
)); )
568 unsigned char c
= *p
++;
576 if (linepos
> linelength
)
577 linelength
= linepos
;
583 linepos
+= 8 - (linepos
% 8);
595 linepos
+= wc_isprint
[c
];
596 bool in_word2
= !wc_isspace
[c
];
597 words
+= !in_word
& in_word2
;
602 while (--bytes_read
);
604 if (linepos
> linelength
)
605 linelength
= linepos
;
608 if (count_chars
< print_chars
)
611 if (total_mode
!= total_only
)
612 write_counts (lines
, words
, chars
, bytes
, linelength
, file_x
);
614 total_lines_overflow
|= ckd_add (&total_lines
, total_lines
, lines
);
615 total_words_overflow
|= ckd_add (&total_words
, total_words
, words
);
616 total_chars_overflow
|= ckd_add (&total_chars
, total_chars
, chars
);
617 total_bytes_overflow
|= ckd_add (&total_bytes
, total_bytes
, bytes
);
619 if (linelength
> max_line_length
)
620 max_line_length
= linelength
;
623 error (0, err
, "%s", quotef (file
));
628 wc_file (char const *file
, struct fstatus
*fstatus
)
630 if (! file
|| STREQ (file
, "-"))
632 have_read_stdin
= true;
633 xset_binary_mode (STDIN_FILENO
, O_BINARY
);
634 return wc (STDIN_FILENO
, file
, fstatus
, -1);
638 int fd
= open (file
, O_RDONLY
| O_BINARY
);
641 error (0, errno
, "%s", quotef (file
));
646 bool ok
= wc (fd
, file
, fstatus
, 0);
649 error (0, errno
, "%s", quotef (file
));
657 /* Return the file status for the NFILES files addressed by FILE.
658 Optimize the case where only one number is printed, for just one
659 file; in that case we can use a print width of 1, so we don't need
660 to stat the file. Handle the case of (nfiles == 0) in the same way;
661 that happens when we don't know how long the list of file names will be. */
663 static struct fstatus
*
664 get_input_fstatus (idx_t nfiles
, char *const *file
)
666 struct fstatus
*fstatus
= xnmalloc (nfiles
? nfiles
: 1, sizeof *fstatus
);
670 && ((print_lines
+ print_words
+ print_chars
671 + print_bytes
+ print_linelength
)
673 fstatus
[0].failed
= 1;
676 for (idx_t i
= 0; i
< nfiles
; i
++)
677 fstatus
[i
].failed
= (! file
[i
] || STREQ (file
[i
], "-")
678 ? fstat (STDIN_FILENO
, &fstatus
[i
].st
)
679 : stat (file
[i
], &fstatus
[i
].st
));
685 /* Return a print width suitable for the NFILES files whose status is
686 recorded in FSTATUS. Optimize the same special case that
687 get_input_fstatus optimizes. */
691 compute_number_width (idx_t nfiles
, struct fstatus
const *fstatus
)
695 if (0 < nfiles
&& fstatus
[0].failed
<= 0)
697 int minimum_width
= 1;
698 uintmax_t regular_total
= 0;
700 for (idx_t i
= 0; i
< nfiles
; i
++)
701 if (! fstatus
[i
].failed
)
703 if (!S_ISREG (fstatus
[i
].st
.st_mode
))
705 else if (ckd_add (®ular_total
, regular_total
,
706 fstatus
[i
].st
.st_size
))
708 regular_total
= UINTMAX_MAX
;
713 for (; 10 <= regular_total
; regular_total
/= 10)
715 if (width
< minimum_width
)
716 width
= minimum_width
;
724 main (int argc
, char **argv
)
729 char *files_from
= nullptr;
730 struct fstatus
*fstatus
;
733 initialize_main (&argc
, &argv
);
734 set_program_name (argv
[0]);
735 setlocale (LC_ALL
, "");
736 bindtextdomain (PACKAGE
, LOCALEDIR
);
737 textdomain (PACKAGE
);
739 atexit (close_stdout
);
741 page_size
= getpagesize ();
742 /* Line buffer stdout to ensure lines are written atomically and immediately
743 so that processes running in parallel do not intersperse their output. */
744 setvbuf (stdout
, nullptr, _IOLBF
, 0);
746 posixly_correct
= (getenv ("POSIXLY_CORRECT") != nullptr);
748 print_lines
= print_words
= print_chars
= print_bytes
= false;
749 print_linelength
= false;
750 total_lines
= total_words
= total_chars
= total_bytes
= max_line_length
= 0;
752 while ((optc
= getopt_long (argc
, argv
, "clLmw", longopts
, nullptr)) != -1)
772 print_linelength
= true;
775 case DEBUG_PROGRAM_OPTION
:
779 case FILES0_FROM_OPTION
:
784 total_mode
= XARGMATCH ("--total", optarg
, total_args
, total_types
);
787 case_GETOPT_HELP_CHAR
;
789 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
792 usage (EXIT_FAILURE
);
795 if (! (print_lines
|| print_words
|| print_chars
|| print_bytes
796 || print_linelength
))
797 print_lines
= print_words
= print_bytes
= true;
799 if (print_linelength
)
800 for (int i
= 0; i
<= UCHAR_MAX
; i
++)
801 wc_isprint
[i
] = !!isprint (i
);
803 for (int i
= 0; i
<= UCHAR_MAX
; i
++)
804 wc_isspace
[i
] = isspace (i
) || iswnbspace (btoc32 (i
));
806 bool read_tokens
= false;
807 struct argv_iterator
*ai
;
812 /* When using --files0-from=F, you may not specify any files
813 on the command-line. */
816 error (0, 0, _("extra operand %s"), quoteaf (argv
[optind
]));
817 fprintf (stderr
, "%s\n",
818 _("file operands cannot be combined with --files0-from"));
819 usage (EXIT_FAILURE
);
822 if (STREQ (files_from
, "-"))
826 stream
= fopen (files_from
, "r");
827 if (stream
== nullptr)
828 error (EXIT_FAILURE
, errno
, _("cannot open %s for reading"),
829 quoteaf (files_from
));
832 /* Read the file list into RAM if we can detect its size and that
833 size is reasonable. Otherwise, we'll read a name at a time. */
835 if (fstat (fileno (stream
), &st
) == 0
836 && S_ISREG (st
.st_mode
)
837 && st
.st_size
<= MIN (10 * 1024 * 1024, physmem_available () / 2))
840 readtokens0_init (&tok
);
841 if (! readtokens0 (stream
, &tok
) || fclose (stream
) != 0)
842 error (EXIT_FAILURE
, 0, _("cannot read file names from %s"),
843 quoteaf (files_from
));
846 ai
= argv_iter_init_argv (files
);
852 ai
= argv_iter_init_stream (stream
);
857 static char *stdin_only
[] = { nullptr };
858 files
= (optind
< argc
? argv
+ optind
: stdin_only
);
859 nfiles
= (optind
< argc
? argc
- optind
: 1);
860 ai
= argv_iter_init_argv (files
);
866 fstatus
= get_input_fstatus (nfiles
, files
);
867 if (total_mode
== total_only
)
868 number_width
= 1; /* No extra padding, since no alignment requirement. */
870 number_width
= compute_number_width (nfiles
, fstatus
);
873 enum argv_iter_err ai_err
;
875 for (int i
= 0; (file_name
= argv_iter (ai
, &ai_err
)); i
++)
877 bool skip_file
= false;
878 if (files_from
&& STREQ (files_from
, "-") && STREQ (file_name
, "-"))
880 /* Give a better diagnostic in an unusual case:
881 printf - | wc --files0-from=- */
882 error (0, 0, _("when reading file names from stdin, "
883 "no file name of %s allowed"),
884 quoteaf (file_name
));
890 /* Diagnose a zero-length file name. When it's one
891 among many, knowing the record number may help.
892 FIXME: currently print the record number only with
893 --files0-from=FILE. Maybe do it for argv, too? */
894 if (files_from
== nullptr)
895 error (0, 0, "%s", _("invalid zero-length file name"));
898 /* Using the standard 'filename:line-number:' prefix here is
899 not totally appropriate, since NUL is the separator, not NL,
900 but it might be better than nothing. */
901 error (0, 0, "%s:%zu: %s", quotef (files_from
),
902 argv_iter_n_args (ai
), _("invalid zero-length file name"));
910 ok
&= wc_file (file_name
, &fstatus
[nfiles
? i
: 0]);
913 fstatus
[0].failed
= 1;
921 error (0, errno
, _("%s: read error"), quotef (files_from
));
932 /* No arguments on the command line is fine. That means read from stdin.
933 However, no arguments on the --files0-from input stream is an error
934 means don't read anything. */
935 if (ok
&& !files_from
&& argv_iter_n_args (ai
) == 0)
936 ok
&= wc_file (nullptr, &fstatus
[0]);
939 readtokens0_free (&tok
);
941 if (total_mode
!= total_never
942 && (total_mode
!= total_auto
|| 1 < argv_iter_n_args (ai
)))
944 if (total_lines_overflow
)
946 total_lines
= UINTMAX_MAX
;
947 error (0, EOVERFLOW
, _("total lines"));
950 if (total_words_overflow
)
952 total_words
= UINTMAX_MAX
;
953 error (0, EOVERFLOW
, _("total words"));
956 if (total_chars_overflow
)
958 total_chars
= UINTMAX_MAX
;
959 error (0, EOVERFLOW
, _("total characters"));
962 if (total_bytes_overflow
)
964 total_bytes
= UINTMAX_MAX
;
965 error (0, EOVERFLOW
, _("total bytes"));
969 write_counts (total_lines
, total_words
, total_chars
, total_bytes
,
971 total_mode
!= total_only
? _("total") : nullptr);
978 if (have_read_stdin
&& close (STDIN_FILENO
) != 0)
979 error (EXIT_FAILURE
, errno
, "-");
981 return ok
? EXIT_SUCCESS
: EXIT_FAILURE
;