1 /* wc - print the number of lines, words, and bytes in files
2 Copyright (C) 1985-2024 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Written by Paul Rubin, phr@ocf.berkeley.edu
18 and David MacKenzie, djm@gnu.ai.mit.edu. */
25 #include <sys/types.h>
29 #include <argv-iter.h>
32 #include <readtokens0.h>
33 #include <stat-size.h>
34 #include <xbinary-io.h>
37 #include "ioblksize.h"
40 /* The official name of this program (e.g., no 'g' prefix). */
41 #define PROGRAM_NAME "wc"
44 proper_name ("Paul Rubin"), \
45 proper_name ("David MacKenzie")
47 static bool wc_isprint
[UCHAR_MAX
+ 1];
48 static bool wc_isspace
[UCHAR_MAX
+ 1];
52 /* Cumulative number of lines, words, chars and bytes in all files so far.
53 max_line_length is the maximum over all files processed so far. */
54 static uintmax_t total_lines
;
55 static uintmax_t total_words
;
56 static uintmax_t total_chars
;
57 static uintmax_t total_bytes
;
58 static bool total_lines_overflow
;
59 static bool total_words_overflow
;
60 static bool total_chars_overflow
;
61 static bool total_bytes_overflow
;
62 static intmax_t max_line_length
;
64 /* Which counts to print. */
65 static bool print_lines
, print_words
, print_chars
, print_bytes
;
66 static bool print_linelength
;
68 /* The print width of each count. */
69 static int number_width
;
71 /* True if we have ever read the standard input. */
72 static bool have_read_stdin
;
74 /* Used to determine if file size can be determined without reading. */
75 static idx_t page_size
;
77 /* Enable to _not_ treat non breaking space as a word separator. */
78 static bool posixly_correct
;
80 /* The result of calling fstat or stat on a file descriptor or file. */
83 /* If positive, fstat or stat has not been called yet. Otherwise,
84 this is the value returned from fstat or stat. */
87 /* If FAILED is zero, this is the file's status. */
91 /* For long options that have no equivalent short option, use a
92 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
95 DEBUG_PROGRAM_OPTION
= CHAR_MAX
+ 1,
100 static struct option
const longopts
[] =
102 {"bytes", no_argument
, nullptr, 'c'},
103 {"chars", no_argument
, nullptr, 'm'},
104 {"lines", no_argument
, nullptr, 'l'},
105 {"words", no_argument
, nullptr, 'w'},
106 {"debug", no_argument
, nullptr, DEBUG_PROGRAM_OPTION
},
107 {"files0-from", required_argument
, nullptr, FILES0_FROM_OPTION
},
108 {"max-line-length", no_argument
, nullptr, 'L'},
109 {"total", required_argument
, nullptr, TOTAL_OPTION
},
110 {GETOPT_HELP_OPTION_DECL
},
111 {GETOPT_VERSION_OPTION_DECL
},
112 {nullptr, 0, nullptr, 0}
117 total_auto
, /* 0: default or --total=auto */
118 total_always
, /* 1: --total=always */
119 total_only
, /* 2: --total=only */
120 total_never
/* 3: --total=never */
122 static char const *const total_args
[] =
124 "auto", "always", "only", "never", nullptr
126 static enum total_type
const total_types
[] =
128 total_auto
, total_always
, total_only
, total_never
130 ARGMATCH_VERIFY (total_args
, total_types
);
131 static enum total_type total_mode
= total_auto
;
133 #ifdef USE_AVX2_WC_LINECOUNT
135 avx2_supported (void)
137 bool avx_enabled
= 0 < __builtin_cpu_supports ("avx2");
140 error (0, 0, (avx_enabled
141 ? _("using avx2 hardware support")
142 : _("avx2 support not detected")));
151 if (status
!= EXIT_SUCCESS
)
156 Usage: %s [OPTION]... [FILE]...\n\
157 or: %s [OPTION]... --files0-from=F\n\
159 program_name
, program_name
);
161 Print newline, word, and byte counts for each FILE, and a total line if\n\
162 more than one FILE is specified. A word is a nonempty sequence of non white\n\
163 space delimited by white space characters or by start or end of input.\n\
170 The options below may be used to select which counts are printed, always in\n\
171 the following order: newline, word, character, byte, maximum line length.\n\
172 -c, --bytes print the byte counts\n\
173 -m, --chars print the character counts\n\
174 -l, --lines print the newline counts\n\
177 --files0-from=F read input from the files specified by\n\
178 NUL-terminated names in file F;\n\
179 If F is - then read names from standard input\n\
180 -L, --max-line-length print the maximum display width\n\
181 -w, --words print the word counts\n\
184 --total=WHEN when to print a line with total counts;\n\
185 WHEN can be: auto, always, only, never\n\
187 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
188 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
189 emit_ancillary_info (PROGRAM_NAME
);
194 /* Return non zero if a non breaking space. */
197 iswnbspace (wint_t wc
)
199 return ! posixly_correct
200 && (wc
== 0x00A0 || wc
== 0x2007
201 || wc
== 0x202F || wc
== 0x2060);
204 /* FILE is the name of the file (or null for standard input)
205 associated with the specified counters. */
207 write_counts (uintmax_t lines
,
214 static char const format_sp_int
[] = " %*s";
215 char const *format_int
= format_sp_int
+ 1;
216 char buf
[MAX (INT_BUFSIZE_BOUND (intmax_t),
217 INT_BUFSIZE_BOUND (uintmax_t))];
221 printf (format_int
, number_width
, umaxtostr (lines
, buf
));
222 format_int
= format_sp_int
;
226 printf (format_int
, number_width
, umaxtostr (words
, buf
));
227 format_int
= format_sp_int
;
231 printf (format_int
, number_width
, umaxtostr (chars
, buf
));
232 format_int
= format_sp_int
;
236 printf (format_int
, number_width
, umaxtostr (bytes
, buf
));
237 format_int
= format_sp_int
;
239 if (print_linelength
)
240 printf (format_int
, number_width
, imaxtostr (linelength
, buf
));
242 printf (" %s", strchr (file
, '\n') ? quotef (file
) : file
);
246 /* Read FD and return a summary. */
247 static struct wc_lines
250 #ifdef USE_AVX2_WC_LINECOUNT
251 static signed char use_avx2
;
253 use_avx2
= avx2_supported () ? 1 : -1;
255 return wc_lines_avx2 (fd
);
258 intmax_t lines
= 0, bytes
= 0;
259 bool long_lines
= false;
263 char buf
[IO_BUFSIZE
+ 1];
264 ssize_t bytes_read
= read (fd
, buf
, IO_BUFSIZE
);
266 return (struct wc_lines
) { bytes_read
== 0 ? 0 : errno
, lines
, bytes
};
269 char *end
= buf
+ bytes_read
;
274 /* Avoid function call overhead for shorter lines. */
275 for (char *p
= buf
; p
< end
; p
++)
276 buflines
+= *p
== '\n';
280 /* rawmemchr is more efficient with longer lines. */
282 for (char *p
= buf
; (p
= rawmemchr (p
, '\n')) < end
; p
++)
286 /* If the average line length in the block is >= 15, then use
287 memchr for the next block, where system specific optimizations
288 may outweigh function call overhead.
289 FIXME: This line length was determined in 2015, on both
290 x86_64 and ppc64, but it's worth re-evaluating in future with
291 newer compilers, CPUs, or memchr() implementations etc. */
292 long_lines
= 15 * buflines
<= bytes_read
;
297 /* Count words. FILE_X is the name of the file (or null for standard
298 input) that is open on descriptor FD. *FSTATUS is its status.
299 CURRENT_POS is the current file offset if known, negative if unknown.
300 Return true if successful. */
302 wc (int fd
, char const *file_x
, struct fstatus
*fstatus
, off_t current_pos
)
305 char buf
[IO_BUFSIZE
+ 1];
306 intmax_t lines
, words
, chars
, bytes
, linelength
;
307 bool count_bytes
, count_chars
, count_complicated
;
308 char const *file
= file_x
? file_x
: _("standard input");
310 lines
= words
= chars
= bytes
= linelength
= 0;
312 /* If in the current locale, chars are equivalent to bytes, we prefer
313 counting bytes, because that's easier. */
316 count_bytes
= print_bytes
;
317 count_chars
= print_chars
;
321 count_bytes
= print_bytes
|| print_chars
;
324 count_complicated
= print_words
|| print_linelength
;
326 /* Advise the kernel of our access pattern only if we will read(). */
327 if (!count_bytes
|| count_chars
|| print_lines
|| count_complicated
)
328 fdadvise (fd
, 0, 0, FADVISE_SEQUENTIAL
);
330 /* When counting only bytes, save some line- and word-counting
331 overhead. If FD is a 'regular' Unix file, using lseek is enough
332 to get its 'size' in bytes. Otherwise, read blocks of IO_BUFSIZE
333 bytes at a time until EOF. Note that the 'size' (number of bytes)
334 that wc reports is smaller than stats.st_size when the file is not
335 positioned at its beginning. That's why the lseek calls below are
336 necessary. For example the command
337 '(dd ibs=99k skip=1 count=0; ./wc -c) < /etc/group'
338 should make wc report '0' bytes. */
340 if (count_bytes
&& !count_chars
&& !print_lines
&& !count_complicated
)
342 bool skip_read
= false;
344 if (0 < fstatus
->failed
)
345 fstatus
->failed
= fstat (fd
, &fstatus
->st
);
347 /* For sized files, seek to one st_blksize before EOF rather than to EOF.
348 This works better for files in proc-like file systems where
349 the size is only approximate. */
350 if (! fstatus
->failed
&& usable_st_size (&fstatus
->st
)
351 && 0 <= fstatus
->st
.st_size
)
353 off_t end_pos
= fstatus
->st
.st_size
;
355 current_pos
= lseek (fd
, 0, SEEK_CUR
);
357 if (end_pos
% page_size
)
359 /* We only need special handling of /proc and /sys files etc.
360 when they're a multiple of PAGE_SIZE. In the common case
361 for files with st_size not a multiple of PAGE_SIZE,
362 it's more efficient and accurate to use st_size.
364 Be careful here. The current position may actually be
365 beyond the end of the file. As in the example above. */
367 bytes
= end_pos
< current_pos
? 0 : end_pos
- current_pos
;
368 if (bytes
&& 0 <= lseek (fd
, bytes
, SEEK_CUR
))
375 off_t hi_pos
= (end_pos
376 - end_pos
% (STP_BLKSIZE (&fstatus
->st
) + 1));
377 if (0 <= current_pos
&& current_pos
< hi_pos
378 && 0 <= lseek (fd
, hi_pos
, SEEK_CUR
))
379 bytes
= hi_pos
- current_pos
;
385 fdadvise (fd
, 0, 0, FADVISE_SEQUENTIAL
);
386 for (ssize_t bytes_read
;
387 (bytes_read
= read (fd
, buf
, IO_BUFSIZE
));
396 else if (!count_chars
&& !count_complicated
)
398 /* Use a separate loop when counting only lines or lines and bytes --
399 but not chars or words. */
400 struct wc_lines w
= wc_lines (fd
);
405 else if (MB_CUR_MAX
> 1)
407 bool in_word
= false;
408 intmax_t linepos
= 0;
409 mbstate_t state
; mbszero (&state
);
410 bool in_shift
= false;
411 idx_t prev
= 0; /* Number of bytes carried over from previous round. */
413 for (ssize_t bytes_read
;
414 ((bytes_read
= read (fd
, buf
+ prev
, IO_BUFSIZE
- prev
))
426 char const *plim
= p
+ prev
+ bytes_read
;
433 if (!in_shift
&& 0 <= *p
&& *p
< 0x80)
435 /* Handle most ASCII characters quickly, without calling
443 idx_t scanbytes
= plim
- (p
+ prev
);
444 size_t n
= mbrtoc32 (&wide_char
, p
+ prev
, scanbytes
, &state
);
449 if (n
== (size_t) -2 && plim
- p
< IO_BUFSIZE
452 /* An incomplete character that is not ridiculously
453 long and there may be more input. Move the bytes
454 to buffer start and prepare to read more data. */
456 memmove (buf
, p
, prev
);
461 /* Remember that we read a byte, but don't complain
462 about the error. Because of the decoding error,
463 this is a considered to be byte but not a
464 character (that is, chars is not incremented). */
469 /* Treat encoding errors as non white space.
470 POSIX says a word is "a non-zero-length string of
471 characters delimited by white space". This is
472 wrong in some sense, as the string can be delimited
473 by start or end of input, and it is unclear what it
474 means when the input contains encoding errors.
475 Since encoding errors are not white space,
476 treat them that way here. */
483 single_byte
= charbytes
== !in_shift
;
484 in_shift
= !mbsinit (&state
);
494 if (linepos
> linelength
)
495 linelength
= linepos
;
501 linepos
+= 8 - (linepos
% 8);
516 linepos
+= wc_isprint
[wide_char
];
517 in_word2
= !wc_isspace
[wide_char
];
521 /* c32width can be expensive on macOS for example,
522 so avoid if not needed. */
523 if (print_linelength
)
525 int width
= c32width (wide_char
);
529 in_word2
= ! iswspace (wide_char
)
530 && ! iswnbspace (wide_char
);
533 /* Count words by counting word starts, i.e., each
534 white space character (or the start of input)
535 followed by non white space. */
536 words
+= !in_word
& in_word2
;
546 if (linepos
> linelength
)
547 linelength
= linepos
;
551 bool in_word
= false;
552 intmax_t linepos
= 0;
554 for (ssize_t bytes_read
; (bytes_read
= read (fd
, buf
, IO_BUFSIZE
)); )
566 unsigned char c
= *p
++;
574 if (linepos
> linelength
)
575 linelength
= linepos
;
581 linepos
+= 8 - (linepos
% 8);
593 linepos
+= wc_isprint
[c
];
594 bool in_word2
= !wc_isspace
[c
];
595 words
+= !in_word
& in_word2
;
600 while (--bytes_read
);
602 if (linepos
> linelength
)
603 linelength
= linepos
;
606 if (count_chars
< print_chars
)
609 if (total_mode
!= total_only
)
610 write_counts (lines
, words
, chars
, bytes
, linelength
, file_x
);
612 total_lines_overflow
|= ckd_add (&total_lines
, total_lines
, lines
);
613 total_words_overflow
|= ckd_add (&total_words
, total_words
, words
);
614 total_chars_overflow
|= ckd_add (&total_chars
, total_chars
, chars
);
615 total_bytes_overflow
|= ckd_add (&total_bytes
, total_bytes
, bytes
);
617 if (linelength
> max_line_length
)
618 max_line_length
= linelength
;
621 error (0, err
, "%s", quotef (file
));
626 wc_file (char const *file
, struct fstatus
*fstatus
)
628 if (! file
|| STREQ (file
, "-"))
630 have_read_stdin
= true;
631 xset_binary_mode (STDIN_FILENO
, O_BINARY
);
632 return wc (STDIN_FILENO
, file
, fstatus
, -1);
636 int fd
= open (file
, O_RDONLY
| O_BINARY
);
639 error (0, errno
, "%s", quotef (file
));
644 bool ok
= wc (fd
, file
, fstatus
, 0);
647 error (0, errno
, "%s", quotef (file
));
655 /* Return the file status for the NFILES files addressed by FILE.
656 Optimize the case where only one number is printed, for just one
657 file; in that case we can use a print width of 1, so we don't need
658 to stat the file. Handle the case of (nfiles == 0) in the same way;
659 that happens when we don't know how long the list of file names will be. */
661 static struct fstatus
*
662 get_input_fstatus (idx_t nfiles
, char *const *file
)
664 struct fstatus
*fstatus
= xnmalloc (nfiles
? nfiles
: 1, sizeof *fstatus
);
668 && ((print_lines
+ print_words
+ print_chars
669 + print_bytes
+ print_linelength
)
671 fstatus
[0].failed
= 1;
674 for (idx_t i
= 0; i
< nfiles
; i
++)
675 fstatus
[i
].failed
= (! file
[i
] || STREQ (file
[i
], "-")
676 ? fstat (STDIN_FILENO
, &fstatus
[i
].st
)
677 : stat (file
[i
], &fstatus
[i
].st
));
683 /* Return a print width suitable for the NFILES files whose status is
684 recorded in FSTATUS. Optimize the same special case that
685 get_input_fstatus optimizes. */
689 compute_number_width (idx_t nfiles
, struct fstatus
const *fstatus
)
693 if (0 < nfiles
&& fstatus
[0].failed
<= 0)
695 int minimum_width
= 1;
696 uintmax_t regular_total
= 0;
698 for (idx_t i
= 0; i
< nfiles
; i
++)
699 if (! fstatus
[i
].failed
)
701 if (!S_ISREG (fstatus
[i
].st
.st_mode
))
703 else if (ckd_add (®ular_total
, regular_total
,
704 fstatus
[i
].st
.st_size
))
706 regular_total
= UINTMAX_MAX
;
711 for (; 10 <= regular_total
; regular_total
/= 10)
713 if (width
< minimum_width
)
714 width
= minimum_width
;
722 main (int argc
, char **argv
)
727 char *files_from
= nullptr;
728 struct fstatus
*fstatus
;
731 initialize_main (&argc
, &argv
);
732 set_program_name (argv
[0]);
733 setlocale (LC_ALL
, "");
734 bindtextdomain (PACKAGE
, LOCALEDIR
);
735 textdomain (PACKAGE
);
737 atexit (close_stdout
);
739 page_size
= getpagesize ();
740 /* Line buffer stdout to ensure lines are written atomically and immediately
741 so that processes running in parallel do not intersperse their output. */
742 setvbuf (stdout
, nullptr, _IOLBF
, 0);
744 posixly_correct
= (getenv ("POSIXLY_CORRECT") != nullptr);
746 print_lines
= print_words
= print_chars
= print_bytes
= false;
747 print_linelength
= false;
748 total_lines
= total_words
= total_chars
= total_bytes
= max_line_length
= 0;
750 while ((optc
= getopt_long (argc
, argv
, "clLmw", longopts
, nullptr)) != -1)
770 print_linelength
= true;
773 case DEBUG_PROGRAM_OPTION
:
777 case FILES0_FROM_OPTION
:
782 total_mode
= XARGMATCH ("--total", optarg
, total_args
, total_types
);
785 case_GETOPT_HELP_CHAR
;
787 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
790 usage (EXIT_FAILURE
);
793 if (! (print_lines
|| print_words
|| print_chars
|| print_bytes
794 || print_linelength
))
795 print_lines
= print_words
= print_bytes
= true;
797 if (print_linelength
)
798 for (int i
= 0; i
<= UCHAR_MAX
; i
++)
799 wc_isprint
[i
] = !!isprint (i
);
801 for (int i
= 0; i
<= UCHAR_MAX
; i
++)
802 wc_isspace
[i
] = isspace (i
) || iswnbspace (btoc32 (i
));
804 bool read_tokens
= false;
805 struct argv_iterator
*ai
;
810 /* When using --files0-from=F, you may not specify any files
811 on the command-line. */
814 error (0, 0, _("extra operand %s"), quoteaf (argv
[optind
]));
815 fprintf (stderr
, "%s\n",
816 _("file operands cannot be combined with --files0-from"));
817 usage (EXIT_FAILURE
);
820 if (STREQ (files_from
, "-"))
824 stream
= fopen (files_from
, "r");
825 if (stream
== nullptr)
826 error (EXIT_FAILURE
, errno
, _("cannot open %s for reading"),
827 quoteaf (files_from
));
830 /* Read the file list into RAM if we can detect its size and that
831 size is reasonable. Otherwise, we'll read a name at a time. */
833 if (fstat (fileno (stream
), &st
) == 0
834 && S_ISREG (st
.st_mode
)
835 && st
.st_size
<= MIN (10 * 1024 * 1024, physmem_available () / 2))
838 readtokens0_init (&tok
);
839 if (! readtokens0 (stream
, &tok
) || fclose (stream
) != 0)
840 error (EXIT_FAILURE
, 0, _("cannot read file names from %s"),
841 quoteaf (files_from
));
844 ai
= argv_iter_init_argv (files
);
850 ai
= argv_iter_init_stream (stream
);
855 static char *stdin_only
[] = { nullptr };
856 files
= (optind
< argc
? argv
+ optind
: stdin_only
);
857 nfiles
= (optind
< argc
? argc
- optind
: 1);
858 ai
= argv_iter_init_argv (files
);
864 fstatus
= get_input_fstatus (nfiles
, files
);
865 if (total_mode
== total_only
)
866 number_width
= 1; /* No extra padding, since no alignment requirement. */
868 number_width
= compute_number_width (nfiles
, fstatus
);
871 enum argv_iter_err ai_err
;
873 for (int i
= 0; (file_name
= argv_iter (ai
, &ai_err
)); i
++)
875 bool skip_file
= false;
876 if (files_from
&& STREQ (files_from
, "-") && STREQ (file_name
, "-"))
878 /* Give a better diagnostic in an unusual case:
879 printf - | wc --files0-from=- */
880 error (0, 0, _("when reading file names from stdin, "
881 "no file name of %s allowed"),
882 quoteaf (file_name
));
888 /* Diagnose a zero-length file name. When it's one
889 among many, knowing the record number may help.
890 FIXME: currently print the record number only with
891 --files0-from=FILE. Maybe do it for argv, too? */
892 if (files_from
== nullptr)
893 error (0, 0, "%s", _("invalid zero-length file name"));
896 /* Using the standard 'filename:line-number:' prefix here is
897 not totally appropriate, since NUL is the separator, not NL,
898 but it might be better than nothing. */
899 error (0, 0, "%s:%zu: %s", quotef (files_from
),
900 argv_iter_n_args (ai
), _("invalid zero-length file name"));
908 ok
&= wc_file (file_name
, &fstatus
[nfiles
? i
: 0]);
911 fstatus
[0].failed
= 1;
919 error (0, errno
, _("%s: read error"), quotef (files_from
));
930 /* No arguments on the command line is fine. That means read from stdin.
931 However, no arguments on the --files0-from input stream is an error
932 means don't read anything. */
933 if (ok
&& !files_from
&& argv_iter_n_args (ai
) == 0)
934 ok
&= wc_file (nullptr, &fstatus
[0]);
937 readtokens0_free (&tok
);
939 if (total_mode
!= total_never
940 && (total_mode
!= total_auto
|| 1 < argv_iter_n_args (ai
)))
942 if (total_lines_overflow
)
944 total_lines
= UINTMAX_MAX
;
945 error (0, EOVERFLOW
, _("total lines"));
948 if (total_words_overflow
)
950 total_words
= UINTMAX_MAX
;
951 error (0, EOVERFLOW
, _("total words"));
954 if (total_chars_overflow
)
956 total_chars
= UINTMAX_MAX
;
957 error (0, EOVERFLOW
, _("total characters"));
960 if (total_bytes_overflow
)
962 total_bytes
= UINTMAX_MAX
;
963 error (0, EOVERFLOW
, _("total bytes"));
967 write_counts (total_lines
, total_words
, total_chars
, total_bytes
,
969 total_mode
!= total_only
? _("total") : nullptr);
976 if (have_read_stdin
&& close (STDIN_FILENO
) != 0)
977 error (EXIT_FAILURE
, errno
, "-");
979 return ok
? EXIT_SUCCESS
: EXIT_FAILURE
;