struct / union in initializer, RFE #901.
[sdcc.git] / sdcc / support / sdbinutils / binutils / strings.c
blob5f4e0ebfc27a54aefd98a3ae3b31646a9f3a924d
1 /* strings -- print the strings of printable characters in files
2 Copyright (C) 1993-2022 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
17 02110-1301, USA. */
19 /* Usage: strings [options] file...
21 Options:
22 --all
24 - Scan each file in its entirety.
26 --data
27 -d Scan only the initialized data section(s) of object files.
29 --print-file-name
30 -f Print the name of the file before each string.
32 --bytes=min-len
33 -n min-len
34 -min-len Print graphic char sequences, MIN-LEN or more bytes long,
35 that are followed by a NUL or a non-displayable character.
36 Default is 4.
38 --radix={o,x,d}
39 -t {o,x,d} Print the offset within the file before each string,
40 in octal/hex/decimal.
42 --include-all-whitespace
43 -w By default tab and space are the only whitepace included in graphic
44 char sequences. This option considers all of isspace() valid.
46 -o Like -to. (Some other implementations have -o like -to,
47 others like -td. We chose one arbitrarily.)
49 --encoding={s,S,b,l,B,L}
50 -e {s,S,b,l,B,L}
51 Select character encoding: 7-bit-character, 8-bit-character,
52 bigendian 16-bit, littleendian 16-bit, bigendian 32-bit,
53 littleendian 32-bit.
55 --target=BFDNAME
56 -T {bfdname}
57 Specify a non-default object file format.
59 --unicode={default|locale|invalid|hex|escape|highlight}
60 -U {d|l|i|x|e|h}
61 Determine how to handle UTF-8 unicode characters. The default
62 is no special treatment. All other versions of this option
63 only apply if the encoding is valid and enabling the option
64 implies --encoding=S.
65 The 'locale' option displays the characters according to the
66 current locale. The 'invalid' option treats them as
67 non-string characters. The 'hex' option displays them as hex
68 byte sequences. The 'escape' option displays them as escape
69 sequences and the 'highlight' option displays them as
70 coloured escape sequences.
72 --output-separator=sep_string
73 -s sep_string String used to separate parsed strings in output.
74 Default is newline.
76 --help
77 -h Print the usage message on the standard output.
79 --version
81 -v Print the program version number.
83 Written by Richard Stallman <rms@gnu.ai.mit.edu>
84 and David MacKenzie <djm@gnu.ai.mit.edu>. */
86 #include "sysdep.h"
87 #include "bfd.h"
88 #include "getopt.h"
89 #include "libiberty.h"
90 #include "safe-ctype.h"
91 #include "bucomm.h"
93 #ifndef streq
94 #define streq(a,b) (strcmp ((a),(b)) == 0)
95 #endif
97 typedef enum unicode_display_type
99 unicode_default = 0,
100 unicode_locale,
101 unicode_escape,
102 unicode_hex,
103 unicode_highlight,
104 unicode_invalid
105 } unicode_display_type;
107 static unicode_display_type unicode_display = unicode_default;
109 #define STRING_ISGRAPHIC(c) \
110 ( (c) >= 0 \
111 && (c) <= 255 \
112 && ((c) == '\t' || ISPRINT (c) || (encoding == 'S' && (c) > 127) \
113 || (include_all_whitespace && ISSPACE (c))) \
116 #ifndef errno
117 extern int errno;
118 #endif
120 /* The BFD section flags that identify an initialized data section. */
121 #define DATA_FLAGS (SEC_ALLOC | SEC_LOAD | SEC_HAS_CONTENTS)
123 /* Radix for printing addresses (must be 8, 10 or 16). */
124 static int address_radix;
126 /* Minimum length of sequence of graphic chars to trigger output. */
127 static unsigned int string_min;
129 /* Whether or not we include all whitespace as a graphic char. */
130 static bool include_all_whitespace;
132 /* TRUE means print address within file for each string. */
133 static bool print_addresses;
135 /* TRUE means print filename for each string. */
136 static bool print_filenames;
138 /* TRUE means for object files scan only the data section. */
139 static bool datasection_only;
141 /* The BFD object file format. */
142 static char *target;
144 /* The character encoding format. */
145 static char encoding;
146 static int encoding_bytes;
148 /* Output string used to separate parsed strings */
149 static char *output_separator;
151 static struct option long_options[] =
153 {"all", no_argument, NULL, 'a'},
154 {"bytes", required_argument, NULL, 'n'},
155 {"data", no_argument, NULL, 'd'},
156 {"encoding", required_argument, NULL, 'e'},
157 {"help", no_argument, NULL, 'h'},
158 {"include-all-whitespace", no_argument, NULL, 'w'},
159 {"output-separator", required_argument, NULL, 's'},
160 {"print-file-name", no_argument, NULL, 'f'},
161 {"radix", required_argument, NULL, 't'},
162 {"target", required_argument, NULL, 'T'},
163 {"unicode", required_argument, NULL, 'U'},
164 {"version", no_argument, NULL, 'v'},
165 {NULL, 0, NULL, 0}
168 static bool strings_file (char *);
169 static void print_strings (const char *, FILE *, file_ptr, int, char *);
170 static void usage (FILE *, int) ATTRIBUTE_NORETURN;
172 int main (int, char **);
175 main (int argc, char **argv)
177 int optc;
178 int exit_status = 0;
179 bool files_given = false;
180 char *s;
181 int numeric_opt = 0;
183 setlocale (LC_ALL, "");
184 bindtextdomain (PACKAGE, LOCALEDIR);
185 textdomain (PACKAGE);
187 program_name = argv[0];
188 xmalloc_set_program_name (program_name);
189 bfd_set_error_program_name (program_name);
191 expandargv (&argc, &argv);
193 string_min = 4;
194 include_all_whitespace = false;
195 print_addresses = false;
196 print_filenames = false;
197 if (DEFAULT_STRINGS_ALL)
198 datasection_only = false;
199 else
200 datasection_only = true;
201 target = NULL;
202 encoding = 's';
203 output_separator = NULL;
205 while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:U:Vv0123456789",
206 long_options, (int *) 0)) != EOF)
208 switch (optc)
210 case 'a':
211 datasection_only = false;
212 break;
214 case 'd':
215 datasection_only = true;
216 break;
218 case 'f':
219 print_filenames = true;
220 break;
222 case 'H':
223 case 'h':
224 usage (stdout, 0);
226 case 'n':
227 string_min = (int) strtoul (optarg, &s, 0);
228 if (s != NULL && *s != 0)
229 fatal (_("invalid integer argument %s"), optarg);
230 break;
232 case 'w':
233 include_all_whitespace = true;
234 break;
236 case 'o':
237 print_addresses = true;
238 address_radix = 8;
239 break;
241 case 't':
242 print_addresses = true;
243 if (optarg[1] != '\0')
244 usage (stderr, 1);
245 switch (optarg[0])
247 case 'o':
248 address_radix = 8;
249 break;
251 case 'd':
252 address_radix = 10;
253 break;
255 case 'x':
256 address_radix = 16;
257 break;
259 default:
260 usage (stderr, 1);
262 break;
264 case 'T':
265 target = optarg;
266 break;
268 case 'e':
269 if (optarg[1] != '\0')
270 usage (stderr, 1);
271 encoding = optarg[0];
272 break;
274 case 's':
275 output_separator = optarg;
276 break;
278 case 'U':
279 if (streq (optarg, "default") || streq (optarg, "d"))
280 unicode_display = unicode_default;
281 else if (streq (optarg, "locale") || streq (optarg, "l"))
282 unicode_display = unicode_locale;
283 else if (streq (optarg, "escape") || streq (optarg, "e"))
284 unicode_display = unicode_escape;
285 else if (streq (optarg, "invalid") || streq (optarg, "i"))
286 unicode_display = unicode_invalid;
287 else if (streq (optarg, "hex") || streq (optarg, "x"))
288 unicode_display = unicode_hex;
289 else if (streq (optarg, "highlight") || streq (optarg, "h"))
290 unicode_display = unicode_highlight;
291 else
292 fatal (_("invalid argument to -U/--unicode: %s"), optarg);
293 break;
295 case 'V':
296 case 'v':
297 print_version ("strings");
298 break;
300 case '?':
301 usage (stderr, 1);
303 default:
304 numeric_opt = optind;
305 break;
309 if (unicode_display != unicode_default)
310 encoding = 'S';
312 if (numeric_opt != 0)
314 string_min = (int) strtoul (argv[numeric_opt - 1] + 1, &s, 0);
315 if (s != NULL && *s != 0)
316 fatal (_("invalid integer argument %s"), argv[numeric_opt - 1] + 1);
318 if (string_min < 1)
319 fatal (_("invalid minimum string length %d"), string_min);
321 switch (encoding)
323 case 'S':
324 case 's':
325 encoding_bytes = 1;
326 break;
327 case 'b':
328 case 'l':
329 encoding_bytes = 2;
330 break;
331 case 'B':
332 case 'L':
333 encoding_bytes = 4;
334 break;
335 default:
336 usage (stderr, 1);
339 if (bfd_init () != BFD_INIT_MAGIC)
340 fatal (_("fatal error: libbfd ABI mismatch"));
341 set_default_bfd_target ();
343 if (optind >= argc)
345 datasection_only = false;
346 SET_BINARY (fileno (stdin));
347 print_strings ("{standard input}", stdin, 0, 0, (char *) NULL);
348 files_given = true;
350 else
352 for (; optind < argc; ++optind)
354 if (streq (argv[optind], "-"))
355 datasection_only = false;
356 else
358 files_given = true;
359 exit_status |= !strings_file (argv[optind]);
364 if (!files_given)
365 usage (stderr, 1);
367 return (exit_status);
370 /* Scan section SECT of the file ABFD, whose printable name is
371 FILENAME. If it contains initialized data set GOT_A_SECTION and
372 print the strings in it. */
374 static void
375 strings_a_section (bfd *abfd, asection *sect, const char *filename,
376 bool *got_a_section)
378 bfd_size_type sectsize;
379 bfd_byte *mem;
381 if ((sect->flags & DATA_FLAGS) != DATA_FLAGS)
382 return;
384 sectsize = bfd_section_size (sect);
385 if (sectsize == 0)
386 return;
388 if (!bfd_malloc_and_get_section (abfd, sect, &mem))
390 non_fatal (_("%s: Reading section %s failed: %s"),
391 filename, sect->name, bfd_errmsg (bfd_get_error ()));
392 return;
395 *got_a_section = true;
396 print_strings (filename, NULL, sect->filepos, sectsize, (char *) mem);
397 free (mem);
400 /* Scan all of the sections in FILE, and print the strings
401 in the initialized data section(s).
403 Return TRUE if successful,
404 FALSE if not (such as if FILE is not an object file). */
406 static bool
407 strings_object_file (const char *file)
409 bfd *abfd;
410 asection *s;
411 bool got_a_section;
413 abfd = bfd_openr (file, target);
415 if (abfd == NULL)
416 /* Treat the file as a non-object file. */
417 return false;
419 /* This call is mainly for its side effect of reading in the sections.
420 We follow the traditional behavior of `strings' in that we don't
421 complain if we don't recognize a file to be an object file. */
422 if (!bfd_check_format (abfd, bfd_object))
424 bfd_close (abfd);
425 return false;
428 got_a_section = false;
429 for (s = abfd->sections; s != NULL; s = s->next)
430 strings_a_section (abfd, s, file, &got_a_section);
432 if (!bfd_close (abfd))
434 bfd_nonfatal (file);
435 return false;
438 return got_a_section;
441 /* Print the strings in FILE. Return TRUE if ok, FALSE if an error occurs. */
443 static bool
444 strings_file (char *file)
446 struct stat st;
448 /* get_file_size does not support non-S_ISREG files. */
450 if (stat (file, &st) < 0)
452 if (errno == ENOENT)
453 non_fatal (_("'%s': No such file"), file);
454 else
455 non_fatal (_("Warning: could not locate '%s'. reason: %s"),
456 file, strerror (errno));
457 return false;
459 else if (S_ISDIR (st.st_mode))
461 non_fatal (_("Warning: '%s' is a directory"), file);
462 return false;
465 /* If we weren't told to scan the whole file,
466 try to open it as an object file and only look at
467 initialized data sections. If that fails, fall back to the
468 whole file. */
469 if (!datasection_only || !strings_object_file (file))
471 FILE *stream;
473 stream = fopen (file, FOPEN_RB);
474 if (stream == NULL)
476 fprintf (stderr, "%s: ", program_name);
477 perror (file);
478 return false;
481 print_strings (file, stream, (file_ptr) 0, 0, (char *) NULL);
483 if (fclose (stream) == EOF)
485 fprintf (stderr, "%s: ", program_name);
486 perror (file);
487 return false;
491 return true;
494 /* Read the next character, return EOF if none available.
495 Assume that STREAM is positioned so that the next byte read
496 is at address ADDRESS in the file.
498 If STREAM is NULL, do not read from it.
499 The caller can supply a buffer of characters
500 to be processed before the data in STREAM.
501 MAGIC is the address of the buffer and
502 MAGICCOUNT is how many characters are in it. */
504 static long
505 get_char (FILE *stream, file_ptr *address, int *magiccount, char **magic)
507 int c, i;
508 long r = 0;
510 for (i = 0; i < encoding_bytes; i++)
512 if (*magiccount)
514 (*magiccount)--;
515 c = *(*magic)++;
517 else
519 if (stream == NULL)
520 return EOF;
522 /* Only use getc_unlocked if we found a declaration for it.
523 Otherwise, libc is not thread safe by default, and we
524 should not use it. */
526 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
527 c = getc_unlocked (stream);
528 #else
529 c = getc (stream);
530 #endif
531 if (c == EOF)
532 return EOF;
535 (*address)++;
536 r = (r << 8) | (c & 0xff);
539 switch (encoding)
541 default:
542 break;
543 case 'l':
544 r = ((r & 0xff) << 8) | ((r & 0xff00) >> 8);
545 break;
546 case 'L':
547 r = (((r & 0xff) << 24) | ((r & 0xff00) << 8)
548 | ((r & 0xff0000) >> 8) | ((r & 0xff000000) >> 24));
549 break;
552 return r;
555 /* Throw away one byte of a (possibly) multi-byte char C, updating
556 address and buffer to suit. */
558 static void
559 unget_part_char (long c, file_ptr *address, int *magiccount, char **magic)
561 static char tmp[4];
563 if (encoding_bytes > 1)
565 *address -= encoding_bytes - 1;
567 if (*magiccount == 0)
569 /* If no magic buffer exists, use temp buffer. */
570 switch (encoding)
572 default:
573 break;
574 case 'b':
575 tmp[0] = c & 0xff;
576 *magiccount = 1;
577 break;
578 case 'l':
579 tmp[0] = (c >> 8) & 0xff;
580 *magiccount = 1;
581 break;
582 case 'B':
583 tmp[0] = (c >> 16) & 0xff;
584 tmp[1] = (c >> 8) & 0xff;
585 tmp[2] = c & 0xff;
586 *magiccount = 3;
587 break;
588 case 'L':
589 tmp[0] = (c >> 8) & 0xff;
590 tmp[1] = (c >> 16) & 0xff;
591 tmp[2] = (c >> 24) & 0xff;
592 *magiccount = 3;
593 break;
595 *magic = tmp;
597 else
599 /* If magic buffer exists, rewind. */
600 *magic -= encoding_bytes - 1;
601 *magiccount += encoding_bytes - 1;
606 static void
607 print_filename_and_address (const char * filename, file_ptr address)
609 if (print_filenames)
610 printf ("%s: ", filename);
612 if (! print_addresses)
613 return;
615 switch (address_radix)
617 case 8:
618 if (sizeof (address) > sizeof (long))
620 #ifndef __MSVCRT__
621 printf ("%7llo ", (unsigned long long) address);
622 #else
623 printf ("%7I64o ", (unsigned long long) address);
624 #endif
626 else
627 printf ("%7lo ", (unsigned long) address);
628 break;
630 case 10:
631 if (sizeof (address) > sizeof (long))
633 #ifndef __MSVCRT__
634 printf ("%7llu ", (unsigned long long) address);
635 #else
636 printf ("%7I64d ", (unsigned long long) address);
637 #endif
639 else
640 printf ("%7ld ", (long) address);
641 break;
643 case 16:
644 if (sizeof (address) > sizeof (long))
646 #ifndef __MSVCRT__
647 printf ("%7llx ", (unsigned long long) address);
648 #else
649 printf ("%7I64x ", (unsigned long long) address);
650 #endif
652 else
653 printf ("%7lx ", (unsigned long) address);
654 break;
658 /* Return non-zero if the bytes starting at BUFFER form a valid UTF-8 encoding.
659 If the encoding is valid then returns the number of bytes it uses. */
661 static unsigned int
662 is_valid_utf8 (const unsigned char * buffer, unsigned long buflen)
664 if (buffer[0] < 0xc0)
665 return 0;
667 if (buflen < 2)
668 return 0;
670 if ((buffer[1] & 0xc0) != 0x80)
671 return 0;
673 if ((buffer[0] & 0x20) == 0)
674 return 2;
676 if (buflen < 3)
677 return 0;
679 if ((buffer[2] & 0xc0) != 0x80)
680 return 0;
682 if ((buffer[0] & 0x10) == 0)
683 return 3;
685 if (buflen < 4)
686 return 0;
688 if ((buffer[3] & 0xc0) != 0x80)
689 return 0;
691 return 4;
694 /* Display a UTF-8 encoded character in BUFFER according to the setting
695 of unicode_display. The character is known to be valid.
696 Returns the number of bytes consumed. */
698 static unsigned int
699 display_utf8_char (const unsigned char * buffer)
701 unsigned int j;
702 unsigned int utf8_len;
704 switch (buffer[0] & 0x30)
706 case 0x00:
707 case 0x10:
708 utf8_len = 2;
709 break;
710 case 0x20:
711 utf8_len = 3;
712 break;
713 default:
714 utf8_len = 4;
717 switch (unicode_display)
719 default:
720 fprintf (stderr, "ICE: unexpected unicode display type\n");
721 break;
723 case unicode_escape:
724 case unicode_highlight:
725 if (unicode_display == unicode_highlight && isatty (1))
726 printf ("\x1B[31;47m"); /* Red. */
728 switch (utf8_len)
730 case 2:
731 printf ("\\u%02x%02x",
732 ((buffer[0] & 0x1c) >> 2),
733 ((buffer[0] & 0x03) << 6) | (buffer[1] & 0x3f));
734 break;
736 case 3:
737 printf ("\\u%02x%02x",
738 ((buffer[0] & 0x0f) << 4) | ((buffer[1] & 0x3c) >> 2),
739 ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3f)));
740 break;
742 case 4:
743 printf ("\\u%02x%02x%02x",
744 ((buffer[0] & 0x07) << 6) | ((buffer[1] & 0x3c) >> 2),
745 ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3c) >> 2),
746 ((buffer[2] & 0x03) << 6) | ((buffer[3] & 0x3f)));
747 break;
748 default:
749 /* URG. */
750 break;
753 if (unicode_display == unicode_highlight && isatty (1))
754 printf ("\033[0m"); /* Default colour. */
755 break;
757 case unicode_hex:
758 putchar ('<');
759 printf ("0x");
760 for (j = 0; j < utf8_len; j++)
761 printf ("%02x", buffer [j]);
762 putchar ('>');
763 break;
765 case unicode_locale:
766 printf ("%.1s", buffer);
767 break;
770 return utf8_len;
773 /* Display strings in BUFFER. Treat any UTF-8 encoded characters encountered
774 according to the setting of the unicode_display variable. The buffer
775 contains BUFLEN bytes.
777 Display the characters as if they started at ADDRESS and are contained in
778 FILENAME. */
780 static void
781 print_unicode_buffer (const char * filename,
782 file_ptr address,
783 const unsigned char * buffer,
784 unsigned long buflen)
786 /* Paranoia checks... */
787 if (filename == NULL
788 || buffer == NULL
789 || unicode_display == unicode_default
790 || encoding != 'S'
791 || encoding_bytes != 1)
793 fprintf (stderr, "ICE: bad arguments to print_unicode_buffer\n");
794 return;
797 if (buflen == 0)
798 return;
800 /* We must only display strings that are at least string_min *characters*
801 long. So we scan the buffer in two stages. First we locate the start
802 of a potential string. Then we walk along it until we have found
803 string_min characters. Then we go back to the start point and start
804 displaying characters according to the unicode_display setting. */
806 unsigned long start_point = 0;
807 unsigned long i = 0;
808 unsigned int char_len = 1;
809 unsigned int num_found = 0;
811 for (i = 0; i < buflen; i += char_len)
813 int c = buffer[i];
815 char_len = 1;
817 /* Find the first potential character of a string. */
818 if (! STRING_ISGRAPHIC (c))
820 num_found = 0;
821 continue;
824 if (c > 126)
826 if (c < 0xc0)
828 num_found = 0;
829 continue;
832 if ((char_len = is_valid_utf8 (buffer + i, buflen - i)) == 0)
834 char_len = 1;
835 num_found = 0;
836 continue;
839 if (unicode_display == unicode_invalid)
841 /* We have found a valid UTF-8 character, but we treat it as non-graphic. */
842 num_found = 0;
843 continue;
847 if (num_found == 0)
848 /* We have found a potential starting point for a string. */
849 start_point = i;
851 ++ num_found;
853 if (num_found >= string_min)
854 break;
857 if (num_found < string_min)
858 return;
860 print_filename_and_address (filename, address + start_point);
862 /* We have found string_min characters. Display them and any
863 more that follow. */
864 for (i = start_point; i < buflen; i += char_len)
866 int c = buffer[i];
868 char_len = 1;
870 if (! STRING_ISGRAPHIC (c))
871 break;
872 else if (c < 127)
873 putchar (c);
874 else if (! is_valid_utf8 (buffer + i, buflen - i))
875 break;
876 else if (unicode_display == unicode_invalid)
877 break;
878 else
879 char_len = display_utf8_char (buffer + i);
882 if (output_separator)
883 fputs (output_separator, stdout);
884 else
885 putchar ('\n');
887 /* FIXME: Using tail recursion here is lazy programming... */
888 print_unicode_buffer (filename, address + i, buffer + i, buflen - i);
891 static int
892 get_unicode_byte (FILE * stream,
893 unsigned char * putback,
894 unsigned int * num_putback,
895 unsigned int * num_read)
897 if (* num_putback > 0)
899 * num_putback = * num_putback - 1;
900 return putback [* num_putback];
903 * num_read = * num_read + 1;
905 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
906 return getc_unlocked (stream);
907 #else
908 return getc (stream);
909 #endif
912 /* Helper function for print_unicode_stream. */
914 static void
915 print_unicode_stream_body (const char * filename,
916 file_ptr address,
917 FILE * stream,
918 unsigned char * putback_buf,
919 unsigned int num_putback,
920 unsigned char * print_buf)
922 /* It would be nice if we could just read the stream into a buffer
923 and then process if with print_unicode_buffer. But the input
924 might be huge or it might time-locked (eg stdin). So instead
925 we go one byte at a time... */
927 file_ptr start_point = 0;
928 unsigned int num_read = 0;
929 unsigned int num_chars = 0;
930 unsigned int num_print = 0;
931 int c = 0;
933 /* Find a series of string_min characters. Put them into print_buf. */
936 if (num_chars >= string_min)
937 break;
939 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
940 if (c == EOF)
941 break;
943 if (! STRING_ISGRAPHIC (c))
945 num_chars = num_print = 0;
946 continue;
949 if (num_chars == 0)
950 start_point = num_read - 1;
952 if (c < 127)
954 print_buf[num_print] = c;
955 num_chars ++;
956 num_print ++;
957 continue;
960 if (c < 0xc0)
962 num_chars = num_print = 0;
963 continue;
966 /* We *might* have a UTF-8 sequence. Time to start peeking. */
967 char utf8[4];
969 utf8[0] = c;
970 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
971 if (c == EOF)
972 break;
973 utf8[1] = c;
975 if ((utf8[1] & 0xc0) != 0x80)
977 /* Invalid UTF-8. */
978 putback_buf[num_putback++] = utf8[1];
979 num_chars = num_print = 0;
980 continue;
982 else if ((utf8[0] & 0x20) == 0)
984 /* A valid 2-byte UTF-8 encoding. */
985 if (unicode_display == unicode_invalid)
987 putback_buf[num_putback++] = utf8[1];
988 num_chars = num_print = 0;
990 else
992 print_buf[num_print ++] = utf8[0];
993 print_buf[num_print ++] = utf8[1];
994 num_chars ++;
996 continue;
999 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1000 if (c == EOF)
1001 break;
1002 utf8[2] = c;
1004 if ((utf8[2] & 0xc0) != 0x80)
1006 /* Invalid UTF-8. */
1007 putback_buf[num_putback++] = utf8[2];
1008 putback_buf[num_putback++] = utf8[1];
1009 num_chars = num_print = 0;
1010 continue;
1012 else if ((utf8[0] & 0x10) == 0)
1014 /* A valid 3-byte UTF-8 encoding. */
1015 if (unicode_display == unicode_invalid)
1017 putback_buf[num_putback++] = utf8[2];
1018 putback_buf[num_putback++] = utf8[1];
1019 num_chars = num_print = 0;
1021 else
1023 print_buf[num_print ++] = utf8[0];
1024 print_buf[num_print ++] = utf8[1];
1025 print_buf[num_print ++] = utf8[2];
1026 num_chars ++;
1028 continue;
1031 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1032 if (c == EOF)
1033 break;
1034 utf8[3] = c;
1036 if ((utf8[3] & 0xc0) != 0x80)
1038 /* Invalid UTF-8. */
1039 putback_buf[num_putback++] = utf8[3];
1040 putback_buf[num_putback++] = utf8[2];
1041 putback_buf[num_putback++] = utf8[1];
1042 num_chars = num_print = 0;
1044 /* We have a valid 4-byte UTF-8 encoding. */
1045 else if (unicode_display == unicode_invalid)
1047 putback_buf[num_putback++] = utf8[3];
1048 putback_buf[num_putback++] = utf8[1];
1049 putback_buf[num_putback++] = utf8[2];
1050 num_chars = num_print = 0;
1052 else
1054 print_buf[num_print ++] = utf8[0];
1055 print_buf[num_print ++] = utf8[1];
1056 print_buf[num_print ++] = utf8[2];
1057 print_buf[num_print ++] = utf8[3];
1058 num_chars ++;
1061 while (1);
1063 if (num_chars >= string_min)
1065 /* We know that we have string_min valid characters in print_buf,
1066 and there may be more to come in the stream. Start displaying
1067 them. */
1069 print_filename_and_address (filename, address + start_point);
1071 unsigned int i;
1072 for (i = 0; i < num_print;)
1074 if (print_buf[i] < 127)
1075 putchar (print_buf[i++]);
1076 else
1077 i += display_utf8_char (print_buf + i);
1080 /* OK so now we have to start read unchecked bytes. */
1082 /* Find a series of string_min characters. Put them into print_buf. */
1085 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1086 if (c == EOF)
1087 break;
1089 if (! STRING_ISGRAPHIC (c))
1090 break;
1092 if (c < 127)
1094 putchar (c);
1095 continue;
1098 if (c < 0xc0)
1099 break;
1101 /* We *might* have a UTF-8 sequence. Time to start peeking. */
1102 unsigned char utf8[4];
1104 utf8[0] = c;
1105 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1106 if (c == EOF)
1107 break;
1108 utf8[1] = c;
1110 if ((utf8[1] & 0xc0) != 0x80)
1112 /* Invalid UTF-8. */
1113 putback_buf[num_putback++] = utf8[1];
1114 break;
1116 else if ((utf8[0] & 0x20) == 0)
1118 /* Valid 2-byte UTF-8. */
1119 if (unicode_display == unicode_invalid)
1121 putback_buf[num_putback++] = utf8[1];
1122 break;
1124 else
1126 (void) display_utf8_char (utf8);
1127 continue;
1131 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1132 if (c == EOF)
1133 break;
1134 utf8[2] = c;
1136 if ((utf8[2] & 0xc0) != 0x80)
1138 /* Invalid UTF-8. */
1139 putback_buf[num_putback++] = utf8[2];
1140 putback_buf[num_putback++] = utf8[1];
1141 break;
1143 else if ((utf8[0] & 0x10) == 0)
1145 /* Valid 3-byte UTF-8. */
1146 if (unicode_display == unicode_invalid)
1148 putback_buf[num_putback++] = utf8[2];
1149 putback_buf[num_putback++] = utf8[1];
1150 break;
1152 else
1154 (void) display_utf8_char (utf8);
1155 continue;
1159 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1160 if (c == EOF)
1161 break;
1162 utf8[3] = c;
1164 if ((utf8[3] & 0xc0) != 0x80)
1166 /* Invalid UTF-8. */
1167 putback_buf[num_putback++] = utf8[3];
1168 putback_buf[num_putback++] = utf8[2];
1169 putback_buf[num_putback++] = utf8[1];
1170 break;
1172 else if (unicode_display == unicode_invalid)
1174 putback_buf[num_putback++] = utf8[3];
1175 putback_buf[num_putback++] = utf8[2];
1176 putback_buf[num_putback++] = utf8[1];
1177 break;
1179 else
1180 /* A valid 4-byte UTF-8 encoding. */
1181 (void) display_utf8_char (utf8);
1183 while (1);
1185 if (output_separator)
1186 fputs (output_separator, stdout);
1187 else
1188 putchar ('\n');
1191 if (c != EOF)
1192 /* FIXME: Using tail recursion here is lazy, but it works. */
1193 print_unicode_stream_body (filename, address + num_read, stream, putback_buf, num_putback, print_buf);
1196 /* Display strings read in from STREAM. Treat any UTF-8 encoded characters
1197 encountered according to the setting of the unicode_display variable.
1198 The stream is positioned at ADDRESS and is attached to FILENAME. */
1200 static void
1201 print_unicode_stream (const char * filename,
1202 file_ptr address,
1203 FILE * stream)
1205 /* Paranoia checks... */
1206 if (filename == NULL
1207 || stream == NULL
1208 || unicode_display == unicode_default
1209 || encoding != 'S'
1210 || encoding_bytes != 1)
1212 fprintf (stderr, "ICE: bad arguments to print_unicode_stream\n");
1213 return;
1216 /* Allocate space for string_min 4-byte utf-8 characters. */
1217 unsigned char * print_buf = xmalloc ((4 * string_min) + 1);
1218 /* We should never have to put back more than 4 bytes. */
1219 unsigned char putback_buf[5];
1220 unsigned int num_putback = 0;
1222 print_unicode_stream_body (filename, address, stream, putback_buf, num_putback, print_buf);
1223 free (print_buf);
1226 /* Find the strings in file FILENAME, read from STREAM.
1227 Assume that STREAM is positioned so that the next byte read
1228 is at address ADDRESS in the file.
1230 If STREAM is NULL, do not read from it.
1231 The caller can supply a buffer of characters
1232 to be processed before the data in STREAM.
1233 MAGIC is the address of the buffer and
1234 MAGICCOUNT is how many characters are in it.
1235 Those characters come at address ADDRESS and the data in STREAM follow. */
1237 static void
1238 print_strings (const char *filename, FILE *stream, file_ptr address,
1239 int magiccount, char *magic)
1241 if (unicode_display != unicode_default)
1243 if (magic != NULL)
1244 print_unicode_buffer (filename, address,
1245 (const unsigned char *) magic, magiccount);
1247 if (stream != NULL)
1248 print_unicode_stream (filename, address, stream);
1249 return;
1252 char *buf = (char *) xmalloc (sizeof (char) * (string_min + 1));
1254 while (1)
1256 file_ptr start;
1257 unsigned int i;
1258 long c;
1260 /* See if the next `string_min' chars are all graphic chars. */
1261 tryline:
1262 start = address;
1263 for (i = 0; i < string_min; i++)
1265 c = get_char (stream, &address, &magiccount, &magic);
1266 if (c == EOF)
1268 free (buf);
1269 return;
1272 if (! STRING_ISGRAPHIC (c))
1274 /* Found a non-graphic. Try again starting with next byte. */
1275 unget_part_char (c, &address, &magiccount, &magic);
1276 goto tryline;
1278 buf[i] = c;
1281 /* We found a run of `string_min' graphic characters. Print up
1282 to the next non-graphic character. */
1283 print_filename_and_address (filename, start);
1285 buf[i] = '\0';
1286 fputs (buf, stdout);
1288 while (1)
1290 c = get_char (stream, &address, &magiccount, &magic);
1291 if (c == EOF)
1292 break;
1293 if (! STRING_ISGRAPHIC (c))
1295 unget_part_char (c, &address, &magiccount, &magic);
1296 break;
1298 putchar (c);
1301 if (output_separator)
1302 fputs (output_separator, stdout);
1303 else
1304 putchar ('\n');
1306 free (buf);
1309 static void
1310 usage (FILE *stream, int status)
1312 fprintf (stream, _("Usage: %s [option(s)] [file(s)]\n"), program_name);
1313 fprintf (stream, _(" Display printable strings in [file(s)] (stdin by default)\n"));
1314 fprintf (stream, _(" The options are:\n"));
1316 if (DEFAULT_STRINGS_ALL)
1317 fprintf (stream, _("\
1318 -a - --all Scan the entire file, not just the data section [default]\n\
1319 -d --data Only scan the data sections in the file\n"));
1320 else
1321 fprintf (stream, _("\
1322 -a - --all Scan the entire file, not just the data section\n\
1323 -d --data Only scan the data sections in the file [default]\n"));
1325 fprintf (stream, _("\
1326 -f --print-file-name Print the name of the file before each string\n\
1327 -n <number> Locate & print any sequence of at least <number>\n\
1328 --bytes=<number> displayable characters. (The default is 4).\n\
1329 -t --radix={o,d,x} Print the location of the string in base 8, 10 or 16\n\
1330 -w --include-all-whitespace Include all whitespace as valid string characters\n\
1331 -o An alias for --radix=o\n\
1332 -T --target=<BFDNAME> Specify the binary file format\n\
1333 -e --encoding={s,S,b,l,B,L} Select character size and endianness:\n\
1334 s = 7-bit, S = 8-bit, {b,l} = 16-bit, {B,L} = 32-bit\n\
1335 --unicode={default|show|invalid|hex|escape|highlight}\n\
1336 -U {d|s|i|x|e|h} Specify how to treat UTF-8 encoded unicode characters\n\
1337 -s --output-separator=<string> String used to separate strings in output.\n\
1338 @<file> Read options from <file>\n\
1339 -h --help Display this information\n\
1340 -v -V --version Print the program's version number\n"));
1341 list_supported_targets (program_name, stream);
1342 if (REPORT_BUGS_TO[0] && status == 0)
1343 fprintf (stream, _("Report bugs to %s\n"), REPORT_BUGS_TO);
1344 exit (status);