sdcc/support/sdbinutils/binutils/strings.c

   1 /* strings -- print the strings of printable characters in files
   2    Copyright (C) 1993-2022 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 3, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software
  16    Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
  17    02110-1301, USA.  */
  18 \f
  19 /* Usage: strings [options] file...
  20
  21    Options:
  22    --all
  23    -a
  24    -            Scan each file in its entirety.
  25
  26    --data
  27    -d           Scan only the initialized data section(s) of object files.
  28
  29    --print-file-name
  30    -f           Print the name of the file before each string.
  31
  32    --bytes=min-len
  33    -n min-len
  34    -min-len     Print graphic char sequences, MIN-LEN or more bytes long,
  35                 that are followed by a NUL or a non-displayable character.
  36                 Default is 4.
  37
  38    --radix={o,x,d}
  39    -t {o,x,d}   Print the offset within the file before each string,
  40                 in octal/hex/decimal.
  41
  42   --include-all-whitespace
  43   -w            By default tab and space are the only whitepace included in graphic
  44                 char sequences.  This option considers all of isspace() valid.
  45
  46    -o           Like -to.  (Some other implementations have -o like -to,
  47                 others like -td.  We chose one arbitrarily.)
  48
  49    --encoding={s,S,b,l,B,L}
  50    -e {s,S,b,l,B,L}
  51                 Select character encoding: 7-bit-character, 8-bit-character,
  52                 bigendian 16-bit, littleendian 16-bit, bigendian 32-bit,
  53                 littleendian 32-bit.
  54
  55    --target=BFDNAME
  56    -T {bfdname}
  57                 Specify a non-default object file format.
  58
  59   --unicode={default|locale|invalid|hex|escape|highlight}
  60   -U {d|l|i|x|e|h}
  61                 Determine how to handle UTF-8 unicode characters.  The default
  62                 is no special treatment.  All other versions of this option
  63                 only apply if the encoding is valid and enabling the option
  64                 implies --encoding=S.
  65                 The 'locale' option displays the characters according to the
  66                 current locale.  The 'invalid' option treats them as
  67                 non-string characters.  The 'hex' option displays them as hex
  68                 byte sequences.  The 'escape' option displays them as escape
  69                 sequences and the 'highlight' option displays them as
  70                 coloured escape sequences.
  71
  72   --output-separator=sep_string
  73   -s sep_string String used to separate parsed strings in output.
  74                 Default is newline.
  75
  76    --help
  77    -h           Print the usage message on the standard output.
  78
  79    --version
  80    -V
  81    -v           Print the program version number.
  82
  83    Written by Richard Stallman <rms@gnu.ai.mit.edu>
  84    and David MacKenzie <djm@gnu.ai.mit.edu>.  */
  85
  86 #include "sysdep.h"
  87 #include "bfd.h"
  88 #include "getopt.h"
  89 #include "libiberty.h"
  90 #include "safe-ctype.h"
  91 #include "bucomm.h"
  92
  93 #ifndef streq
  94 #define streq(a,b) (strcmp ((a),(b)) == 0)
  95 #endif
  96
  97 typedef enum unicode_display_type
  98 {
  99   unicode_default = 0,
 100   unicode_locale,
 101   unicode_escape,
 102   unicode_hex,
 103   unicode_highlight,
 104   unicode_invalid
 105 } unicode_display_type;
 106
 107 static unicode_display_type unicode_display = unicode_default;
 108
 109 #define STRING_ISGRAPHIC(c) \
 110       (   (c) >= 0 \
 111        && (c) <= 255 \
 112        && ((c) == '\t' || ISPRINT (c) || (encoding == 'S' && (c) > 127) \
 113            || (include_all_whitespace && ISSPACE (c))) \
 114       )
 115
 116 #ifndef errno
 117 extern int errno;
 118 #endif
 119
 120 /* The BFD section flags that identify an initialized data section.  */
 121 #define DATA_FLAGS (SEC_ALLOC | SEC_LOAD | SEC_HAS_CONTENTS)
 122
 123 /* Radix for printing addresses (must be 8, 10 or 16).  */
 124 static int address_radix;
 125
 126 /* Minimum length of sequence of graphic chars to trigger output.  */
 127 static unsigned int string_min;
 128
 129 /* Whether or not we include all whitespace as a graphic char.   */
 130 static bool include_all_whitespace;
 131
 132 /* TRUE means print address within file for each string.  */
 133 static bool print_addresses;
 134
 135 /* TRUE means print filename for each string.  */
 136 static bool print_filenames;
 137
 138 /* TRUE means for object files scan only the data section.  */
 139 static bool datasection_only;
 140
 141 /* The BFD object file format.  */
 142 static char *target;
 143
 144 /* The character encoding format.  */
 145 static char encoding;
 146 static int encoding_bytes;
 147
 148 /* Output string used to separate parsed strings  */
 149 static char *output_separator;
 150
 151 static struct option long_options[] =
 152 {
 153   {"all", no_argument, NULL, 'a'},
 154   {"bytes", required_argument, NULL, 'n'},
 155   {"data", no_argument, NULL, 'd'},
 156   {"encoding", required_argument, NULL, 'e'},
 157   {"help", no_argument, NULL, 'h'},
 158   {"include-all-whitespace", no_argument, NULL, 'w'},
 159   {"output-separator", required_argument, NULL, 's'},
 160   {"print-file-name", no_argument, NULL, 'f'},
 161   {"radix", required_argument, NULL, 't'},
 162   {"target", required_argument, NULL, 'T'},
 163   {"unicode", required_argument, NULL, 'U'},
 164   {"version", no_argument, NULL, 'v'},
 165   {NULL, 0, NULL, 0}
 166 };
 167
 168 static bool strings_file (char *);
 169 static void print_strings (const char *, FILE *, file_ptr, int, char *);
 170 static void usage (FILE *, int) ATTRIBUTE_NORETURN;
 171 \f
 172 int main (int, char **);
 173
 174 int
 175 main (int argc, char **argv)
 176 {
 177   int optc;
 178   int exit_status = 0;
 179   bool files_given = false;
 180   char *s;
 181   int numeric_opt = 0;
 182
 183   setlocale (LC_ALL, "");
 184   bindtextdomain (PACKAGE, LOCALEDIR);
 185   textdomain (PACKAGE);
 186
 187   program_name = argv[0];
 188   xmalloc_set_program_name (program_name);
 189   bfd_set_error_program_name (program_name);
 190
 191   expandargv (&argc, &argv);
 192
 193   string_min = 4;
 194   include_all_whitespace = false;
 195   print_addresses = false;
 196   print_filenames = false;
 197   if (DEFAULT_STRINGS_ALL)
 198     datasection_only = false;
 199   else
 200     datasection_only = true;
 201   target = NULL;
 202   encoding = 's';
 203   output_separator = NULL;
 204
 205   while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:U:Vv0123456789",
 206                               long_options, (int *) 0)) != EOF)
 207     {
 208       switch (optc)
 209         {
 210         case 'a':
 211           datasection_only = false;
 212           break;
 213
 214         case 'd':
 215           datasection_only = true;
 216           break;
 217
 218         case 'f':
 219           print_filenames = true;
 220           break;
 221
 222         case 'H':
 223         case 'h':
 224           usage (stdout, 0);
 225
 226         case 'n':
 227           string_min = (int) strtoul (optarg, &s, 0);
 228           if (s != NULL && *s != 0)
 229             fatal (_("invalid integer argument %s"), optarg);
 230           break;
 231
 232         case 'w':
 233           include_all_whitespace = true;
 234           break;
 235
 236         case 'o':
 237           print_addresses = true;
 238           address_radix = 8;
 239           break;
 240
 241         case 't':
 242           print_addresses = true;
 243           if (optarg[1] != '\0')
 244             usage (stderr, 1);
 245           switch (optarg[0])
 246             {
 247             case 'o':
 248               address_radix = 8;
 249               break;
 250
 251             case 'd':
 252               address_radix = 10;
 253               break;
 254
 255             case 'x':
 256               address_radix = 16;
 257               break;
 258
 259             default:
 260               usage (stderr, 1);
 261             }
 262           break;
 263
 264         case 'T':
 265           target = optarg;
 266           break;
 267
 268         case 'e':
 269           if (optarg[1] != '\0')
 270             usage (stderr, 1);
 271           encoding = optarg[0];
 272           break;
 273
 274         case 's':
 275           output_separator = optarg;
 276           break;
 277
 278         case 'U':
 279           if (streq (optarg, "default") || streq (optarg, "d"))
 280             unicode_display = unicode_default;
 281           else if (streq (optarg, "locale") || streq (optarg, "l"))
 282             unicode_display = unicode_locale;
 283           else if (streq (optarg, "escape") || streq (optarg, "e"))
 284             unicode_display = unicode_escape;
 285           else if (streq (optarg, "invalid") || streq (optarg, "i"))
 286             unicode_display = unicode_invalid;
 287           else if (streq (optarg, "hex") || streq (optarg, "x"))
 288             unicode_display = unicode_hex;
 289           else if (streq (optarg, "highlight") || streq (optarg, "h"))
 290             unicode_display = unicode_highlight;
 291           else
 292             fatal (_("invalid argument to -U/--unicode: %s"), optarg);
 293           break;
 294
 295         case 'V':
 296         case 'v':
 297           print_version ("strings");
 298           break;
 299
 300         case '?':
 301           usage (stderr, 1);
 302
 303         default:
 304           numeric_opt = optind;
 305           break;
 306         }
 307     }
 308
 309   if (unicode_display != unicode_default)
 310     encoding = 'S';
 311
 312   if (numeric_opt != 0)
 313     {
 314       string_min = (int) strtoul (argv[numeric_opt - 1] + 1, &s, 0);
 315       if (s != NULL && *s != 0)
 316         fatal (_("invalid integer argument %s"), argv[numeric_opt - 1] + 1);
 317     }
 318   if (string_min < 1)
 319     fatal (_("invalid minimum string length %d"), string_min);
 320
 321   switch (encoding)
 322     {
 323     case 'S':
 324     case 's':
 325       encoding_bytes = 1;
 326       break;
 327     case 'b':
 328     case 'l':
 329       encoding_bytes = 2;
 330       break;
 331     case 'B':
 332     case 'L':
 333       encoding_bytes = 4;
 334       break;
 335     default:
 336       usage (stderr, 1);
 337     }
 338
 339   if (bfd_init () != BFD_INIT_MAGIC)
 340     fatal (_("fatal error: libbfd ABI mismatch"));
 341   set_default_bfd_target ();
 342
 343   if (optind >= argc)
 344     {
 345       datasection_only = false;
 346       SET_BINARY (fileno (stdin));
 347       print_strings ("{standard input}", stdin, 0, 0, (char *) NULL);
 348       files_given = true;
 349     }
 350   else
 351     {
 352       for (; optind < argc; ++optind)
 353         {
 354           if (streq (argv[optind], "-"))
 355             datasection_only = false;
 356           else
 357             {
 358               files_given = true;
 359               exit_status |= !strings_file (argv[optind]);
 360             }
 361         }
 362     }
 363
 364   if (!files_given)
 365     usage (stderr, 1);
 366
 367   return (exit_status);
 368 }
 369 \f
 370 /* Scan section SECT of the file ABFD, whose printable name is
 371    FILENAME.  If it contains initialized data set GOT_A_SECTION and
 372    print the strings in it.  */
 373
 374 static void
 375 strings_a_section (bfd *abfd, asection *sect, const char *filename,
 376                    bool *got_a_section)
 377 {
 378   bfd_size_type sectsize;
 379   bfd_byte *mem;
 380
 381   if ((sect->flags & DATA_FLAGS) != DATA_FLAGS)
 382     return;
 383
 384   sectsize = bfd_section_size (sect);
 385   if (sectsize == 0)
 386     return;
 387
 388   if (!bfd_malloc_and_get_section (abfd, sect, &mem))
 389     {
 390       non_fatal (_("%s: Reading section %s failed: %s"),
 391                  filename, sect->name, bfd_errmsg (bfd_get_error ()));
 392       return;
 393     }
 394
 395   *got_a_section = true;
 396   print_strings (filename, NULL, sect->filepos, sectsize, (char *) mem);
 397   free (mem);
 398 }
 399
 400 /* Scan all of the sections in FILE, and print the strings
 401    in the initialized data section(s).
 402
 403    Return TRUE if successful,
 404    FALSE if not (such as if FILE is not an object file).  */
 405
 406 static bool
 407 strings_object_file (const char *file)
 408 {
 409   bfd *abfd;
 410   asection *s;
 411   bool got_a_section;
 412
 413   abfd = bfd_openr (file, target);
 414
 415   if (abfd == NULL)
 416     /* Treat the file as a non-object file.  */
 417     return false;
 418
 419   /* This call is mainly for its side effect of reading in the sections.
 420      We follow the traditional behavior of `strings' in that we don't
 421      complain if we don't recognize a file to be an object file.  */
 422   if (!bfd_check_format (abfd, bfd_object))
 423     {
 424       bfd_close (abfd);
 425       return false;
 426     }
 427
 428   got_a_section = false;
 429   for (s = abfd->sections; s != NULL; s = s->next)
 430     strings_a_section (abfd, s, file, &got_a_section);
 431
 432   if (!bfd_close (abfd))
 433     {
 434       bfd_nonfatal (file);
 435       return false;
 436     }
 437
 438   return got_a_section;
 439 }
 440
 441 /* Print the strings in FILE.  Return TRUE if ok, FALSE if an error occurs.  */
 442
 443 static bool
 444 strings_file (char *file)
 445 {
 446   struct stat st;
 447
 448   /* get_file_size does not support non-S_ISREG files.  */
 449
 450   if (stat (file, &st) < 0)
 451     {
 452       if (errno == ENOENT)
 453         non_fatal (_("'%s': No such file"), file);
 454       else
 455         non_fatal (_("Warning: could not locate '%s'.  reason: %s"),
 456                    file, strerror (errno));
 457       return false;
 458     }
 459   else if (S_ISDIR (st.st_mode))
 460     {
 461       non_fatal (_("Warning: '%s' is a directory"), file);
 462       return false;
 463     }
 464
 465   /* If we weren't told to scan the whole file,
 466      try to open it as an object file and only look at
 467      initialized data sections.  If that fails, fall back to the
 468      whole file.  */
 469   if (!datasection_only || !strings_object_file (file))
 470     {
 471       FILE *stream;
 472
 473       stream = fopen (file, FOPEN_RB);
 474       if (stream == NULL)
 475         {
 476           fprintf (stderr, "%s: ", program_name);
 477           perror (file);
 478           return false;
 479         }
 480
 481       print_strings (file, stream, (file_ptr) 0, 0, (char *) NULL);
 482
 483       if (fclose (stream) == EOF)
 484         {
 485           fprintf (stderr, "%s: ", program_name);
 486           perror (file);
 487           return false;
 488         }
 489     }
 490
 491   return true;
 492 }
 493 \f
 494 /* Read the next character, return EOF if none available.
 495    Assume that STREAM is positioned so that the next byte read
 496    is at address ADDRESS in the file.
 497
 498    If STREAM is NULL, do not read from it.
 499    The caller can supply a buffer of characters
 500    to be processed before the data in STREAM.
 501    MAGIC is the address of the buffer and
 502    MAGICCOUNT is how many characters are in it.  */
 503
 504 static long
 505 get_char (FILE *stream, file_ptr *address, int *magiccount, char **magic)
 506 {
 507   int c, i;
 508   long r = 0;
 509
 510   for (i = 0; i < encoding_bytes; i++)
 511     {
 512       if (*magiccount)
 513         {
 514           (*magiccount)--;
 515           c = *(*magic)++;
 516         }
 517       else
 518         {
 519           if (stream == NULL)
 520             return EOF;
 521
 522           /* Only use getc_unlocked if we found a declaration for it.
 523              Otherwise, libc is not thread safe by default, and we
 524              should not use it.  */
 525
 526 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
 527           c = getc_unlocked (stream);
 528 #else
 529           c = getc (stream);
 530 #endif
 531           if (c == EOF)
 532             return EOF;
 533         }
 534
 535       (*address)++;
 536       r = (r << 8) | (c & 0xff);
 537     }
 538
 539   switch (encoding)
 540     {
 541     default:
 542       break;
 543     case 'l':
 544       r = ((r & 0xff) << 8) | ((r & 0xff00) >> 8);
 545       break;
 546     case 'L':
 547       r = (((r & 0xff) << 24) | ((r & 0xff00) << 8)
 548            | ((r & 0xff0000) >> 8) | ((r & 0xff000000) >> 24));
 549       break;
 550     }
 551
 552   return r;
 553 }
 554
 555 /* Throw away one byte of a (possibly) multi-byte char C, updating
 556    address and buffer to suit.  */
 557
 558 static void
 559 unget_part_char (long c, file_ptr *address, int *magiccount, char **magic)
 560 {
 561   static char tmp[4];
 562
 563   if (encoding_bytes > 1)
 564     {
 565       *address -= encoding_bytes - 1;
 566
 567       if (*magiccount == 0)
 568         {
 569           /* If no magic buffer exists, use temp buffer.  */
 570           switch (encoding)
 571             {
 572             default:
 573               break;
 574             case 'b':
 575               tmp[0] = c & 0xff;
 576               *magiccount = 1;
 577               break;
 578             case 'l':
 579               tmp[0] = (c >> 8) & 0xff;
 580               *magiccount = 1;
 581               break;
 582             case 'B':
 583               tmp[0] = (c >> 16) & 0xff;
 584               tmp[1] = (c >> 8) & 0xff;
 585               tmp[2] = c & 0xff;
 586               *magiccount = 3;
 587               break;
 588             case 'L':
 589               tmp[0] = (c >> 8) & 0xff;
 590               tmp[1] = (c >> 16) & 0xff;
 591               tmp[2] = (c >> 24) & 0xff;
 592               *magiccount = 3;
 593               break;
 594             }
 595           *magic = tmp;
 596         }
 597       else
 598         {
 599           /* If magic buffer exists, rewind.  */
 600           *magic -= encoding_bytes - 1;
 601           *magiccount += encoding_bytes - 1;
 602         }
 603     }
 604 }
 605
 606 static void
 607 print_filename_and_address (const char * filename, file_ptr address)
 608 {
 609   if (print_filenames)
 610     printf ("%s: ", filename);
 611
 612   if (! print_addresses)
 613     return;
 614
 615   switch (address_radix)
 616     {
 617     case 8:
 618       if (sizeof (address) > sizeof (long))
 619         {
 620 #ifndef __MSVCRT__
 621           printf ("%7llo ", (unsigned long long) address);
 622 #else
 623           printf ("%7I64o ", (unsigned long long) address);
 624 #endif
 625         }
 626       else
 627         printf ("%7lo ", (unsigned long) address);
 628       break;
 629
 630     case 10:
 631       if (sizeof (address) > sizeof (long))
 632         {
 633 #ifndef __MSVCRT__
 634           printf ("%7llu ", (unsigned long long) address);
 635 #else
 636           printf ("%7I64d ", (unsigned long long) address);
 637 #endif
 638         }
 639       else
 640         printf ("%7ld ", (long) address);
 641       break;
 642
 643     case 16:
 644       if (sizeof (address) > sizeof (long))
 645         {
 646 #ifndef __MSVCRT__
 647           printf ("%7llx ", (unsigned long long) address);
 648 #else
 649           printf ("%7I64x ", (unsigned long long) address);
 650 #endif
 651         }
 652       else
 653         printf ("%7lx ", (unsigned long) address);
 654       break;
 655     }
 656 }
 657
 658 /* Return non-zero if the bytes starting at BUFFER form a valid UTF-8 encoding.
 659    If the encoding is valid then returns the number of bytes it uses.  */
 660
 661 static unsigned int
 662 is_valid_utf8 (const unsigned char * buffer, unsigned long buflen)
 663 {
 664   if (buffer[0] < 0xc0)
 665     return 0;
 666
 667   if (buflen < 2)
 668     return 0;
 669
 670   if ((buffer[1] & 0xc0) != 0x80)
 671     return 0;
 672
 673   if ((buffer[0] & 0x20) == 0)
 674     return 2;
 675
 676   if (buflen < 3)
 677     return 0;
 678
 679   if ((buffer[2] & 0xc0) != 0x80)
 680     return 0;
 681
 682   if ((buffer[0] & 0x10) == 0)
 683     return 3;
 684
 685   if (buflen < 4)
 686     return 0;
 687
 688   if ((buffer[3] & 0xc0) != 0x80)
 689     return 0;
 690
 691   return 4;
 692 }
 693
 694 /* Display a UTF-8 encoded character in BUFFER according to the setting
 695    of unicode_display.  The character is known to be valid.
 696    Returns the number of bytes consumed.  */
 697
 698 static unsigned int
 699 display_utf8_char (const unsigned char * buffer)
 700 {
 701   unsigned int j;
 702   unsigned int utf8_len;
 703
 704   switch (buffer[0] & 0x30)
 705     {
 706     case 0x00:
 707     case 0x10:
 708       utf8_len = 2;
 709       break;
 710     case 0x20:
 711       utf8_len = 3;
 712       break;
 713     default:
 714       utf8_len = 4;
 715     }
 716
 717   switch (unicode_display)
 718     {
 719     default:
 720       fprintf (stderr, "ICE: unexpected unicode display type\n");
 721       break;
 722
 723     case unicode_escape:
 724     case unicode_highlight:
 725       if (unicode_display == unicode_highlight && isatty (1))
 726         printf ("\x1B[31;47m"); /* Red.  */
 727
 728       switch (utf8_len)
 729         {
 730         case 2:
 731           printf ("\\u%02x%02x",
 732                   ((buffer[0] & 0x1c) >> 2),
 733                   ((buffer[0] & 0x03) << 6) | (buffer[1] & 0x3f));
 734           break;
 735
 736         case 3:
 737           printf ("\\u%02x%02x",
 738                   ((buffer[0] & 0x0f) << 4) | ((buffer[1] & 0x3c) >> 2),
 739                   ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3f)));
 740           break;
 741
 742         case 4:
 743           printf ("\\u%02x%02x%02x",
 744                   ((buffer[0] & 0x07) << 6) | ((buffer[1] & 0x3c) >> 2),
 745                   ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3c) >> 2),
 746                   ((buffer[2] & 0x03) << 6) | ((buffer[3] & 0x3f)));
 747           break;
 748         default:
 749           /* URG.  */
 750           break;
 751         }
 752
 753       if (unicode_display == unicode_highlight && isatty (1))
 754         printf ("\033[0m"); /* Default colour.  */
 755       break;
 756
 757     case unicode_hex:
 758       putchar ('<');
 759       printf ("0x");
 760       for (j = 0; j < utf8_len; j++)
 761         printf ("%02x", buffer [j]);
 762       putchar ('>');
 763       break;
 764
 765     case unicode_locale:
 766       printf ("%.1s", buffer);
 767       break;
 768     }
 769
 770   return utf8_len;
 771 }
 772
 773 /* Display strings in BUFFER.  Treat any UTF-8 encoded characters encountered
 774    according to the setting of the unicode_display variable.  The buffer
 775    contains BUFLEN bytes.
 776
 777    Display the characters as if they started at ADDRESS and are contained in
 778    FILENAME.  */
 779
 780 static void
 781 print_unicode_buffer (const char *            filename,
 782                       file_ptr                address,
 783                       const unsigned char *   buffer,
 784                       unsigned long           buflen)
 785 {
 786   /* Paranoia checks...  */
 787   if (filename == NULL
 788       || buffer == NULL
 789       || unicode_display == unicode_default
 790       || encoding != 'S'
 791       || encoding_bytes != 1)
 792     {
 793       fprintf (stderr, "ICE: bad arguments to print_unicode_buffer\n");
 794       return;
 795     }
 796
 797   if (buflen == 0)
 798     return;
 799
 800   /* We must only display strings that are at least string_min *characters*
 801      long.  So we scan the buffer in two stages.  First we locate the start
 802      of a potential string.  Then we walk along it until we have found
 803      string_min characters.  Then we go back to the start point and start
 804      displaying characters according to the unicode_display setting.  */
 805
 806   unsigned long start_point = 0;
 807   unsigned long i = 0;
 808   unsigned int char_len = 1;
 809   unsigned int num_found = 0;
 810
 811   for (i = 0; i < buflen; i += char_len)
 812     {
 813       int c = buffer[i];
 814
 815       char_len = 1;
 816
 817       /* Find the first potential character of a string.  */
 818       if (! STRING_ISGRAPHIC (c))
 819         {
 820           num_found = 0;
 821           continue;
 822         }
 823
 824       if (c > 126)
 825         {
 826           if (c < 0xc0)
 827             {
 828               num_found = 0;
 829               continue;
 830             }
 831
 832           if ((char_len = is_valid_utf8 (buffer + i, buflen - i)) == 0)
 833             {
 834               char_len = 1;
 835               num_found = 0;
 836               continue;
 837             }
 838
 839           if (unicode_display == unicode_invalid)
 840             {
 841               /* We have found a valid UTF-8 character, but we treat it as non-graphic.  */
 842               num_found = 0;
 843               continue;
 844             }
 845         }
 846
 847       if (num_found == 0)
 848         /* We have found a potential starting point for a string.  */
 849         start_point = i;
 850
 851       ++ num_found;
 852
 853       if (num_found >= string_min)
 854         break;
 855     }
 856
 857   if (num_found < string_min)
 858     return;
 859
 860   print_filename_and_address (filename, address + start_point);
 861
 862   /* We have found string_min characters.  Display them and any
 863      more that follow.  */
 864   for (i = start_point; i < buflen; i += char_len)
 865     {
 866       int c = buffer[i];
 867
 868       char_len = 1;
 869
 870       if (! STRING_ISGRAPHIC (c))
 871         break;
 872       else if (c < 127)
 873         putchar (c);
 874       else if (! is_valid_utf8 (buffer + i, buflen - i))
 875         break;
 876       else if (unicode_display == unicode_invalid)
 877         break;
 878       else
 879         char_len = display_utf8_char (buffer + i);
 880     }
 881
 882   if (output_separator)
 883     fputs (output_separator, stdout);
 884   else
 885     putchar ('\n');
 886
 887   /* FIXME: Using tail recursion here is lazy programming...  */
 888   print_unicode_buffer (filename, address + i, buffer + i, buflen - i);
 889 }
 890
 891 static int
 892 get_unicode_byte (FILE *          stream,
 893                   unsigned char * putback,
 894                   unsigned int *  num_putback,
 895                   unsigned int *  num_read)
 896 {
 897   if (* num_putback > 0)
 898     {
 899       * num_putback = * num_putback - 1;
 900       return putback [* num_putback];
 901     }
 902
 903   * num_read = * num_read + 1;
 904
 905 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
 906   return getc_unlocked (stream);
 907 #else
 908   return getc (stream);
 909 #endif
 910 }
 911
 912 /* Helper function for print_unicode_stream.  */
 913
 914 static void
 915 print_unicode_stream_body (const char *     filename,
 916                            file_ptr         address,
 917                            FILE *           stream,
 918                            unsigned char *  putback_buf,
 919                            unsigned int     num_putback,
 920                            unsigned char *  print_buf)
 921 {
 922   /* It would be nice if we could just read the stream into a buffer
 923      and then process if with print_unicode_buffer.  But the input
 924      might be huge or it might time-locked (eg stdin).  So instead
 925      we go one byte at a time...  */
 926
 927   file_ptr start_point = 0;
 928   unsigned int num_read = 0;
 929   unsigned int num_chars = 0;
 930   unsigned int num_print = 0;
 931   int c = 0;
 932
 933   /* Find a series of string_min characters.  Put them into print_buf.  */
 934   do
 935     {
 936       if (num_chars >= string_min)
 937         break;
 938
 939       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
 940       if (c == EOF)
 941         break;
 942
 943       if (! STRING_ISGRAPHIC (c))
 944         {
 945           num_chars = num_print = 0;
 946           continue;
 947         }
 948
 949       if (num_chars == 0)
 950         start_point = num_read - 1;
 951
 952       if (c < 127)
 953         {
 954           print_buf[num_print] = c;
 955           num_chars ++;
 956           num_print ++;
 957           continue;
 958         }
 959
 960       if (c < 0xc0)
 961         {
 962           num_chars = num_print = 0;
 963           continue;
 964         }
 965
 966       /* We *might* have a UTF-8 sequence.  Time to start peeking.  */
 967       char utf8[4];
 968
 969       utf8[0] = c;
 970       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
 971       if (c == EOF)
 972         break;
 973       utf8[1] = c;
 974
 975       if ((utf8[1] & 0xc0) != 0x80)
 976         {
 977           /* Invalid UTF-8.  */
 978           putback_buf[num_putback++] = utf8[1];
 979           num_chars = num_print = 0;
 980           continue;
 981         }
 982       else if ((utf8[0] & 0x20) == 0)
 983         {
 984           /* A valid 2-byte UTF-8 encoding.  */
 985           if (unicode_display == unicode_invalid)
 986             {
 987               putback_buf[num_putback++] = utf8[1];
 988               num_chars = num_print = 0;
 989             }
 990           else
 991             {
 992               print_buf[num_print ++] = utf8[0];
 993               print_buf[num_print ++] = utf8[1];
 994               num_chars ++;
 995             }
 996           continue;
 997         }
 998
 999       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1000       if (c == EOF)
1001         break;
1002       utf8[2] = c;
1003
1004       if ((utf8[2] & 0xc0) != 0x80)
1005         {
1006           /* Invalid UTF-8.  */
1007           putback_buf[num_putback++] = utf8[2];
1008           putback_buf[num_putback++] = utf8[1];
1009           num_chars = num_print = 0;
1010           continue;
1011         }
1012       else if ((utf8[0] & 0x10) == 0)
1013         {
1014           /* A valid 3-byte UTF-8 encoding.  */
1015           if (unicode_display == unicode_invalid)
1016             {
1017               putback_buf[num_putback++] = utf8[2];
1018               putback_buf[num_putback++] = utf8[1];
1019               num_chars = num_print = 0;
1020             }
1021           else
1022             {
1023               print_buf[num_print ++] = utf8[0];
1024               print_buf[num_print ++] = utf8[1];
1025               print_buf[num_print ++] = utf8[2];
1026               num_chars ++;
1027             }
1028           continue;
1029         }
1030
1031       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1032       if (c == EOF)
1033         break;
1034       utf8[3] = c;
1035
1036       if ((utf8[3] & 0xc0) != 0x80)
1037         {
1038           /* Invalid UTF-8.  */
1039           putback_buf[num_putback++] = utf8[3];
1040           putback_buf[num_putback++] = utf8[2];
1041           putback_buf[num_putback++] = utf8[1];
1042           num_chars = num_print = 0;
1043         }
1044       /* We have a valid 4-byte UTF-8 encoding.  */
1045       else if (unicode_display == unicode_invalid)
1046         {
1047           putback_buf[num_putback++] = utf8[3];
1048           putback_buf[num_putback++] = utf8[1];
1049           putback_buf[num_putback++] = utf8[2];
1050           num_chars = num_print = 0;
1051         }
1052       else
1053         {
1054           print_buf[num_print ++] = utf8[0];
1055           print_buf[num_print ++] = utf8[1];
1056           print_buf[num_print ++] = utf8[2];
1057           print_buf[num_print ++] = utf8[3];
1058           num_chars ++;
1059         }
1060     }
1061   while (1);
1062
1063   if (num_chars >= string_min)
1064     {
1065       /* We know that we have string_min valid characters in print_buf,
1066          and there may be more to come in the stream.  Start displaying
1067          them.  */
1068
1069       print_filename_and_address (filename, address + start_point);
1070
1071       unsigned int i;
1072       for (i = 0; i < num_print;)
1073         {
1074           if (print_buf[i] < 127)
1075             putchar (print_buf[i++]);
1076           else
1077             i += display_utf8_char (print_buf + i);
1078         }
1079
1080       /* OK so now we have to start read unchecked bytes.  */
1081
1082       /* Find a series of string_min characters.  Put them into print_buf.  */
1083       do
1084         {
1085           c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1086           if (c == EOF)
1087             break;
1088
1089           if (! STRING_ISGRAPHIC (c))
1090             break;
1091
1092           if (c < 127)
1093             {
1094               putchar (c);
1095               continue;
1096             }
1097
1098           if (c < 0xc0)
1099             break;
1100
1101           /* We *might* have a UTF-8 sequence.  Time to start peeking.  */
1102           unsigned char utf8[4];
1103
1104           utf8[0] = c;
1105           c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1106           if (c == EOF)
1107             break;
1108           utf8[1] = c;
1109
1110           if ((utf8[1] & 0xc0) != 0x80)
1111             {
1112               /* Invalid UTF-8.  */
1113               putback_buf[num_putback++] = utf8[1];
1114               break;
1115             }
1116           else if ((utf8[0] & 0x20) == 0)
1117             {
1118               /* Valid 2-byte UTF-8.  */
1119               if (unicode_display == unicode_invalid)
1120                 {
1121                   putback_buf[num_putback++] = utf8[1];
1122                   break;
1123                 }
1124               else
1125                 {
1126                   (void) display_utf8_char (utf8);
1127                   continue;
1128                 }
1129             }
1130
1131           c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1132           if (c == EOF)
1133             break;
1134           utf8[2] = c;
1135
1136           if ((utf8[2] & 0xc0) != 0x80)
1137             {
1138               /* Invalid UTF-8.  */
1139               putback_buf[num_putback++] = utf8[2];
1140               putback_buf[num_putback++] = utf8[1];
1141               break;
1142             }
1143           else if ((utf8[0] & 0x10) == 0)
1144             {
1145               /* Valid 3-byte UTF-8.  */
1146               if (unicode_display == unicode_invalid)
1147                 {
1148                   putback_buf[num_putback++] = utf8[2];
1149                   putback_buf[num_putback++] = utf8[1];
1150                   break;
1151                 }
1152               else
1153                 {
1154                   (void) display_utf8_char (utf8);
1155                   continue;
1156                 }
1157             }
1158
1159           c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1160           if (c == EOF)
1161             break;
1162           utf8[3] = c;
1163
1164           if ((utf8[3] & 0xc0) != 0x80)
1165             {
1166               /* Invalid UTF-8.  */
1167               putback_buf[num_putback++] = utf8[3];
1168               putback_buf[num_putback++] = utf8[2];
1169               putback_buf[num_putback++] = utf8[1];
1170               break;
1171             }
1172           else if (unicode_display == unicode_invalid)
1173             {
1174               putback_buf[num_putback++] = utf8[3];
1175               putback_buf[num_putback++] = utf8[2];
1176               putback_buf[num_putback++] = utf8[1];
1177               break;
1178             }
1179           else
1180             /* A valid 4-byte UTF-8 encoding.  */
1181             (void) display_utf8_char (utf8);
1182         }
1183       while (1);
1184
1185       if (output_separator)
1186         fputs (output_separator, stdout);
1187       else
1188         putchar ('\n');
1189     }
1190
1191   if (c != EOF)
1192     /* FIXME: Using tail recursion here is lazy, but it works.  */
1193     print_unicode_stream_body (filename, address + num_read, stream, putback_buf, num_putback, print_buf);
1194 }
1195
1196 /* Display strings read in from STREAM.  Treat any UTF-8 encoded characters
1197    encountered according to the setting of the unicode_display variable.
1198    The stream is positioned at ADDRESS and is attached to FILENAME.  */
1199
1200 static void
1201 print_unicode_stream (const char * filename,
1202                       file_ptr     address,
1203                       FILE *       stream)
1204 {
1205   /* Paranoia checks...  */
1206   if (filename == NULL
1207       || stream == NULL
1208       || unicode_display == unicode_default
1209       || encoding != 'S'
1210       || encoding_bytes != 1)
1211     {
1212       fprintf (stderr, "ICE: bad arguments to print_unicode_stream\n");
1213       return;
1214     }
1215
1216   /* Allocate space for string_min 4-byte utf-8 characters.  */
1217   unsigned char * print_buf = xmalloc ((4 * string_min) + 1);
1218   /* We should never have to put back more than 4 bytes.  */
1219   unsigned char putback_buf[5];
1220   unsigned int num_putback = 0;
1221
1222   print_unicode_stream_body (filename, address, stream, putback_buf, num_putback, print_buf);
1223   free (print_buf);
1224 }
1225 \f
1226 /* Find the strings in file FILENAME, read from STREAM.
1227    Assume that STREAM is positioned so that the next byte read
1228    is at address ADDRESS in the file.
1229
1230    If STREAM is NULL, do not read from it.
1231    The caller can supply a buffer of characters
1232    to be processed before the data in STREAM.
1233    MAGIC is the address of the buffer and
1234    MAGICCOUNT is how many characters are in it.
1235    Those characters come at address ADDRESS and the data in STREAM follow.  */
1236
1237 static void
1238 print_strings (const char *filename, FILE *stream, file_ptr address,
1239                int magiccount, char *magic)
1240 {
1241   if (unicode_display != unicode_default)
1242     {
1243       if (magic != NULL)
1244         print_unicode_buffer (filename, address,
1245                               (const unsigned char *) magic, magiccount);
1246
1247       if (stream != NULL)
1248         print_unicode_stream (filename, address, stream);
1249       return;
1250     }
1251
1252   char *buf = (char *) xmalloc (sizeof (char) * (string_min + 1));
1253
1254   while (1)
1255     {
1256       file_ptr start;
1257       unsigned int i;
1258       long c;
1259
1260       /* See if the next `string_min' chars are all graphic chars.  */
1261     tryline:
1262       start = address;
1263       for (i = 0; i < string_min; i++)
1264         {
1265           c = get_char (stream, &address, &magiccount, &magic);
1266           if (c == EOF)
1267             {
1268               free (buf);
1269               return;
1270             }
1271
1272           if (! STRING_ISGRAPHIC (c))
1273             {
1274               /* Found a non-graphic.  Try again starting with next byte.  */
1275               unget_part_char (c, &address, &magiccount, &magic);
1276               goto tryline;
1277             }
1278           buf[i] = c;
1279         }
1280
1281       /* We found a run of `string_min' graphic characters.  Print up
1282          to the next non-graphic character.  */
1283       print_filename_and_address (filename, start);
1284
1285       buf[i] = '\0';
1286       fputs (buf, stdout);
1287
1288       while (1)
1289         {
1290           c = get_char (stream, &address, &magiccount, &magic);
1291           if (c == EOF)
1292             break;
1293           if (! STRING_ISGRAPHIC (c))
1294             {
1295               unget_part_char (c, &address, &magiccount, &magic);
1296               break;
1297             }
1298           putchar (c);
1299         }
1300
1301       if (output_separator)
1302         fputs (output_separator, stdout);
1303       else
1304         putchar ('\n');
1305     }
1306   free (buf);
1307 }
1308 \f
1309 static void
1310 usage (FILE *stream, int status)
1311 {
1312   fprintf (stream, _("Usage: %s [option(s)] [file(s)]\n"), program_name);
1313   fprintf (stream, _(" Display printable strings in [file(s)] (stdin by default)\n"));
1314   fprintf (stream, _(" The options are:\n"));
1315
1316   if (DEFAULT_STRINGS_ALL)
1317     fprintf (stream, _("\
1318   -a - --all                Scan the entire file, not just the data section [default]\n\
1319   -d --data                 Only scan the data sections in the file\n"));
1320   else
1321     fprintf (stream, _("\
1322   -a - --all                Scan the entire file, not just the data section\n\
1323   -d --data                 Only scan the data sections in the file [default]\n"));
1324
1325   fprintf (stream, _("\
1326   -f --print-file-name      Print the name of the file before each string\n\
1327   -n <number>               Locate & print any sequence of at least <number>\n\
1328     --bytes=<number>         displayable characters.  (The default is 4).\n\
1329   -t --radix={o,d,x}        Print the location of the string in base 8, 10 or 16\n\
1330   -w --include-all-whitespace Include all whitespace as valid string characters\n\
1331   -o                        An alias for --radix=o\n\
1332   -T --target=<BFDNAME>     Specify the binary file format\n\
1333   -e --encoding={s,S,b,l,B,L} Select character size and endianness:\n\
1334                             s = 7-bit, S = 8-bit, {b,l} = 16-bit, {B,L} = 32-bit\n\
1335   --unicode={default|show|invalid|hex|escape|highlight}\n\
1336   -U {d|s|i|x|e|h}          Specify how to treat UTF-8 encoded unicode characters\n\
1337   -s --output-separator=<string> String used to separate strings in output.\n\
1338   @<file>                   Read options from <file>\n\
1339   -h --help                 Display this information\n\
1340   -v -V --version           Print the program's version number\n"));
1341   list_supported_targets (program_name, stream);
1342   if (REPORT_BUGS_TO[0] && status == 0)
1343     fprintf (stream, _("Report bugs to %s\n"), REPORT_BUGS_TO);
1344   exit (status);
1345 }