.
[glibc/history.git] / iconv / iconv_prog.c
bloba1ca05f153caee4f463f122d9769687f811a5e2a
1 /* Convert text in given files from the specified from-set to the to-set.
2 Copyright (C) 1998-2008, 2009 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published
8 by the Free Software Foundation; version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
20 #include <argp.h>
21 #include <assert.h>
22 #include <ctype.h>
23 #include <errno.h>
24 #include <error.h>
25 #include <fcntl.h>
26 #include <iconv.h>
27 #include <langinfo.h>
28 #include <locale.h>
29 #include <search.h>
30 #include <stdbool.h>
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <unistd.h>
35 #include <libintl.h>
36 #ifdef _POSIX_MAPPED_FILES
37 # include <sys/mman.h>
38 #endif
39 #include <charmap.h>
40 #include <gconv_int.h>
41 #include "iconv_prog.h"
42 #include "iconvconfig.h"
44 /* Get libc version number. */
45 #include "../version.h"
47 #define PACKAGE _libc_intl_domainname
50 /* Name and version of program. */
51 static void print_version (FILE *stream, struct argp_state *state);
52 void (*argp_program_version_hook) (FILE *, struct argp_state *) = print_version;
54 #define OPT_VERBOSE 1000
55 #define OPT_LIST 'l'
57 /* Definitions of arguments for argp functions. */
58 static const struct argp_option options[] =
60 { NULL, 0, NULL, 0, N_("Input/Output format specification:") },
61 { "from-code", 'f', "NAME", 0, N_("encoding of original text") },
62 { "to-code", 't', "NAME", 0, N_("encoding for output") },
63 { NULL, 0, NULL, 0, N_("Information:") },
64 { "list", 'l', NULL, 0, N_("list all known coded character sets") },
65 { NULL, 0, NULL, 0, N_("Output control:") },
66 { NULL, 'c', NULL, 0, N_("omit invalid characters from output") },
67 { "output", 'o', "FILE", 0, N_("output file") },
68 { "silent", 's', NULL, 0, N_("suppress warnings") },
69 { "verbose", OPT_VERBOSE, NULL, 0, N_("print progress information") },
70 { NULL, 0, NULL, 0, NULL }
73 /* Short description of program. */
74 static const char doc[] = N_("\
75 Convert encoding of given files from one encoding to another.");
77 /* Strings for arguments in help texts. */
78 static const char args_doc[] = N_("[FILE...]");
80 /* Prototype for option handler. */
81 static error_t parse_opt (int key, char *arg, struct argp_state *state);
83 /* Function to print some extra text in the help message. */
84 static char *more_help (int key, const char *text, void *input);
86 /* Data structure to communicate with argp functions. */
87 static struct argp argp =
89 options, parse_opt, args_doc, doc, NULL, more_help
92 /* Code sets to convert from and to respectively. An empty string as the
93 default causes the 'iconv_open' function to look up the charset of the
94 currently selected locale and use it. */
95 static const char *from_code = "";
96 static const char *to_code = "";
98 /* File to write output to. If NULL write to stdout. */
99 static const char *output_file;
101 /* Nonzero if verbose ouput is wanted. */
102 int verbose;
104 /* Nonzero if list of all coded character sets is wanted. */
105 static int list;
107 /* If nonzero omit invalid character from output. */
108 int omit_invalid;
110 /* Prototypes for the functions doing the actual work. */
111 static int process_block (iconv_t cd, char *addr, size_t len, FILE **output,
112 const char *output_file);
113 static int process_fd (iconv_t cd, int fd, FILE **output,
114 const char *output_file);
115 static int process_file (iconv_t cd, FILE *input, FILE **output,
116 const char *output_file);
117 static void print_known_names (void) internal_function;
121 main (int argc, char *argv[])
123 int status = EXIT_SUCCESS;
124 int remaining;
125 iconv_t cd;
126 const char *orig_to_code;
127 struct charmap_t *from_charmap = NULL;
128 struct charmap_t *to_charmap = NULL;
130 /* Set locale via LC_ALL. */
131 setlocale (LC_ALL, "");
133 /* Set the text message domain. */
134 textdomain (_libc_intl_domainname);
136 /* Parse and process arguments. */
137 argp_parse (&argp, argc, argv, 0, &remaining, NULL);
139 /* List all coded character sets if wanted. */
140 if (list)
142 print_known_names ();
143 exit (EXIT_SUCCESS);
146 /* If we have to ignore errors make sure we use the appropriate name for
147 the to-character-set. */
148 orig_to_code = to_code;
149 if (omit_invalid)
151 const char *errhand = strchrnul (to_code, '/');
152 int nslash = 2;
153 char *newp;
154 char *cp;
156 if (*errhand == '/')
158 --nslash;
159 errhand = strchrnul (errhand, '/');
161 if (*errhand == '/')
163 --nslash;
164 errhand = strchr (errhand, '\0');
168 newp = (char *) alloca (errhand - to_code + nslash + 7 + 1);
169 cp = mempcpy (newp, to_code, errhand - to_code);
170 while (nslash-- > 0)
171 *cp++ = '/';
172 if (cp[-1] != '/')
173 *cp++ = ',';
174 memcpy (cp, "IGNORE", sizeof ("IGNORE"));
176 to_code = newp;
179 /* POSIX 1003.2b introduces a silly thing: the arguments to -t anf -f
180 can be file names of charmaps. In this case iconv will have to read
181 those charmaps and use them to do the conversion. But there are
182 holes in the specification. There is nothing said that if -f is a
183 charmap filename that -t must be, too. And vice versa. There is
184 also no word about the symbolic names used. What if they don't
185 match? */
186 if (strchr (from_code, '/') != NULL)
187 /* The from-name might be a charmap file name. Try reading the
188 file. */
189 from_charmap = charmap_read (from_code, /*0, 1*/1, 0, 0, 0);
191 if (strchr (orig_to_code, '/') != NULL)
192 /* The to-name might be a charmap file name. Try reading the
193 file. */
194 to_charmap = charmap_read (orig_to_code, /*0, 1,*/1, 0, 0, 0);
197 /* At this point we have to handle two cases. The first one is
198 where a charmap is used for the from- or to-charset, or both. We
199 handle this special since it is very different from the sane way of
200 doing things. The other case allows converting using the iconv()
201 function. */
202 if (from_charmap != NULL || to_charmap != NULL)
203 /* Construct the conversion table and do the conversion. */
204 status = charmap_conversion (from_code, from_charmap, to_code, to_charmap,
205 argc, remaining, argv, output_file);
206 else
208 /* Let's see whether we have these coded character sets. */
209 cd = iconv_open (to_code, from_code);
210 if (cd == (iconv_t) -1)
212 if (errno == EINVAL)
214 /* Try to be nice with the user and tell her which of the
215 two encoding names is wrong. This is possible because
216 all supported encodings can be converted from/to Unicode,
217 in other words, because the graph of encodings is
218 connected. */
219 bool from_wrong =
220 (iconv_open ("UTF-8", from_code) == (iconv_t) -1
221 && errno == EINVAL);
222 bool to_wrong =
223 (iconv_open (to_code, "UTF-8") == (iconv_t) -1
224 && errno == EINVAL);
225 const char *from_pretty =
226 (from_code[0] ? from_code : nl_langinfo (CODESET));
227 const char *to_pretty =
228 (orig_to_code[0] ? orig_to_code : nl_langinfo (CODESET));
230 if (from_wrong)
232 if (to_wrong)
233 error (0, 0,
234 _("\
235 conversions from `%s' and to `%s' are not supported"),
236 from_pretty, to_pretty);
237 else
238 error (0, 0,
239 _("conversion from `%s' is not supported"),
240 from_pretty);
242 else
244 if (to_wrong)
245 error (0, 0,
246 _("conversion to `%s' is not supported"),
247 to_pretty);
248 else
249 error (0, 0,
250 _("conversion from `%s' to `%s' is not supported"),
251 from_pretty, to_pretty);
254 argp_help (&argp, stderr, ARGP_HELP_SEE,
255 program_invocation_short_name);
256 exit (1);
258 else
259 error (EXIT_FAILURE, errno,
260 _("failed to start conversion processing"));
263 /* The output file. Will be opened when we are ready to produce
264 output. */
265 FILE *output = NULL;
267 /* Now process the remaining files. Write them to stdout or the file
268 specified with the `-o' parameter. If we have no file given as
269 the parameter process all from stdin. */
270 if (remaining == argc)
272 if (process_file (cd, stdin, &output, output_file) != 0)
273 status = EXIT_FAILURE;
275 else
278 #ifdef _POSIX_MAPPED_FILES
279 struct stat st;
280 char *addr;
281 #endif
282 int fd, ret;
284 if (verbose)
285 fprintf (stderr, "%s:\n", argv[remaining]);
286 if (strcmp (argv[remaining], "-") == 0)
287 fd = 0;
288 else
290 fd = open (argv[remaining], O_RDONLY);
292 if (fd == -1)
294 error (0, errno, _("cannot open input file `%s'"),
295 argv[remaining]);
296 status = EXIT_FAILURE;
297 continue;
301 #ifdef _POSIX_MAPPED_FILES
302 /* We have possibilities for reading the input file. First try
303 to mmap() it since this will provide the fastest solution. */
304 if (fstat (fd, &st) == 0
305 && ((addr = mmap (NULL, st.st_size, PROT_READ, MAP_PRIVATE,
306 fd, 0)) != MAP_FAILED))
308 /* Yes, we can use mmap(). The descriptor is not needed
309 anymore. */
310 if (close (fd) != 0)
311 error (EXIT_FAILURE, errno,
312 _("error while closing input `%s'"),
313 argv[remaining]);
315 ret = process_block (cd, addr, st.st_size, &output,
316 output_file);
318 /* We don't need the input data anymore. */
319 munmap ((void *) addr, st.st_size);
321 if (ret != 0)
323 status = EXIT_FAILURE;
325 if (ret < 0)
326 /* We cannot go on with producing output since it might
327 lead to problem because the last output might leave
328 the output stream in an undefined state. */
329 break;
332 else
333 #endif /* _POSIX_MAPPED_FILES */
335 /* Read the file in pieces. */
336 ret = process_fd (cd, fd, &output, output_file);
338 /* Now close the file. */
339 close (fd);
341 if (ret != 0)
343 /* Something went wrong. */
344 status = EXIT_FAILURE;
346 if (ret < 0)
347 /* We cannot go on with producing output since it might
348 lead to problem because the last output might leave
349 the output stream in an undefined state. */
350 break;
354 while (++remaining < argc);
356 /* Close the output file now. */
357 if (output != NULL && fclose (output))
358 error (EXIT_FAILURE, errno, _("error while closing output file"));
361 return status;
365 /* Handle program arguments. */
366 static error_t
367 parse_opt (int key, char *arg, struct argp_state *state)
369 switch (key)
371 case 'f':
372 from_code = arg;
373 break;
374 case 't':
375 to_code = arg;
376 break;
377 case 'o':
378 output_file = arg;
379 break;
380 case 's':
381 /* Nothing, for now at least. We are not giving out any information
382 about missing character or so. */
383 break;
384 case 'c':
385 /* Omit invalid characters from output. */
386 omit_invalid = 1;
387 break;
388 case OPT_VERBOSE:
389 verbose = 1;
390 break;
391 case OPT_LIST:
392 list = 1;
393 break;
394 default:
395 return ARGP_ERR_UNKNOWN;
397 return 0;
401 static char *
402 more_help (int key, const char *text, void *input)
404 switch (key)
406 case ARGP_KEY_HELP_EXTRA:
407 /* We print some extra information. */
408 return strdup (gettext ("\
409 For bug reporting instructions, please see:\n\
410 <http://www.gnu.org/software/libc/bugs.html>.\n"));
411 default:
412 break;
414 return (char *) text;
418 /* Print the version information. */
419 static void
420 print_version (FILE *stream, struct argp_state *state)
422 fprintf (stream, "iconv (GNU %s) %s\n", PACKAGE, VERSION);
423 fprintf (stream, gettext ("\
424 Copyright (C) %s Free Software Foundation, Inc.\n\
425 This is free software; see the source for copying conditions. There is NO\n\
426 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\
427 "), "2009");
428 fprintf (stream, gettext ("Written by %s.\n"), "Ulrich Drepper");
432 static int
433 write_output (const char *outbuf, const char *outptr, FILE **output,
434 const char *output_file)
436 /* We have something to write out. */
437 int errno_save = errno;
439 if (*output == NULL)
441 /* Determine output file. */
442 if (output_file != NULL && strcmp (output_file, "-") != 0)
444 *output = fopen (output_file, "w");
445 if (output == NULL)
446 error (EXIT_FAILURE, errno, _("cannot open output file"));
448 else
449 *output = stdout;
452 if (fwrite (outbuf, 1, outptr - outbuf, *output) < (size_t) (outptr - outbuf)
453 || ferror (*output))
455 /* Error occurred while printing the result. */
456 error (0, 0, _("\
457 conversion stopped due to problem in writing the output"));
458 return -1;
461 errno = errno_save;
463 return 0;
467 static int
468 process_block (iconv_t cd, char *addr, size_t len, FILE **output,
469 const char *output_file)
471 #define OUTBUF_SIZE 32768
472 const char *start = addr;
473 char outbuf[OUTBUF_SIZE];
474 char *outptr;
475 size_t outlen;
476 size_t n;
477 int ret = 0;
479 while (len > 0)
481 outptr = outbuf;
482 outlen = OUTBUF_SIZE;
483 n = iconv (cd, &addr, &len, &outptr, &outlen);
485 if (n == (size_t) -1 && omit_invalid && errno == EILSEQ)
487 ret = 1;
488 if (len == 0)
489 n = 0;
490 else
491 errno = E2BIG;
494 if (outptr != outbuf)
496 ret = write_output (outbuf, outptr, output, output_file);
497 if (ret != 0)
498 break;
501 if (n != (size_t) -1)
503 /* All the input test is processed. For state-dependent
504 character sets we have to flush the state now. */
505 outptr = outbuf;
506 outlen = OUTBUF_SIZE;
507 n = iconv (cd, NULL, NULL, &outptr, &outlen);
509 if (outptr != outbuf)
511 ret = write_output (outbuf, outptr, output, output_file);
512 if (ret != 0)
513 break;
516 if (n != (size_t) -1)
517 break;
519 if (omit_invalid && errno == EILSEQ)
521 ret = 1;
522 break;
526 if (errno != E2BIG)
528 /* iconv() ran into a problem. */
529 switch (errno)
531 case EILSEQ:
532 if (! omit_invalid)
533 error (0, 0, _("illegal input sequence at position %ld"),
534 (long int) (addr - start));
535 break;
536 case EINVAL:
537 error (0, 0, _("\
538 incomplete character or shift sequence at end of buffer"));
539 break;
540 case EBADF:
541 error (0, 0, _("internal error (illegal descriptor)"));
542 break;
543 default:
544 error (0, 0, _("unknown iconv() error %d"), errno);
545 break;
548 return -1;
552 return ret;
556 static int
557 process_fd (iconv_t cd, int fd, FILE **output, const char *output_file)
559 /* we have a problem with reading from a desriptor since we must not
560 provide the iconv() function an incomplete character or shift
561 sequence at the end of the buffer. Since we have to deal with
562 arbitrary encodings we must read the whole text in a buffer and
563 process it in one step. */
564 static char *inbuf = NULL;
565 static size_t maxlen = 0;
566 char *inptr = NULL;
567 size_t actlen = 0;
569 while (actlen < maxlen)
571 ssize_t n = read (fd, inptr, maxlen - actlen);
573 if (n == 0)
574 /* No more text to read. */
575 break;
577 if (n == -1)
579 /* Error while reading. */
580 error (0, errno, _("error while reading the input"));
581 return -1;
584 inptr += n;
585 actlen += n;
588 if (actlen == maxlen)
589 while (1)
591 ssize_t n;
592 char *new_inbuf;
594 /* Increase the buffer. */
595 new_inbuf = (char *) realloc (inbuf, maxlen + 32768);
596 if (new_inbuf == NULL)
598 error (0, errno, _("unable to allocate buffer for input"));
599 return -1;
601 inbuf = new_inbuf;
602 maxlen += 32768;
603 inptr = inbuf + actlen;
607 n = read (fd, inptr, maxlen - actlen);
609 if (n == 0)
610 /* No more text to read. */
611 break;
613 if (n == -1)
615 /* Error while reading. */
616 error (0, errno, _("error while reading the input"));
617 return -1;
620 inptr += n;
621 actlen += n;
623 while (actlen < maxlen);
625 if (n == 0)
626 /* Break again so we leave both loops. */
627 break;
630 /* Now we have all the input in the buffer. Process it in one run. */
631 return process_block (cd, inbuf, actlen, output, output_file);
635 static int
636 process_file (iconv_t cd, FILE *input, FILE **output, const char *output_file)
638 /* This should be safe since we use this function only for `stdin' and
639 we haven't read anything so far. */
640 return process_fd (cd, fileno (input), output, output_file);
644 /* Print all known character sets/encodings. */
645 static void *printlist;
646 static size_t column;
647 static int not_first;
649 static void
650 insert_print_list (const void *nodep, VISIT value, int level)
652 if (value == leaf || value == postorder)
654 const struct gconv_alias *s = *(const struct gconv_alias **) nodep;
655 tsearch (s->fromname, &printlist, (__compar_fn_t) strverscmp);
659 static void
660 do_print_human (const void *nodep, VISIT value, int level)
662 if (value == leaf || value == postorder)
664 const char *s = *(const char **) nodep;
665 size_t len = strlen (s);
666 size_t cnt;
668 while (len > 0 && s[len - 1] == '/')
669 --len;
671 for (cnt = 0; cnt < len; ++cnt)
672 if (isalnum (s[cnt]))
673 break;
674 if (cnt == len)
675 return;
677 if (not_first)
679 putchar (',');
680 ++column;
682 if (column > 2 && column + len > 77)
684 fputs ("\n ", stdout);
685 column = 2;
687 else
689 putchar (' ');
690 ++column;
693 else
694 not_first = 1;
696 fwrite (s, len, 1, stdout);
697 column += len;
701 static void
702 do_print (const void *nodep, VISIT value, int level)
704 if (value == leaf || value == postorder)
706 const char *s = *(const char **) nodep;
708 puts (s);
712 static void
713 internal_function
714 add_known_names (struct gconv_module *node)
716 if (node->left != NULL)
717 add_known_names (node->left);
718 if (node->right != NULL)
719 add_known_names (node->right);
722 if (strcmp (node->from_string, "INTERNAL"))
723 tsearch (node->from_string, &printlist,
724 (__compar_fn_t) strverscmp);
725 if (strcmp (node->to_string, "INTERNAL") != 0)
726 tsearch (node->to_string, &printlist, (__compar_fn_t) strverscmp);
728 node = node->same;
730 while (node != NULL);
734 static void
735 insert_cache (void)
737 const struct gconvcache_header *header;
738 const char *strtab;
739 const struct hash_entry *hashtab;
740 size_t cnt;
742 header = (const struct gconvcache_header *) __gconv_get_cache ();
743 strtab = (char *) header + header->string_offset;
744 hashtab = (struct hash_entry *) ((char *) header + header->hash_offset);
746 for (cnt = 0; cnt < header->hash_size; ++cnt)
747 if (hashtab[cnt].string_offset != 0)
749 const char *str = strtab + hashtab[cnt].string_offset;
751 if (strcmp (str, "INTERNAL") != 0)
752 tsearch (str, &printlist, (__compar_fn_t) strverscmp);
757 static void
758 internal_function
759 print_known_names (void)
761 iconv_t h;
762 void *cache;
764 /* We must initialize the internal databases first. */
765 h = iconv_open ("L1", "L1");
766 iconv_close (h);
768 /* See whether we have a cache. */
769 cache = __gconv_get_cache ();
770 if (cache != NULL)
771 /* Yep, use only this information. */
772 insert_cache ();
773 else
775 struct gconv_module *modules;
777 /* No, then use the information read from the gconv-modules file.
778 First add the aliases. */
779 twalk (__gconv_get_alias_db (), insert_print_list);
781 /* Add the from- and to-names from the known modules. */
782 modules = __gconv_get_modules_db ();
783 if (modules != NULL)
784 add_known_names (modules);
787 bool human_readable = isatty (fileno (stdout));
789 if (human_readable)
790 fputs (_("\
791 The following list contain all the coded character sets known. This does\n\
792 not necessarily mean that all combinations of these names can be used for\n\
793 the FROM and TO command line parameters. One coded character set can be\n\
794 listed with several different names (aliases).\n\n "), stdout);
796 /* Now print the collected names. */
797 column = 2;
798 twalk (printlist, human_readable ? do_print_human : do_print);
800 if (human_readable && column != 0)
801 puts ("");