1 /* Message list concatenation and duplicate handling.
2 Copyright (C) 2001-2003 Free Software Foundation, Inc.
3 Written by Bruno Haible <haible@clisp.cons.org>, 2001.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
36 #include "po-charset.h"
37 #include "msgl-ascii.h"
38 #include "msgl-equal.h"
39 #include "msgl-iconv.h"
47 #define _(str) gettext (str)
50 /* These variables control which messages are selected. */
54 /* If true, use the first available translation.
55 If false, merge all available translations into one and fuzzy it. */
58 /* If true, merge like msgcomm.
59 If false, merge like msgcat and msguniq. */
60 bool msgcomm_mode
= false;
62 /* If true, omit the header entry.
63 If false, keep the header entry present in the input. */
64 bool omit_header
= false;
68 is_message_selected (const message_ty
*tmp
)
70 int used
= (tmp
->used
>= 0 ? tmp
->used
: - tmp
->used
);
72 return (tmp
->msgid
[0] == '\0'
73 ? !omit_header
/* keep the header entry */
74 : (used
> more_than
&& used
< less_than
));
79 is_message_needed (const message_ty
*mp
)
82 && ((mp
->msgid
[0] != '\0' && mp
->is_fuzzy
) || mp
->msgstr
[0] == '\0'))
83 /* Weak translation. Needed if there are only weak translations. */
84 return mp
->tmp
->used
< 0 && is_message_selected (mp
->tmp
);
86 /* Good translation. */
87 return is_message_selected (mp
->tmp
);
91 /* The use_first logic. */
93 is_message_first_needed (const message_ty
*mp
)
95 if (mp
->tmp
->obsolete
&& is_message_needed (mp
))
97 mp
->tmp
->obsolete
= false;
106 catenate_msgdomain_list (string_list_ty
*file_list
, const char *to_code
)
108 const char * const *files
= file_list
->item
;
109 size_t nfiles
= file_list
->nitems
;
110 msgdomain_list_ty
**mdlps
;
111 const char ***canon_charsets
;
112 const char ***identifications
;
113 msgdomain_list_ty
*total_mdlp
;
114 const char *canon_to_code
;
117 /* Read input files. */
119 (msgdomain_list_ty
**) xmalloc (nfiles
* sizeof (msgdomain_list_ty
*));
120 for (n
= 0; n
< nfiles
; n
++)
121 mdlps
[n
] = read_po_file (files
[n
]);
123 /* Determine the canonical name of each input file's encoding. */
124 canon_charsets
= (const char ***) xmalloc (nfiles
* sizeof (const char **));
125 for (n
= 0; n
< nfiles
; n
++)
127 msgdomain_list_ty
*mdlp
= mdlps
[n
];
131 (const char **) xmalloc (mdlp
->nitems
* sizeof (const char *));
132 for (k
= 0; k
< mdlp
->nitems
; k
++)
134 message_list_ty
*mlp
= mdlp
->item
[k
]->messages
;
135 const char *canon_from_code
= NULL
;
139 for (j
= 0; j
< mlp
->nitems
; j
++)
140 if (mlp
->item
[j
]->msgid
[0] == '\0' && !mlp
->item
[j
]->obsolete
)
142 const char *header
= mlp
->item
[j
]->msgstr
;
146 const char *charsetstr
= strstr (header
, "charset=");
148 if (charsetstr
!= NULL
)
152 const char *canon_charset
;
154 charsetstr
+= strlen ("charset=");
155 len
= strcspn (charsetstr
, " \t\n");
156 charset
= (char *) xallocsa (len
+ 1);
157 memcpy (charset
, charsetstr
, len
);
160 canon_charset
= po_charset_canonicalize (charset
);
161 if (canon_charset
== NULL
)
163 /* Don't give an error for POT files, because
164 POT files usually contain only ASCII
166 const char *filename
= files
[n
];
167 size_t filenamelen
= strlen (filename
);
170 && memcmp (filename
+ filenamelen
- 4,
172 && strcmp (charset
, "CHARSET") == 0)
173 canon_charset
= po_charset_ascii
;
175 error (EXIT_FAILURE
, 0,
177 present charset \"%s\" is not a portable encoding name"),
183 if (canon_from_code
== NULL
)
184 canon_from_code
= canon_charset
;
185 else if (canon_from_code
!= canon_charset
)
186 error (EXIT_FAILURE
, 0,
188 two different charsets \"%s\" and \"%s\" in input file"),
189 canon_from_code
, canon_charset
);
193 if (canon_from_code
== NULL
)
195 if (is_ascii_message_list (mlp
))
196 canon_from_code
= po_charset_ascii
;
197 else if (mdlp
->encoding
!= NULL
)
198 canon_from_code
= mdlp
->encoding
;
202 error (EXIT_FAILURE
, 0, _("\
203 input file `%s' doesn't contain a header entry with a charset specification"),
206 error (EXIT_FAILURE
, 0, _("\
207 domain \"%s\" in input file `%s' doesn't contain a header entry with a charset specification"),
208 mdlp
->item
[k
]->domain
, files
[n
]);
212 canon_charsets
[n
][k
] = canon_from_code
;
216 /* Determine textual identifications of each file/domain combination. */
217 identifications
= (const char ***) xmalloc (nfiles
* sizeof (const char **));
218 for (n
= 0; n
< nfiles
; n
++)
220 const char *filename
= basename (files
[n
]);
221 msgdomain_list_ty
*mdlp
= mdlps
[n
];
225 (const char **) xmalloc (mdlp
->nitems
* sizeof (const char *));
226 for (k
= 0; k
< mdlp
->nitems
; k
++)
228 const char *domain
= mdlp
->item
[k
]->domain
;
229 message_list_ty
*mlp
= mdlp
->item
[k
]->messages
;
230 char *project_id
= NULL
;
232 for (j
= 0; j
< mlp
->nitems
; j
++)
233 if (mlp
->item
[j
]->msgid
[0] == '\0' && !mlp
->item
[j
]->obsolete
)
235 const char *header
= mlp
->item
[j
]->msgstr
;
239 const char *cp
= strstr (header
, "Project-Id-Version:");
245 cp
+= sizeof ("Project-Id-Version:") - 1;
247 endp
= strchr (cp
, '\n');
249 endp
= cp
+ strlen (cp
);
251 while (cp
< endp
&& *cp
== ' ')
256 size_t len
= endp
- cp
;
257 project_id
= (char *) xmalloc (len
+ 1);
258 memcpy (project_id
, cp
, len
);
259 project_id
[len
] = '\0';
266 identifications
[n
][k
] =
268 ? (k
> 0 ? xasprintf ("%s:%s (%s)", filename
, domain
, project_id
)
269 : xasprintf ("%s (%s)", filename
, project_id
))
270 : (k
> 0 ? xasprintf ("%s:%s", filename
, domain
)
271 : xasprintf ("%s", filename
)));
275 /* Create list of resulting messages, but don't fill it. Only count
276 the number of translations for each message.
277 If for a message, there is at least one non-fuzzy, non-empty translation,
278 use only the non-fuzzy, non-empty translations. Otherwise use the
279 fuzzy or empty translations as well. */
280 total_mdlp
= msgdomain_list_alloc (true);
281 for (n
= 0; n
< nfiles
; n
++)
283 msgdomain_list_ty
*mdlp
= mdlps
[n
];
285 for (k
= 0; k
< mdlp
->nitems
; k
++)
287 const char *domain
= mdlp
->item
[k
]->domain
;
288 message_list_ty
*mlp
= mdlp
->item
[k
]->messages
;
289 message_list_ty
*total_mlp
;
291 total_mlp
= msgdomain_list_sublist (total_mdlp
, domain
, true);
293 for (j
= 0; j
< mlp
->nitems
; j
++)
295 message_ty
*mp
= mlp
->item
[j
];
299 tmp
= message_list_search (total_mlp
, mp
->msgid
);
302 tmp
= message_alloc (mp
->msgid
, mp
->msgid_plural
, NULL
, 0,
304 tmp
->is_fuzzy
= true; /* may be set to false later */
305 for (i
= 0; i
< NFORMATS
; i
++)
306 tmp
->is_format
[i
] = undecided
; /* may be set to yes/no later */
307 tmp
->do_wrap
= yes
; /* may be set to no later */
308 tmp
->obsolete
= true; /* may be set to false later */
309 tmp
->alternative_count
= 0;
310 tmp
->alternative
= NULL
;
311 message_list_append (total_mlp
, tmp
);
315 && ((mp
->msgid
[0] != '\0' && mp
->is_fuzzy
)
316 || mp
->msgstr
[0] == '\0'))
317 /* Weak translation. Counted as negative tmp->used. */
323 /* Good translation. Counted as positive tmp->used. */
334 /* Remove messages that are not used and need not be converted. */
335 for (n
= 0; n
< nfiles
; n
++)
337 msgdomain_list_ty
*mdlp
= mdlps
[n
];
339 for (k
= 0; k
< mdlp
->nitems
; k
++)
341 message_list_ty
*mlp
= mdlp
->item
[k
]->messages
;
343 message_list_remove_if_not (mlp
,
345 ? is_message_first_needed
346 : is_message_needed
);
348 /* If no messages are remaining, drop the charset. */
349 if (mlp
->nitems
== 0)
350 canon_charsets
[n
][k
] = NULL
;
353 for (k
= 0; k
< total_mdlp
->nitems
; k
++)
355 message_list_ty
*mlp
= total_mdlp
->item
[k
]->messages
;
357 message_list_remove_if_not (mlp
, is_message_selected
);
360 /* Determine the common known a-priori encoding, if any. */
363 bool all_same_encoding
= true;
365 for (n
= 1; n
< nfiles
; n
++)
366 if (mdlps
[n
]->encoding
!= mdlps
[0]->encoding
)
368 all_same_encoding
= false;
372 if (all_same_encoding
)
373 total_mdlp
->encoding
= mdlps
[0]->encoding
;
376 /* Determine the target encoding for the remaining messages. */
379 /* Canonicalize target encoding. */
380 canon_to_code
= po_charset_canonicalize (to_code
);
381 if (canon_to_code
== NULL
)
382 error (EXIT_FAILURE
, 0,
383 _("target charset \"%s\" is not a portable encoding name."),
388 /* No target encoding was specified. Test whether the messages are
389 all in a single encoding. If so, conversion is not needed. */
390 const char *first
= NULL
;
391 const char *second
= NULL
;
392 bool with_ASCII
= false;
393 bool with_UTF8
= false;
394 bool all_ASCII_compatible
= true;
396 for (n
= 0; n
< nfiles
; n
++)
398 msgdomain_list_ty
*mdlp
= mdlps
[n
];
400 for (k
= 0; k
< mdlp
->nitems
; k
++)
401 if (canon_charsets
[n
][k
] != NULL
)
403 if (canon_charsets
[n
][k
] == po_charset_ascii
)
408 first
= canon_charsets
[n
][k
];
409 else if (canon_charsets
[n
][k
] != first
&& second
== NULL
)
410 second
= canon_charsets
[n
][k
];
412 if (strcmp (canon_charsets
[n
][k
], "UTF-8") == 0)
415 if (!po_charset_ascii_compatible (canon_charsets
[n
][k
]))
416 all_ASCII_compatible
= false;
421 if (with_ASCII
&& !all_ASCII_compatible
)
423 /* assert (first != NULL); */
425 second
= po_charset_ascii
;
430 /* A conversion is needed. Warn the user since he hasn't asked
431 for it and might be surprised. */
433 multiline_warning (xasprintf (_("warning: ")),
435 Input files contain messages in different encodings, UTF-8 among others.\n\
436 Converting the output to UTF-8.\n\
439 multiline_warning (xasprintf (_("warning: ")),
441 Input files contain messages in different encodings, %s and %s among others.\n\
442 Converting the output to UTF-8.\n\
443 To select a different output encoding, use the --to-code option.\n\
445 canon_to_code
= po_charset_utf8
;
447 else if (first
!= NULL
&& with_ASCII
&& all_ASCII_compatible
)
449 /* The conversion is a no-op conversion. Don't warn the user,
450 but still perform the conversion, in order to check that the
451 input was really ASCII. */
452 canon_to_code
= first
;
456 /* No conversion needed. */
457 canon_to_code
= NULL
;
461 /* Now convert the remaining messages to to_code. */
462 if (canon_to_code
!= NULL
)
463 for (n
= 0; n
< nfiles
; n
++)
465 msgdomain_list_ty
*mdlp
= mdlps
[n
];
467 for (k
= 0; k
< mdlp
->nitems
; k
++)
468 if (canon_charsets
[n
][k
] != NULL
)
469 /* If the user hasn't given a to_code, don't bother doing a noop
470 conversion that would only replace the charset name in the
471 header entry with its canonical equivalent. */
472 if (!(to_code
== NULL
&& canon_charsets
[n
][k
] == canon_to_code
))
473 iconv_message_list (mdlp
->item
[k
]->messages
, canon_charsets
[n
][k
],
474 canon_to_code
, files
[n
]);
477 /* Fill the resulting messages. */
478 for (n
= 0; n
< nfiles
; n
++)
480 msgdomain_list_ty
*mdlp
= mdlps
[n
];
482 for (k
= 0; k
< mdlp
->nitems
; k
++)
484 message_list_ty
*mlp
= mdlp
->item
[k
]->messages
;
486 for (j
= 0; j
< mlp
->nitems
; j
++)
488 message_ty
*mp
= mlp
->item
[j
];
489 message_ty
*tmp
= mp
->tmp
;
492 /* No need to discard unneeded weak translations here;
493 they have already been filtered out above. */
494 if (use_first
|| tmp
->used
== 1 || tmp
->used
== -1)
496 /* Copy mp, as only message, into tmp. */
497 tmp
->msgstr
= mp
->msgstr
;
498 tmp
->msgstr_len
= mp
->msgstr_len
;
501 for (i
= 0; i
< mp
->comment
->nitems
; i
++)
502 message_comment_append (tmp
, mp
->comment
->item
[i
]);
504 for (i
= 0; i
< mp
->comment_dot
->nitems
; i
++)
505 message_comment_dot_append (tmp
,
506 mp
->comment_dot
->item
[i
]);
507 for (i
= 0; i
< mp
->filepos_count
; i
++)
508 message_comment_filepos (tmp
, mp
->filepos
[i
].file_name
,
509 mp
->filepos
[i
].line_number
);
510 tmp
->is_fuzzy
= mp
->is_fuzzy
;
511 for (i
= 0; i
< NFORMATS
; i
++)
512 tmp
->is_format
[i
] = mp
->is_format
[i
];
513 tmp
->do_wrap
= mp
->do_wrap
;
514 tmp
->obsolete
= mp
->obsolete
;
516 else if (msgcomm_mode
)
518 /* Copy mp, as only message, into tmp. */
519 if (tmp
->msgstr
== NULL
)
521 tmp
->msgstr
= mp
->msgstr
;
522 tmp
->msgstr_len
= mp
->msgstr_len
;
524 tmp
->is_fuzzy
= mp
->is_fuzzy
;
526 if (mp
->comment
&& tmp
->comment
== NULL
)
527 for (i
= 0; i
< mp
->comment
->nitems
; i
++)
528 message_comment_append (tmp
, mp
->comment
->item
[i
]);
529 if (mp
->comment_dot
&& tmp
->comment_dot
== NULL
)
530 for (i
= 0; i
< mp
->comment_dot
->nitems
; i
++)
531 message_comment_dot_append (tmp
,
532 mp
->comment_dot
->item
[i
]);
533 for (i
= 0; i
< mp
->filepos_count
; i
++)
534 message_comment_filepos (tmp
, mp
->filepos
[i
].file_name
,
535 mp
->filepos
[i
].line_number
);
536 for (i
= 0; i
< NFORMATS
; i
++)
537 if (tmp
->is_format
[i
] == undecided
)
538 tmp
->is_format
[i
] = mp
->is_format
[i
];
539 if (tmp
->do_wrap
== undecided
)
540 tmp
->do_wrap
= mp
->do_wrap
;
541 tmp
->obsolete
= false;
545 /* Copy mp, among others, into tmp. */
546 char *id
= xasprintf ("#-#-#-#-# %s #-#-#-#-#",
547 identifications
[n
][k
]);
550 if (tmp
->alternative_count
== 0)
553 i
= tmp
->alternative_count
;
554 nbytes
= (i
+ 1) * sizeof (struct altstr
);
555 tmp
->alternative
= xrealloc (tmp
->alternative
, nbytes
);
556 tmp
->alternative
[i
].msgstr
= mp
->msgstr
;
557 tmp
->alternative
[i
].msgstr_len
= mp
->msgstr_len
;
558 tmp
->alternative
[i
].msgstr_end
=
559 tmp
->alternative
[i
].msgstr
+ tmp
->alternative
[i
].msgstr_len
;
560 tmp
->alternative
[i
].comment
= mp
->comment
;
561 tmp
->alternative
[i
].comment_dot
= mp
->comment_dot
;
562 tmp
->alternative
[i
].id
= id
;
563 tmp
->alternative_count
= i
+ 1;
565 for (i
= 0; i
< mp
->filepos_count
; i
++)
566 message_comment_filepos (tmp
, mp
->filepos
[i
].file_name
,
567 mp
->filepos
[i
].line_number
);
569 tmp
->is_fuzzy
= false;
570 for (i
= 0; i
< NFORMATS
; i
++)
571 if (mp
->is_format
[i
] == yes
)
572 tmp
->is_format
[i
] = yes
;
573 else if (mp
->is_format
[i
] == no
574 && tmp
->is_format
[i
] == undecided
)
575 tmp
->is_format
[i
] = no
;
576 if (mp
->do_wrap
== no
)
579 tmp
->obsolete
= false;
584 for (k
= 0; k
< total_mdlp
->nitems
; k
++)
586 message_list_ty
*mlp
= total_mdlp
->item
[k
]->messages
;
588 for (j
= 0; j
< mlp
->nitems
; j
++)
590 message_ty
*tmp
= mlp
->item
[j
];
592 if (tmp
->alternative_count
> 0)
594 /* Test whether all alternative translations are equal. */
595 struct altstr
*first
= &tmp
->alternative
[0];
598 for (i
= 0; i
< tmp
->alternative_count
; i
++)
599 if (!(tmp
->alternative
[i
].msgstr_len
== first
->msgstr_len
600 && memcmp (tmp
->alternative
[i
].msgstr
, first
->msgstr
,
601 first
->msgstr_len
) == 0))
604 if (i
== tmp
->alternative_count
)
606 /* All alternatives are equal. */
607 tmp
->msgstr
= first
->msgstr
;
608 tmp
->msgstr_len
= first
->msgstr_len
;
612 /* Concatenate the alternative msgstrs into a single one,
613 separated by markers. */
621 for (i
= 0; i
< tmp
->alternative_count
; i
++)
623 size_t id_len
= strlen (tmp
->alternative
[i
].id
);
625 len
+= tmp
->alternative
[i
].msgstr_len
;
627 p
= tmp
->alternative
[i
].msgstr
;
628 p_end
= tmp
->alternative
[i
].msgstr_end
;
629 for (; p
< p_end
; p
+= strlen (p
) + 1)
633 new_msgstr
= (char *) xmalloc (len
);
637 /* Test whether there's one more plural form to
639 for (i
= 0; i
< tmp
->alternative_count
; i
++)
640 if (tmp
->alternative
[i
].msgstr
641 < tmp
->alternative
[i
].msgstr_end
)
643 if (i
== tmp
->alternative_count
)
646 /* Process next plural form. */
647 for (i
= 0; i
< tmp
->alternative_count
; i
++)
648 if (tmp
->alternative
[i
].msgstr
649 < tmp
->alternative
[i
].msgstr_end
)
651 if (np
> new_msgstr
&& np
[-1] != '\0'
655 len
= strlen (tmp
->alternative
[i
].id
);
656 memcpy (np
, tmp
->alternative
[i
].id
, len
);
660 len
= strlen (tmp
->alternative
[i
].msgstr
);
661 memcpy (np
, tmp
->alternative
[i
].msgstr
, len
);
663 tmp
->alternative
[i
].msgstr
+= len
+ 1;
666 /* Plural forms are separated by NUL bytes. */
669 tmp
->msgstr
= new_msgstr
;
670 tmp
->msgstr_len
= np
- new_msgstr
;
672 tmp
->is_fuzzy
= true;
675 /* Test whether all alternative comments are equal. */
676 for (i
= 0; i
< tmp
->alternative_count
; i
++)
677 if (tmp
->alternative
[i
].comment
== NULL
678 || !string_list_equal (tmp
->alternative
[i
].comment
,
682 if (i
== tmp
->alternative_count
)
683 /* All alternatives are equal. */
684 tmp
->comment
= first
->comment
;
686 /* Concatenate the alternative comments into a single one,
687 separated by markers. */
688 for (i
= 0; i
< tmp
->alternative_count
; i
++)
690 string_list_ty
*slp
= tmp
->alternative
[i
].comment
;
696 message_comment_append (tmp
, tmp
->alternative
[i
].id
);
697 for (l
= 0; l
< slp
->nitems
; l
++)
698 message_comment_append (tmp
, slp
->item
[l
]);
702 /* Test whether all alternative dot comments are equal. */
703 for (i
= 0; i
< tmp
->alternative_count
; i
++)
704 if (tmp
->alternative
[i
].comment_dot
== NULL
705 || !string_list_equal (tmp
->alternative
[i
].comment_dot
,
709 if (i
== tmp
->alternative_count
)
710 /* All alternatives are equal. */
711 tmp
->comment_dot
= first
->comment_dot
;
713 /* Concatenate the alternative dot comments into a single one,
714 separated by markers. */
715 for (i
= 0; i
< tmp
->alternative_count
; i
++)
717 string_list_ty
*slp
= tmp
->alternative
[i
].comment_dot
;
723 message_comment_dot_append (tmp
,
724 tmp
->alternative
[i
].id
);
725 for (l
= 0; l
< slp
->nitems
; l
++)
726 message_comment_dot_append (tmp
, slp
->item
[l
]);