1 /* Writing Qt .qm files.
2 Copyright (C) 2003, 2005 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2003.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
36 #include "po-charset.h"
37 #include "msgl-iconv.h"
38 #include "hash-string.h"
39 #include "utf8-ucs4.h"
42 #include "binary-io.h"
43 #include "fwriteerror.h"
47 #define _(str) gettext (str)
49 /* Qt .qm files are read by the QTranslator::load() function and written
50 by the Qt QTranslator::save() function.
52 The Qt tool 'msg2qm' uses the latter function and can convert PO files
53 to .qm files. But since 'msg2qm' is marked as an "old" tool in Qt 3.0.5's
54 i18n.html documentation and therefore likely to disappear, we provide the
55 same functionality here.
57 The format of .qm files, as reverse engineered from the functions
58 QTranslator::save(const QString& filename, SaveMode mode)
59 QTranslator::squeeze(SaveMode mode)
60 QTranslatorMessage::write(QDataStream& stream, bool strip, Prefix prefix)
61 elfHash(const char* name)
62 in qt-3.0.5, is as follows:
64 It's a binary data format. Elements are u8 (byte), u16, u32. They are
65 written in big-endian order.
67 The file starts with a magic string of 16 bytes:
68 3C B8 64 18 CA EF 9C 95 CD 21 1C BF 60 A1 BD DD
70 Then come three sections. Each of the three sections is optional. Each
73 u8 section_type; // 0x42 = hashes, 0x69 = messages, 0x2f = contexts
74 u32 length; // number of bytes of the data
78 In the first section, the hashes section, the data has the following
80 It's a sorted array of
82 u32 hashcode; // elfHash of the concatenation of msgid and
83 // disambiguating-comment
84 u32 offset; // offset within the data[] of the messages section
86 It's sorted in ascending order by hashcode as primary sorting criteria
87 and - when the hashcodes are the same - by offset as secondary criteria.
89 In the second section, the messages section, the data has the following
91 It's a sequence of records, each representing a message, in no
92 particular order. Each record is a sequence of subsections, each
93 introduced by a particular subsection tag. The possible subsection tags
94 are (and they usually occur in this order):
95 - 03: Translation. Followed by the msgstr in UCS-2 or UTF-16 format:
100 - 08: Disambiguating-comment. Followed by the NUL-terminated,
101 ISO-8859-1 encoded, disambiguating-comment string:
103 u32 length; // number of bytes including the NUL at the end
106 - 06: SourceText, i.e. msgid. Followed by the NUL-terminated,
107 ISO-8859-1 encoded, msgid:
109 u32 length; // number of bytes including the NUL at the end
112 - 02: SourceText16, i.e. msgid. Encoded as UCS-2, but must actually
118 This subsection tag is obsoleted by SourceText.
119 - 07: Context. Followed by the NUL-terminated, ISO-8859-1 encoded,
120 context string (usually a C++ class name or empty):
122 u32 length; // number of bytes including the NUL at the end
125 - 04: Context16. Encoded as UCS-2, but must actually be ISO-8859-1.
130 This subsection tag is obsoleted by Context.
131 - 05: Hash. Followed by
133 u32 hashcode; // elfHash of the concatenation of msgid and
134 // disambiguating-comment
136 - 01: End. Designates the end of the record. No further data.
137 Usually the following subsections are written, but some of them are
140 - 08: Disambiguating-comment (optional).
141 - 06: SourceText (optional).
142 - 07: Context (optional).
145 A subsection can be omitted if the value to be output is the same as
146 for the previous record.
148 In the third section, the contexts section, the data contains a hash
149 table. Quite complicated.
151 The elfHash function is the same as our hash_string function, except that
152 at the end it maps a hash code of 0x00000000 to 0x00000001.
154 When we convert from PO file format, all disambiguating-comments and
155 contexts are empty, and therefore the contexts section can be omitted. */
158 /* Write a u8 (a single byte) to the output stream. */
160 write_u8 (FILE *output_file
, unsigned char value
)
162 putc (value
, output_file
);
165 /* Write a u16 (two bytes) to the output stream. */
167 write_u16 (FILE *output_file
, unsigned short value
)
169 unsigned char data
[2];
171 data
[0] = (value
>> 8) & 0xff;
172 data
[1] = value
& 0xff;
174 fwrite (data
, 2, 1, output_file
);
177 /* Write a u32 (four bytes) to the output stream. */
179 write_u32 (FILE *output_file
, unsigned int value
)
181 unsigned char data
[4];
183 data
[0] = (value
>> 24) & 0xff;
184 data
[1] = (value
>> 16) & 0xff;
185 data
[2] = (value
>> 8) & 0xff;
186 data
[3] = value
& 0xff;
188 fwrite (data
, 4, 1, output_file
);
192 #define obstack_chunk_alloc xmalloc
193 #define obstack_chunk_free free
195 /* Add a u8 (a single byte) to an obstack. */
197 append_u8 (struct obstack
*mempool
, unsigned char value
)
199 unsigned char data
[1];
203 obstack_grow (mempool
, data
, 1);
206 /* Add a u16 (two bytes) to an obstack. */
208 append_u16 (struct obstack
*mempool
, unsigned short value
)
210 unsigned char data
[2];
212 data
[0] = (value
>> 8) & 0xff;
213 data
[1] = value
& 0xff;
215 obstack_grow (mempool
, data
, 2);
218 /* Add a u32 (four bytes) to an obstack. */
220 append_u32 (struct obstack
*mempool
, unsigned int value
)
222 unsigned char data
[4];
224 data
[0] = (value
>> 24) & 0xff;
225 data
[1] = (value
>> 16) & 0xff;
226 data
[2] = (value
>> 8) & 0xff;
227 data
[3] = value
& 0xff;
229 obstack_grow (mempool
, data
, 4);
232 /* Add an ISO-8859-1 encoded string to an obstack. */
234 append_base_string (struct obstack
*mempool
, const char *string
)
236 size_t length
= strlen (string
) + 1;
237 append_u32 (mempool
, length
);
238 obstack_grow (mempool
, string
, length
);
241 /* Add an UTF-16 encoded string to an obstack. */
243 append_unicode_string (struct obstack
*mempool
, const unsigned short *string
,
246 append_u32 (mempool
, length
* 2);
247 for (; length
> 0; string
++, length
--)
248 append_u16 (mempool
, *string
);
251 /* Retrieve a 4-byte integer from memory. */
252 static inline unsigned int
253 peek_u32 (const unsigned char *p
)
255 return (p
[0] << 24) | (p
[1] << 16) | (p
[2] << 8) | p
[3];
258 /* Convert an UTF-8 string to ISO-8859-1, without error checking. */
260 conv_to_iso_8859_1 (const char *string
)
262 size_t length
= strlen (string
);
263 const char *str
= string
;
264 const char *str_limit
= string
+ length
;
265 /* Conversion to ISO-8859-1 can only reduce the number of bytes. */
266 char *result
= (char *) xmalloc (length
+ 1);
269 while (str
< str_limit
)
272 str
+= u8_mbtouc (&uc
, (const unsigned char *) str
, str_limit
- str
);
273 /* It has already been verified that the string its in ISO-8859-1. */
276 /* Store as ISO-8859-1. */
277 *q
++ = (unsigned char) uc
;
280 assert (q
- result
<= length
);
285 /* Convert an UTF-8 string to UTF-16, returning its size (number of UTF-16
286 codepoints) in *SIZEP. */
287 static unsigned short *
288 conv_to_utf16 (const char *string
, size_t *sizep
)
290 size_t length
= strlen (string
);
291 const char *str
= string
;
292 const char *str_limit
= string
+ length
;
293 /* Conversion to UTF-16 can at most double the number of bytes. */
294 unsigned short *result
= (unsigned short *) xmalloc (2 * length
);
295 unsigned short *q
= result
;
297 while (str
< str_limit
)
300 str
+= u8_mbtouc (&uc
, (const unsigned char *) str
, str_limit
- str
);
302 /* UCS-2 character. */
303 *q
++ = (unsigned short) uc
;
306 /* UTF-16 surrogate. */
307 *q
++ = 0xd800 + ((uc
- 0x10000) >> 10);
308 *q
++ = 0xdc00 + ((uc
- 0x10000) & 0x3ff);
311 assert (q
- result
<= 2 * length
);
317 /* Return the Qt hash code of a string. */
319 string_hashcode (const char *str
)
323 h
= hash_string (str
);
329 /* Compare two entries of the hashes section. */
331 cmp_hashes (const void *va
, const void *vb
)
333 const unsigned char *a
= (const unsigned char *) va
;
334 const unsigned char *b
= (const unsigned char *) vb
;
335 unsigned int a_hashcode
= peek_u32 (a
);
336 unsigned int b_hashcode
= peek_u32 (b
);
338 if (a_hashcode
!= b_hashcode
)
339 return (a_hashcode
>= b_hashcode
? 1 : -1);
342 unsigned int a_offset
= peek_u32 (a
+ 4);
343 unsigned int b_offset
= peek_u32 (b
+ 4);
345 if (a_offset
!= b_offset
)
346 return (a_offset
>= b_offset
? 1 : -1);
353 /* Write a section to the output stream. */
355 write_section (FILE *output_file
, unsigned char tag
, void *data
, size_t size
)
357 /* A section can be omitted if it is empty. */
360 write_u8 (output_file
, tag
);
361 write_u32 (output_file
, size
);
362 fwrite (data
, size
, 1, output_file
);
367 /* Write an entire .qm file. */
369 write_qm (FILE *output_file
, message_list_ty
*mlp
)
371 static unsigned char magic
[16] =
373 0x3C, 0xB8, 0x64, 0x18, 0xCA, 0xEF, 0x9C, 0x95,
374 0xCD, 0x21, 0x1C, 0xBF, 0x60, 0xA1, 0xBD, 0xDD
376 struct obstack hashes_pool
;
377 struct obstack messages_pool
;
380 obstack_init (&hashes_pool
);
381 obstack_init (&messages_pool
);
383 /* Prepare the hashes section and the messages section. */
384 for (j
= 0; j
< mlp
->nitems
; j
++)
386 message_ty
*mp
= mlp
->item
[j
];
388 /* No need to emit the header entry, it's not needed at runtime. */
389 if (mp
->msgid
[0] != '\0')
391 char *msgid_as_iso_8859_1
= conv_to_iso_8859_1 (mp
->msgid
);
393 unsigned short *msgstr_as_utf16
=
394 conv_to_utf16 (mp
->msgstr
, &msgstr_len
);
395 unsigned int hashcode
= string_hashcode (msgid_as_iso_8859_1
);
396 unsigned int offset
= obstack_object_size (&messages_pool
);
398 /* Add a record to the hashes section. */
399 append_u32 (&hashes_pool
, hashcode
);
400 append_u32 (&hashes_pool
, offset
);
402 /* Add a record to the messages section. */
404 append_u8 (&messages_pool
, 0x03);
405 append_unicode_string (&messages_pool
, msgstr_as_utf16
, msgstr_len
);
407 append_u8 (&messages_pool
, 0x08);
408 append_base_string (&messages_pool
, "");
410 append_u8 (&messages_pool
, 0x06);
411 append_base_string (&messages_pool
, msgid_as_iso_8859_1
);
413 append_u8 (&messages_pool
, 0x07);
414 append_base_string (&messages_pool
, "");
416 append_u8 (&messages_pool
, 0x05);
417 append_u32 (&messages_pool
, hashcode
);
419 append_u8 (&messages_pool
, 0x01);
421 free (msgstr_as_utf16
);
422 free (msgid_as_iso_8859_1
);
426 /* Sort the hashes section. */
428 size_t nstrings
= obstack_object_size (&hashes_pool
) / 8;
430 qsort (obstack_base (&hashes_pool
), nstrings
, 8, cmp_hashes
);
433 /* Write the magic number. */
434 fwrite (magic
, sizeof (magic
), 1, output_file
);
436 /* Write the hashes section. */
437 write_section (output_file
, 0x42, obstack_base (&hashes_pool
),
438 obstack_object_size (&hashes_pool
));
440 /* Write the messages section. */
441 write_section (output_file
, 0x69, obstack_base (&messages_pool
),
442 obstack_object_size (&messages_pool
));
444 /* Omit the contexts section. */
446 write_section (output_file
, 0x2f, ...);
449 obstack_free (&messages_pool
, NULL
);
450 obstack_free (&hashes_pool
, NULL
);
455 msgdomain_write_qt (message_list_ty
*mlp
, const char *canon_encoding
,
456 const char *domain_name
, const char *file_name
)
460 /* If no entry for this domain don't even create the file. */
461 if (mlp
->nitems
!= 0)
463 /* Determine whether mlp has plural entries. */
469 for (j
= 0; j
< mlp
->nitems
; j
++)
470 if (mlp
->item
[j
]->msgid_plural
!= NULL
)
474 multiline_error (xstrdup (""),
476 message catalog has plural form translations\n\
477 but the Qt message catalog format doesn't support plural handling\n")));
482 /* Convert the messages to Unicode. */
483 iconv_message_list (mlp
, canon_encoding
, po_charset_utf8
, NULL
);
485 /* Determine whether mlp has non-ISO-8859-1 msgid entries. */
489 for (j
= 0; j
< mlp
->nitems
; j
++)
491 const char *string
= mlp
->item
[j
]->msgid
;
493 /* An UTF-8 encoded string fits in ISO-8859-1 if and only if all
494 its bytes are < 0xc4. */
495 for (; *string
; string
++)
496 if ((unsigned char) *string
>= 0xc4)
498 multiline_error (xstrdup (""),
500 message catalog has msgid strings containing characters outside ISO-8859-1\n\
501 but the Qt message catalog format supports Unicode only in the translated\n\
502 strings, not in the untranslated strings\n")));
508 if (strcmp (domain_name
, "-") == 0)
510 output_file
= stdout
;
511 SET_BINARY (fileno (output_file
));
515 output_file
= fopen (file_name
, "wb");
516 if (output_file
== NULL
)
518 error (0, errno
, _("error while opening \"%s\" for writing"),
524 if (output_file
!= NULL
)
526 write_qm (output_file
, mlp
);
528 /* Make sure nothing went wrong. */
529 if (fwriteerror (output_file
))
530 error (EXIT_FAILURE
, errno
, _("error while writing \"%s\" file"),