2 * Copyright 2002-2009, Haiku, Inc. All rights reserved.
3 * Distributed under the terms of the MIT License.
7 * Axel Dörfler, axeld@pinc-software.de
11 #include "STXTTranslator.h"
15 #include <CharacterSet.h>
16 #include <CharacterSetRoster.h>
28 using namespace BPrivate
;
31 #undef B_TRANSLATION_CONTEXT
32 #define B_TRANSLATION_CONTEXT "STXTTranslator"
34 #define READ_BUFFER_SIZE 32768
35 #define DATA_BUFFER_SIZE 256
37 // The input formats that this translator supports.
38 static const translation_format sInputFormats
[] = {
57 // The output formats that this translator supports.
58 static const translation_format sOutputFormats
[] = {
77 // Default settings for the Translator
78 static const TranSetting sDefaultSettings
[] = {
79 {B_TRANSLATOR_EXT_HEADER_ONLY
, TRAN_SETTING_BOOL
, false},
80 {B_TRANSLATOR_EXT_DATA_ONLY
, TRAN_SETTING_BOOL
, false}
83 const uint32 kNumInputFormats
= sizeof(sInputFormats
) / sizeof(translation_format
);
84 const uint32 kNumOutputFormats
= sizeof(sOutputFormats
) / sizeof(translation_format
);
85 const uint32 kNumDefaultSettings
= sizeof(sDefaultSettings
) / sizeof(TranSetting
);
87 // ---------------------------------------------------------------
88 // make_nth_translator
90 // Creates a STXTTranslator object to be used by BTranslatorRoster
94 // Parameters: n, The translator to return. Since
95 // STXTTranslator only publishes one
96 // translator, it only returns a
97 // STXTTranslator if n == 0
99 // you, The image_id of the add-on that
100 // contains code (not used).
102 // flags, Has no meaning yet, should be 0.
106 // Returns: NULL if n is not zero,
107 // a new STXTTranslator if n is zero
108 // ---------------------------------------------------------------
110 make_nth_translator(int32 n
, image_id you
, uint32 flags
, ...)
113 return new (std::nothrow
) STXTTranslator();
119 // #pragma mark - ascmagic.c from the BSD file tool
121 * The following code has been taken from version 4.17 of the BSD file tool,
122 * file ascmagic.c, modified for our purpose.
126 * Copyright (c) Ian F. Darwin 1986-1995.
127 * Software written by Ian F. Darwin and others;
128 * maintained 1995-present by Christos Zoulas and others.
130 * Redistribution and use in source and binary forms, with or without
131 * modification, are permitted provided that the following conditions
133 * 1. Redistributions of source code must retain the above copyright
134 * notice immediately at the beginning of the file, without modification,
135 * this list of conditions, and the following disclaimer.
136 * 2. Redistributions in binary form must reproduce the above copyright
137 * notice, this list of conditions and the following disclaimer in the
138 * documentation and/or other materials provided with the distribution.
140 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
141 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
142 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
143 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
144 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
145 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
146 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
147 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
148 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
149 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
153 * ASCII magic -- file types that we know based on keywords
154 * that can appear anywhere in the file.
155 * bool found = false;
156 if (subtypeMimeSpecific != NULL) {
157 mimeType->SetTo(subtypeMimeSpecific);
158 if (mimeType->IsInstalled())
161 if (!found && subtypeMimeGeneric != NULL) {
162 mimeType->SetTo(subtypeMimeGeneric);
163 if (mimeType->IsInstalled())
167 mimeType->SetTo("text/plain");
169 * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
170 * to handle character codes other than ASCII on a unified basis.
172 * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
173 * international characters, now subsumed into this file.
184 typedef unsigned long my_unichar
;
186 #define MAXLINELEN 300 /* longest sane line length */
187 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
188 || (x) == 0x85 || (x) == '\f')
190 static int looks_ascii(const unsigned char *, size_t, my_unichar
*, size_t *);
191 static int looks_utf8(const unsigned char *, size_t, my_unichar
*, size_t *);
192 static int looks_unicode(const unsigned char *, size_t, my_unichar
*, size_t *);
193 static int looks_latin1(const unsigned char *, size_t, my_unichar
*, size_t *);
194 static int looks_extended(const unsigned char *, size_t, my_unichar
*, size_t *);
195 static void from_ebcdic(const unsigned char *, size_t, unsigned char *);
196 static int ascmatch(const unsigned char *, const my_unichar
*, size_t);
200 file_ascmagic(const unsigned char *buf
, size_t nbytes
, BMimeType
* mimeType
,
201 const char*& encoding
)
204 unsigned char *nbuf
= NULL
;
205 my_unichar
*ubuf
= NULL
;
210 const char *code
= NULL
;
212 const char *type
= NULL
;
213 const char *subtype
= NULL
;
214 const char *subtypeMimeGeneric
= NULL
;
215 const char *subtypeMimeSpecific
= NULL
;
218 int has_backspace
= 0;
226 int last_line_end
= -1;
227 int has_long_lines
= 0;
229 if ((nbuf
= (unsigned char*)malloc((nbytes
+ 1) * sizeof(nbuf
[0]))) == NULL
)
231 if ((ubuf
= (my_unichar
*)malloc((nbytes
+ 1) * sizeof(ubuf
[0]))) == NULL
)
235 * Then try to determine whether it's any character code we can
236 * identify. Each of these tests, if it succeeds, will leave
237 * the text converted into one-my_unichar-per-character Unicode in
238 * ubuf, and the number of characters converted in ulen.
241 code
= "UTF-8 Unicode";
242 encoding
= NULL
; // "UTF-8";
245 } else if (looks_ascii(buf
, nbytes
, ubuf
, &ulen
)) {
247 encoding
= NULL
; //"us-ascii";
253 } else if (looks_utf8(buf
, nbytes
, ubuf
, &ulen
)) {
254 code
= "UTF-8 Unicode";
255 encoding
= NULL
; // "UTF-8";
257 } else if ((i
= looks_unicode(buf
, nbytes
, ubuf
, &ulen
)) != 0) {
259 code
= "Little-endian UTF-16 Unicode";
262 code
= "Big-endian UTF-16 Unicode";
266 type
= "character data";
267 } else if (looks_latin1(buf
, nbytes
, ubuf
, &ulen
)) {
270 encoding
= "iso-8859-1";
271 } else if (looks_extended(buf
, nbytes
, ubuf
, &ulen
)) {
272 code
= "Non-ISO extended-ASCII";
274 encoding
= "unknown";
276 from_ebcdic(buf
, nbytes
, nbuf
);
278 if (looks_ascii(nbuf
, nbytes
, ubuf
, &ulen
)) {
280 type
= "character data";
282 } else if (looks_latin1(nbuf
, nbytes
, ubuf
, &ulen
)) {
283 code
= "International EBCDIC";
284 type
= "character data";
288 goto done
; /* doesn't look like text at all */
299 * for troff, look for . + letter + letter or .\";
300 * this must be done to disambiguate tar archives' ./file
301 * and other trash from real troff input.
303 * I believe Plan 9 troff allows non-ASCII characters in the names
304 * of macros, so this test might possibly fail on such a file.
307 my_unichar
*tp
= ubuf
+ 1;
310 ++tp
; /* skip leading whitespace */
311 if ((tp
[0] == '\\' && tp
[1] == '\"') ||
312 (isascii((unsigned char)tp
[0]) &&
313 isalnum((unsigned char)tp
[0]) &&
314 isascii((unsigned char)tp
[1]) &&
315 isalnum((unsigned char)tp
[1]) &&
317 subtypeMimeGeneric
= "text/x-source-code";
318 subtypeMimeSpecific
= "text/troff";
319 subtype
= "troff or preprocessor input";
320 goto subtype_identified
;
324 if ((*buf
== 'c' || *buf
== 'C') && ISSPC(buf
[1])) {
325 subtypeMimeGeneric
= "text/x-source-code";
326 subtypeMimeSpecific
= "text/fortran";
327 subtype
= "fortran program";
328 goto subtype_identified
;
331 /* look for tokens from names.h - this is expensive! */
338 * skip past any leading space
340 while (i
< ulen
&& ISSPC(ubuf
[i
]))
346 * find the next whitespace
348 for (end
= i
+ 1; end
< nbytes
; end
++)
349 if (ISSPC(ubuf
[end
]))
353 * compare the word thus isolated against the token list
355 for (p
= names
; p
< names
+ NNAMES
; p
++) {
356 if (ascmatch((const unsigned char *)p
->name
, ubuf
+ i
,
358 subtype
= types
[p
->type
].human
;
359 subtypeMimeGeneric
= types
[p
->type
].generic_mime
;
360 subtypeMimeSpecific
= types
[p
->type
].specific_mime
;
361 goto subtype_identified
;
371 * Now try to discover other details about the file.
373 for (i
= 0; i
< ulen
; i
++) {
374 if (ubuf
[i
] == '\n') {
383 seen_cr
= (ubuf
[i
] == '\r');
387 if (ubuf
[i
] == 0x85) { /* X3.64/ECMA-43 "next line" character */
392 /* If this line is _longer_ than MAXLINELEN, remember it. */
393 if ((int)i
> last_line_end
+ MAXLINELEN
)
396 if (ubuf
[i
] == '\033')
410 // If we have identified the subtype, return it, otherwise just
414 if (subtypeMimeSpecific
!= NULL
) {
415 mimeType
->SetTo(subtypeMimeSpecific
);
416 if (mimeType
->IsInstalled())
419 if (!found
&& subtypeMimeGeneric
!= NULL
) {
420 mimeType
->SetTo(subtypeMimeGeneric
);
421 if (mimeType
->IsInstalled())
425 mimeType
->SetTo("text/plain");
432 ascmatch(const unsigned char *s
, const my_unichar
*us
, size_t ulen
)
436 for (i
= 0; i
< ulen
; i
++) {
448 * This table reflects a particular philosophy about what constitutes
449 * "text," and there is room for disagreement about it.
451 * Version 3.31 of the file command considered a file to be ASCII if
452 * each of its characters was approved by either the isascii() or
453 * isalpha() function. On most systems, this would mean that any
454 * file consisting only of characters in the range 0x00 ... 0x7F
455 * would be called ASCII text, but many systems might reasonably
456 * consider some characters outside this range to be alphabetic,
457 * so the file command would call such characters ASCII. It might
458 * have been more accurate to call this "considered textual on the
459 * local system" than "ASCII."
461 * It considered a file to be "International language text" if each
462 * of its characters was either an ASCII printing character (according
463 * to the real ASCII standard, not the above test), a character in
464 * the range 0x80 ... 0xFF, or one of the following control characters:
465 * backspace, tab, line feed, vertical tab, form feed, carriage return,
466 * escape. No attempt was made to determine the language in which files
467 * of this type were written.
470 * The table below considers a file to be ASCII if all of its characters
471 * are either ASCII printing characters (again, according to the X3.4
472 * standard, not isascii()) or any of the following controls: bell,
473 * backspace, tab, line feed, form feed, carriage return, esc, nextline.
475 * I include bell because some programs (particularly shell scripts)
476 * use it literally, even though it is rare in normal text. I exclude
477 * vertical tab because it never seems to be used in real text. I also
478 * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
479 * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
480 * character to. It might be more appropriate to include it in the 8859
481 * set instead of the ASCII set, but it's got to be included in *something*
482 * we recognize or EBCDIC files aren't going to be considered textual.
483 * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
484 * and Latin characters, so these should possibly be allowed. But they
485 * make a real mess on VT100-style displays if they're not paired properly,
486 * so we are probably better off not calling them text.
488 * A file is considered to be ISO-8859 text if its characters are all
489 * either ASCII, according to the above definition, or printing characters
490 * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
492 * Finally, a file is considered to be international text from some other
493 * character code if its characters are all either ISO-8859 (according to
494 * the above definition) or characters in the range 0x80 ... 0x9F, which
495 * ISO-8859 considers to be control characters but the IBM PC and Macintosh
496 * consider to be printing characters.
499 #define F 0 /* character never appears in text */
500 #define T 1 /* character appears in plain ASCII text */
501 #define I 2 /* character appears in ISO-8859 text */
502 #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
504 static char text_chars
[256] = {
505 /* BEL BS HT LF FF CR */
506 F
, F
, F
, F
, F
, F
, F
, T
, T
, T
, T
, F
, T
, T
, F
, F
, /* 0x0X */
508 F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, T
, F
, F
, F
, F
, /* 0x1X */
509 T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, /* 0x2X */
510 T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, /* 0x3X */
511 T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, /* 0x4X */
512 T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, /* 0x5X */
513 T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, /* 0x6X */
514 T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, F
, /* 0x7X */
516 X
, X
, X
, X
, X
, T
, X
, X
, X
, X
, X
, X
, X
, X
, X
, X
, /* 0x8X */
517 X
, X
, X
, X
, X
, X
, X
, X
, X
, X
, X
, X
, X
, X
, X
, X
, /* 0x9X */
518 I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, /* 0xaX */
519 I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, /* 0xbX */
520 I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, /* 0xcX */
521 I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, /* 0xdX */
522 I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, /* 0xeX */
523 I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
/* 0xfX */
527 looks_ascii(const unsigned char *buf
, size_t nbytes
, my_unichar
*ubuf
,
534 for (i
= 0; i
< (int)nbytes
; i
++) {
535 int t
= text_chars
[buf
[i
]];
540 ubuf
[(*ulen
)++] = buf
[i
];
547 looks_latin1(const unsigned char *buf
, size_t nbytes
, my_unichar
*ubuf
, size_t *ulen
)
553 for (i
= 0; i
< (int)nbytes
; i
++) {
554 int t
= text_chars
[buf
[i
]];
556 if (t
!= T
&& t
!= I
)
559 ubuf
[(*ulen
)++] = buf
[i
];
566 looks_extended(const unsigned char *buf
, size_t nbytes
, my_unichar
*ubuf
,
573 for (i
= 0; i
< (int)nbytes
; i
++) {
574 int t
= text_chars
[buf
[i
]];
576 if (t
!= T
&& t
!= I
&& t
!= X
)
579 ubuf
[(*ulen
)++] = buf
[i
];
586 looks_utf8(const unsigned char *buf
, size_t nbytes
, my_unichar
*ubuf
, size_t *ulen
)
594 for (i
= 0; i
< (int)nbytes
; i
++) {
595 if ((buf
[i
] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */
597 * Even if the whole file is valid UTF-8 sequences,
598 * still reject it if it uses weird control characters.
601 if (text_chars
[buf
[i
]] != T
)
604 ubuf
[(*ulen
)++] = buf
[i
];
605 } else if ((buf
[i
] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
607 } else { /* 11xxxxxx begins UTF-8 */
610 if ((buf
[i
] & 0x20) == 0) { /* 110xxxxx */
613 } else if ((buf
[i
] & 0x10) == 0) { /* 1110xxxx */
616 } else if ((buf
[i
] & 0x08) == 0) { /* 11110xxx */
619 } else if ((buf
[i
] & 0x04) == 0) { /* 111110xx */
622 } else if ((buf
[i
] & 0x02) == 0) { /* 1111110x */
628 for (n
= 0; n
< following
; n
++) {
630 if (i
>= (int)nbytes
)
633 if ((buf
[i
] & 0x80) == 0 || (buf
[i
] & 0x40))
636 c
= (c
<< 6) + (buf
[i
] & 0x3f);
644 return gotone
; /* don't claim it's UTF-8 if it's all 7-bit */
648 looks_unicode(const unsigned char *buf
, size_t nbytes
, my_unichar
*ubuf
,
657 if (buf
[0] == 0xff && buf
[1] == 0xfe)
659 else if (buf
[0] == 0xfe && buf
[1] == 0xff)
666 for (i
= 2; i
+ 1 < (int)nbytes
; i
+= 2) {
667 /* XXX fix to properly handle chars > 65536 */
670 ubuf
[(*ulen
)++] = buf
[i
+ 1] + 256 * buf
[i
];
672 ubuf
[(*ulen
)++] = buf
[i
] + 256 * buf
[i
+ 1];
674 if (ubuf
[*ulen
- 1] == 0xfffe)
676 if (ubuf
[*ulen
- 1] < 128 &&
677 text_chars
[(size_t)ubuf
[*ulen
- 1]] != T
)
690 * This table maps each EBCDIC character to an (8-bit extended) ASCII
691 * character, as specified in the rationale for the dd(1) command in
692 * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
694 * Unfortunately it does not seem to correspond exactly to any of the
695 * five variants of EBCDIC documented in IBM's _Enterprise Systems
696 * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
697 * Edition, July, 1999, pp. I-1 - I-4.
699 * Fortunately, though, all versions of EBCDIC, including this one, agree
700 * on most of the printing characters that also appear in (7-bit) ASCII.
701 * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
703 * Fortunately too, there is general agreement that codes 0x00 through
704 * 0x3F represent control characters, 0x41 a nonbreaking space, and the
705 * remainder printing characters.
707 * This is sufficient to allow us to identify EBCDIC text and to distinguish
708 * between old-style and internationalized examples of text.
711 static unsigned char ebcdic_to_ascii
[] = {
712 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15,
713 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31,
714 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7,
715 144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26,
716 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
717 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
718 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
719 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
720 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
721 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
722 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
723 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
724 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
725 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
726 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
727 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
732 * The following EBCDIC-to-ASCII table may relate more closely to reality,
733 * or at least to modern reality. It comes from
735 * http://ftp.s390.ibm.com/products/oe/bpxqp9.html
737 * and maps the characters of EBCDIC code page 1047 (the code used for
738 * Unix-derived software on IBM's 390 systems) to the corresponding
739 * characters from ISO 8859-1.
741 * If this table is used instead of the above one, some of the special
742 * cases for the NEL character can be taken out of the code.
745 static unsigned char ebcdic_1047_to_8859
[] = {
746 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
747 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
748 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
749 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
750 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
751 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
752 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
753 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
754 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
755 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
756 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
757 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
758 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
759 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
760 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
761 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
766 * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
769 from_ebcdic(const unsigned char *buf
, size_t nbytes
, unsigned char *out
)
773 for (i
= 0; i
< (int)nbytes
; i
++) {
774 out
[i
] = ebcdic_to_ascii
[buf
[i
]];
783 Determines if the data in inSource is of the STXT format.
785 \param header the STXT stream header read in by Identify() or Translate()
786 \param inSource the stream with the STXT data
787 \param outInfo information about the type of data from inSource is stored here
788 \param outType the desired output type for the data in inSource
789 \param ptxtheader if this is not NULL, the TEXT header from
790 inSource is copied to it
793 identify_stxt_header(const TranslatorStyledTextStreamHeader
&header
,
794 BPositionIO
*inSource
, translator_info
*outInfo
, uint32 outType
,
795 TranslatorStyledTextTextHeader
*ptxtheader
= NULL
)
797 const ssize_t ktxtsize
= sizeof(TranslatorStyledTextTextHeader
);
798 const ssize_t kstylsize
= sizeof(TranslatorStyledTextStyleHeader
);
800 uint8 buffer
[max(ktxtsize
, kstylsize
)];
802 // Check the TEXT header
803 TranslatorStyledTextTextHeader txtheader
;
804 if (inSource
->Read(buffer
, ktxtsize
) != ktxtsize
)
805 return B_NO_TRANSLATOR
;
807 memcpy(&txtheader
, buffer
, ktxtsize
);
808 if (swap_data(B_UINT32_TYPE
, &txtheader
, ktxtsize
,
809 B_SWAP_BENDIAN_TO_HOST
) != B_OK
)
812 if (txtheader
.header
.magic
!= 'TEXT'
813 || txtheader
.header
.header_size
!= sizeof(TranslatorStyledTextTextHeader
)
814 || txtheader
.charset
!= B_UNICODE_UTF8
)
815 return B_NO_TRANSLATOR
;
817 // skip the text data
818 off_t seekresult
, pos
;
819 pos
= header
.header
.header_size
+ txtheader
.header
.header_size
820 + txtheader
.header
.data_size
;
821 seekresult
= inSource
->Seek(txtheader
.header
.data_size
,
823 if (seekresult
< pos
)
824 return B_NO_TRANSLATOR
;
825 if (seekresult
> pos
)
828 // check the STYL header (not all STXT files have this)
830 TranslatorStyledTextStyleHeader stylheader
;
831 read
= inSource
->Read(buffer
, kstylsize
);
834 if (read
!= kstylsize
&& read
!= 0)
835 return B_NO_TRANSLATOR
;
837 // If there is a STYL header
838 if (read
== kstylsize
) {
839 memcpy(&stylheader
, buffer
, kstylsize
);
840 if (swap_data(B_UINT32_TYPE
, &stylheader
, kstylsize
,
841 B_SWAP_BENDIAN_TO_HOST
) != B_OK
)
844 if (stylheader
.header
.magic
!= 'STYL'
845 || stylheader
.header
.header_size
!=
846 sizeof(TranslatorStyledTextStyleHeader
))
847 return B_NO_TRANSLATOR
;
850 // if output TEXT header is supplied, fill it with data
852 ptxtheader
->header
.magic
= txtheader
.header
.magic
;
853 ptxtheader
->header
.header_size
= txtheader
.header
.header_size
;
854 ptxtheader
->header
.data_size
= txtheader
.header
.data_size
;
855 ptxtheader
->charset
= txtheader
.charset
;
858 // return information about the data in the stream
859 outInfo
->type
= B_STYLED_TEXT_FORMAT
;
860 outInfo
->group
= B_TRANSLATOR_TEXT
;
861 outInfo
->quality
= STXT_IN_QUALITY
;
862 outInfo
->capability
= STXT_IN_CAPABILITY
;
863 strlcpy(outInfo
->name
, B_TRANSLATE("Be styled text file"),
864 sizeof(outInfo
->name
));
865 strcpy(outInfo
->MIME
, "text/x-vnd.Be-stxt");
872 Determines if the data in \a inSource is of the UTF8 plain
874 \param data buffer containing data already read (must be at
875 least DATA_BUFFER_SIZE bytes large)
876 \param nread number of bytes that have already been read from the stream
877 \param header the STXT stream header read in by Identify() or Translate()
878 \param inSource the stream with the STXT data
879 \param outInfo information about the type of data from inSource is stored here
880 \param outType the desired output type for the data in inSource
883 identify_text(uint8
* data
, int32 bytesRead
, BPositionIO
* source
,
884 translator_info
* outInfo
, uint32 outType
, const char*& encoding
)
886 ssize_t readLater
= source
->Read(data
+ bytesRead
, DATA_BUFFER_SIZE
- bytesRead
);
887 if (readLater
< B_OK
)
888 return B_NO_TRANSLATOR
;
890 bytesRead
+= readLater
;
892 // TODO: identify encoding as possible!
894 if (!file_ascmagic((const unsigned char*)data
, bytesRead
, &type
, encoding
))
895 return B_NO_TRANSLATOR
;
897 float capability
= TEXT_IN_CAPABILITY
;
901 // return information about the data in the stream
902 outInfo
->type
= B_TRANSLATOR_TEXT
;
903 outInfo
->group
= B_TRANSLATOR_TEXT
;
904 outInfo
->quality
= TEXT_IN_QUALITY
;
905 outInfo
->capability
= capability
;
907 char description
[B_MIME_TYPE_LENGTH
];
908 if (type
.GetLongDescription(description
) == B_OK
)
909 strlcpy(outInfo
->name
, description
, sizeof(outInfo
->name
));
911 strlcpy(outInfo
->name
, B_TRANSLATE("Plain text file"),
912 sizeof(outInfo
->name
));
914 //strlcpy(outInfo->MIME, type.Type(), sizeof(outInfo->MIME));
915 strcpy(outInfo
->MIME
, "text/plain");
920 // ---------------------------------------------------------------
921 // translate_from_stxt
923 // Translates the data in inSource to the type outType and stores
924 // the translated data in outDestination.
928 // Parameters: inSource, the data to be translated
930 // outDestination, where the translated data is
933 // outType, the type to convert inSource to
935 // txtheader, the TEXT header from inSource
940 // Returns: B_BAD_VALUE, if outType is invalid
942 // B_NO_TRANSLATOR, if this translator doesn't understand the data
944 // B_ERROR, if there was an error allocating memory or converting
947 // B_OK, if all went well
948 // ---------------------------------------------------------------
950 translate_from_stxt(BPositionIO
*inSource
, BPositionIO
*outDestination
,
951 uint32 outType
, const TranslatorStyledTextTextHeader
&txtheader
)
953 if (inSource
->Seek(0, SEEK_SET
) != 0)
956 const ssize_t kstxtsize
= sizeof(TranslatorStyledTextStreamHeader
);
957 const ssize_t ktxtsize
= sizeof(TranslatorStyledTextTextHeader
);
960 if (outType
== B_TRANSLATOR_TEXT
)
962 else if (outType
== B_STYLED_TEXT_FORMAT
)
967 uint8 buffer
[READ_BUFFER_SIZE
];
968 ssize_t nread
= 0, nwritten
= 0, nreed
= 0, ntotalread
= 0;
970 // skip to the actual text data when outputting a
973 if (inSource
->Seek(kstxtsize
+ ktxtsize
, SEEK_CUR
) !=
974 kstxtsize
+ ktxtsize
)
978 // Read data from inSource
979 // When outputing B_TRANSLATOR_TEXT, the loop stops when all of
980 // the text data has been read and written.
981 // When outputting B_STYLED_TEXT_FORMAT, the loop stops when all
982 // of the data from inSource has been read and written.
984 nreed
= min((size_t)READ_BUFFER_SIZE
,
985 (size_t)txtheader
.header
.data_size
- ntotalread
);
987 nreed
= READ_BUFFER_SIZE
;
988 nread
= inSource
->Read(buffer
, nreed
);
990 nwritten
= outDestination
->Write(buffer
, nread
);
991 if (nwritten
!= nread
)
996 nreed
= min((size_t)READ_BUFFER_SIZE
,
997 (size_t)txtheader
.header
.data_size
- ntotalread
);
999 nreed
= READ_BUFFER_SIZE
;
1000 nread
= inSource
->Read(buffer
, nreed
);
1003 if (btoplain
&& static_cast<ssize_t
>(txtheader
.header
.data_size
) !=
1005 // If not all of the text data was able to be read...
1006 return B_NO_TRANSLATOR
;
1011 // ---------------------------------------------------------------
1014 // Outputs the Stream and Text headers from the B_STYLED_TEXT_FORMAT
1015 // to outDestination, setting the data_size member of the text header
1016 // to text_data_size
1020 // Parameters: outDestination, where the translated data is
1023 // text_data_size, number of bytes in data section
1024 // of the TEXT header
1031 // B_ERROR, if there was an error writing to outDestination or
1032 // an error with converting the byte order
1034 // B_OK, if all went well
1035 // ---------------------------------------------------------------
1037 output_headers(BPositionIO
*outDestination
, uint32 text_data_size
)
1039 const int32 kHeadersSize
= sizeof(TranslatorStyledTextStreamHeader
) +
1040 sizeof(TranslatorStyledTextTextHeader
);
1042 TranslatorStyledTextStreamHeader stxtheader
;
1043 TranslatorStyledTextTextHeader txtheader
;
1045 uint8 buffer
[kHeadersSize
];
1047 stxtheader
.header
.magic
= 'STXT';
1048 stxtheader
.header
.header_size
= sizeof(TranslatorStyledTextStreamHeader
);
1049 stxtheader
.header
.data_size
= 0;
1050 stxtheader
.version
= 100;
1051 memcpy(buffer
, &stxtheader
, stxtheader
.header
.header_size
);
1053 txtheader
.header
.magic
= 'TEXT';
1054 txtheader
.header
.header_size
= sizeof(TranslatorStyledTextTextHeader
);
1055 txtheader
.header
.data_size
= text_data_size
;
1056 txtheader
.charset
= B_UNICODE_UTF8
;
1057 memcpy(buffer
+ stxtheader
.header
.header_size
, &txtheader
,
1058 txtheader
.header
.header_size
);
1060 // write out headers in Big Endian byte order
1061 result
= swap_data(B_UINT32_TYPE
, buffer
, kHeadersSize
,
1062 B_SWAP_HOST_TO_BENDIAN
);
1063 if (result
== B_OK
) {
1064 ssize_t nwritten
= 0;
1065 nwritten
= outDestination
->Write(buffer
, kHeadersSize
);
1066 if (nwritten
!= kHeadersSize
)
1075 // ---------------------------------------------------------------
1078 // Writes out the actual style information into outDestination
1079 // using the data from pflatRunArray
1083 // Parameters: outDestination, where the translated data is
1086 // text_size, size in bytes of the text in
1089 // data_size, size of pflatRunArray
1095 // B_ERROR, if there was an error writing to outDestination or
1096 // an error with converting the byte order
1098 // B_OK, if all went well
1099 // ---------------------------------------------------------------
1101 output_styles(BPositionIO
*outDestination
, uint32 text_size
,
1102 uint8
*pflatRunArray
, ssize_t data_size
)
1104 const ssize_t kstylsize
= sizeof(TranslatorStyledTextStyleHeader
);
1106 uint8 buffer
[kstylsize
];
1108 // output STYL header
1109 TranslatorStyledTextStyleHeader stylheader
;
1110 stylheader
.header
.magic
= 'STYL';
1111 stylheader
.header
.header_size
=
1112 sizeof(TranslatorStyledTextStyleHeader
);
1113 stylheader
.header
.data_size
= data_size
;
1114 stylheader
.apply_offset
= 0;
1115 stylheader
.apply_length
= text_size
;
1117 memcpy(buffer
, &stylheader
, kstylsize
);
1118 if (swap_data(B_UINT32_TYPE
, buffer
, kstylsize
,
1119 B_SWAP_HOST_TO_BENDIAN
) != B_OK
)
1121 if (outDestination
->Write(buffer
, kstylsize
) != kstylsize
)
1124 // output actual style information
1125 if (outDestination
->Write(pflatRunArray
,
1126 data_size
) != data_size
)
1134 Convert the plain text (UTF8) from inSource to plain or
1135 styled text in outDestination
1138 translate_from_text(BPositionIO
* source
, const char* encoding
, bool forceEncoding
,
1139 BPositionIO
* destination
, uint32 outType
)
1141 if (outType
!= B_TRANSLATOR_TEXT
&& outType
!= B_STYLED_TEXT_FORMAT
)
1144 // find the length of the text
1145 off_t size
= source
->Seek(0, SEEK_END
);
1147 return (status_t
)size
;
1148 if (size
> UINT32_MAX
&& outType
== B_STYLED_TEXT_FORMAT
)
1149 return B_NOT_SUPPORTED
;
1151 status_t status
= source
->Seek(0, SEEK_SET
);
1155 if (outType
== B_STYLED_TEXT_FORMAT
) {
1156 // output styled text headers
1157 status
= output_headers(destination
, (uint32
)size
);
1162 class MallocBuffer
{
1164 MallocBuffer() : fBuffer(NULL
), fSize(0) {}
1165 ~MallocBuffer() { free(fBuffer
); }
1167 void* Buffer() { return fBuffer
; }
1168 size_t Size() const { return fSize
; }
1171 Allocate(size_t size
)
1173 fBuffer
= malloc(size
);
1174 if (fBuffer
!= NULL
) {
1185 BMallocIO encodingIO
;
1186 uint32 encodingID
= 0;
1187 // defaults to UTF-8 or no encoding
1189 BNode
* node
= dynamic_cast<BNode
*>(source
);
1191 // determine encoding, if available
1192 const BCharacterSet
* characterSet
= NULL
;
1193 bool hasAttribute
= false;
1194 if (encoding
!= NULL
&& !forceEncoding
) {
1196 if (node
->ReadAttrString("be:encoding", &name
) == B_OK
) {
1197 encoding
= name
.String();
1198 hasAttribute
= true;
1201 ssize_t bytesRead
= node
->ReadAttr("be:encoding", B_INT32_TYPE
, 0,
1202 &value
, sizeof(value
));
1203 if (bytesRead
== (ssize_t
)sizeof(value
)) {
1204 hasAttribute
= true;
1206 characterSet
= BCharacterSetRoster::GetCharacterSetByConversionID(value
);
1210 hasAttribute
= true;
1211 // we don't write the encoding in this case
1213 if (characterSet
== NULL
&& encoding
!= NULL
)
1214 characterSet
= BCharacterSetRoster::FindCharacterSetByName(encoding
);
1216 if (characterSet
!= NULL
) {
1217 encodingID
= characterSet
->GetConversionID();
1218 encodingBuffer
.Allocate(READ_BUFFER_SIZE
* 4);
1221 if (!hasAttribute
&& encoding
!= NULL
) {
1222 // add encoding attribute, so that someone opening the file can
1223 // retrieve it for persistance
1224 node
->WriteAttr("be:encoding", B_STRING_TYPE
, 0, encoding
,
1229 off_t outputSize
= 0;
1233 // output the actual text part of the data
1235 uint8 buffer
[READ_BUFFER_SIZE
];
1236 bytesRead
= source
->Read(buffer
, READ_BUFFER_SIZE
);
1237 if (bytesRead
< B_OK
)
1242 if (encodingBuffer
.Size() == 0) {
1243 // default, no encoding
1244 ssize_t bytesWritten
= destination
->Write(buffer
, bytesRead
);
1245 if (bytesWritten
!= bytesRead
) {
1246 if (bytesWritten
< B_OK
)
1247 return bytesWritten
;
1252 outputSize
+= bytesRead
;
1254 // decode text file to UTF-8
1255 char* pos
= (char*)buffer
;
1256 int32 encodingLength
= encodingIO
.BufferLength();
1257 int32 bytesLeft
= bytesRead
;
1260 encodingLength
= READ_BUFFER_SIZE
* 4;
1263 status
= convert_to_utf8(encodingID
, pos
, &bytes
,
1264 (char*)encodingBuffer
.Buffer(), &encodingLength
, &state
);
1268 ssize_t bytesWritten
= destination
->Write(encodingBuffer
.Buffer(),
1270 if (bytesWritten
< encodingLength
) {
1271 if (bytesWritten
< B_OK
)
1272 return bytesWritten
;
1279 outputSize
+= encodingLength
;
1280 } while (encodingLength
> 0 && bytesLeft
> 0);
1282 } while (bytesRead
> 0);
1284 if (outType
!= B_STYLED_TEXT_FORMAT
)
1287 if (encodingBuffer
.Size() != 0 && size
!= outputSize
) {
1288 if (outputSize
> UINT32_MAX
)
1289 return B_NOT_SUPPORTED
;
1291 // we need to update the header as the decoded text size has changed
1292 status
= destination
->Seek(0, SEEK_SET
);
1294 status
= output_headers(destination
, (uint32
)outputSize
);
1296 status
= destination
->Seek(0, SEEK_END
);
1302 // Read file attributes if outputting styled data
1303 // and source is a BNode object
1308 // Try to read styles - we only propagate an error if the actual on-disk
1309 // data is likely to be okay
1311 const char *kAttrName
= "styles";
1313 if (node
->GetAttrInfo(kAttrName
, &info
) != B_OK
)
1316 if (info
.type
!= B_RAW_TYPE
|| info
.size
< 160) {
1317 // styles seem to be broken, but since we got the text,
1318 // we don't propagate the error
1322 uint8
* flatRunArray
= new (std::nothrow
) uint8
[info
.size
];
1323 if (flatRunArray
== NULL
)
1326 bytesRead
= node
->ReadAttr(kAttrName
, B_RAW_TYPE
, 0, flatRunArray
, info
.size
);
1327 if (bytesRead
!= info
.size
)
1330 output_styles(destination
, size
, flatRunArray
, info
.size
);
1332 delete[] flatRunArray
;
1340 STXTTranslator::STXTTranslator()
1341 : BaseTranslator(B_TRANSLATE("StyledEdit files"),
1342 B_TRANSLATE("StyledEdit file translator"),
1343 STXT_TRANSLATOR_VERSION
,
1344 sInputFormats
, kNumInputFormats
,
1345 sOutputFormats
, kNumOutputFormats
,
1346 "STXTTranslator_Settings",
1347 sDefaultSettings
, kNumDefaultSettings
,
1348 B_TRANSLATOR_TEXT
, B_STYLED_TEXT_FORMAT
)
1353 STXTTranslator::~STXTTranslator()
1359 STXTTranslator::Identify(BPositionIO
*inSource
,
1360 const translation_format
*inFormat
, BMessage
*ioExtension
,
1361 translator_info
*outInfo
, uint32 outType
)
1364 outType
= B_TRANSLATOR_TEXT
;
1365 if (outType
!= B_TRANSLATOR_TEXT
&& outType
!= B_STYLED_TEXT_FORMAT
)
1366 return B_NO_TRANSLATOR
;
1368 const ssize_t kstxtsize
= sizeof(TranslatorStyledTextStreamHeader
);
1370 uint8 buffer
[DATA_BUFFER_SIZE
];
1372 // Read in the header to determine
1373 // if the data is supported
1374 nread
= inSource
->Read(buffer
, kstxtsize
);
1378 // read in enough data to fill the stream header
1379 if (nread
== kstxtsize
) {
1380 TranslatorStyledTextStreamHeader header
;
1381 memcpy(&header
, buffer
, kstxtsize
);
1382 if (swap_data(B_UINT32_TYPE
, &header
, kstxtsize
,
1383 B_SWAP_BENDIAN_TO_HOST
) != B_OK
)
1386 if (header
.header
.magic
== B_STYLED_TEXT_FORMAT
1387 && header
.header
.header_size
== (int32
)kstxtsize
1388 && header
.header
.data_size
== 0
1389 && header
.version
== 100)
1390 return identify_stxt_header(header
, inSource
, outInfo
, outType
);
1393 // if the data is not styled text, check if it is plain text
1394 const char* encoding
;
1395 return identify_text(buffer
, nread
, inSource
, outInfo
, outType
, encoding
);
1400 STXTTranslator::Translate(BPositionIO
* source
, const translator_info
* info
,
1401 BMessage
* ioExtension
, uint32 outType
, BPositionIO
* outDestination
)
1404 outType
= B_TRANSLATOR_TEXT
;
1405 if (outType
!= B_TRANSLATOR_TEXT
&& outType
!= B_STYLED_TEXT_FORMAT
)
1406 return B_NO_TRANSLATOR
;
1408 const ssize_t headerSize
= sizeof(TranslatorStyledTextStreamHeader
);
1409 uint8 buffer
[DATA_BUFFER_SIZE
];
1411 translator_info outInfo
;
1412 // Read in the header to determine
1413 // if the data is supported
1414 ssize_t bytesRead
= source
->Read(buffer
, headerSize
);
1418 // read in enough data to fill the stream header
1419 if (bytesRead
== headerSize
) {
1420 TranslatorStyledTextStreamHeader header
;
1421 memcpy(&header
, buffer
, headerSize
);
1422 if (swap_data(B_UINT32_TYPE
, &header
, headerSize
,
1423 B_SWAP_BENDIAN_TO_HOST
) != B_OK
)
1426 if (header
.header
.magic
== B_STYLED_TEXT_FORMAT
1427 && header
.header
.header_size
== sizeof(TranslatorStyledTextStreamHeader
)
1428 && header
.header
.data_size
== 0
1429 && header
.version
== 100) {
1430 TranslatorStyledTextTextHeader textHeader
;
1431 result
= identify_stxt_header(header
, source
, &outInfo
, outType
,
1436 return translate_from_stxt(source
, outDestination
, outType
, textHeader
);
1440 // if the data is not styled text, check if it is ASCII text
1441 bool forceEncoding
= false;
1442 const char* encoding
= NULL
;
1443 result
= identify_text(buffer
, bytesRead
, source
, &outInfo
, outType
, encoding
);
1447 if (ioExtension
!= NULL
) {
1449 if (ioExtension
->FindString("be:encoding", &value
) == B_OK
1451 // override encoding
1453 forceEncoding
= true;
1457 return translate_from_text(source
, encoding
, forceEncoding
, outDestination
, outType
);
1462 STXTTranslator::NewConfigView(TranslatorSettings
*settings
)
1464 return new STXTView(BRect(0, 0, 225, 175),
1465 B_TRANSLATE("STXTTranslator Settings"),
1466 B_FOLLOW_ALL
, B_WILL_DRAW
, settings
);