src/add-ons/translators/stxt/STXTTranslator.cpp

   1 /*
   2  * Copyright 2002-2009, Haiku, Inc. All rights reserved.
   3  * Distributed under the terms of the MIT License.
   4  *
   5  * Authors:
   6  *              Michael Wilber
   7  *              Axel Dörfler, axeld@pinc-software.de
   8  */
   9
  10
  11 #include "STXTTranslator.h"
  12 #include "STXTView.h"
  13
  14 #include <Catalog.h>
  15 #include <CharacterSet.h>
  16 #include <CharacterSetRoster.h>
  17 #include <MimeType.h>
  18 #include <String.h>
  19 #include <UTF8.h>
  20
  21 #include <algorithm>
  22 #include <new>
  23 #include <string.h>
  24 #include <stdio.h>
  25 #include <stdint.h>
  26
  27
  28 using namespace BPrivate;
  29 using namespace std;
  30
  31 #undef B_TRANSLATION_CONTEXT
  32 #define B_TRANSLATION_CONTEXT "STXTTranslator"
  33
  34 #define READ_BUFFER_SIZE 32768
  35 #define DATA_BUFFER_SIZE 256
  36
  37 // The input formats that this translator supports.
  38 static const translation_format sInputFormats[] = {
  39         {
  40                 B_TRANSLATOR_TEXT,
  41                 B_TRANSLATOR_TEXT,
  42                 TEXT_IN_QUALITY,
  43                 TEXT_IN_CAPABILITY,
  44                 "text/plain",
  45                 "Plain text file"
  46         },
  47         {
  48                 B_STYLED_TEXT_FORMAT,
  49                 B_TRANSLATOR_TEXT,
  50                 STXT_IN_QUALITY,
  51                 STXT_IN_CAPABILITY,
  52                 "text/x-vnd.Be-stxt",
  53                 "Be styled text file"
  54         }
  55 };
  56
  57 // The output formats that this translator supports.
  58 static const translation_format sOutputFormats[] = {
  59         {
  60                 B_TRANSLATOR_TEXT,
  61                 B_TRANSLATOR_TEXT,
  62                 TEXT_OUT_QUALITY,
  63                 TEXT_OUT_CAPABILITY,
  64                 "text/plain",
  65                 "Plain text file"
  66         },
  67         {
  68                 B_STYLED_TEXT_FORMAT,
  69                 B_TRANSLATOR_TEXT,
  70                 STXT_OUT_QUALITY,
  71                 STXT_OUT_CAPABILITY,
  72                 "text/x-vnd.Be-stxt",
  73                 "Be styled text file"
  74         }
  75 };
  76
  77 // Default settings for the Translator
  78 static const TranSetting sDefaultSettings[] = {
  79         {B_TRANSLATOR_EXT_HEADER_ONLY, TRAN_SETTING_BOOL, false},
  80         {B_TRANSLATOR_EXT_DATA_ONLY, TRAN_SETTING_BOOL, false}
  81 };
  82
  83 const uint32 kNumInputFormats = sizeof(sInputFormats) / sizeof(translation_format);
  84 const uint32 kNumOutputFormats = sizeof(sOutputFormats) / sizeof(translation_format);
  85 const uint32 kNumDefaultSettings = sizeof(sDefaultSettings) / sizeof(TranSetting);
  86
  87 // ---------------------------------------------------------------
  88 // make_nth_translator
  89 //
  90 // Creates a STXTTranslator object to be used by BTranslatorRoster
  91 //
  92 // Preconditions:
  93 //
  94 // Parameters: n,               The translator to return. Since
  95 //                                              STXTTranslator only publishes one
  96 //                                              translator, it only returns a
  97 //                                              STXTTranslator if n == 0
  98 //
  99 //             you,     The image_id of the add-on that
 100 //                                              contains code (not used).
 101 //
 102 //             flags,   Has no meaning yet, should be 0.
 103 //
 104 // Postconditions:
 105 //
 106 // Returns: NULL if n is not zero,
 107 //          a new STXTTranslator if n is zero
 108 // ---------------------------------------------------------------
 109 BTranslator *
 110 make_nth_translator(int32 n, image_id you, uint32 flags, ...)
 111 {
 112         if (!n)
 113                 return new (std::nothrow) STXTTranslator();
 114
 115         return NULL;
 116 }
 117
 118
 119 // #pragma mark - ascmagic.c from the BSD file tool
 120 /*
 121  * The following code has been taken from version 4.17 of the BSD file tool,
 122  * file ascmagic.c, modified for our purpose.
 123  */
 124
 125 /*
 126  * Copyright (c) Ian F. Darwin 1986-1995.
 127  * Software written by Ian F. Darwin and others;
 128  * maintained 1995-present by Christos Zoulas and others.
 129  *
 130  * Redistribution and use in source and binary forms, with or without
 131  * modification, are permitted provided that the following conditions
 132  * are met:
 133  * 1. Redistributions of source code must retain the above copyright
 134  *    notice immediately at the beginning of the file, without modification,
 135  *    this list of conditions, and the following disclaimer.
 136  * 2. Redistributions in binary form must reproduce the above copyright
 137  *    notice, this list of conditions and the following disclaimer in the
 138  *    documentation and/or other materials provided with the distribution.
 139  *
 140  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 141  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 142  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 143  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
 144  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 145  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 146  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 147  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 148  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 149  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 150  * SUCH DAMAGE.
 151  */
 152 /*
 153  * ASCII magic -- file types that we know based on keywords
 154  * that can appear anywhere in the file.
 155  *              bool found = false;
 156                 if (subtypeMimeSpecific != NULL) {
 157                         mimeType->SetTo(subtypeMimeSpecific);
 158                         if (mimeType->IsInstalled())
 159                                 found = true;
 160                 }
 161                 if (!found && subtypeMimeGeneric != NULL) {
 162                         mimeType->SetTo(subtypeMimeGeneric);
 163                         if (mimeType->IsInstalled())
 164                                 found = true;
 165                 }
 166                 if (!found)
 167                         mimeType->SetTo("text/plain");
 168
 169  * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
 170  * to handle character codes other than ASCII on a unified basis.
 171  *
 172  * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
 173  * international characters, now subsumed into this file.
 174  */
 175
 176 #include <stdio.h>
 177 #include <string.h>
 178 #include <memory.h>
 179 #include <ctype.h>
 180 #include <stdlib.h>
 181 #include <unistd.h>
 182 #include "names.h"
 183
 184 typedef unsigned long my_unichar;
 185
 186 #define MAXLINELEN 300  /* longest sane line length */
 187 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
 188                   || (x) == 0x85 || (x) == '\f')
 189
 190 static int looks_ascii(const unsigned char *, size_t, my_unichar *, size_t *);
 191 static int looks_utf8(const unsigned char *, size_t, my_unichar *, size_t *);
 192 static int looks_unicode(const unsigned char *, size_t, my_unichar *, size_t *);
 193 static int looks_latin1(const unsigned char *, size_t, my_unichar *, size_t *);
 194 static int looks_extended(const unsigned char *, size_t, my_unichar *, size_t *);
 195 static void from_ebcdic(const unsigned char *, size_t, unsigned char *);
 196 static int ascmatch(const unsigned char *, const my_unichar *, size_t);
 197
 198
 199 static int
 200 file_ascmagic(const unsigned char *buf, size_t nbytes, BMimeType* mimeType,
 201         const char*& encoding)
 202 {
 203         size_t i;
 204         unsigned char *nbuf = NULL;
 205         my_unichar *ubuf = NULL;
 206         size_t ulen;
 207         struct names *p;
 208         int rv = -1;
 209
 210         const char *code = NULL;
 211         encoding = NULL;
 212         const char *type = NULL;
 213         const char *subtype = NULL;
 214         const char *subtypeMimeGeneric = NULL;
 215         const char *subtypeMimeSpecific = NULL;
 216
 217         int has_escapes = 0;
 218         int has_backspace = 0;
 219         int seen_cr = 0;
 220
 221         int n_crlf = 0;
 222         int n_lf = 0;
 223         int n_cr = 0;
 224         int n_nel = 0;
 225
 226         int last_line_end = -1;
 227         int has_long_lines = 0;
 228
 229         if ((nbuf = (unsigned char*)malloc((nbytes + 1) * sizeof(nbuf[0]))) == NULL)
 230                 goto done;
 231         if ((ubuf = (my_unichar*)malloc((nbytes + 1) * sizeof(ubuf[0]))) == NULL)
 232                 goto done;
 233
 234         /*
 235          * Then try to determine whether it's any character code we can
 236          * identify.  Each of these tests, if it succeeds, will leave
 237          * the text converted into one-my_unichar-per-character Unicode in
 238          * ubuf, and the number of characters converted in ulen.
 239          */
 240         if (nbytes == 0) {
 241                 code = "UTF-8 Unicode";
 242                 encoding = NULL; // "UTF-8";
 243                 type = "text";
 244                 rv = 1;
 245         } else if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
 246                 code = "ASCII";
 247                 encoding = NULL; //"us-ascii";
 248                 type = "text";
 249                 if (nbytes == 1) {
 250                         // no further tests
 251                         rv = 1;
 252                 }
 253         } else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
 254                 code = "UTF-8 Unicode";
 255                 encoding = NULL; // "UTF-8";
 256                 type = "text";
 257         } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) {
 258                 if (i == 1) {
 259                         code = "Little-endian UTF-16 Unicode";
 260                         encoding = "UTF-16";
 261                 } else {
 262                         code = "Big-endian UTF-16 Unicode";
 263                         encoding = "UTF-16";
 264                 }
 265
 266                 type = "character data";
 267         } else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
 268                 code = "ISO-8859";
 269                 type = "text";
 270                 encoding = "iso-8859-1";
 271         } else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
 272                 code = "Non-ISO extended-ASCII";
 273                 type = "text";
 274                 encoding = "unknown";
 275         } else {
 276                 from_ebcdic(buf, nbytes, nbuf);
 277
 278                 if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
 279                         code = "EBCDIC";
 280                         type = "character data";
 281                         encoding = "ebcdic";
 282                 } else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
 283                         code = "International EBCDIC";
 284                         type = "character data";
 285                         encoding = "ebcdic";
 286                 } else {
 287                         rv = 0;
 288                         goto done;  /* doesn't look like text at all */
 289                 }
 290         }
 291
 292         if (nbytes <= 1) {
 293                 if (rv == -1)
 294                         rv = 0;
 295                 goto done;
 296         }
 297
 298         /*
 299          * for troff, look for . + letter + letter or .\";
 300          * this must be done to disambiguate tar archives' ./file
 301          * and other trash from real troff input.
 302          *
 303          * I believe Plan 9 troff allows non-ASCII characters in the names
 304          * of macros, so this test might possibly fail on such a file.
 305          */
 306         if (*ubuf == '.') {
 307                 my_unichar *tp = ubuf + 1;
 308
 309                 while (ISSPC(*tp))
 310                         ++tp;   /* skip leading whitespace */
 311                 if ((tp[0] == '\\' && tp[1] == '\"') ||
 312                     (isascii((unsigned char)tp[0]) &&
 313                      isalnum((unsigned char)tp[0]) &&
 314                      isascii((unsigned char)tp[1]) &&
 315                      isalnum((unsigned char)tp[1]) &&
 316                      ISSPC(tp[2]))) {
 317                     subtypeMimeGeneric = "text/x-source-code";
 318                         subtypeMimeSpecific = "text/troff";
 319                         subtype = "troff or preprocessor input";
 320                         goto subtype_identified;
 321                 }
 322         }
 323
 324         if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
 325                 subtypeMimeGeneric = "text/x-source-code";
 326                 subtypeMimeSpecific = "text/fortran";
 327                 subtype = "fortran program";
 328                 goto subtype_identified;
 329         }
 330
 331         /* look for tokens from names.h - this is expensive! */
 332
 333         i = 0;
 334         while (i < ulen) {
 335                 size_t end;
 336
 337                 /*
 338                  * skip past any leading space
 339                  */
 340                 while (i < ulen && ISSPC(ubuf[i]))
 341                         i++;
 342                 if (i >= ulen)
 343                         break;
 344
 345                 /*
 346                  * find the next whitespace
 347                  */
 348                 for (end = i + 1; end < nbytes; end++)
 349                         if (ISSPC(ubuf[end]))
 350                                 break;
 351
 352                 /*
 353                  * compare the word thus isolated against the token list
 354                  */
 355                 for (p = names; p < names + NNAMES; p++) {
 356                         if (ascmatch((const unsigned char *)p->name, ubuf + i,
 357                             end - i)) {
 358                                 subtype = types[p->type].human;
 359                                 subtypeMimeGeneric = types[p->type].generic_mime;
 360                                 subtypeMimeSpecific = types[p->type].specific_mime;
 361                                 goto subtype_identified;
 362                         }
 363                 }
 364
 365                 i = end;
 366         }
 367
 368 subtype_identified:
 369
 370         /*
 371          * Now try to discover other details about the file.
 372          */
 373         for (i = 0; i < ulen; i++) {
 374                 if (ubuf[i] == '\n') {
 375                         if (seen_cr)
 376                                 n_crlf++;
 377                         else
 378                                 n_lf++;
 379                         last_line_end = i;
 380                 } else if (seen_cr)
 381                         n_cr++;
 382
 383                 seen_cr = (ubuf[i] == '\r');
 384                 if (seen_cr)
 385                         last_line_end = i;
 386
 387                 if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
 388                         n_nel++;
 389                         last_line_end = i;
 390                 }
 391
 392                 /* If this line is _longer_ than MAXLINELEN, remember it. */
 393                 if ((int)i > last_line_end + MAXLINELEN)
 394                         has_long_lines = 1;
 395
 396                 if (ubuf[i] == '\033')
 397                         has_escapes = 1;
 398                 if (ubuf[i] == '\b')
 399                         has_backspace = 1;
 400         }
 401
 402         rv = 1;
 403 done:
 404         if (nbuf)
 405                 free(nbuf);
 406         if (ubuf)
 407                 free(ubuf);
 408
 409         if (rv) {
 410                 // If we have identified the subtype, return it, otherwise just
 411                 // text/plain.
 412
 413                 bool found = false;
 414                 if (subtypeMimeSpecific != NULL) {
 415                         mimeType->SetTo(subtypeMimeSpecific);
 416                         if (mimeType->IsInstalled())
 417                                 found = true;
 418                 }
 419                 if (!found && subtypeMimeGeneric != NULL) {
 420                         mimeType->SetTo(subtypeMimeGeneric);
 421                         if (mimeType->IsInstalled())
 422                                 found = true;
 423                 }
 424                 if (!found)
 425                         mimeType->SetTo("text/plain");
 426         }
 427
 428         return rv;
 429 }
 430
 431 static int
 432 ascmatch(const unsigned char *s, const my_unichar *us, size_t ulen)
 433 {
 434         size_t i;
 435
 436         for (i = 0; i < ulen; i++) {
 437                 if (s[i] != us[i])
 438                         return 0;
 439         }
 440
 441         if (s[i])
 442                 return 0;
 443         else
 444                 return 1;
 445 }
 446
 447 /*
 448  * This table reflects a particular philosophy about what constitutes
 449  * "text," and there is room for disagreement about it.
 450  *
 451  * Version 3.31 of the file command considered a file to be ASCII if
 452  * each of its characters was approved by either the isascii() or
 453  * isalpha() function.  On most systems, this would mean that any
 454  * file consisting only of characters in the range 0x00 ... 0x7F
 455  * would be called ASCII text, but many systems might reasonably
 456  * consider some characters outside this range to be alphabetic,
 457  * so the file command would call such characters ASCII.  It might
 458  * have been more accurate to call this "considered textual on the
 459  * local system" than "ASCII."
 460  *
 461  * It considered a file to be "International language text" if each
 462  * of its characters was either an ASCII printing character (according
 463  * to the real ASCII standard, not the above test), a character in
 464  * the range 0x80 ... 0xFF, or one of the following control characters:
 465  * backspace, tab, line feed, vertical tab, form feed, carriage return,
 466  * escape.  No attempt was made to determine the language in which files
 467  * of this type were written.
 468  *
 469  *
 470  * The table below considers a file to be ASCII if all of its characters
 471  * are either ASCII printing characters (again, according to the X3.4
 472  * standard, not isascii()) or any of the following controls: bell,
 473  * backspace, tab, line feed, form feed, carriage return, esc, nextline.
 474  *
 475  * I include bell because some programs (particularly shell scripts)
 476  * use it literally, even though it is rare in normal text.  I exclude
 477  * vertical tab because it never seems to be used in real text.  I also
 478  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
 479  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
 480  * character to.  It might be more appropriate to include it in the 8859
 481  * set instead of the ASCII set, but it's got to be included in *something*
 482  * we recognize or EBCDIC files aren't going to be considered textual.
 483  * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
 484  * and Latin characters, so these should possibly be allowed.  But they
 485  * make a real mess on VT100-style displays if they're not paired properly,
 486  * so we are probably better off not calling them text.
 487  *
 488  * A file is considered to be ISO-8859 text if its characters are all
 489  * either ASCII, according to the above definition, or printing characters
 490  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
 491  *
 492  * Finally, a file is considered to be international text from some other
 493  * character code if its characters are all either ISO-8859 (according to
 494  * the above definition) or characters in the range 0x80 ... 0x9F, which
 495  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
 496  * consider to be printing characters.
 497  */
 498
 499 #define F 0   /* character never appears in text */
 500 #define T 1   /* character appears in plain ASCII text */
 501 #define I 2   /* character appears in ISO-8859 text */
 502 #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
 503
 504 static char text_chars[256] = {
 505         /*                  BEL BS HT LF    FF CR    */
 506         F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
 507         /*                              ESC          */
 508         F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
 509         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
 510         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
 511         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
 512         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
 513         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
 514         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
 515         /*            NEL                            */
 516         X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
 517         X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
 518         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
 519         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
 520         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
 521         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
 522         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
 523         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
 524 };
 525
 526 static int
 527 looks_ascii(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
 528     size_t *ulen)
 529 {
 530         int i;
 531
 532         *ulen = 0;
 533
 534         for (i = 0; i < (int)nbytes; i++) {
 535                 int t = text_chars[buf[i]];
 536
 537                 if (t != T)
 538                         return 0;
 539
 540                 ubuf[(*ulen)++] = buf[i];
 541         }
 542
 543         return 1;
 544 }
 545
 546 static int
 547 looks_latin1(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, size_t *ulen)
 548 {
 549         int i;
 550
 551         *ulen = 0;
 552
 553         for (i = 0; i < (int)nbytes; i++) {
 554                 int t = text_chars[buf[i]];
 555
 556                 if (t != T && t != I)
 557                         return 0;
 558
 559                 ubuf[(*ulen)++] = buf[i];
 560         }
 561
 562         return 1;
 563 }
 564
 565 static int
 566 looks_extended(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
 567     size_t *ulen)
 568 {
 569         int i;
 570
 571         *ulen = 0;
 572
 573         for (i = 0; i < (int)nbytes; i++) {
 574                 int t = text_chars[buf[i]];
 575
 576                 if (t != T && t != I && t != X)
 577                         return 0;
 578
 579                 ubuf[(*ulen)++] = buf[i];
 580         }
 581
 582         return 1;
 583 }
 584
 585 static int
 586 looks_utf8(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, size_t *ulen)
 587 {
 588         int i, n;
 589         my_unichar c;
 590         int gotone = 0;
 591
 592         *ulen = 0;
 593
 594         for (i = 0; i < (int)nbytes; i++) {
 595                 if ((buf[i] & 0x80) == 0) {        /* 0xxxxxxx is plain ASCII */
 596                         /*
 597                          * Even if the whole file is valid UTF-8 sequences,
 598                          * still reject it if it uses weird control characters.
 599                          */
 600
 601                         if (text_chars[buf[i]] != T)
 602                                 return 0;
 603
 604                         ubuf[(*ulen)++] = buf[i];
 605                 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
 606                         return 0;
 607                 } else {                           /* 11xxxxxx begins UTF-8 */
 608                         int following;
 609
 610                         if ((buf[i] & 0x20) == 0) {             /* 110xxxxx */
 611                                 c = buf[i] & 0x1f;
 612                                 following = 1;
 613                         } else if ((buf[i] & 0x10) == 0) {      /* 1110xxxx */
 614                                 c = buf[i] & 0x0f;
 615                                 following = 2;
 616                         } else if ((buf[i] & 0x08) == 0) {      /* 11110xxx */
 617                                 c = buf[i] & 0x07;
 618                                 following = 3;
 619                         } else if ((buf[i] & 0x04) == 0) {      /* 111110xx */
 620                                 c = buf[i] & 0x03;
 621                                 following = 4;
 622                         } else if ((buf[i] & 0x02) == 0) {      /* 1111110x */
 623                                 c = buf[i] & 0x01;
 624                                 following = 5;
 625                         } else
 626                                 return 0;
 627
 628                         for (n = 0; n < following; n++) {
 629                                 i++;
 630                                 if (i >= (int)nbytes)
 631                                         goto done;
 632
 633                                 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
 634                                         return 0;
 635
 636                                 c = (c << 6) + (buf[i] & 0x3f);
 637                         }
 638
 639                         ubuf[(*ulen)++] = c;
 640                         gotone = 1;
 641                 }
 642         }
 643 done:
 644         return gotone;   /* don't claim it's UTF-8 if it's all 7-bit */
 645 }
 646
 647 static int
 648 looks_unicode(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
 649     size_t *ulen)
 650 {
 651         int bigend;
 652         int i;
 653
 654         if (nbytes < 2)
 655                 return 0;
 656
 657         if (buf[0] == 0xff && buf[1] == 0xfe)
 658                 bigend = 0;
 659         else if (buf[0] == 0xfe && buf[1] == 0xff)
 660                 bigend = 1;
 661         else
 662                 return 0;
 663
 664         *ulen = 0;
 665
 666         for (i = 2; i + 1 < (int)nbytes; i += 2) {
 667                 /* XXX fix to properly handle chars > 65536 */
 668
 669                 if (bigend)
 670                         ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
 671                 else
 672                         ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
 673
 674                 if (ubuf[*ulen - 1] == 0xfffe)
 675                         return 0;
 676                 if (ubuf[*ulen - 1] < 128 &&
 677                     text_chars[(size_t)ubuf[*ulen - 1]] != T)
 678                         return 0;
 679         }
 680
 681         return 1 + bigend;
 682 }
 683
 684 #undef F
 685 #undef T
 686 #undef I
 687 #undef X
 688
 689 /*
 690  * This table maps each EBCDIC character to an (8-bit extended) ASCII
 691  * character, as specified in the rationale for the dd(1) command in
 692  * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
 693  *
 694  * Unfortunately it does not seem to correspond exactly to any of the
 695  * five variants of EBCDIC documented in IBM's _Enterprise Systems
 696  * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
 697  * Edition, July, 1999, pp. I-1 - I-4.
 698  *
 699  * Fortunately, though, all versions of EBCDIC, including this one, agree
 700  * on most of the printing characters that also appear in (7-bit) ASCII.
 701  * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
 702  *
 703  * Fortunately too, there is general agreement that codes 0x00 through
 704  * 0x3F represent control characters, 0x41 a nonbreaking space, and the
 705  * remainder printing characters.
 706  *
 707  * This is sufficient to allow us to identify EBCDIC text and to distinguish
 708  * between old-style and internationalized examples of text.
 709  */
 710
 711 static unsigned char ebcdic_to_ascii[] = {
 712   0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
 713  16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
 714 128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
 715 144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
 716 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
 717 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
 718 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
 719 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
 720 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
 721 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
 722 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
 723 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
 724 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
 725 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
 726 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
 727 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
 728 };
 729
 730 #ifdef notdef
 731 /*
 732  * The following EBCDIC-to-ASCII table may relate more closely to reality,
 733  * or at least to modern reality.  It comes from
 734  *
 735  *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
 736  *
 737  * and maps the characters of EBCDIC code page 1047 (the code used for
 738  * Unix-derived software on IBM's 390 systems) to the corresponding
 739  * characters from ISO 8859-1.
 740  *
 741  * If this table is used instead of the above one, some of the special
 742  * cases for the NEL character can be taken out of the code.
 743  */
 744
 745 static unsigned char ebcdic_1047_to_8859[] = {
 746 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
 747 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
 748 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
 749 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
 750 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
 751 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
 752 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
 753 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
 754 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
 755 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
 756 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
 757 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
 758 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
 759 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
 760 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
 761 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
 762 };
 763 #endif
 764
 765 /*
 766  * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
 767  */
 768 static void
 769 from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
 770 {
 771         int i;
 772
 773         for (i = 0; i < (int)nbytes; i++) {
 774                 out[i] = ebcdic_to_ascii[buf[i]];
 775         }
 776 }
 777
 778
 779 //      #pragma mark -
 780
 781
 782 /*!
 783         Determines if the data in inSource is of the STXT format.
 784
 785         \param header the STXT stream header read in by Identify() or Translate()
 786         \param inSource the stream with the STXT data
 787         \param outInfo information about the type of data from inSource is stored here
 788         \param outType the desired output type for the data in inSource
 789         \param ptxtheader if this is not NULL, the TEXT header from
 790                 inSource is copied to it
 791 */
 792 status_t
 793 identify_stxt_header(const TranslatorStyledTextStreamHeader &header,
 794         BPositionIO *inSource, translator_info *outInfo, uint32 outType,
 795         TranslatorStyledTextTextHeader *ptxtheader = NULL)
 796 {
 797         const ssize_t ktxtsize = sizeof(TranslatorStyledTextTextHeader);
 798         const ssize_t kstylsize = sizeof(TranslatorStyledTextStyleHeader);
 799
 800         uint8 buffer[max(ktxtsize, kstylsize)];
 801
 802         // Check the TEXT header
 803         TranslatorStyledTextTextHeader txtheader;
 804         if (inSource->Read(buffer, ktxtsize) != ktxtsize)
 805                 return B_NO_TRANSLATOR;
 806
 807         memcpy(&txtheader, buffer, ktxtsize);
 808         if (swap_data(B_UINT32_TYPE, &txtheader, ktxtsize,
 809                 B_SWAP_BENDIAN_TO_HOST) != B_OK)
 810                 return B_ERROR;
 811
 812         if (txtheader.header.magic != 'TEXT'
 813                 || txtheader.header.header_size != sizeof(TranslatorStyledTextTextHeader)
 814                 || txtheader.charset != B_UNICODE_UTF8)
 815                 return B_NO_TRANSLATOR;
 816
 817         // skip the text data
 818         off_t seekresult, pos;
 819         pos = header.header.header_size + txtheader.header.header_size
 820                 + txtheader.header.data_size;
 821         seekresult = inSource->Seek(txtheader.header.data_size,
 822                 SEEK_CUR);
 823         if (seekresult < pos)
 824                 return B_NO_TRANSLATOR;
 825         if (seekresult > pos)
 826                 return B_ERROR;
 827
 828         // check the STYL header (not all STXT files have this)
 829         ssize_t read = 0;
 830         TranslatorStyledTextStyleHeader stylheader;
 831         read = inSource->Read(buffer, kstylsize);
 832         if (read < 0)
 833                 return read;
 834         if (read != kstylsize && read != 0)
 835                 return B_NO_TRANSLATOR;
 836
 837         // If there is a STYL header
 838         if (read == kstylsize) {
 839                 memcpy(&stylheader, buffer, kstylsize);
 840                 if (swap_data(B_UINT32_TYPE, &stylheader, kstylsize,
 841                         B_SWAP_BENDIAN_TO_HOST) != B_OK)
 842                         return B_ERROR;
 843
 844                 if (stylheader.header.magic != 'STYL'
 845                         || stylheader.header.header_size !=
 846                                 sizeof(TranslatorStyledTextStyleHeader))
 847                         return B_NO_TRANSLATOR;
 848         }
 849
 850         // if output TEXT header is supplied, fill it with data
 851         if (ptxtheader) {
 852                 ptxtheader->header.magic = txtheader.header.magic;
 853                 ptxtheader->header.header_size = txtheader.header.header_size;
 854                 ptxtheader->header.data_size = txtheader.header.data_size;
 855                 ptxtheader->charset = txtheader.charset;
 856         }
 857
 858         // return information about the data in the stream
 859         outInfo->type = B_STYLED_TEXT_FORMAT;
 860         outInfo->group = B_TRANSLATOR_TEXT;
 861         outInfo->quality = STXT_IN_QUALITY;
 862         outInfo->capability = STXT_IN_CAPABILITY;
 863         strlcpy(outInfo->name, B_TRANSLATE("Be styled text file"),
 864                 sizeof(outInfo->name));
 865         strcpy(outInfo->MIME, "text/x-vnd.Be-stxt");
 866
 867         return B_OK;
 868 }
 869
 870
 871 /*!
 872         Determines if the data in \a inSource is of the UTF8 plain
 873
 874         \param data buffer containing data already read (must be at
 875                 least DATA_BUFFER_SIZE bytes large)
 876         \param nread number of bytes that have already been read from the stream
 877         \param header the STXT stream header read in by Identify() or Translate()
 878         \param inSource the stream with the STXT data
 879         \param outInfo information about the type of data from inSource is stored here
 880         \param outType the desired output type for the data in inSource
 881 */
 882 status_t
 883 identify_text(uint8* data, int32 bytesRead, BPositionIO* source,
 884         translator_info* outInfo, uint32 outType, const char*& encoding)
 885 {
 886         ssize_t readLater = source->Read(data + bytesRead, DATA_BUFFER_SIZE - bytesRead);
 887         if (readLater < B_OK)
 888                 return B_NO_TRANSLATOR;
 889
 890         bytesRead += readLater;
 891
 892         // TODO: identify encoding as possible!
 893         BMimeType type;
 894         if (!file_ascmagic((const unsigned char*)data, bytesRead, &type, encoding))
 895                 return B_NO_TRANSLATOR;
 896
 897         float capability = TEXT_IN_CAPABILITY;
 898         if (bytesRead < 20)
 899                 capability = .1f;
 900
 901         // return information about the data in the stream
 902         outInfo->type = B_TRANSLATOR_TEXT;
 903         outInfo->group = B_TRANSLATOR_TEXT;
 904         outInfo->quality = TEXT_IN_QUALITY;
 905         outInfo->capability = capability;
 906
 907         char description[B_MIME_TYPE_LENGTH];
 908         if (type.GetLongDescription(description) == B_OK)
 909                 strlcpy(outInfo->name, description, sizeof(outInfo->name));
 910         else
 911                 strlcpy(outInfo->name, B_TRANSLATE("Plain text file"),
 912                         sizeof(outInfo->name));
 913
 914         //strlcpy(outInfo->MIME, type.Type(), sizeof(outInfo->MIME));
 915         strcpy(outInfo->MIME, "text/plain");
 916         return B_OK;
 917 }
 918
 919
 920 // ---------------------------------------------------------------
 921 // translate_from_stxt
 922 //
 923 // Translates the data in inSource to the type outType and stores
 924 // the translated data in outDestination.
 925 //
 926 // Preconditions:
 927 //
 928 // Parameters:  inSource,       the data to be translated
 929 //
 930 //                              outDestination, where the translated data is
 931 //                                                              put
 932 //
 933 //                              outType,        the type to convert inSource to
 934 //
 935 //                              txtheader,      the TEXT header from inSource
 936 //
 937 //
 938 // Postconditions:
 939 //
 940 // Returns: B_BAD_VALUE, if outType is invalid
 941 //
 942 // B_NO_TRANSLATOR, if this translator doesn't understand the data
 943 //
 944 // B_ERROR, if there was an error allocating memory or converting
 945 //          data
 946 //
 947 // B_OK, if all went well
 948 // ---------------------------------------------------------------
 949 status_t
 950 translate_from_stxt(BPositionIO *inSource, BPositionIO *outDestination,
 951                 uint32 outType, const TranslatorStyledTextTextHeader &txtheader)
 952 {
 953         if (inSource->Seek(0, SEEK_SET) != 0)
 954                 return B_ERROR;
 955
 956         const ssize_t kstxtsize = sizeof(TranslatorStyledTextStreamHeader);
 957         const ssize_t ktxtsize = sizeof(TranslatorStyledTextTextHeader);
 958
 959         bool btoplain;
 960         if (outType == B_TRANSLATOR_TEXT)
 961                 btoplain = true;
 962         else if (outType == B_STYLED_TEXT_FORMAT)
 963                 btoplain = false;
 964         else
 965                 return B_BAD_VALUE;
 966
 967         uint8 buffer[READ_BUFFER_SIZE];
 968         ssize_t nread = 0, nwritten = 0, nreed = 0, ntotalread = 0;
 969
 970         // skip to the actual text data when outputting a
 971         // plain text file
 972         if (btoplain) {
 973                 if (inSource->Seek(kstxtsize + ktxtsize, SEEK_CUR) !=
 974                         kstxtsize + ktxtsize)
 975                         return B_ERROR;
 976         }
 977
 978         // Read data from inSource
 979         // When outputing B_TRANSLATOR_TEXT, the loop stops when all of
 980         // the text data has been read and written.
 981         // When outputting B_STYLED_TEXT_FORMAT, the loop stops when all
 982         // of the data from inSource has been read and written.
 983         if (btoplain)
 984                 nreed = min((size_t)READ_BUFFER_SIZE,
 985                         (size_t)txtheader.header.data_size - ntotalread);
 986         else
 987                 nreed = READ_BUFFER_SIZE;
 988         nread = inSource->Read(buffer, nreed);
 989         while (nread > 0) {
 990                 nwritten = outDestination->Write(buffer, nread);
 991                 if (nwritten != nread)
 992                         return B_ERROR;
 993
 994                 if (btoplain) {
 995                         ntotalread += nread;
 996                         nreed = min((size_t)READ_BUFFER_SIZE,
 997                                 (size_t)txtheader.header.data_size - ntotalread);
 998                 } else
 999                         nreed = READ_BUFFER_SIZE;
1000                 nread = inSource->Read(buffer, nreed);
1001         }
1002
1003         if (btoplain && static_cast<ssize_t>(txtheader.header.data_size) !=
1004                 ntotalread)
1005                 // If not all of the text data was able to be read...
1006                 return B_NO_TRANSLATOR;
1007         else
1008                 return B_OK;
1009 }
1010
1011 // ---------------------------------------------------------------
1012 // output_headers
1013 //
1014 // Outputs the Stream and Text headers from the B_STYLED_TEXT_FORMAT
1015 // to outDestination, setting the data_size member of the text header
1016 // to text_data_size
1017 //
1018 // Preconditions:
1019 //
1020 // Parameters:  outDestination, where the translated data is
1021 //                                                              put
1022 //
1023 //                              text_data_size, number of bytes in data section
1024 //                                                          of the TEXT header
1025 //
1026 //
1027 // Postconditions:
1028 //
1029 // Returns:
1030 //
1031 // B_ERROR, if there was an error writing to outDestination or
1032 //      an error with converting the byte order
1033 //
1034 // B_OK, if all went well
1035 // ---------------------------------------------------------------
1036 status_t
1037 output_headers(BPositionIO *outDestination, uint32 text_data_size)
1038 {
1039         const int32 kHeadersSize = sizeof(TranslatorStyledTextStreamHeader) +
1040                 sizeof(TranslatorStyledTextTextHeader);
1041         status_t result;
1042         TranslatorStyledTextStreamHeader stxtheader;
1043         TranslatorStyledTextTextHeader txtheader;
1044
1045         uint8 buffer[kHeadersSize];
1046
1047         stxtheader.header.magic = 'STXT';
1048         stxtheader.header.header_size = sizeof(TranslatorStyledTextStreamHeader);
1049         stxtheader.header.data_size = 0;
1050         stxtheader.version = 100;
1051         memcpy(buffer, &stxtheader, stxtheader.header.header_size);
1052
1053         txtheader.header.magic = 'TEXT';
1054         txtheader.header.header_size = sizeof(TranslatorStyledTextTextHeader);
1055         txtheader.header.data_size = text_data_size;
1056         txtheader.charset = B_UNICODE_UTF8;
1057         memcpy(buffer + stxtheader.header.header_size, &txtheader,
1058                 txtheader.header.header_size);
1059
1060         // write out headers in Big Endian byte order
1061         result = swap_data(B_UINT32_TYPE, buffer, kHeadersSize,
1062                 B_SWAP_HOST_TO_BENDIAN);
1063         if (result == B_OK) {
1064                 ssize_t nwritten = 0;
1065                 nwritten = outDestination->Write(buffer, kHeadersSize);
1066                 if (nwritten != kHeadersSize)
1067                         return B_ERROR;
1068                 else
1069                         return B_OK;
1070         }
1071
1072         return result;
1073 }
1074
1075 // ---------------------------------------------------------------
1076 // output_styles
1077 //
1078 // Writes out the actual style information into outDestination
1079 // using the data from pflatRunArray
1080 //
1081 // Preconditions:
1082 //
1083 // Parameters:  outDestination, where the translated data is
1084 //                                                              put
1085 //
1086 //                              text_size,              size in bytes of the text in
1087 //                                                              outDestination
1088 //
1089 //                              data_size,              size of pflatRunArray
1090 //
1091 // Postconditions:
1092 //
1093 // Returns:
1094 //
1095 // B_ERROR, if there was an error writing to outDestination or
1096 //      an error with converting the byte order
1097 //
1098 // B_OK, if all went well
1099 // ---------------------------------------------------------------
1100 status_t
1101 output_styles(BPositionIO *outDestination, uint32 text_size,
1102         uint8 *pflatRunArray, ssize_t data_size)
1103 {
1104         const ssize_t kstylsize = sizeof(TranslatorStyledTextStyleHeader);
1105
1106         uint8 buffer[kstylsize];
1107
1108         // output STYL header
1109         TranslatorStyledTextStyleHeader stylheader;
1110         stylheader.header.magic = 'STYL';
1111         stylheader.header.header_size =
1112                 sizeof(TranslatorStyledTextStyleHeader);
1113         stylheader.header.data_size = data_size;
1114         stylheader.apply_offset = 0;
1115         stylheader.apply_length = text_size;
1116
1117         memcpy(buffer, &stylheader, kstylsize);
1118         if (swap_data(B_UINT32_TYPE, buffer, kstylsize,
1119                 B_SWAP_HOST_TO_BENDIAN) != B_OK)
1120                 return B_ERROR;
1121         if (outDestination->Write(buffer, kstylsize) != kstylsize)
1122                 return B_ERROR;
1123
1124         // output actual style information
1125         if (outDestination->Write(pflatRunArray,
1126                 data_size) != data_size)
1127                 return B_ERROR;
1128
1129         return B_OK;
1130 }
1131
1132
1133 /*!
1134         Convert the plain text (UTF8) from inSource to plain or
1135         styled text in outDestination
1136 */
1137 status_t
1138 translate_from_text(BPositionIO* source, const char* encoding, bool forceEncoding,
1139         BPositionIO* destination, uint32 outType)
1140 {
1141         if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT)
1142                 return B_BAD_VALUE;
1143
1144         // find the length of the text
1145         off_t size = source->Seek(0, SEEK_END);
1146         if (size < 0)
1147                 return (status_t)size;
1148         if (size > UINT32_MAX && outType == B_STYLED_TEXT_FORMAT)
1149                 return B_NOT_SUPPORTED;
1150
1151         status_t status = source->Seek(0, SEEK_SET);
1152         if (status < B_OK)
1153                 return status;
1154
1155         if (outType == B_STYLED_TEXT_FORMAT) {
1156                 // output styled text headers
1157                 status = output_headers(destination, (uint32)size);
1158                 if (status != B_OK)
1159                         return status;
1160         }
1161
1162         class MallocBuffer {
1163                 public:
1164                         MallocBuffer() : fBuffer(NULL), fSize(0) {}
1165                         ~MallocBuffer() { free(fBuffer); }
1166
1167                         void* Buffer() { return fBuffer; }
1168                         size_t Size() const { return fSize; }
1169
1170                         status_t
1171                         Allocate(size_t size)
1172                         {
1173                                 fBuffer = malloc(size);
1174                                 if (fBuffer != NULL) {
1175                                         fSize = size;
1176                                         return B_OK;
1177                                 }
1178                                 return B_NO_MEMORY;
1179                         }
1180
1181                 private:
1182                         void*   fBuffer;
1183                         size_t  fSize;
1184         } encodingBuffer;
1185         BMallocIO encodingIO;
1186         uint32 encodingID = 0;
1187                 // defaults to UTF-8 or no encoding
1188
1189         BNode* node = dynamic_cast<BNode*>(source);
1190         if (node != NULL) {
1191                 // determine encoding, if available
1192                 const BCharacterSet* characterSet = NULL;
1193                 bool hasAttribute = false;
1194                 if (encoding != NULL && !forceEncoding) {
1195                         BString name;
1196                         if (node->ReadAttrString("be:encoding", &name) == B_OK) {
1197                                 encoding = name.String();
1198                                 hasAttribute = true;
1199                         } else {
1200                                 int32 value;
1201                                 ssize_t bytesRead = node->ReadAttr("be:encoding", B_INT32_TYPE, 0,
1202                                         &value, sizeof(value));
1203                                 if (bytesRead == (ssize_t)sizeof(value)) {
1204                                         hasAttribute = true;
1205                                         if (value != 65535)
1206                                                 characterSet = BCharacterSetRoster::GetCharacterSetByConversionID(value);
1207                                 }
1208                         }
1209                 } else {
1210                         hasAttribute = true;
1211                                 // we don't write the encoding in this case
1212                 }
1213                 if (characterSet == NULL && encoding != NULL)
1214                         characterSet = BCharacterSetRoster::FindCharacterSetByName(encoding);
1215
1216                 if (characterSet != NULL) {
1217                         encodingID = characterSet->GetConversionID();
1218                         encodingBuffer.Allocate(READ_BUFFER_SIZE * 4);
1219                 }
1220
1221                 if (!hasAttribute && encoding != NULL) {
1222                         // add encoding attribute, so that someone opening the file can
1223                         // retrieve it for persistance
1224                         node->WriteAttr("be:encoding", B_STRING_TYPE, 0, encoding,
1225                                 strlen(encoding));
1226                 }
1227         }
1228
1229         off_t outputSize = 0;
1230         ssize_t bytesRead;
1231         int32 state = 0;
1232
1233         // output the actual text part of the data
1234         do {
1235                 uint8 buffer[READ_BUFFER_SIZE];
1236                 bytesRead = source->Read(buffer, READ_BUFFER_SIZE);
1237                 if (bytesRead < B_OK)
1238                         return bytesRead;
1239                 if (bytesRead == 0)
1240                         break;
1241
1242                 if (encodingBuffer.Size() == 0) {
1243                         // default, no encoding
1244                         ssize_t bytesWritten = destination->Write(buffer, bytesRead);
1245                         if (bytesWritten != bytesRead) {
1246                                 if (bytesWritten < B_OK)
1247                                         return bytesWritten;
1248
1249                                 return B_ERROR;
1250                         }
1251
1252                         outputSize += bytesRead;
1253                 } else {
1254                         // decode text file to UTF-8
1255                         char* pos = (char*)buffer;
1256                         int32 encodingLength = encodingIO.BufferLength();
1257                         int32 bytesLeft = bytesRead;
1258                         int32 bytes;
1259                         do {
1260                                 encodingLength = READ_BUFFER_SIZE * 4;
1261                                 bytes = bytesLeft;
1262
1263                                 status = convert_to_utf8(encodingID, pos, &bytes,
1264                                         (char*)encodingBuffer.Buffer(), &encodingLength, &state);
1265                                 if (status < B_OK)
1266                                         return status;
1267
1268                                 ssize_t bytesWritten = destination->Write(encodingBuffer.Buffer(),
1269                                         encodingLength);
1270                                 if (bytesWritten < encodingLength) {
1271                                         if (bytesWritten < B_OK)
1272                                                 return bytesWritten;
1273
1274                                         return B_ERROR;
1275                                 }
1276
1277                                 pos += bytes;
1278                                 bytesLeft -= bytes;
1279                                 outputSize += encodingLength;
1280                         } while (encodingLength > 0 && bytesLeft > 0);
1281                 }
1282         } while (bytesRead > 0);
1283
1284         if (outType != B_STYLED_TEXT_FORMAT)
1285                 return B_OK;
1286
1287         if (encodingBuffer.Size() != 0 && size != outputSize) {
1288                 if (outputSize > UINT32_MAX)
1289                         return B_NOT_SUPPORTED;
1290
1291                 // we need to update the header as the decoded text size has changed
1292                 status = destination->Seek(0, SEEK_SET);
1293                 if (status == B_OK)
1294                         status = output_headers(destination, (uint32)outputSize);
1295                 if (status == B_OK)
1296                         status = destination->Seek(0, SEEK_END);
1297
1298                 if (status < B_OK)
1299                         return status;
1300         }
1301
1302         // Read file attributes if outputting styled data
1303         // and source is a BNode object
1304
1305         if (node == NULL)
1306                 return B_OK;
1307
1308         // Try to read styles - we only propagate an error if the actual on-disk
1309         // data is likely to be okay
1310
1311         const char *kAttrName = "styles";
1312         attr_info info;
1313         if (node->GetAttrInfo(kAttrName, &info) != B_OK)
1314                 return B_OK;
1315
1316         if (info.type != B_RAW_TYPE || info.size < 160) {
1317                 // styles seem to be broken, but since we got the text,
1318                 // we don't propagate the error
1319                 return B_OK;
1320         }
1321
1322         uint8* flatRunArray = new (std::nothrow) uint8[info.size];
1323         if (flatRunArray == NULL)
1324                 return B_NO_MEMORY;
1325
1326         bytesRead = node->ReadAttr(kAttrName, B_RAW_TYPE, 0, flatRunArray, info.size);
1327         if (bytesRead != info.size)
1328                 return B_OK;
1329
1330         output_styles(destination, size, flatRunArray, info.size);
1331
1332         delete[] flatRunArray;
1333         return B_OK;
1334 }
1335
1336
1337 //      #pragma mark -
1338
1339
1340 STXTTranslator::STXTTranslator()
1341         : BaseTranslator(B_TRANSLATE("StyledEdit files"),
1342                 B_TRANSLATE("StyledEdit file translator"),
1343                 STXT_TRANSLATOR_VERSION,
1344                 sInputFormats, kNumInputFormats,
1345                 sOutputFormats, kNumOutputFormats,
1346                 "STXTTranslator_Settings",
1347                 sDefaultSettings, kNumDefaultSettings,
1348                 B_TRANSLATOR_TEXT, B_STYLED_TEXT_FORMAT)
1349 {
1350 }
1351
1352
1353 STXTTranslator::~STXTTranslator()
1354 {
1355 }
1356
1357
1358 status_t
1359 STXTTranslator::Identify(BPositionIO *inSource,
1360         const translation_format *inFormat, BMessage *ioExtension,
1361         translator_info *outInfo, uint32 outType)
1362 {
1363         if (!outType)
1364                 outType = B_TRANSLATOR_TEXT;
1365         if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT)
1366                 return B_NO_TRANSLATOR;
1367
1368         const ssize_t kstxtsize = sizeof(TranslatorStyledTextStreamHeader);
1369
1370         uint8 buffer[DATA_BUFFER_SIZE];
1371         status_t nread = 0;
1372         // Read in the header to determine
1373         // if the data is supported
1374         nread = inSource->Read(buffer, kstxtsize);
1375         if (nread < 0)
1376                 return nread;
1377
1378         // read in enough data to fill the stream header
1379         if (nread == kstxtsize) {
1380                 TranslatorStyledTextStreamHeader header;
1381                 memcpy(&header, buffer, kstxtsize);
1382                 if (swap_data(B_UINT32_TYPE, &header, kstxtsize,
1383                                 B_SWAP_BENDIAN_TO_HOST) != B_OK)
1384                         return B_ERROR;
1385
1386                 if (header.header.magic == B_STYLED_TEXT_FORMAT
1387                         && header.header.header_size == (int32)kstxtsize
1388                         && header.header.data_size == 0
1389                         && header.version == 100)
1390                         return identify_stxt_header(header, inSource, outInfo, outType);
1391         }
1392
1393         // if the data is not styled text, check if it is plain text
1394         const char* encoding;
1395         return identify_text(buffer, nread, inSource, outInfo, outType, encoding);
1396 }
1397
1398
1399 status_t
1400 STXTTranslator::Translate(BPositionIO* source, const translator_info* info,
1401         BMessage* ioExtension, uint32 outType, BPositionIO* outDestination)
1402 {
1403         if (!outType)
1404                 outType = B_TRANSLATOR_TEXT;
1405         if (outType != B_TRANSLATOR_TEXT && outType != B_STYLED_TEXT_FORMAT)
1406                 return B_NO_TRANSLATOR;
1407
1408         const ssize_t headerSize = sizeof(TranslatorStyledTextStreamHeader);
1409         uint8 buffer[DATA_BUFFER_SIZE];
1410         status_t result;
1411         translator_info outInfo;
1412         // Read in the header to determine
1413         // if the data is supported
1414         ssize_t bytesRead = source->Read(buffer, headerSize);
1415         if (bytesRead < 0)
1416                 return bytesRead;
1417
1418         // read in enough data to fill the stream header
1419         if (bytesRead == headerSize) {
1420                 TranslatorStyledTextStreamHeader header;
1421                 memcpy(&header, buffer, headerSize);
1422                 if (swap_data(B_UINT32_TYPE, &header, headerSize,
1423                                 B_SWAP_BENDIAN_TO_HOST) != B_OK)
1424                         return B_ERROR;
1425
1426                 if (header.header.magic == B_STYLED_TEXT_FORMAT
1427                         && header.header.header_size == sizeof(TranslatorStyledTextStreamHeader)
1428                         && header.header.data_size == 0
1429                         && header.version == 100) {
1430                         TranslatorStyledTextTextHeader textHeader;
1431                         result = identify_stxt_header(header, source, &outInfo, outType,
1432                                 &textHeader);
1433                         if (result != B_OK)
1434                                 return result;
1435
1436                         return translate_from_stxt(source, outDestination, outType, textHeader);
1437                 }
1438         }
1439
1440         // if the data is not styled text, check if it is ASCII text
1441         bool forceEncoding = false;
1442         const char* encoding = NULL;
1443         result = identify_text(buffer, bytesRead, source, &outInfo, outType, encoding);
1444         if (result != B_OK)
1445                 return result;
1446
1447         if (ioExtension != NULL) {
1448                 const char* value;
1449                 if (ioExtension->FindString("be:encoding", &value) == B_OK
1450                         && value[0]) {
1451                         // override encoding
1452                         encoding = value;
1453                         forceEncoding = true;
1454                 }
1455         }
1456
1457         return translate_from_text(source, encoding, forceEncoding, outDestination, outType);
1458 }
1459
1460
1461 BView *
1462 STXTTranslator::NewConfigView(TranslatorSettings *settings)
1463 {
1464         return new STXTView(BRect(0, 0, 225, 175),
1465                 B_TRANSLATE("STXTTranslator Settings"),
1466                 B_FOLLOW_ALL, B_WILL_DRAW, settings);
1467 }
1468