src/kits/storage/mime/TextSnifferAddon.cpp

   1 /*
   2  * Copyright 2006-2013, Ingo Weinhold, ingo_weinhold@gmx.de.
   3  * Distributed under the terms of the MIT License.
   4  */
   5
   6
   7 #include <mime/TextSnifferAddon.h>
   8
   9 #include <MimeType.h>
  10
  11 #include <mime/DatabaseLocation.h>
  12
  13
  14 using BPrivate::Storage::Mime::DatabaseLocation;
  15
  16
  17 static int file_ascmagic(DatabaseLocation* databaseLocation,
  18         const unsigned char *buf, size_t nbytes, BMimeType* mimeType);
  19
  20
  21 namespace BPrivate {
  22 namespace Storage {
  23 namespace Mime {
  24
  25
  26 // constructor
  27 TextSnifferAddon::TextSnifferAddon(DatabaseLocation* databaseLocation)
  28         :
  29         fDatabaseLocation(databaseLocation)
  30 {
  31 }
  32
  33 // destructor
  34 TextSnifferAddon::~TextSnifferAddon()
  35 {
  36 }
  37
  38 // MinimalBufferSize
  39 size_t
  40 TextSnifferAddon::MinimalBufferSize()
  41 {
  42         return 512;
  43 }
  44
  45 // GuessMimeType
  46 float
  47 TextSnifferAddon::GuessMimeType(const char* fileName, BMimeType* type)
  48 {
  49         // we check content only
  50         return -1;
  51 }
  52
  53 // GuessMimeType
  54 float
  55 TextSnifferAddon::GuessMimeType(BFile* file, const void* buffer, int32 length,
  56         BMimeType* type)
  57 {
  58         if (file_ascmagic(fDatabaseLocation, (const unsigned char*)buffer, length,
  59                         type)) {
  60                 // If the buffer is very short, we return a lower priority. Maybe
  61                 // someone else knows better.
  62                 if (length < 20)
  63                         return .0f;
  64                 return 0.25f;
  65         }
  66
  67         return -1;
  68 }
  69
  70
  71 } // namespace Mime
  72 } // namespace Storage
  73 } // namespace BPrivate
  74
  75
  76 // #pragma mark - ascmagic.c from the BSD file tool
  77 /*
  78  * The following code has been taken from version 4.17 of the BSD file tool,
  79  * file ascmagic.c, modified for our purpose.
  80  */
  81
  82 /*
  83  * Copyright (c) Ian F. Darwin 1986-1995.
  84  * Software written by Ian F. Darwin and others;
  85  * maintained 1995-present by Christos Zoulas and others.
  86  *
  87  * Redistribution and use in source and binary forms, with or without
  88  * modification, are permitted provided that the following conditions
  89  * are met:
  90  * 1. Redistributions of source code must retain the above copyright
  91  *    notice immediately at the beginning of the file, without modification,
  92  *    this list of conditions, and the following disclaimer.
  93  * 2. Redistributions in binary form must reproduce the above copyright
  94  *    notice, this list of conditions and the following disclaimer in the
  95  *    documentation and/or other materials provided with the distribution.
  96  *
  97  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  98  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  99  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 100  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
 101  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 102  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 103  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 104  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 105  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 106  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 107  * SUCH DAMAGE.
 108  */
 109 /*
 110  * ASCII magic -- file types that we know based on keywords
 111  * that can appear anywhere in the file.
 112  *
 113  * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
 114  * to handle character codes other than ASCII on a unified basis.
 115  *
 116  * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
 117  * international characters, now subsumed into this file.
 118  */
 119
 120 #include <stdio.h>
 121 #include <string.h>
 122 #include <memory.h>
 123 #include <ctype.h>
 124 #include <stdlib.h>
 125 #include <unistd.h>
 126 #include "names.h"
 127
 128 typedef unsigned long my_unichar;
 129
 130 #define MAXLINELEN 300  /* longest sane line length */
 131 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
 132                   || (x) == 0x85 || (x) == '\f')
 133
 134 static int looks_ascii(const unsigned char *, size_t, my_unichar *, size_t *);
 135 static int looks_utf8(const unsigned char *, size_t, my_unichar *, size_t *);
 136 static int looks_unicode(const unsigned char *, size_t, my_unichar *, size_t *);
 137 static int looks_latin1(const unsigned char *, size_t, my_unichar *, size_t *);
 138 static int looks_extended(const unsigned char *, size_t, my_unichar *, size_t *);
 139 static void from_ebcdic(const unsigned char *, size_t, unsigned char *);
 140 static int ascmatch(const unsigned char *, const my_unichar *, size_t);
 141
 142
 143 static int
 144 file_ascmagic(DatabaseLocation* databaseLocation, const unsigned char *buf,
 145         size_t nbytes, BMimeType* mimeType)
 146 {
 147         size_t i;
 148         unsigned char *nbuf = NULL;
 149         my_unichar *ubuf = NULL;
 150         size_t ulen;
 151         struct names *p;
 152         int rv = -1;
 153
 154         const char *code = NULL;
 155         const char *code_mime = NULL;
 156         const char *type = NULL;
 157         const char *subtype = NULL;
 158         const char *subtypeMimeGeneric = NULL;
 159         const char *subtypeMimeSpecific = NULL;
 160
 161         int has_escapes = 0;
 162         int has_backspace = 0;
 163         int seen_cr = 0;
 164
 165         int n_crlf = 0;
 166         int n_lf = 0;
 167         int n_cr = 0;
 168         int n_nel = 0;
 169
 170         int last_line_end = -1;
 171         int has_long_lines = 0;
 172
 173         if ((nbuf = (unsigned char*)malloc((nbytes + 1) * sizeof(nbuf[0]))) == NULL)
 174                 goto done;
 175         if ((ubuf = (my_unichar*)malloc((nbytes + 1) * sizeof(ubuf[0]))) == NULL)
 176                 goto done;
 177
 178         /*
 179          * Then try to determine whether it's any character code we can
 180          * identify.  Each of these tests, if it succeeds, will leave
 181          * the text converted into one-my_unichar-per-character Unicode in
 182          * ubuf, and the number of characters converted in ulen.
 183          */
 184         if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
 185                 code = "ASCII";
 186                 code_mime = "us-ascii";
 187                 type = "text";
 188         } else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
 189                 code = "UTF-8 Unicode";
 190                 code_mime = "utf-8";
 191                 type = "text";
 192         } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) {
 193                 if (i == 1)
 194                         code = "Little-endian UTF-16 Unicode";
 195                 else
 196                         code = "Big-endian UTF-16 Unicode";
 197
 198                 type = "character data";
 199                 code_mime = "utf-16";    /* is this defined? */
 200         } else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
 201                 code = "ISO-8859";
 202                 type = "text";
 203                 code_mime = "iso-8859-1";
 204         } else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
 205                 code = "Non-ISO extended-ASCII";
 206                 type = "text";
 207                 code_mime = "unknown";
 208         } else {
 209                 from_ebcdic(buf, nbytes, nbuf);
 210
 211                 if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
 212                         code = "EBCDIC";
 213                         type = "character data";
 214                         code_mime = "ebcdic";
 215                 } else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
 216                         code = "International EBCDIC";
 217                         type = "character data";
 218                         code_mime = "ebcdic";
 219                 } else {
 220                         rv = 0;
 221                         goto done;  /* doesn't look like text at all */
 222                 }
 223         }
 224
 225         if (nbytes <= 1) {
 226                 rv = 0;
 227                 goto done;
 228         }
 229
 230         /*
 231          * for troff, look for . + letter + letter or .\";
 232          * this must be done to disambiguate tar archives' ./file
 233          * and other trash from real troff input.
 234          *
 235          * I believe Plan 9 troff allows non-ASCII characters in the names
 236          * of macros, so this test might possibly fail on such a file.
 237          */
 238         if (*ubuf == '.') {
 239                 my_unichar *tp = ubuf + 1;
 240
 241                 while (ISSPC(*tp))
 242                         ++tp;   /* skip leading whitespace */
 243                 if ((tp[0] == '\\' && tp[1] == '\"') ||
 244                     (isascii((unsigned char)tp[0]) &&
 245                      isalnum((unsigned char)tp[0]) &&
 246                      isascii((unsigned char)tp[1]) &&
 247                      isalnum((unsigned char)tp[1]) &&
 248                      ISSPC(tp[2]))) {
 249                     subtypeMimeGeneric = "text/x-source-code";
 250                         subtypeMimeSpecific = "text/troff";
 251                         subtype = "troff or preprocessor input";
 252                         goto subtype_identified;
 253                 }
 254         }
 255
 256         if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
 257                 subtypeMimeGeneric = "text/x-source-code";
 258                 subtypeMimeSpecific = "text/fortran";
 259                 subtype = "fortran program";
 260                 goto subtype_identified;
 261         }
 262
 263         /* look for tokens from names.h - this is expensive! */
 264
 265         i = 0;
 266         while (i < ulen) {
 267                 size_t end;
 268
 269                 /*
 270                  * skip past any leading space
 271                  */
 272                 while (i < ulen && ISSPC(ubuf[i]))
 273                         i++;
 274                 if (i >= ulen)
 275                         break;
 276
 277                 /*
 278                  * find the next whitespace
 279                  */
 280                 for (end = i + 1; end < nbytes; end++)
 281                         if (ISSPC(ubuf[end]))
 282                                 break;
 283
 284                 /*
 285                  * compare the word thus isolated against the token list
 286                  */
 287                 for (p = names; p < names + NNAMES; p++) {
 288                         if (ascmatch((const unsigned char *)p->name, ubuf + i,
 289                             end - i)) {
 290                                 subtype = types[p->type].human;
 291                                 subtypeMimeGeneric = types[p->type].generic_mime;
 292                                 subtypeMimeSpecific = types[p->type].specific_mime;
 293                                 goto subtype_identified;
 294                         }
 295                 }
 296
 297                 i = end;
 298         }
 299
 300         (void)code;
 301         (void)code_mime;
 302         (void)type;
 303         (void)subtype;
 304         (void)has_escapes;
 305         (void)has_backspace;
 306         (void)has_long_lines;
 307
 308 subtype_identified:
 309
 310         /*
 311          * Now try to discover other details about the file.
 312          */
 313         for (i = 0; i < ulen; i++) {
 314                 if (ubuf[i] == '\n') {
 315                         if (seen_cr)
 316                                 n_crlf++;
 317                         else
 318                                 n_lf++;
 319                         last_line_end = i;
 320                 } else if (seen_cr)
 321                         n_cr++;
 322
 323                 seen_cr = (ubuf[i] == '\r');
 324                 if (seen_cr)
 325                         last_line_end = i;
 326
 327                 if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
 328                         n_nel++;
 329                         last_line_end = i;
 330                 }
 331
 332                 /* If this line is _longer_ than MAXLINELEN, remember it. */
 333                 if ((int)i > last_line_end + MAXLINELEN)
 334                         has_long_lines = 1;
 335
 336                 if (ubuf[i] == '\033')
 337                         has_escapes = 1;
 338                 if (ubuf[i] == '\b')
 339                         has_backspace = 1;
 340         }
 341
 342         rv = 1;
 343 done:
 344         if (nbuf)
 345                 free(nbuf);
 346         if (ubuf)
 347                 free(ubuf);
 348
 349         if (rv) {
 350                 // If we have identified the subtype, return it, otherwise just
 351                 // text/plain.
 352
 353                 bool found = false;
 354                 if (subtypeMimeSpecific != NULL) {
 355                         if (databaseLocation->IsInstalled(subtypeMimeSpecific)) {
 356                                 mimeType->SetTo(subtypeMimeSpecific);
 357                                 found = true;
 358                         }
 359                 }
 360                 if (!found && subtypeMimeGeneric != NULL) {
 361                         if (databaseLocation->IsInstalled(subtypeMimeGeneric)) {
 362                                 mimeType->SetTo(subtypeMimeGeneric);
 363                                 found = true;
 364                         }
 365                 }
 366                 if (!found)
 367                         mimeType->SetTo("text/plain");
 368         }
 369
 370         return rv;
 371 }
 372
 373 static int
 374 ascmatch(const unsigned char *s, const my_unichar *us, size_t ulen)
 375 {
 376         size_t i;
 377
 378         for (i = 0; i < ulen; i++) {
 379                 if (s[i] != us[i])
 380                         return 0;
 381         }
 382
 383         if (s[i])
 384                 return 0;
 385         else
 386                 return 1;
 387 }
 388
 389 /*
 390  * This table reflects a particular philosophy about what constitutes
 391  * "text," and there is room for disagreement about it.
 392  *
 393  * Version 3.31 of the file command considered a file to be ASCII if
 394  * each of its characters was approved by either the isascii() or
 395  * isalpha() function.  On most systems, this would mean that any
 396  * file consisting only of characters in the range 0x00 ... 0x7F
 397  * would be called ASCII text, but many systems might reasonably
 398  * consider some characters outside this range to be alphabetic,
 399  * so the file command would call such characters ASCII.  It might
 400  * have been more accurate to call this "considered textual on the
 401  * local system" than "ASCII."
 402  *
 403  * It considered a file to be "International language text" if each
 404  * of its characters was either an ASCII printing character (according
 405  * to the real ASCII standard, not the above test), a character in
 406  * the range 0x80 ... 0xFF, or one of the following control characters:
 407  * backspace, tab, line feed, vertical tab, form feed, carriage return,
 408  * escape.  No attempt was made to determine the language in which files
 409  * of this type were written.
 410  *
 411  *
 412  * The table below considers a file to be ASCII if all of its characters
 413  * are either ASCII printing characters (again, according to the X3.4
 414  * standard, not isascii()) or any of the following controls: bell,
 415  * backspace, tab, line feed, form feed, carriage return, esc, nextline.
 416  *
 417  * I include bell because some programs (particularly shell scripts)
 418  * use it literally, even though it is rare in normal text.  I exclude
 419  * vertical tab because it never seems to be used in real text.  I also
 420  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
 421  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
 422  * character to.  It might be more appropriate to include it in the 8859
 423  * set instead of the ASCII set, but it's got to be included in *something*
 424  * we recognize or EBCDIC files aren't going to be considered textual.
 425  * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
 426  * and Latin characters, so these should possibly be allowed.  But they
 427  * make a real mess on VT100-style displays if they're not paired properly,
 428  * so we are probably better off not calling them text.
 429  *
 430  * A file is considered to be ISO-8859 text if its characters are all
 431  * either ASCII, according to the above definition, or printing characters
 432  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
 433  *
 434  * Finally, a file is considered to be international text from some other
 435  * character code if its characters are all either ISO-8859 (according to
 436  * the above definition) or characters in the range 0x80 ... 0x9F, which
 437  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
 438  * consider to be printing characters.
 439  */
 440
 441 #define F 0   /* character never appears in text */
 442 #define T 1   /* character appears in plain ASCII text */
 443 #define I 2   /* character appears in ISO-8859 text */
 444 #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
 445
 446 static char text_chars[256] = {
 447         /*                  BEL BS HT LF    FF CR    */
 448         F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
 449         /*                              ESC          */
 450         F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
 451         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
 452         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
 453         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
 454         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
 455         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
 456         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
 457         /*            NEL                            */
 458         X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
 459         X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
 460         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
 461         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
 462         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
 463         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
 464         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
 465         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
 466 };
 467
 468 static int
 469 looks_ascii(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
 470     size_t *ulen)
 471 {
 472         int i;
 473
 474         *ulen = 0;
 475
 476         for (i = 0; i < (int)nbytes; i++) {
 477                 int t = text_chars[buf[i]];
 478
 479                 if (t != T)
 480                         return 0;
 481
 482                 ubuf[(*ulen)++] = buf[i];
 483         }
 484
 485         return 1;
 486 }
 487
 488 static int
 489 looks_latin1(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, size_t *ulen)
 490 {
 491         int i;
 492
 493         *ulen = 0;
 494
 495         for (i = 0; i < (int)nbytes; i++) {
 496                 int t = text_chars[buf[i]];
 497
 498                 if (t != T && t != I)
 499                         return 0;
 500
 501                 ubuf[(*ulen)++] = buf[i];
 502         }
 503
 504         return 1;
 505 }
 506
 507 static int
 508 looks_extended(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
 509     size_t *ulen)
 510 {
 511         int i;
 512
 513         *ulen = 0;
 514
 515         for (i = 0; i < (int)nbytes; i++) {
 516                 int t = text_chars[buf[i]];
 517
 518                 if (t != T && t != I && t != X)
 519                         return 0;
 520
 521                 ubuf[(*ulen)++] = buf[i];
 522         }
 523
 524         return 1;
 525 }
 526
 527 static int
 528 looks_utf8(const unsigned char *buf, size_t nbytes, my_unichar *ubuf, size_t *ulen)
 529 {
 530         int i, n;
 531         my_unichar c;
 532         int gotone = 0;
 533
 534         *ulen = 0;
 535
 536         for (i = 0; i < (int)nbytes; i++) {
 537                 if ((buf[i] & 0x80) == 0) {        /* 0xxxxxxx is plain ASCII */
 538                         /*
 539                          * Even if the whole file is valid UTF-8 sequences,
 540                          * still reject it if it uses weird control characters.
 541                          */
 542
 543                         if (text_chars[buf[i]] != T)
 544                                 return 0;
 545
 546                         ubuf[(*ulen)++] = buf[i];
 547                 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
 548                         return 0;
 549                 } else {                           /* 11xxxxxx begins UTF-8 */
 550                         int following;
 551
 552                         if ((buf[i] & 0x20) == 0) {             /* 110xxxxx */
 553                                 c = buf[i] & 0x1f;
 554                                 following = 1;
 555                         } else if ((buf[i] & 0x10) == 0) {      /* 1110xxxx */
 556                                 c = buf[i] & 0x0f;
 557                                 following = 2;
 558                         } else if ((buf[i] & 0x08) == 0) {      /* 11110xxx */
 559                                 c = buf[i] & 0x07;
 560                                 following = 3;
 561                         } else if ((buf[i] & 0x04) == 0) {      /* 111110xx */
 562                                 c = buf[i] & 0x03;
 563                                 following = 4;
 564                         } else if ((buf[i] & 0x02) == 0) {      /* 1111110x */
 565                                 c = buf[i] & 0x01;
 566                                 following = 5;
 567                         } else
 568                                 return 0;
 569
 570                         for (n = 0; n < following; n++) {
 571                                 i++;
 572                                 if (i >= (int)nbytes)
 573                                         goto done;
 574
 575                                 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
 576                                         return 0;
 577
 578                                 c = (c << 6) + (buf[i] & 0x3f);
 579                         }
 580
 581                         ubuf[(*ulen)++] = c;
 582                         gotone = 1;
 583                 }
 584         }
 585 done:
 586         return gotone;   /* don't claim it's UTF-8 if it's all 7-bit */
 587 }
 588
 589 static int
 590 looks_unicode(const unsigned char *buf, size_t nbytes, my_unichar *ubuf,
 591     size_t *ulen)
 592 {
 593         int bigend;
 594         int i;
 595
 596         if (nbytes < 2)
 597                 return 0;
 598
 599         if (buf[0] == 0xff && buf[1] == 0xfe)
 600                 bigend = 0;
 601         else if (buf[0] == 0xfe && buf[1] == 0xff)
 602                 bigend = 1;
 603         else
 604                 return 0;
 605
 606         *ulen = 0;
 607
 608         for (i = 2; i + 1 < (int)nbytes; i += 2) {
 609                 /* XXX fix to properly handle chars > 65536 */
 610
 611                 if (bigend)
 612                         ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
 613                 else
 614                         ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
 615
 616                 if (ubuf[*ulen - 1] == 0xfffe)
 617                         return 0;
 618                 if (ubuf[*ulen - 1] < 128 &&
 619                     text_chars[(size_t)ubuf[*ulen - 1]] != T)
 620                         return 0;
 621         }
 622
 623         return 1 + bigend;
 624 }
 625
 626 #undef F
 627 #undef T
 628 #undef I
 629 #undef X
 630
 631 /*
 632  * This table maps each EBCDIC character to an (8-bit extended) ASCII
 633  * character, as specified in the rationale for the dd(1) command in
 634  * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
 635  *
 636  * Unfortunately it does not seem to correspond exactly to any of the
 637  * five variants of EBCDIC documented in IBM's _Enterprise Systems
 638  * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
 639  * Edition, July, 1999, pp. I-1 - I-4.
 640  *
 641  * Fortunately, though, all versions of EBCDIC, including this one, agree
 642  * on most of the printing characters that also appear in (7-bit) ASCII.
 643  * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
 644  *
 645  * Fortunately too, there is general agreement that codes 0x00 through
 646  * 0x3F represent control characters, 0x41 a nonbreaking space, and the
 647  * remainder printing characters.
 648  *
 649  * This is sufficient to allow us to identify EBCDIC text and to distinguish
 650  * between old-style and internationalized examples of text.
 651  */
 652
 653 static unsigned char ebcdic_to_ascii[] = {
 654   0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
 655  16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
 656 128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
 657 144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
 658 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
 659 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
 660 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
 661 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
 662 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
 663 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
 664 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
 665 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
 666 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
 667 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
 668 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
 669 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
 670 };
 671
 672 #ifdef notdef
 673 /*
 674  * The following EBCDIC-to-ASCII table may relate more closely to reality,
 675  * or at least to modern reality.  It comes from
 676  *
 677  *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
 678  *
 679  * and maps the characters of EBCDIC code page 1047 (the code used for
 680  * Unix-derived software on IBM's 390 systems) to the corresponding
 681  * characters from ISO 8859-1.
 682  *
 683  * If this table is used instead of the above one, some of the special
 684  * cases for the NEL character can be taken out of the code.
 685  */
 686
 687 static unsigned char ebcdic_1047_to_8859[] = {
 688 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
 689 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
 690 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
 691 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
 692 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
 693 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
 694 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
 695 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
 696 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
 697 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
 698 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
 699 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
 700 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
 701 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
 702 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
 703 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
 704 };
 705 #endif
 706
 707 /*
 708  * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
 709  */
 710 static void
 711 from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
 712 {
 713         int i;
 714
 715         for (i = 0; i < (int)nbytes; i++) {
 716                 out[i] = ebcdic_to_ascii[buf[i]];
 717         }
 718 }