glue/wv1-glue.c

   1 /*
   2  * wv1-glue.c : A "C" wrapper for using wv1 (library to parse
   3  * Microsoft Word documents).
   4  *
   5  * Copyright (C) 2004 Novell, Inc.
   6  *
   7  * Author: Veerapuram Varadhan <vvaradhan@novell.com>
   8  * [Basic framework of this file is taken from wvRTF.c of wv-1.0]
   9  *
  10  */
  11
  12 /*
  13  * Permission is hereby granted, free of charge, to any person obtaining a
  14  * copy of this software and associated documentation files (the "Software"),
  15  * to deal in the Software without restriction, including without limitation
  16  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  17  * and/or sell copies of the Software, and to permit persons to whom the
  18  * Software is furnished to do so, subject to the following conditions:
  19  *
  20  * The above copyright notice and this permission notice shall be included in
  21  * all copies or substantial portions of the Software.
  22  *
  23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  24  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  25  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  26  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  27  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  28  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  29  * DEALINGS IN THE SOFTWARE.
  30  */
  31
  32 #include <wv.h>
  33 #include <string.h>
  34
  35 /* Number of structural-break'ed text-chunks to hold
  36  * in the text/hot pools, before sending them for
  37  * indexing.  Increasing the number will give good
  38  * performance w.r.t for indexing, however, may take
  39  * large-chunk-of-memory to hold that much data and
  40  * depends on the length of each structurally-broken
  41  * lines.
  42  */
  43 #define BUFFERED_STRUCT_BREAK 12
  44
  45
  46 /* Callback to Handle "text" (or words) extracted out of
  47  * M$ Word documents
  48  *
  49  * text: Holds the extracted text/words.
  50  *
  51  * hotText: Identifies the attributes of the text.
  52  *          (bold, italic, underline, superscript, subscript)
  53  */
  54
  55 typedef void (* wvTextHandlerCallback) (U8* text, int len,
  56                                         U8* hotText, int hotLen,
  57                                         U8 needStructBrk);
  58
  59 typedef struct _UserData {
  60   /* formatting variables */
  61
  62   int cFontSize;
  63   int cCol;
  64
  65   /* boolean formats */
  66   int bIsBold:1;
  67   int bIsItalic:1;
  68   int bIsStrike:1;
  69   int bIsUl:1;
  70   int bIsSup:1;
  71   int bIsSub:1;
  72   int bIsSplStyle:1;
  73   int bIgnore:1;
  74
  75   /* beagle specifc formats */
  76   U8 bIsHot;
  77
  78   /* beagle specifc formats - for partially formatted
  79    *  texts.
  80    */
  81   U8 bWasHot;
  82
  83   /* buffer to hold text */
  84   GString* txtWord;
  85
  86   /* buffer to hold hot-pool-text */
  87   GString* txtHotPool;
  88
  89   /* buffer to hold normal-pool-text */
  90   GString* txtPool;
  91
  92   /* hold number of "structural breaks" encountered
  93    * since last-update-to-filter.
  94    */
  95   short structBrkCount;
  96
  97   wvTextHandlerCallback WordHandler;
  98
  99 } UserData;
 100
 101
 102 /*
 103  * append_char: fills the txtWord buffer with the character 'ch'
 104  * converted to UTF8 encoding.  Calls the "WordHandler" for every
 105  * word/line/end of a paragraph or for every 1023 characters,
 106  * whichever comes first.
 107  *
 108  * ud : carries the UserData filled-in appropriately to hold the
 109  *      character (text) attributes.
 110  *
 111  * ch : unicode character
 112  *
 113  */
 114
 115 void
 116 append_char (UserData * ud, U16 ch)
 117 {
 118   gchar tmpBuf[64];
 119   int len = 0;
 120   U8 bNeedStructBrk = 0;
 121
 122   if (ud->bIgnore)
 123     return;
 124
 125   switch (ch) {
 126   case 0x0B: /* hard line break */
 127   case 0x0D: /* paragraph end */
 128   case 0x0C:
 129   case '\n': /* new-line */
 130     bNeedStructBrk = 1;
 131     ch = 0x00;
 132     break;
 133
 134   case 0x20: /* space */
 135       g_string_append_c (ud->txtWord, ch);
 136     break;
 137   default:
 138     len =  g_unichar_to_utf8 (ch, tmpBuf);
 139     int i;
 140     /*  FIXME: This is not good, pretty hacky code
 141      *  to get rid of unwanted characters, especially
 142      *  some graphic symbols used in a document.
 143      *  Ex: a tick mark, a smiley blah blah blah...
 144      *  in a much sane way without blocking
 145      *  printable-non-iso characters ;)
 146      */
 147     /*
 148       for (i = 0; i < len; i++)
 149       if (tmpBuf[i] > 0)
 150       g_string_append_c (ud->txtWord, tmpBuf[i]);
 151     */
 152     g_string_append_len (ud->txtWord, tmpBuf, len);
 153     break;
 154   }
 155
 156   if (ch == 0x00 || ch == 0x20) {
 157     if (ud->bWasHot)
 158       g_string_append (ud->txtHotPool, ud->txtWord->str);
 159
 160     g_string_append (ud->txtPool, ud->txtWord->str);
 161
 162     /*      printf ("TxtWord: %s\n", ud->txtWord->str);
 163             printf ("TxtPool: %s\n", ud->txtPool->str);
 164             printf ("HotTxtPool: %s\n", ud->txtHotPool->str);
 165     */
 166
 167     if (bNeedStructBrk) {
 168       g_string_append_c (ud->txtPool, '\n');
 169       g_string_append_c (ud->txtHotPool, ' ');
 170       ud->structBrkCount++;
 171     }
 172
 173     if (ud->structBrkCount >= BUFFERED_STRUCT_BREAK) {
 174       (*(ud->WordHandler))(ud->txtPool->str, ud->txtPool->len,
 175                            ud->txtHotPool->str, ud->txtHotPool->len, bNeedStructBrk);
 176       g_string_erase (ud->txtPool, 0, -1);
 177       g_string_erase (ud->txtHotPool, 0, -1);
 178       ud->structBrkCount = 0;
 179     }
 180     g_string_erase (ud->txtWord, 0, -1);
 181     ud->bWasHot = 0;
 182   }
 183 }
 184
 185 /*
 186  * fill_UserData: fills the UserData structure from the
 187  * CHP structure that represents the Character Property
 188  * Information like bold, italic, striked, underlined,
 189  * superscript, subscript, fontsize, color, fontface etc.
 190  *
 191  */
 192 void
 193 fill_UserData (UserData * ud, CHP * chp, wvParseStruct * ps)
 194 {
 195   ud->cCol = 0;
 196   if (chp->ico)
 197     ud->cCol = chp->ico - 1;
 198
 199   ud->cFontSize = chp->hps;
 200   ud->bIsBold = (chp->fBold);
 201   ud->bIsItalic = (chp->fItalic);
 202   ud->bIsUl = (chp->kul);
 203   ud->bIsStrike = (chp->fStrike);
 204   ud->bIsSup = (chp->iss == 1);
 205   ud->bIsSub = (chp->iss == 2);
 206
 207   if ((ud->bIsBold
 208        || ud->bIsItalic
 209        || ud->bIsUl
 210        || ud->bIsSup
 211        || ud->bIsSub
 212        || ud->bIsSplStyle) &&
 213       (!ud->bIgnore))
 214       ud->bIsHot = 1;
 215   else
 216     ud->bIsHot = 0;
 217 }
 218
 219 /* This is a callback that handles the individual
 220  * character that are extracted from M$ word file.
 221  */
 222 static int
 223 charProc (wvParseStruct * ps, U16 eachchar, U8 chartype, U16 lid)
 224 {
 225
 226   /* convert incoming character to unicode */
 227   if (chartype) {
 228     eachchar = wvHandleCodePage (eachchar, lid);
 229   }
 230
 231   /* take care of any oddities in Microsoft's character "encoding" */
 232   /* TODO: does the above code page handler take care of these? */
 233   if (chartype == 1 && eachchar == 146)
 234     eachchar = 39;              /* apostrophe */
 235
 236   switch (eachchar)
 237     {
 238     case 14:                    /* column break */
 239       break;
 240
 241     case 19:                    /* field begin */
 242       /* flush current text buffer */
 243       ps->fieldstate++;
 244       ps->fieldmiddle = 0;
 245       return 0;
 246     case 20:                    /* field separator */
 247       ps->fieldmiddle = 1;
 248       return 0;
 249     case 21:                    /* field end */
 250       ps->fieldstate--;
 251       ps->fieldmiddle = 0;
 252       return 0;
 253     case 7:                     /* Cell/Row mark, end of a cell/row*/
 254       eachchar = 0x20;
 255       break;
 256     default:
 257       break;
 258     }
 259
 260   if (eachchar == 0x14)
 261     return 0;
 262
 263   /* To handle partially-formatted-texts, Bug#157100,
 264    * which is applicable to all word-processor-generated
 265    * documents.
 266    *
 267    * ud->bIsHot is updated for every CHARPROPBEGIN element
 268    * ud->bWasHot is updated on reading every *word*.
 269    */
 270   UserData *ud = (UserData *) ps->userData;
 271   if (!ud->bWasHot)
 272     ud->bWasHot = ud->bIsHot;
 273
 274   append_char (ps->userData, eachchar);
 275   return 0;
 276 }
 277
 278 /* This is a callback that handles the special
 279  * character that are specific to M$ word file.
 280  */
 281 static int
 282 specCharProc (wvParseStruct * ps, U16 eachchar, CHP * achp)
 283 {
 284   Blip blip;
 285   wvStream *fil;
 286   long pos;
 287   FSPA *fspa;
 288   PICF picf;
 289   FDOA *fdoa;
 290
 291   switch (eachchar)
 292     {
 293     case 19:                    /* field begin */
 294       ps->fieldstate++;
 295       ps->fieldmiddle = 0;
 296       return 0;
 297     case 20:                    /* field separator */
 298       if (achp->fOle2)
 299         {
 300 /*        printf ("Field has an embedded OLE2 object\n"); */
 301         }
 302       ps->fieldmiddle = 1;
 303       return 0;
 304     case 21:                    /* field end */
 305       ps->fieldstate--;
 306       ps->fieldmiddle = 0;
 307       return 0;
 308     case 7:                     /* Cell/Row mark, end of a cell/row */
 309       append_char (ps->userData, 0x20);
 310       break;
 311     default:
 312       break;
 313     }
 314
 315   if (ps->fieldstate)
 316     {
 317       if (eachchar == 0x13 || eachchar == 0x14)
 318         return 0;
 319     }
 320
 321   return 0;
 322 }
 323
 324 /* This is a callback that handles the individual
 325  * elements that are marked by libwv1.
 326  */
 327
 328 static int
 329 eleProc (wvParseStruct * ps, wvTag tag, void *props, int dirty)
 330 {
 331   /* some word structures */
 332   PAP *apap;
 333   CHP *achp;
 334   SEP *asep;
 335   int iRes;
 336
 337   UserData *ud = (UserData *) ps->userData;
 338
 339   switch (tag)
 340     {
 341     case PARABEGIN:
 342       apap = (PAP *)props;
 343       switch (ps->stsh.std[apap->istd].sti) {
 344       case 29:    /* Footnote Text   */
 345       case 30:    /* Annotation text */
 346       case 31:    /* Header          */
 347       case 32:    /* Footer          */
 348       case 33:    /* Index Heading   */
 349       case 34:    /* Caption         */
 350       case 43:    /* Endnote Text    */
 351       case 62:    /* Title           */
 352       case 74:    /* Sub title       */
 353         ud->bIsSplStyle = 1;
 354         break;
 355       default:
 356         ud->bIsSplStyle = 0;
 357         break;
 358       }
 359       break;
 360
 361     case SECTIONEND:
 362       append_char (ud, '\n');
 363       break;
 364
 365     case PARAEND:               /* pretty much nothing */
 366       ud->bIsSplStyle = 0;
 367       append_char (ud, '\n');
 368       break;
 369
 370     case CHARPROPBEGIN:
 371       achp = (CHP *) props;
 372       /*      switch (ps->stsh.std[achp->istd].sti) {
 373       case 38:
 374       case 39:
 375       case 40:
 376       case 41:
 377       case 42:
 378         ud->bIgnore = 1;
 379         break;
 380       default:
 381         ud->bIgnore = 0;
 382         break;
 383       }
 384       */
 385       fill_UserData (ud, achp, ps);
 386       break;
 387
 388
 389       /* Do not call fill_UserData, as it resets the
 390        * *Hot* flag in the ud structure.
 391        */
 392     case CHARPROPEND:
 393       achp = (CHP *) props;
 394       /*fill_UserData (ud, achp, ps);*/
 395       break;
 396
 397     default:
 398       break;
 399     }
 400
 401   return 0;
 402 }
 403
 404 /* This is a callback that handles the document
 405  * level tags that are marked by libwv1.
 406  */
 407
 408 static int
 409 docProc (wvParseStruct * ps, wvTag tag)
 410 {
 411   UserData *ud = (UserData *) ps->userData;
 412   switch (tag)
 413     {
 414     case DOCEND:
 415       /* flush the text/hot pools at the EOD */
 416       ud->structBrkCount = BUFFERED_STRUCT_BREAK;
 417       append_char (ps->userData, 0x00);
 418       break;
 419
 420     default:
 421       break;
 422     }
 423
 424   return 0;
 425 }
 426
 427 /*
 428  * wv1_init (): Initialize the wv1 library
 429  * NOTE: Do not call this more than once for an application.
 430  */
 431
 432 int
 433 wv1_init ()
 434 {
 435   return (wvInit());
 436 }
 437
 438
 439 /*
 440  * wv1_glue_init_doc_parsing: Initiates the document parsing
 441  * procedure.  Sets up all the required handlers and the parser.
 442  *
 443  * fname: Name of the file to parse. (essentially a M$ word file)
 444  *
 445  * wvTextHandlerCallback: The callback routine that will be called
 446  * on extraction of each word.
 447  *
 448  * Return: 0 -> success
 449  *        -1 -> failure.
 450  */
 451
 452 int
 453 wv1_glue_init_doc_parsing (char* fname, wvTextHandlerCallback callback)
 454 {
 455   FILE *input;
 456   int ret = 0;
 457
 458   wvParseStruct ps;
 459   char *dir = NULL;
 460
 461   UserData ud;
 462
 463   input = fopen (fname, "rb");
 464   if (!input)
 465       return -1;
 466   fclose (input);
 467
 468   ret = wvInitParser (&ps, fname);
 469   if (ret & 0x8000)
 470     ret = -2;
 471   else if (ret)
 472     ret = -3;
 473
 474   if (ret) {
 475     wvOLEFree (&ps);
 476     return ret;
 477   }
 478
 479   ps.filename = fname;
 480   ps.dir = dir;
 481
 482   /* set to 0 */
 483   memset (&ud, 0, sizeof (UserData));
 484   ud.WordHandler = callback;
 485   ud.txtWord = g_string_sized_new (32);
 486   ud.txtHotPool = g_string_sized_new (1024);
 487   ud.txtPool = g_string_sized_new (1024);
 488   ps.userData = &ud;
 489
 490   wvSetElementHandler (&ps, eleProc);
 491   wvSetDocumentHandler (&ps, docProc);
 492   wvSetCharHandler (&ps, charProc);
 493   wvSetSpecialCharHandler (&ps, specCharProc);
 494
 495   wvText (&ps);
 496
 497   /* free associated memory */
 498   wvOLEFree (&ps);
 499
 500   /* free userdata memory */
 501   g_string_free (ud.txtWord, TRUE);
 502
 503   /* free text pool memory */
 504   g_string_free (ud.txtPool, TRUE);
 505
 506   /* free hot text pool memory */
 507   g_string_free (ud.txtHotPool, TRUE);
 508
 509   return 0;
 510 }
 511
 512 void *
 513 wv1_glue_get_ole_stream (const char* fname)
 514 {
 515     MsOle *ole = NULL;
 516     ms_ole_open (&ole, fname);
 517     return ((void *)ole);
 518 }
 519
 520 void *
 521 wv1_glue_get_ole_summary_stream (MsOle *stream)
 522 {
 523   MsOle *oleStream = (MsOle *)stream;
 524   MsOleSummary *summary = NULL;
 525   summary = ms_ole_summary_open (oleStream);
 526   return ((void *)summary);
 527 }
 528
 529 char *
 530 wv1_glue_get_title (MsOleSummary* smryStream)
 531 {
 532   int ret;
 533   return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_TITLE, &ret));
 534 }
 535
 536 char *
 537 wv1_glue_get_subject (MsOleSummary* smryStream)
 538 {
 539   int ret;
 540   return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_SUBJECT, &ret));
 541 }
 542
 543 char *
 544 wv1_glue_get_author (MsOleSummary* smryStream)
 545 {
 546   int ret;
 547   return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_AUTHOR, &ret));
 548 }
 549
 550 char *
 551 wv1_glue_get_keywords (MsOleSummary* smryStream)
 552 {
 553   int ret;
 554   return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_KEYWORDS, &ret));
 555 }
 556
 557 char *
 558 wv1_glue_get_comments (MsOleSummary* smryStream)
 559 {
 560   int ret;
 561   return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_COMMENTS, &ret));
 562 }
 563
 564 char *
 565 wv1_glue_get_template (MsOleSummary* smryStream)
 566 {
 567   int ret;
 568   return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_TEMPLATE, &ret));
 569 }
 570
 571 char *
 572 wv1_glue_get_lastsavedby (MsOleSummary* smryStream)
 573 {
 574   int ret;
 575   return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_LASTAUTHOR, &ret));
 576 }
 577
 578 char *
 579 wv1_glue_get_revision_number (MsOleSummary* smryStream)
 580 {
 581   int ret;
 582   return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_REVNUMBER, &ret));
 583 }
 584
 585 char *
 586 wv1_glue_get_appname (MsOleSummary* smryStream)
 587 {
 588   int ret;
 589   return (ms_ole_summary_get_string (smryStream, MS_OLE_SUMMARY_APPNAME, &ret));
 590 }
 591
 592 long
 593 wv1_glue_get_page_count (MsOleSummary* smryStream)
 594 {
 595   int ret;
 596   return (ms_ole_summary_get_long (smryStream, MS_OLE_SUMMARY_PAGECOUNT, &ret));
 597 }
 598
 599 long
 600 wv1_glue_get_word_count (MsOleSummary* smryStream)
 601 {
 602   int ret;
 603   return (ms_ole_summary_get_long (smryStream, MS_OLE_SUMMARY_WORDCOUNT, &ret));
 604 }
 605
 606 long
 607 wv1_glue_get_character_count (MsOleSummary* smryStream)
 608 {
 609   int ret;
 610   return (ms_ole_summary_get_long (smryStream, MS_OLE_SUMMARY_CHARCOUNT, &ret));
 611 }
 612
 613 long
 614 wv1_glue_get_security (MsOleSummary* smryStream)
 615 {
 616   int ret;
 617   return (ms_ole_summary_get_long (smryStream, MS_OLE_SUMMARY_SECURITY, &ret));
 618 }
 619
 620 short
 621 wv1_glue_get_codepage (MsOleSummary* smryStream)
 622 {
 623   int ret;
 624   return (ms_ole_summary_get_short (smryStream, MS_OLE_SUMMARY_CODEPAGE, &ret));
 625 }
 626
 627 void
 628 wv1_glue_close_stream (MsOle* oleStream, MsOleSummary* summary)
 629 {
 630     ms_ole_summary_close (summary);
 631     ms_ole_destroy (&oleStream);
 632 }