glue/wv1-glue.c

   1 /*
   2  * wv1-glue.c : A "C" wrapper for using wv1 (library to parse
   3  * Microsoft Word documents).
   4  *
   5  * Copyright (C) 2004 Novell, Inc.
   6  *
   7  * Author: Veerapuram Varadhan <vvaradhan@novell.com>
   8  * [Basic framework of this file is taken from wvRTF.c of wv-1.0]
   9  *
  10  */
  11
  12 /*
  13  * Permission is hereby granted, free of charge, to any person obtaining a
  14  * copy of this software and associated documentation files (the "Software"),
  15  * to deal in the Software without restriction, including without limitation
  16  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  17  * and/or sell copies of the Software, and to permit persons to whom the
  18  * Software is furnished to do so, subject to the following conditions:
  19  *
  20  * The above copyright notice and this permission notice shall be included in
  21  * all copies or substantial portions of the Software.
  22  *
  23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  24  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  25  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  26  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  27  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  28  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  29  * DEALINGS IN THE SOFTWARE.
  30  */
  31
  32 #include <wv.h>
  33 #include <string.h>
  34
  35 /* Number of structural-break'ed text-chunks to hold
  36  * in the text/hot pools, before sending them for
  37  * indexing.  Increasing the number will give good
  38  * performance w.r.t for indexing, however, may take
  39  * large-chunk-of-memory to hold that much data and
  40  * depends on the length of each structurally-broken
  41  * lines.
  42  */
  43 #define BUFFERED_STRUCT_BREAK 12
  44
  45
  46 /* Callback to Handle "text" (or words) extracted out of
  47  * M$ Word documents
  48  *
  49  * text: Holds the extracted text/words.
  50  *
  51  * hotText: Identifies the attributes of the text.
  52  *          (bold, italic, underline, superscript, subscript)
  53  */
  54
  55 typedef void (* wvTextHandlerCallback) (U8* text, int len,
  56                                         U8* hotText, int hotLen,
  57                                         U8 needStructBrk);
  58
  59 typedef struct _UserData {
  60   /* formatting variables */
  61
  62   int cFontSize;
  63   int cCol;
  64
  65   /* boolean formats */
  66   int bIsBold:1;
  67   int bIsItalic:1;
  68   int bIsStrike:1;
  69   int bIsUl:1;
  70   int bIsSup:1;
  71   int bIsSub:1;
  72   int bIsSplStyle:1;
  73   int bIgnore:1;
  74
  75   /* beagle specifc formats */
  76   U8 bIsHot;
  77
  78   /* beagle specifc formats - for partially formatted
  79    *  texts.
  80    */
  81   U8 bWasHot:1;
  82
  83   /*
  84     specifies end of para, used to send data to managed code
  85   */
  86   int bParaEnd:1;
  87
  88   /* buffer to hold text */
  89   GString* txtWord;
  90
  91   /* buffer to hold hot-pool-text */
  92   GString* txtHotPool;
  93
  94   /* buffer to hold normal-pool-text */
  95   GString* txtPool;
  96
  97   /* hold number of "structural breaks" encountered
  98    * since last-update-to-filter.
  99    */
 100   short structBrkCount;
 101
 102   wvTextHandlerCallback WordHandler;
 103
 104 } UserData;
 105
 106
 107 /*
 108  * append_char: fills the txtWord buffer with the character 'ch'
 109  * converted to UTF8 encoding.  Calls the "WordHandler" for every
 110  * word/line/end of a paragraph or for every 1023 characters,
 111  * whichever comes first.
 112  *
 113  * ud : carries the UserData filled-in appropriately to hold the
 114  *      character (text) attributes.
 115  *
 116  * ch : unicode character
 117  *
 118  */
 119
 120 void
 121 append_char (UserData * ud, U16 ch)
 122 {
 123   gchar tmpBuf[64];
 124   int len = 0;
 125   U8 bNeedStructBrk = 0;
 126
 127   if (ud->bIgnore)
 128     return;
 129
 130   switch (ch) {
 131   case 0x00: /* End of Document */
 132     bNeedStructBrk = 1;
 133     break;
 134
 135   case 0x0B: /* hard line break */
 136   case 0x0D: /* paragraph end */
 137   case 0x0C:
 138   case '\n': /* new-line */
 139     bNeedStructBrk = 1;
 140     ch = 0x0A;
 141     break;
 142
 143   case 0x20: /* space */
 144     g_string_append_c (ud->txtWord, ch);
 145     break;
 146
 147   default:
 148     len =  g_unichar_to_utf8 (ch, tmpBuf);
 149     /*  FIXME: This is not good, pretty hacky code
 150      *  to get rid of unwanted characters, especially
 151      *  some graphic symbols used in a document.
 152      *  Ex: a tick mark, a smiley blah blah blah...
 153      *  in a much sane way without blocking
 154      *  printable-non-iso characters ;)
 155      */
 156     /*
 157       int i;
 158       for (i = 0; i < len; i++)
 159       if (tmpBuf[i] > 0)
 160       g_string_append_c (ud->txtWord, tmpBuf[i]);
 161     */
 162     g_string_append_len (ud->txtWord, tmpBuf, len);
 163     break;
 164   }
 165
 166   if (ch == 0x00 || ch == 0x20 || ch == 0x0A) {
 167     if (ud->bWasHot)
 168       g_string_append_len (ud->txtHotPool, ud->txtWord->str, ud->txtWord->len);
 169
 170     /*
 171     printf ("TxtWord: %s, len: %d\n", ud->txtWord->str, ud->txtWord->len);
 172     printf ("TxtPool: %s, len: %d\n", ud->txtPool->str, ud->txtPool->len);
 173     printf ("HotTxtPool: %s, len: %d\n", ud->txtHotPool->str, ud->txtHotPool->len);
 174     */
 175
 176     g_string_append_len (ud->txtPool, ud->txtWord->str, ud->txtWord->len);
 177     if (bNeedStructBrk) {
 178       g_string_append_c (ud->txtPool, '\n');
 179       g_string_append_c (ud->txtHotPool, ' ');
 180       ud->structBrkCount++;
 181     }
 182
 183     if (ud->structBrkCount >= BUFFERED_STRUCT_BREAK ||
 184         ud->bParaEnd) {
 185       (*(ud->WordHandler))(ud->txtPool->str, ud->txtPool->len,
 186                            ud->txtHotPool->str, ud->txtHotPool->len, bNeedStructBrk);
 187       /*
 188          g_string_erase () can be used here to erase
 189          the previous content, however, using this
 190          call will free the "erased-content-memory"
 191          and thereby causing memory fragmentation for
 192          every time we transfer data from unmanaged
 193          to managed code.  Setting "len" to 0 results
 194          in the same way g_string_erase () does, but
 195          doesn't do memory-[de/re]allocation
 196       */
 197
 198       /*
 199         ch == 0x00 refers to EOD.  Do not reset len to
 200         zero, we have to free the gstrings.
 201        */
 202       if (ch != 0x00) {
 203         ud->txtPool->len = 0;
 204         ud->txtHotPool->len = 0;
 205         ud->structBrkCount = 0;
 206       }
 207     }
 208     if (ch != 0x00)
 209       ud->txtWord->len = 0;
 210     ud->bWasHot = 0;
 211   }
 212 }
 213
 214 /*
 215  * fill_UserData: fills the UserData structure from the
 216  * CHP structure that represents the Character Property
 217  * Information like bold, italic, striked, underlined,
 218  * superscript, subscript, fontsize, color, fontface etc.
 219  *
 220  */
 221 void
 222 fill_UserData (UserData * ud, CHP * chp, wvParseStruct * ps)
 223 {
 224   if (!chp || !ud)
 225     return;
 226
 227   ud->cCol = 0;
 228   if (chp->ico)
 229     ud->cCol = chp->ico - 1;
 230
 231   ud->cFontSize = chp->hps;
 232   ud->bIsBold = (chp->fBold);
 233   ud->bIsItalic = (chp->fItalic);
 234   ud->bIsUl = (chp->kul);
 235   ud->bIsStrike = (chp->fStrike);
 236   ud->bIsSup = (chp->iss == 1);
 237   ud->bIsSub = (chp->iss == 2);
 238
 239   if ((ud->bIsBold
 240        || ud->bIsItalic
 241        || ud->bIsUl
 242        || ud->bIsSup
 243        || ud->bIsSub
 244        || ud->bIsSplStyle) &&
 245       (!ud->bIgnore))
 246       ud->bIsHot = 1;
 247   else
 248     ud->bIsHot = 0;
 249 }
 250
 251 /* This is a callback that handles the individual
 252  * character that are extracted from M$ word file.
 253  */
 254 static int
 255 charProc (wvParseStruct * ps, U16 eachchar, U8 chartype, U16 lid)
 256 {
 257   /* convert incoming character to unicode */
 258   if (chartype) {
 259     eachchar = wvHandleCodePage (eachchar, lid);
 260   }
 261
 262   /* take care of any oddities in Microsoft's character "encoding" */
 263   /* TODO: does the above code page handler take care of these? */
 264   if (chartype == 1 && eachchar == 146)
 265     eachchar = 39;              /* apostrophe */
 266
 267   switch (eachchar)
 268     {
 269     case 14:                    /* column break */
 270       break;
 271
 272     case 19:                    /* field begin */
 273       /* flush current text buffer */
 274       ps->fieldstate++;
 275       ps->fieldmiddle = 0;
 276       return 0;
 277     case 20:                    /* field separator */
 278       ps->fieldmiddle = 1;
 279       return 0;
 280     case 21:                    /* field end */
 281       ps->fieldstate--;
 282       ps->fieldmiddle = 0;
 283       return 0;
 284     case 7:                     /* Cell/Row mark, end of a cell/row*/
 285       eachchar = 0x20;
 286       break;
 287     default:
 288       break;
 289     }
 290
 291   if (eachchar == 0x14)
 292     return 0;
 293
 294   /* To handle partially-formatted-texts, Bug#157100,
 295    * which is applicable to all word-processor-generated
 296    * documents.
 297    *
 298    * ud->bIsHot is updated for every CHARPROPBEGIN element
 299    * ud->bWasHot is updated on reading every *word*.
 300    */
 301   UserData *ud = (UserData *) ps->userData;
 302   if (!ud->bWasHot)
 303     ud->bWasHot = ud->bIsHot;
 304
 305   append_char (ps->userData, eachchar);
 306   return 0;
 307 }
 308
 309 /* This is a callback that handles the special
 310  * character that are specific to M$ word file.
 311  */
 312 static int
 313 specCharProc (wvParseStruct * ps, U16 eachchar, CHP * achp)
 314 {
 315   Blip blip;
 316   wvStream *fil;
 317   long pos;
 318   FSPA *fspa;
 319   PICF picf;
 320   FDOA *fdoa;
 321
 322   switch (eachchar)
 323     {
 324     case 19:                    /* field begin */
 325       ps->fieldstate++;
 326       ps->fieldmiddle = 0;
 327       return 0;
 328     case 20:                    /* field separator */
 329       if (achp->fOle2)
 330         {
 331 /*        printf ("Field has an embedded OLE2 object\n"); */
 332         }
 333       ps->fieldmiddle = 1;
 334       return 0;
 335     case 21:                    /* field end */
 336       ps->fieldstate--;
 337       ps->fieldmiddle = 0;
 338       return 0;
 339     case 7:                     /* Cell/Row mark, end of a cell/row */
 340       append_char (ps->userData, 0x20);
 341       break;
 342     default:
 343       break;
 344     }
 345
 346   if (ps->fieldstate)
 347     {
 348       if (eachchar == 0x13 || eachchar == 0x14)
 349         return 0;
 350     }
 351
 352   return 0;
 353 }
 354
 355 /* This is a callback that handles the individual
 356  * elements that are marked by libwv1.
 357  */
 358
 359 static int
 360 eleProc (wvParseStruct * ps, wvTag tag, void *props, int dirty)
 361 {
 362   /* some word structures */
 363   PAP *apap;
 364   CHP *achp;
 365   SEP *asep;
 366   int iRes;
 367
 368   UserData *ud = (UserData *) ps->userData;
 369
 370   switch (tag)
 371     {
 372     case PARABEGIN:
 373       apap = (PAP *)props;
 374       switch (ps->stsh.std[apap->istd].sti) {
 375       case 29:    /* Footnote Text   */
 376       case 30:    /* Annotation text */
 377       case 31:    /* Header          */
 378       case 32:    /* Footer          */
 379       case 33:    /* Index Heading   */
 380       case 34:    /* Caption         */
 381       case 43:    /* Endnote Text    */
 382       case 62:    /* Title           */
 383       case 74:    /* Sub title       */
 384         ud->bIsSplStyle = 1;
 385         break;
 386       default:
 387         ud->bIsSplStyle = 0;
 388         break;
 389       }
 390       ud->bParaEnd = 0;
 391       break;
 392
 393     case SECTIONEND:
 394       append_char (ud, '\n');
 395       break;
 396
 397     case PARAEND:               /* pretty much nothing */
 398       ud->bIsSplStyle = 0;
 399       ud->bParaEnd = 1;
 400       append_char (ud, '\n');
 401       break;
 402
 403     case CHARPROPBEGIN:
 404       achp = (CHP *) props;
 405       /*      switch (ps->stsh.std[achp->istd].sti) {
 406       case 38:
 407       case 39:
 408       case 40:
 409       case 41:
 410       case 42:
 411         ud->bIgnore = 1;
 412         break;
 413       default:
 414         ud->bIgnore = 0;
 415         break;
 416       }
 417       */
 418       fill_UserData (ud, achp, ps);
 419       break;
 420
 421
 422       /* Do not call fill_UserData, as it resets the
 423        * *Hot* flag in the ud structure.
 424        */
 425     case CHARPROPEND:
 426       achp = (CHP *) props;
 427       /*fill_UserData (ud, achp, ps);*/
 428       break;
 429
 430     default:
 431       break;
 432     }
 433
 434   return 0;
 435 }
 436
 437 /* This is a callback that handles the document
 438  * level tags that are marked by libwv1.
 439  */
 440
 441 static int
 442 docProc (wvParseStruct * ps, wvTag tag)
 443 {
 444   UserData *ud = (UserData *) ps->userData;
 445
 446   switch (tag)
 447     {
 448     case DOCEND:
 449       /* flush the text/hot pools at the EOD */
 450       ud->structBrkCount = BUFFERED_STRUCT_BREAK;
 451       append_char (ps->userData, 0x00);
 452
 453       break;
 454
 455     default:
 456       break;
 457     }
 458
 459   return 0;
 460 }
 461
 462 /*
 463  * wv1_glue_init_doc_parsing: Initiates the document parsing
 464  * procedure.  Sets up all the required handlers and the parser.
 465  *
 466  * fname: Name of the file to parse. (essentially a M$ word file)
 467  *
 468  * wvTextHandlerCallback: The callback routine that will be called
 469  * on extraction of each word.
 470  *
 471  * Return: 0 -> success
 472  *        -1 -> failure.
 473  */
 474
 475 int
 476 wv1_glue_init_doc_parsing (char* fname, wvTextHandlerCallback callback)
 477 {
 478   FILE *input;
 479   int ret = 0;
 480
 481   wvParseStruct ps;
 482   char *dir = NULL;
 483
 484   UserData ud;
 485
 486   input = fopen (fname, "rb");
 487   if (!input)
 488       return -1;
 489   fclose (input);
 490
 491   ret = wvInitParser (&ps, fname);
 492   if (ret & 0x8000)
 493     ret = -2;
 494   else if (ret)
 495     ret = -3;
 496
 497   if (ret) {
 498     wvOLEFree (&ps);
 499     return ret;
 500   }
 501
 502   ps.filename = fname;
 503   ps.dir = dir;
 504
 505   /* set to 0 */
 506   memset (&ud, 0, sizeof (UserData));
 507   ud.WordHandler = callback;
 508   ud.txtWord = g_string_sized_new (32);
 509   ud.txtHotPool = g_string_sized_new (1024);
 510   ud.txtPool = g_string_sized_new (1024);
 511   ps.userData = &ud;
 512
 513   wvSetElementHandler (&ps, eleProc);
 514   wvSetDocumentHandler (&ps, docProc);
 515   wvSetCharHandler (&ps, charProc);
 516   wvSetSpecialCharHandler (&ps, specCharProc);
 517
 518   wvText (&ps);
 519
 520   /* free userdata memory */
 521   g_string_free (ud.txtWord, TRUE);
 522
 523   /* free text pool memory */
 524   g_string_free (ud.txtPool, TRUE);
 525
 526   /* free hot text pool memory */
 527   g_string_free (ud.txtHotPool, TRUE);
 528
 529   /* free associated memory */
 530   wvOLEFree (&ps);
 531
 532   ud.txtPool = NULL;
 533   ud.txtWord = NULL;
 534   ud.txtHotPool = NULL;
 535
 536   return 0;
 537 }
 538
 539 /*
 540  * wv1_init (): Initialize the wv1 library
 541  * NOTE: Do not call this more than once for an application.
 542  */
 543
 544 int
 545 wv1_init ()
 546 {
 547   return (wvInit());
 548 }
 549