components/library/indri/patches/pia.patch

   1 Add our PIA wrapper to indri sources. This patch does several things:
   2  - Add pia wrapper sources to indri source tree
   3  - Add new tokenizer which does not treat '_' as a separator
   4    - The TextTokenizerPIA.l differs from TextTokenizer.l only in single character
   5       -[a-zA-Z0-9']+  { byte_position += tokleng; return ASCII_TOKEN; }
   6       +[a-zA-Z0-9_']+ { byte_position += tokleng; return ASCII_TOKEN; }
   7    - plus many symbol renames so that the parsers can coexist (toktext -> piatoktext etc.)
   8    - TextTokenizerPIA.hpp contains only symbol renamse
   9  - Rest are modifications to make indri build PIA wrapper
  10
  11
  12 --- indri-5.4/pia_wrapper.cpp   po Ärc 15 14:30:41 2013
  13 +++ indri-5.4/pia_wrapper.cpp   po Ärc 15 14:29:09 2013
  14 @@ -0,0 +1,222 @@
  15 +/*
  16 + * TO compile :
  17 + *      g++ -o libpia_wrapper.so -shared -fPIC -I../vlad-libs/sparc/usr/include/ -L../vlad-libs/sparc/usr/lib/ -lclucene-core -lnvpair pia_wrapper.cc
  18 + *
  19 + */
  20 +
  21 +#include <sys/stat.h>
  22 +#include <strings.h>
  23 +#include <stdio.h>
  24 +#include <libnvpair.h>
  25 +
  26 +#include <iostream>
  27 +#include <string>
  28 +#include <sstream>
  29 +#include <fstream>
  30 +
  31 +#include <vector>
  32 +#include "indri/QueryEnvironment.hpp"
  33 +#include "indri/SnippetBuilder.hpp"
  34 +#include "indri/Repository.hpp"
  35 +
  36 +using namespace std;
  37 +
  38 +using namespace indri::api;
  39 +
  40 +#define MAX_RESULTS 3
  41 +#define PIA_DATABASE "/var/db/piadb"
  42 +#define PIA_DATABASE_STORAGE PIA_DATABASE "/collection/storage"
  43 +
  44 +indri::collection::Repository repository;
  45 +
  46 +std::string
  47 +getFieldText(int documentID, std::string field) {
  48 +       std::string ret_val = "";
  49 +       indri::collection::Repository::index_state repIndexState = repository.indexes();
  50 +       indri::index::Index *thisIndex=(*repIndexState)[0];
  51 +       int fieldID=thisIndex->field(field);
  52 +
  53 +       if (fieldID < 1) {
  54 +               return "";
  55 +       }
  56 +
  57 +       const indri::index::TermList *termList=thisIndex->termList(documentID);
  58 +
  59 +       if (!termList) {
  60 +               return "";
  61 +       }
  62 +
  63 +       indri::utility::greedy_vector< indri::index::FieldExtent > fieldVec=termList->fields();
  64 +       indri::utility::greedy_vector< indri::index::FieldExtent >::iterator fIter=fieldVec.begin();
  65 +       while (fIter!=fieldVec.end()) {
  66 +
  67 +               if ((*fIter).id==fieldID) {
  68 +                       int beginTerm=(*fIter).begin;
  69 +                       int endTerm=(*fIter).end;
  70 +
  71 +                       /*
  72 +                        * note that the text is inclusive of the beginning
  73 +                        * but exclusive of the ending
  74 +                        */
  75 +                       for (int t=beginTerm; t < endTerm; t++) {
  76 +                               int thisTermID=termList->terms()[t];
  77 +                               ret_val = ret_val + thisIndex->term(thisTermID) + " ";
  78 +                       }
  79 +               }
  80 +
  81 +               fIter++;
  82 +       }
  83 +
  84 +       delete termList;
  85 +       termList=NULL;
  86 +       return ret_val;
  87 +}
  88 +
  89 +/*
  90 + * Returns NULL on failure
  91 + * nvlist *
  92 + * search(
  93 + *  nvlist_t *search_params,
  94 + *  char **errmsg            // Similar to pia_index()
  95 + * );
  96 + */
  97 +nvlist *
  98 +search (nvlist_t *search_params, char **errmsg) {
  99 +
 100 +       char *index_path = PIA_DATABASE;
 101 +       nvlist_t **nvl_list_result;
 102 +       nvlist_t *nvl_return;
 103 +       nvlist_t *nvl_result;
 104 +       nvlist_t *results = NULL;
 105 +
 106 +       if (nvlist_alloc(&results, NV_UNIQUE_NAME, 0) != 0) {
 107 +               *errmsg = strdup("nvlist_alloc failed\n");
 108 +               return NULL;
 109 +       }
 110 +
 111 +       try {
 112 +               std::string query;
 113 +               char *panicstack;
 114 +               (void) nvlist_lookup_string(search_params, "stack", &panicstack);
 115 +
 116 +               QueryEnvironment indriEnvironment;
 117 +               indriEnvironment.addIndex(index_path);
 118 +
 119 +               /* Create Indri query */
 120 +               query = "#combine (" + std::string(panicstack) + ")";
 121 +
 122 +               QueryAnnotation *QAresults=indriEnvironment.runAnnotatedQuery(query.c_str(), MAX_RESULTS);
 123 +
 124 +               std::vector<indri::api::ScoredExtentResult> resultVector=QAresults->getResults();
 125 +
 126 +               int totalNumResults=resultVector.size();
 127 +
 128 +               /* Get Parsed document of the results */
 129 +               std::vector<ParsedDocument*> parsedDocs=indriEnvironment.documents(resultVector);
 130 +
 131 +               int results_to_return = 0;
 132 +               for ( size_t i=0; i < totalNumResults && i < MAX_RESULTS; i++ ) {
 133 +                               results_to_return++;
 134 +               }
 135 +
 136 +               /* Open Repository */
 137 +               repository.openRead(index_path);
 138 +
 139 +               nvl_list_result = (nvlist_t **) malloc(results_to_return * sizeof(nvlist_t *));
 140 +
 141 +               for ( size_t i=0; i < results_to_return; i++ ) {
 142 +
 143 +                       std::string ret="";
 144 +
 145 +                       int thisResultDocID=resultVector[i].document;
 146 +
 147 +                       if (nvlist_alloc(&nvl_list_result[i], NV_UNIQUE_NAME, 0) != 0) {
 148 +                               *errmsg = strdup("nvlist_alloc failed\n");
 149 +                               return NULL;
 150 +                       }
 151 +
 152 +                       if ((ret = getFieldText(thisResultDocID, "bug")) == "") {
 153 +                               *errmsg = strdup("Lookup of bugid failed\n");
 154 +                               return NULL;
 155 +                       } else if (nvlist_add_string(nvl_list_result[i], "pia-bugid", ret.c_str())) {
 156 +                               *errmsg = strdup("nvlist_add bugid failed\n");
 157 +                               return NULL;
 158 +                       }
 159 +
 160 +                       if ((ret = getFieldText(thisResultDocID, "stack")) == "") {
 161 +                               *errmsg = strdup("Lookup of stack failed\n");
 162 +                               return NULL;
 163 +                       } else if (nvlist_add_string(nvl_list_result[i], "pia-stack", ret.c_str())) {
 164 +                               *errmsg = strdup("nvlist_add stack failed\n");
 165 +                               return NULL;
 166 +                       }
 167 +
 168 +                       if ((ret = getFieldText(thisResultDocID, "signature")) == "") {
 169 +                               *errmsg = strdup("Lookup of signature failed\n");
 170 +                               return NULL;
 171 +                       } else if (nvlist_add_string(nvl_list_result[i], "pia-signature", ret.c_str())) {
 172 +                               *errmsg = strdup("nvlist_add signature failed\n");
 173 +                               return NULL;
 174 +                       }
 175 +
 176 +                       int indri_score = 1000 + (int)resultVector[i].score*1000;
 177 +                       if (nvlist_add_int32(nvl_list_result[i], "pia-score", indri_score)) {
 178 +                               *errmsg = strdup("nvlist_add score failed\n");
 179 +                               return NULL;
 180 +                       }
 181 +               }
 182 +               repository.close();
 183 +
 184 +               nvlist_add_nvlist_array(results, "results", nvl_list_result, results_to_return);
 185 +
 186 +               for (int i=0; i<results_to_return; i++) {
 187 +                       nvlist_free(nvl_list_result[i]);
 188 +               }
 189 +
 190 +               return results;
 191 +
 192 +       } catch(...){
 193 +               nvl_list_result = (nvlist_t **) malloc(1 * sizeof(nvlist_t **));
 194 +
 195 +               if (nvlist_alloc(&nvl_result, NV_UNIQUE_NAME, 0) != 0) {
 196 +                       *errmsg = strdup("nvlist_alloc failed\n");
 197 +                       return NULL;
 198 +               }
 199 +
 200 +               if (nvlist_add_string(nvl_result, "error", "Indri Error")) {
 201 +                       *errmsg = strdup("nvlist_add error failed\n");
 202 +                       return NULL;
 203 +                }
 204 +
 205 +               nvlist_dup(nvl_result, &nvl_list_result[0], 0);
 206 +               nvlist_free(nvl_result);
 207 +               nvlist_add_nvlist_array(results, "results", nvl_list_result, 1);
 208 +
 209 +               return results;
 210 +        }
 211 +}
 212 +
 213 +extern "C" nvlist*
 214 +pia_search (nvlist_t *search_params, char **errmsg) {
 215 +
 216 +       return search (search_params, errmsg);
 217 +
 218 +}
 219 +
 220 +int
 221 +init () {
 222 +
 223 +       struct stat sb;
 224 +       if (stat(PIA_DATABASE_STORAGE, &sb) != 0) {
 225 +               return 1;
 226 +       }
 227 +
 228 +       return 0;
 229 +}
 230 +
 231 +extern "C" int
 232 +pia_init () {
 233 +
 234 +       return init ();
 235 +
 236 +}
 237 --- indri-5.4/src/TextTokenizerPIA.l    po Ärc 15 14:38:12 2013
 238 +++ indri-5.4/src/TextTokenizerPIA.l    po Ärc 15 14:36:55 2013
 239 @@ -0,0 +1,588 @@
 240 +%option noyywrap
 241 +%option never-interactive
 242 +%option prefix="piatok"
 243 +
 244 +%{
 245 +
 246 +/*==========================================================================
 247 + * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
 248 + *
 249 + * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
 250 + * is subject to the terms of the software license set forth in the LICENSE
 251 + * file included with this software, and also available at
 252 + * http://www.lemurproject.org/license.html
 253 + *
 254 + *==========================================================================
 255 + */
 256 +
 257 +//
 258 +// TextTokenizerPIA
 259 +//
 260 +// 15 September 2005 -- mwb
 261 +//
 262 +
 263 +#include <string.h>
 264 +#include <ctype.h>
 265 +#include "indri/TextTokenizerPIA.hpp"
 266 +#include "indri/TermExtent.hpp"
 267 +#include "indri/TagEvent.hpp"
 268 +#include "indri/TokenizedDocument.hpp"
 269 +#include "indri/UnparsedDocument.hpp"
 270 +#include "indri/UTF8Transcoder.hpp"
 271 +#include "indri/AttributeValuePair.hpp"
 272 +
 273 +static long byte_position;
 274 +
 275 +#define ZAP           1
 276 +#define TAG           2
 277 +#define ASCII_TOKEN   3
 278 +#define UTF8_TOKEN    4
 279 +
 280 +%}
 281 +%start COMMENT
 282 +%%
 283 +
 284 +"<!--" { BEGIN(COMMENT); byte_position += piatokleng; return ZAP; }
 285 +<COMMENT>[^-]+ { byte_position += piatokleng; return ZAP; }
 286 +<COMMENT>"-->" { BEGIN(INITIAL); byte_position += piatokleng; return ZAP; }
 287 +<COMMENT>"-" { byte_position += piatokleng; return ZAP; }
 288 +"<!"[^\>]*">" { byte_position += piatokleng; return ZAP; }
 289 +\<[a-zA-Z/][^\>]*\>                                             { byte_position += piatokleng; return TAG; }
 290 +[&]([a-zA-Z]+|[#]([0-9]+|[xX][a-fA-F0-9]+))[;]         { byte_position += piatokleng; return ZAP; /* symbols */ }
 291 +[A-Z0-9]"."([A-Z0-9]".")*                                        { byte_position += piatokleng; return ASCII_TOKEN; }
 292 +[a-zA-Z0-9_']+                                        { byte_position += piatokleng; return ASCII_TOKEN; }
 293 +"-"[0-9]+("."[0-9]+)?                                  { byte_position += piatokleng; return ASCII_TOKEN; }
 294 +[a-zA-Z0-9\x80-\xFD]+                               { byte_position += piatokleng; return UTF8_TOKEN; }
 295 +
 296 +[\n]                                                   { byte_position += piatokleng; return ZAP; }
 297 +.                                                      { byte_position += piatokleng; return ZAP; }
 298 +
 299 +%%
 300 +
 301 +indri::parse::TokenizedDocument* indri::parse::TextTokenizerPIA::tokenize( indri::parse::UnparsedDocument* document ) {
 302 +
 303 +  _termBuffer.clear();
 304 +  if ( _tokenize_entire_words)
 305 +    _termBuffer.grow( document->textLength * 4);
 306 +  else
 307 +    _termBuffer.grow( document->textLength * 8 ); // extra null per char.
 308 +
 309 +  _document.terms.clear();
 310 +  _document.tags.clear();
 311 +  _document.positions.clear();
 312 +
 313 +  _document.metadata = document->metadata;
 314 +  _document.text = document->text;
 315 +  _document.textLength = document->textLength;
 316 +  _document.content = document->content;
 317 +  _document.contentLength = document->contentLength;
 318 +
 319 +  // byte offset
 320 +  byte_position = document->content - document->text;
 321 +
 322 +  piatok_scan_bytes( document->content, document->contentLength );
 323 +
 324 +  // Main Tokenizer loop
 325 +
 326 +  int type;
 327 +
 328 +  while ( type = piatoklex() ) {
 329 +
 330 +    switch ( type ) {
 331 +
 332 +    case ASCII_TOKEN: processASCIIToken(); break;
 333 +
 334 +    case UTF8_TOKEN: processUTF8Token(); break;
 335 +
 336 +    case TAG: if ( _tokenize_markup ) processTag(); break;
 337 +
 338 +    default:
 339 +    case ZAP:
 340 +      break;
 341 +
 342 +    }
 343 +
 344 +  }
 345 +
 346 +  piatok_delete_buffer( YY_CURRENT_BUFFER );
 347 +
 348 +  return &_document;
 349 +}
 350 +
 351 +// Member functions for processing tokenization events as dispatched
 352 +// from the main tokenizer loop
 353 +
 354 +void indri::parse::TextTokenizerPIA::processTag() {
 355 +
 356 +  // Here, we parse the tag in a fashion that is relatively robust to
 357 +  // malformed markup.  toktext matches this pattern: <[^>]+>
 358 +
 359 +  if ( piatoktext[1] == '?' || piatoktext[1] == '!' ) {
 360 +
 361 +    // XML declaration like <? ... ?> and <!DOCTYPE ... >
 362 +    return; // ignore
 363 +
 364 +  } else if ( piatoktext[1] == '/' ) { // close tag, eg. </FOO>
 365 +
 366 +    // Downcase the tag name.
 367 +
 368 +    int len = 0;
 369 +
 370 +    for ( char *c = piatoktext + 2;
 371 +#ifndef WIN32
 372 +          isalnum( *c ) || *c == '-' || *c == '_' || *c == ':' ; c++ ) {
 373 +#else
 374 +          ((*c >= 0) && isalnum( *c )) || *c == '-' || *c == '_' || *c == ':' ; c++ ) {
 375 +#endif
 376 +
 377 +      *c = tolower( *c );
 378 +      if ( *c == ':' ) *c = '_'; /* replace colon (from namespaces) */
 379 +      len++;
 380 +    }
 381 +
 382 +    TagEvent te;
 383 +
 384 +    te.open_tag = false;
 385 +
 386 +    // We need to write len characters, plus a NULL
 387 +    char* write_loc = _termBuffer.write( len + 1 );
 388 +    strncpy( write_loc, piatoktext + 2, len );
 389 +    write_loc[len] = '\0';
 390 +    te.name = write_loc;
 391 +
 392 +    // token position of tag event w/r/t token string
 393 +    te.pos = _document.terms.size();
 394 +
 395 +    te.begin = byte_position - piatokleng;
 396 +    te.end = byte_position;
 397 +
 398 +    _document.tags.push_back( te );
 399 +
 400 +#ifndef WIN32
 401 +    } else if ( isalpha( piatoktext[1] ) ) {
 402 +#else
 403 +    } else if ( (piatoktext[1]  >= 0) && (isalpha( piatoktext[1] ) )) {
 404 +#endif
 405 +
 406 +    // Try to extract the tag name:
 407 +
 408 +    char* c = piatoktext + 1;
 409 +    int i = 0;
 410 +    int offset = 1; // current offset w/r/t byte_position - piatokleng
 411 +    // it starts at one because it is incremented when c is, and c starts at one.
 412 +    char* write_loc;
 413 +
 414 +#ifndef WIN32
 415 +    while ( isalnum( c[i] ) || c[i] == '-' || c[i] == '_' || c[i] == ':' ) i++;
 416 +#else
 417 +    while ( ( (c[i] >= 0) && isalnum( c[i] )) || c[i] == '-' || c[i] == '_' || c[i] == ':' ) i++;
 418 +#endif
 419 +    if ( c[i] == '>' ) {
 420 +
 421 +      // open tag with no attributes, eg. <title>
 422 +
 423 +      // Ensure tag name is downcased
 424 +      for ( int j = 0; j < i; j++ ) {
 425 +        c[j] = tolower( c[j] );
 426 +        if ( c[j] == ':' ) c[j] = '_'; /* replace colon (from namespaces) */
 427 +      }
 428 +
 429 +      TagEvent te;
 430 +
 431 +      te.open_tag = true;
 432 +
 433 +      // need to write i characters, plus a NULL
 434 +      char* write_loc = _termBuffer.write( i + 1 );
 435 +      strncpy( write_loc, c, i );
 436 +      write_loc[i] = '\0';
 437 +      te.name = write_loc;
 438 +
 439 +      te.pos = _document.terms.size();
 440 +
 441 +      te.begin = byte_position - piatokleng;
 442 +      te.end = byte_position;
 443 +
 444 +      _document.tags.push_back( te );
 445 +
 446 +#ifndef WIN32
 447 +    } else if ( isspace( c[i] ) ) {
 448 +#else
 449 +    } else if ( (c[i]  >= 0) && (isspace( c[i] ) )) {
 450 +#endif
 451 +
 452 +      // open tag with attributes, eg. <A HREF="www.foo.com/bar">
 453 +
 454 +      TagEvent te;
 455 +
 456 +      te.open_tag = true;
 457 +
 458 +      // Ensure tag name is downcased
 459 +      for ( int j = 0; j < i; j++ ) {
 460 +        c[j] = tolower( c[j] );
 461 +        if ( c[j] == ':' ) c[j] = '_'; /* replace colon (from namespaces) */
 462 +      }
 463 +
 464 +      // need to write i characters, plus a NULL
 465 +      char* write_loc = _termBuffer.write( i + 1 );
 466 +      strncpy( write_loc, c, i );
 467 +      write_loc[i] = '\0';
 468 +      te.name = write_loc;
 469 +      c += i;
 470 +      offset += i;
 471 +
 472 +#ifndef WIN32
 473 +    while ( isspace( *c ) ) { c++; offset++; }
 474 +#else
 475 +    while (((*c) >=0) &&  isspace( *c )) { c++; offset++; }
 476 +#endif
 477 +
 478 +      te.pos = _document.terms.size();
 479 +
 480 +      te.begin = byte_position - piatokleng;
 481 +      te.end = byte_position;
 482 +
 483 +      // Now search for attributes:
 484 +
 485 +      while ( *c != '>' && *c != '\0' ) {
 486 +
 487 +        AttributeValuePair avp;
 488 +
 489 +        // Try to extract attribute name:
 490 +
 491 +        i = 0;
 492 +#ifndef WIN32
 493 +        while ( isalnum( c[i] ) || c[i] == '-' || c[i] == '_' ) i++;
 494 +#else
 495 +        while ( (c[i] >= 0) && isalnum( c[i] ) || c[i] == '-' || c[i] == '_') i++;
 496 +#endif
 497 +
 498 +        if ( i == 0 ) break;
 499 +
 500 +        // Ensure attribute name is downcased
 501 +        for ( int j = 0; j < i; j++ )
 502 +          c[j] = tolower( c[j] );
 503 +
 504 +        // need to write i characters, plus a NULL
 505 +        write_loc = _termBuffer.write( i + 1 );
 506 +        strncpy( write_loc, c, i );
 507 +        write_loc[i] = '\0';
 508 +        avp.attribute = write_loc;
 509 +        c += i;
 510 +        offset += i;
 511 +
 512 +        // attributes can be foo\s*=\s*"bar[">] or foo\s*=\s*bar
 513 +
 514 +               // ignore any spaces
 515 +#ifndef WIN32
 516 +    while ( isspace( *c ) ) { c++; offset++; }
 517 +#else
 518 +    while (((*c) >=0) &&  isspace( *c )) { c++; offset++; }
 519 +#endif
 520 +
 521 +        if ( *c == '=' ) {
 522 +
 523 +          c++; // get past the '=' sign.
 524 +          offset++;
 525 +
 526 +#ifndef WIN32
 527 +    while ( isspace( *c ) ) { c++; offset++; }
 528 +#else
 529 +    while (((*c) >=0) &&  isspace( *c )) { c++; offset++; }
 530 +#endif
 531 +
 532 +          if ( *c == '>' ) {
 533 +
 534 +            // common malformed markup <a href=>
 535 +
 536 +            // Insert empty attribute value
 537 +            // need to write a single NULL
 538 +            write_loc = _termBuffer.write( 1 );
 539 +            write_loc[0] = '\0';
 540 +            avp.value = write_loc;
 541 +            avp.begin = byte_position - piatokleng + offset;
 542 +            avp.end = byte_position - piatokleng + offset;
 543 +
 544 +          } else {
 545 +
 546 +            bool quoted = true;
 547 +            char quote_char;
 548 +            if ( *c == '"' || *c =='\'' ) { quote_char = *c; c++; offset++; }
 549 +            else quoted = false;
 550 +
 551 +            // Attribute value starts here.
 552 +
 553 +            i = 0;
 554 +// make sure the opening and closing quote character match...
 555 +            if ( quoted )
 556 +//              while ( c[i] != '"' && c[i] != '>' && c[i] !='\'') i++;
 557 +              while ( c[i] != quote_char && c[i] != '>') i++;
 558 +            else
 559 +#ifndef WIN32
 560 +              while ( ! isspace( c[i] ) && c[i] != '>' ) i++;
 561 +#else
 562 +              while ( ((c[i] >= 0)  && ! isspace( c[i] ) ) && c[i] != '>' ) i++;
 563 +#endif
 564 +
 565 +            // need to write i characters, plus a NULL
 566 +            write_loc = _termBuffer.write( i + 1 );
 567 +            strncpy( write_loc, c, i );
 568 +            write_loc[i] = '\0';
 569 +            avp.value = write_loc;
 570 +            avp.begin = byte_position - piatokleng + offset;
 571 +            avp.end = byte_position - piatokleng + offset + i;
 572 +            c += i;
 573 +            offset += i;
 574 +
 575 +          }
 576 +        } else {
 577 +
 578 +          // Insert empty attribute value
 579 +          // need to write a single NULL
 580 +          write_loc = _termBuffer.write( 1 );
 581 +          write_loc[0] = '\0';
 582 +          avp.value = write_loc;
 583 +          avp.begin = byte_position - piatokleng + offset;
 584 +          avp.end = byte_position - piatokleng + offset;
 585 +        }
 586 +#ifndef WIN32
 587 +        while ( isspace( *c ) || *c == '"' ) { c++; offset++; }
 588 +#else
 589 +        while ( ((*c >= 0) && isspace( *c )) || *c == '"' ) { c++; offset++; }
 590 +#endif
 591 +
 592 +        te.attributes.push_back( avp );
 593 +      }
 594 +
 595 +      _document.tags.push_back( te );
 596 +
 597 +    }
 598 +
 599 +    // One of the cases that is ignored is this common malformed
 600 +    // markup <foo=bar> with no tag name.  Another is the case
 601 +    // of an email address <foo@bar.com>
 602 +
 603 +
 604 +  }
 605 +}
 606 +
 607 +void indri::parse::TextTokenizerPIA::processUTF8Token() {
 608 +
 609 +  // A UTF-8 token, as recognized by flex, could actually be
 610 +  // a mixed ASCII/UTF-8 string containing any number of
 611 +  // UTF-8 characters, so we re-tokenize it here.
 612 +
 613 +  indri::utility::HashTable<UINT64,const int>& unicode = _transcoder.unicode();
 614 +
 615 +  int len = strlen( piatoktext );
 616 +
 617 +  UINT64* unicode_chars = new UINT64[len + 1];
 618 +  int* offsets = new int[len + 1];
 619 +  int* lengths = new int[len + 1];
 620 +  _transcoder.utf8_decode( piatoktext, &unicode_chars, NULL, NULL,
 621 +                           &offsets, &lengths );
 622 +
 623 +  const int* p;
 624 +  int cls;             // Character class of current UTF-8 character
 625 +  // offset of current UTF-8 character w/r/t toktext stored in offsets[i]
 626 +  // byte length of current UTF-8 character stored in lengths[i]
 627 +
 628 +  int offset = 0;      // Position of start of current *token* (not character) w/r/t toktext
 629 +  int extent = 0;      // Extent for this *token* including trailing punct
 630 +  int piatoken_len = 0;   // Same as above, minus the trailing punctuation
 631 +
 632 +  char buf[64];
 633 +
 634 +  // If this flag is true, we have punctuation symbols at the end of a
 635 +  // token, so do not attach another letter to this token.
 636 +  bool no_letter = false;
 637 +
 638 +  // In case there are malformed characters preceding the good
 639 +  // characters:
 640 +  offset = offsets[0];
 641 +
 642 +  for ( int i = 0; unicode_chars[i] != 0; i++ ) {
 643 +
 644 +    p = unicode.find( unicode_chars[i] );
 645 +    cls = p ? *p : 0;
 646 +
 647 +    if ( ! _tokenize_entire_words ) { // Tokenize by character
 648 +
 649 +      if ( cls != 0 && cls != 3 && cls != 5 && cls != 9 ) {
 650 +
 651 +        writeToken( piatoktext + offsets[i], lengths[i],
 652 +                    byte_position - piatokleng + offsets[i],
 653 +                    byte_position - piatokleng + offsets[i] + lengths[i] );
 654 +      }
 655 +      continue;
 656 +    }
 657 +
 658 +    // If this is not the first time through this loop, we need
 659 +    // to check to see if any bytes in toktext were skipped
 660 +    // during the UTF-8 analysis:
 661 +
 662 +    if ( i != 0 && offset + piatoken_len != offsets[i] ) {
 663 +
 664 +      // Write out the token we are working on, if any:
 665 +
 666 +      if ( piatoken_len > 0 ) {
 667 +
 668 +        writeToken( piatoktext + offset, piatoken_len,
 669 +                    byte_position - piatokleng + offset,
 670 +                    byte_position - piatokleng + offset + extent );
 671 +      }
 672 +
 673 +      extent = 0;
 674 +      piatoken_len = 0;
 675 +      no_letter = false;
 676 +      offset = offsets[i];
 677 +    }
 678 +
 679 +    // Tokenize by word:
 680 +
 681 +    switch ( cls ) {
 682 +
 683 +    case 4: // Currency symbol: always extracted alone
 684 +      // Action: write the token we are working on,
 685 +      // and write this symbol as a separate token
 686 +      writeToken( piatoktext + offset, extent,
 687 +                  byte_position - piatokleng + offset,
 688 +                  byte_position - piatokleng + offset + extent );
 689 +
 690 +      offset += extent;
 691 +
 692 +      writeToken( piatoktext + offset, lengths[i],
 693 +                  byte_position - piatokleng + offset,
 694 +                  byte_position - piatokleng + offset + lengths[i] );
 695 +
 696 +      offset += lengths[i];
 697 +      piatoken_len = 0;
 698 +      extent = 0;
 699 +      no_letter = false;
 700 +      break;
 701 +
 702 +    case 1: // Apostrophe
 703 +    case 10: // Decimal separator
 704 +    case 6: // Letter
 705 +    case 7: // Digit
 706 +      // Action: add this character to the end of the token we are
 707 +      // working on
 708 +      if ( no_letter ) { // This is a token boundary
 709 +        writeToken( piatoktext + offset, piatoken_len,
 710 +                    byte_position - piatokleng + offset,
 711 +                    byte_position - piatokleng + offset + extent );
 712 +
 713 +        offset += extent;
 714 +        extent = 0;
 715 +        piatoken_len = 0;
 716 +        no_letter = false;
 717 +
 718 +      }
 719 +
 720 +      extent += lengths[i];
 721 +      piatoken_len += lengths[i];
 722 +      break;
 723 +
 724 +    case 2: // Percent
 725 +    case 8: // Punctuation
 726 +    case 12: // Thousands separator
 727 +    case 11: // Hyphen
 728 +      // Action: These characters are included in the extent of the
 729 +      // token we are working on.
 730 +      no_letter = true;
 731 +      extent += lengths[i];
 732 +      break;
 733 +
 734 +    case 0: // No character class!
 735 +    case 3: // Control character
 736 +    case 5: // Non-punctuation symbol
 737 +    case 9: // Whitespace
 738 +    default:
 739 +      // Action: write the token we are working on.  Do not include
 740 +      // this character in any future token.
 741 +      writeToken( piatoktext + offset, piatoken_len,
 742 +                  byte_position - piatokleng + offset,
 743 +                  byte_position - piatokleng + offset + extent );
 744 +
 745 +      offset += (extent + lengths[i]); // Include current character
 746 +      extent = 0;
 747 +      piatoken_len = 0;
 748 +      no_letter = false;
 749 +
 750 +      break;
 751 +    }
 752 +  }
 753 +
 754 +  // Write out last token
 755 +  if ( piatoken_len > 0 )
 756 +    writeToken( piatoktext + offset, piatoken_len,
 757 +                byte_position - piatokleng + offset,
 758 +                byte_position - piatokleng + offset + extent );
 759 +
 760 +  delete[] unicode_chars;
 761 +  delete[] offsets;
 762 +  delete[] lengths;
 763 +}
 764 +
 765 +void indri::parse::TextTokenizerPIA::processASCIIToken() {
 766 +
 767 +  int piatoken_len = strlen( piatoktext );
 768 +
 769 +  // token_len here is the length of the token without
 770 +  // any trailing punctuation.
 771 +
 772 +  for ( int i = piatoken_len - 1; i > 0; i-- ) {
 773 +
 774 +    if ( ! ispunct( piatoktext[i] ) )
 775 +      break;
 776 +    else
 777 +      piatoken_len--;
 778 +  }
 779 +
 780 +  if ( _tokenize_entire_words ) {
 781 +
 782 +    writeToken( piatoktext, piatoken_len, byte_position - piatokleng, byte_position );
 783 +
 784 +  } else {
 785 +
 786 +    for ( int i = 0; i < piatoken_len; i++ )
 787 +      writeToken( piatoktext + i, 1, byte_position - piatokleng + i,
 788 +                  byte_position - piatokleng + i + 1 );
 789 +  }
 790 +}
 791 +
 792 +
 793 +// ObjectHandler implementation
 794 +
 795 +void indri::parse::TextTokenizerPIA::handle( indri::parse::UnparsedDocument* document ) {
 796 +
 797 +  _handler->handle( tokenize( document ) );
 798 +}
 799 +
 800 +void indri::parse::TextTokenizerPIA::setHandler( ObjectHandler<indri::parse::TokenizedDocument>& h ) {
 801 +
 802 +  _handler = &h;
 803 +}
 804 +
 805 +void indri::parse::TextTokenizerPIA::writeToken( char* token, int piatoken_len,
 806 +                                              int extent_begin, int extent_end ) {
 807 +
 808 +
 809 +  // The TermExtent for a token will include trailing punctuation.
 810 +  // The purpose for this is that it makes for a nicer display when a
 811 +  // sequence of tokens (say, a sentence) is retrieved and shown to
 812 +  // the user.
 813 +
 814 +  TermExtent extent;
 815 +  extent.begin = extent_begin;
 816 +  extent.end = extent_end;
 817 +  _document.positions.push_back( extent );
 818 +
 819 +  // The terms entry for a token won't include the punctuation.
 820 +
 821 +  char* write_loc = _termBuffer.write( piatoken_len + 1 );
 822 +  strncpy( write_loc, token, piatoken_len );
 823 +  write_loc[piatoken_len] = '\0';
 824 +  _document.terms.push_back( write_loc );
 825 +}
 826 +
 827 +
 828 --- indri-5.4/include/indri/TextTokenizerPIA.hpp        po Ärc 15 14:38:50 2013
 829 +++ indri-5.4/include/indri/TextTokenizerPIA.hpp        po Ärc 15 14:36:54 2013
 830 @@ -0,0 +1,73 @@
 831 +/*==========================================================================
 832 + * Copyright (c) 2003-2005 University of Massachusetts.  All Rights Reserved.
 833 + *
 834 + * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
 835 + * is subject to the terms of the software license set forth in the LICENSE
 836 + * file included with this software, and also available at
 837 + * http://www.lemurproject.org/license.html
 838 + *
 839 + *==========================================================================
 840 + */
 841 +
 842 +//
 843 +// TextTokenizerPIA
 844 +//
 845 +// 15 September 2005 -- mwb
 846 +//
 847 +
 848 +#ifndef INDRI_TEXTTOKENIZERPIA_HPP
 849 +#define INDRI_TEXTTOKENIZERPIA_HPP
 850 +
 851 +#include <stdio.h>
 852 +#include <string>
 853 +#include <map>
 854 +
 855 +#include "indri/IndriTokenizer.hpp"
 856 +#include "indri/Buffer.hpp"
 857 +#include "indri/TagEvent.hpp"
 858 +#include "indri/UnparsedDocument.hpp"
 859 +#include "indri/TokenizedDocument.hpp"
 860 +#include "indri/UTF8Transcoder.hpp"
 861 +
 862 +namespace indri {
 863 +  namespace parse {
 864 +
 865 +    class TextTokenizerPIA : public Tokenizer {
 866 +
 867 +    public:
 868 +      TextTokenizerPIA( bool tokenize_markup = true, bool tokenize_entire_words = true ) : _handler(0) {
 869 +
 870 +        _tokenize_markup = tokenize_markup;
 871 +        _tokenize_entire_words = tokenize_entire_words;
 872 +      }
 873 +
 874 +      ~TextTokenizerPIA() {}
 875 +
 876 +      TokenizedDocument* tokenize( UnparsedDocument* document );
 877 +
 878 +      void handle( UnparsedDocument* document );
 879 +      void setHandler( ObjectHandler<TokenizedDocument>& h );
 880 +
 881 +    protected:
 882 +      void processASCIIToken();
 883 +      void processUTF8Token();
 884 +      void processTag();
 885 +
 886 +      indri::utility::Buffer _termBuffer;
 887 +      UTF8Transcoder _transcoder;
 888 +
 889 +      bool _tokenize_markup;
 890 +      bool _tokenize_entire_words;
 891 +
 892 +    private:
 893 +      ObjectHandler<TokenizedDocument>* _handler;
 894 +      TokenizedDocument _document;
 895 +
 896 +      void writeToken( char* token, int token_len, int extent_begin,
 897 +                       int extent_end );
 898 +    };
 899 +  }
 900 +}
 901 +
 902 +#endif // INDRI_TEXTTOKENIZERPIA_HPP
 903 +
 904 --- indri-5.4/src/TokenizerFactory.cpp  po Ärc 15 14:39:30 2013
 905 +++ indri-5.4/src/TokenizerFactory.cpp  po Ärc 15 14:29:11 2013
 906 @@ -22,6 +22,7 @@
 907
 908  #include "indri/TextTokenizer.hpp"
 909  // Add an #include for your Tokenizer here.
 910 +#include "indri/TextTokenizerPIA.hpp"
 911
 912
 913  #define TOKENIZER_WORD ("Word")
 914 @@ -29,6 +30,8 @@
 915  #define TOKENIZER_CHAR ("Char")
 916  #define TOKENIZER_CHAR_NO_MARKUP ("Char without Markup")
 917  // Add a #define for your Tokenizer here.
 918 +#define TOKENIZER_PIA ("PIA")
 919 +#define TOKENIZER_PIA_NO_MARKUP ("PIA without Markup")
 920
 921
 922  //
 923 @@ -78,8 +81,23 @@
 924      // got "char"
 925      return TOKENIZER_CHAR;
 926
 927 +  } else if ( ( name[0] == 'p' || name[0] == 'P' ) &&
 928 +       ( name[1] == 'i' || name[1] == 'I' ) &&
 929 +       ( name[2] == 'a' || name[3] == 'A' ) ) {
 930 +
 931 +    if ( name[4] == '-' &&
 932 +         ( name[5] == 'n' || name[5] == 'N' ) &&
 933 +         ( name[5] == 'o' || name[5] == 'O' ) ) {
 934 +
 935 +      // got "pia-nomarkup"
 936 +      return TOKENIZER_PIA_NO_MARKUP;
 937 +    }
 938 +
 939 +    // got "pia"
 940 +    return TOKENIZER_PIA;
 941    }
 942
 943 +
 944    return "";
 945  }
 946
 947 @@ -105,6 +123,14 @@
 948
 949      tokenizer = new indri::parse::TextTokenizer( false, false );
 950
 951 +  } else if ( preferred == TOKENIZER_PIA ) {
 952 +
 953 +    tokenizer = new indri::parse::TextTokenizerPIA();
 954 +
 955 +  } else if ( preferred == TOKENIZER_PIA_NO_MARKUP ) {
 956 +
 957 +    tokenizer = new indri::parse::TextTokenizerPIA( false );
 958 +
 959    } else {
 960
 961      LEMUR_THROW( LEMUR_RUNTIME_ERROR, name + " is not a known tokenizer." );
 962 --- indri-5.4/src/FileClassEnvironmentFactory.cpp       po Ärc 15 14:40:19 2013
 963 +++ indri-5.4/src/FileClassEnvironmentFactory.cpp       po Ärc 15 14:29:12 2013
 964 @@ -189,6 +189,20 @@
 965      trec_conflations      // conflations
 966    },
 967    {
 968 +    "trecpia",           // name
 969 +    "xml",                // parser
 970 +    "pia",               // tokenizer
 971 +    "tagged",             // iterator
 972 +    "<DOC>",              // startDocTag
 973 +    "</DOC>",             // endDocTag
 974 +    NULL,                 // endMetadataTag
 975 +    trec_include_tags,    // includeTags
 976 +    NULL,                 // excludeTags
 977 +    trec_index_tags,      // indexTags
 978 +    trec_metadata_tags,   // metadataTags
 979 +    trec_conflations      // conflations
 980 +  },
 981 +  {
 982      "trecchar",           // name
 983      "xml",                // parser
 984      "char",               // tokenizer
 985 --- indri-5.4/Makefile.app.in   2013-09-04 06:31:06.740210927 -0700
 986 +++ indri-5.4/Makefile.app.in   2013-09-04 06:27:24.857989779 -0700
 987 @@ -1,22 +1,26 @@
 988 +include MakeDefns
 989 +
 990  ## your application name here
 991 -APP=
 992 +APP=pia_wrapper
 993  SRC=$(APP).cpp
 994  ## extra object files for your app here
 995  OBJ=
 996 +OUTPUT=lib$(APP).so.1
 997
 998  prefix = @prefix@
 999  exec_prefix = ${prefix}
1000  libdir = @libdir@
1001  includedir = @includedir@
1002 -INCPATH=-I$(includedir)
1003 -LIBPATH=-L$(libdir)
1004 +INCPATH=-Iinclude -Icontrib/lemur/include
1005 +LIBPATH=-Lobj
1006  CXXFLAGS=@DEFS@ @CPPFLAGS@ @CXXFLAGS@ $(INCPATH)
1007 -CPPLDFLAGS  = @LDFLAGS@ -lindri @LIBS@
1008 +CPPLDFLAGS  = @LDFLAGS@ -lnvpair -lindri @LIBS@
1009
1010  all:
1011 -       $(CXX) $(CXXFLAGS) $(SRC) -o $(APP) $(OBJ) $(LIBPATH) $(CPPLDFLAGS)
1012 +       $(CXX) $(CXXFLAGS) $(SRC) -fpic -shared -static-libgcc -h $(OUTPUT) -o $(OUTPUT) $(OBJ) $(LIBPATH) $(CPPLDFLAGS)
1013
1014  clean:
1015         rm -f $(APP)
1016
1017 -
1018 +install:
1019 +       cp $(OUTPUT) $(libdir)
1020 --- indri-5.4/Makefile  2013-09-12 07:39:16.027125829 -0700
1021 +++ indri-5.4/Makefile  2013-09-12 07:38:44.720450641 -0700
1022 @@ -73,5 +73,6 @@
1023         $(MAKE) install -C doc
1024         $(MAKE) -C site-search install
1025         $(INSTALL_DATA) Makefile.app $(pkgdatadir)
1026 +       $(MAKE) -f Makefile.app install
1027
1028  test: