openexr: make architecture independent
[oi-userland.git] / components / library / indri / patches / pia.patch
blob4121fd3c0fb6e2da31719e4d0840b450e2c66818
1 Add our PIA wrapper to indri sources. This patch does several things:
2 - Add pia wrapper sources to indri source tree
3 - Add new tokenizer which does not treat '_' as a separator
4 - The TextTokenizerPIA.l differs from TextTokenizer.l only in single character
5 -[a-zA-Z0-9']+ { byte_position += tokleng; return ASCII_TOKEN; }
6 +[a-zA-Z0-9_']+ { byte_position += tokleng; return ASCII_TOKEN; }
7 - plus many symbol renames so that the parsers can coexist (toktext -> piatoktext etc.)
8 - TextTokenizerPIA.hpp contains only symbol renamse
9 - Rest are modifications to make indri build PIA wrapper
12 --- indri-5.4/pia_wrapper.cpp po črc 15 14:30:41 2013
13 +++ indri-5.4/pia_wrapper.cpp po črc 15 14:29:09 2013
14 @@ -0,0 +1,222 @@
15 +/*
16 + * TO compile :
17 + * g++ -o libpia_wrapper.so -shared -fPIC -I../vlad-libs/sparc/usr/include/ -L../vlad-libs/sparc/usr/lib/ -lclucene-core -lnvpair pia_wrapper.cc
18 + *
19 + */
21 +#include <sys/stat.h>
22 +#include <strings.h>
23 +#include <stdio.h>
24 +#include <libnvpair.h>
26 +#include <iostream>
27 +#include <string>
28 +#include <sstream>
29 +#include <fstream>
31 +#include <vector>
32 +#include "indri/QueryEnvironment.hpp"
33 +#include "indri/SnippetBuilder.hpp"
34 +#include "indri/Repository.hpp"
36 +using namespace std;
38 +using namespace indri::api;
40 +#define MAX_RESULTS 3
41 +#define PIA_DATABASE "/var/db/piadb"
42 +#define PIA_DATABASE_STORAGE PIA_DATABASE "/collection/storage"
44 +indri::collection::Repository repository;
46 +std::string
47 +getFieldText(int documentID, std::string field) {
48 + std::string ret_val = "";
49 + indri::collection::Repository::index_state repIndexState = repository.indexes();
50 + indri::index::Index *thisIndex=(*repIndexState)[0];
51 + int fieldID=thisIndex->field(field);
53 + if (fieldID < 1) {
54 + return "";
55 + }
57 + const indri::index::TermList *termList=thisIndex->termList(documentID);
59 + if (!termList) {
60 + return "";
61 + }
63 + indri::utility::greedy_vector< indri::index::FieldExtent > fieldVec=termList->fields();
64 + indri::utility::greedy_vector< indri::index::FieldExtent >::iterator fIter=fieldVec.begin();
65 + while (fIter!=fieldVec.end()) {
67 + if ((*fIter).id==fieldID) {
68 + int beginTerm=(*fIter).begin;
69 + int endTerm=(*fIter).end;
71 + /*
72 + * note that the text is inclusive of the beginning
73 + * but exclusive of the ending
74 + */
75 + for (int t=beginTerm; t < endTerm; t++) {
76 + int thisTermID=termList->terms()[t];
77 + ret_val = ret_val + thisIndex->term(thisTermID) + " ";
78 + }
79 + }
81 + fIter++;
82 + }
84 + delete termList;
85 + termList=NULL;
86 + return ret_val;
89 +/*
90 + * Returns NULL on failure
91 + * nvlist *
92 + * search(
93 + * nvlist_t *search_params,
94 + * char **errmsg // Similar to pia_index()
95 + * );
96 + */
97 +nvlist *
98 +search (nvlist_t *search_params, char **errmsg) {
100 + char *index_path = PIA_DATABASE;
101 + nvlist_t **nvl_list_result;
102 + nvlist_t *nvl_return;
103 + nvlist_t *nvl_result;
104 + nvlist_t *results = NULL;
106 + if (nvlist_alloc(&results, NV_UNIQUE_NAME, 0) != 0) {
107 + *errmsg = strdup("nvlist_alloc failed\n");
108 + return NULL;
111 + try {
112 + std::string query;
113 + char *panicstack;
114 + (void) nvlist_lookup_string(search_params, "stack", &panicstack);
116 + QueryEnvironment indriEnvironment;
117 + indriEnvironment.addIndex(index_path);
119 + /* Create Indri query */
120 + query = "#combine (" + std::string(panicstack) + ")";
122 + QueryAnnotation *QAresults=indriEnvironment.runAnnotatedQuery(query.c_str(), MAX_RESULTS);
124 + std::vector<indri::api::ScoredExtentResult> resultVector=QAresults->getResults();
126 + int totalNumResults=resultVector.size();
128 + /* Get Parsed document of the results */
129 + std::vector<ParsedDocument*> parsedDocs=indriEnvironment.documents(resultVector);
131 + int results_to_return = 0;
132 + for ( size_t i=0; i < totalNumResults && i < MAX_RESULTS; i++ ) {
133 + results_to_return++;
136 + /* Open Repository */
137 + repository.openRead(index_path);
139 + nvl_list_result = (nvlist_t **) malloc(results_to_return * sizeof(nvlist_t *));
141 + for ( size_t i=0; i < results_to_return; i++ ) {
143 + std::string ret="";
145 + int thisResultDocID=resultVector[i].document;
147 + if (nvlist_alloc(&nvl_list_result[i], NV_UNIQUE_NAME, 0) != 0) {
148 + *errmsg = strdup("nvlist_alloc failed\n");
149 + return NULL;
152 + if ((ret = getFieldText(thisResultDocID, "bug")) == "") {
153 + *errmsg = strdup("Lookup of bugid failed\n");
154 + return NULL;
155 + } else if (nvlist_add_string(nvl_list_result[i], "pia-bugid", ret.c_str())) {
156 + *errmsg = strdup("nvlist_add bugid failed\n");
157 + return NULL;
160 + if ((ret = getFieldText(thisResultDocID, "stack")) == "") {
161 + *errmsg = strdup("Lookup of stack failed\n");
162 + return NULL;
163 + } else if (nvlist_add_string(nvl_list_result[i], "pia-stack", ret.c_str())) {
164 + *errmsg = strdup("nvlist_add stack failed\n");
165 + return NULL;
168 + if ((ret = getFieldText(thisResultDocID, "signature")) == "") {
169 + *errmsg = strdup("Lookup of signature failed\n");
170 + return NULL;
171 + } else if (nvlist_add_string(nvl_list_result[i], "pia-signature", ret.c_str())) {
172 + *errmsg = strdup("nvlist_add signature failed\n");
173 + return NULL;
176 + int indri_score = 1000 + (int)resultVector[i].score*1000;
177 + if (nvlist_add_int32(nvl_list_result[i], "pia-score", indri_score)) {
178 + *errmsg = strdup("nvlist_add score failed\n");
179 + return NULL;
182 + repository.close();
184 + nvlist_add_nvlist_array(results, "results", nvl_list_result, results_to_return);
186 + for (int i=0; i<results_to_return; i++) {
187 + nvlist_free(nvl_list_result[i]);
190 + return results;
192 + } catch(...){
193 + nvl_list_result = (nvlist_t **) malloc(1 * sizeof(nvlist_t **));
195 + if (nvlist_alloc(&nvl_result, NV_UNIQUE_NAME, 0) != 0) {
196 + *errmsg = strdup("nvlist_alloc failed\n");
197 + return NULL;
200 + if (nvlist_add_string(nvl_result, "error", "Indri Error")) {
201 + *errmsg = strdup("nvlist_add error failed\n");
202 + return NULL;
205 + nvlist_dup(nvl_result, &nvl_list_result[0], 0);
206 + nvlist_free(nvl_result);
207 + nvlist_add_nvlist_array(results, "results", nvl_list_result, 1);
209 + return results;
213 +extern "C" nvlist*
214 +pia_search (nvlist_t *search_params, char **errmsg) {
216 + return search (search_params, errmsg);
220 +int
221 +init () {
223 + struct stat sb;
224 + if (stat(PIA_DATABASE_STORAGE, &sb) != 0) {
225 + return 1;
228 + return 0;
231 +extern "C" int
232 +pia_init () {
234 + return init ();
237 --- indri-5.4/src/TextTokenizerPIA.l po črc 15 14:38:12 2013
238 +++ indri-5.4/src/TextTokenizerPIA.l po črc 15 14:36:55 2013
239 @@ -0,0 +1,588 @@
240 +%option noyywrap
241 +%option never-interactive
242 +%option prefix="piatok"
246 +/*==========================================================================
247 + * Copyright (c) 2004 University of Massachusetts. All Rights Reserved.
249 + * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
250 + * is subject to the terms of the software license set forth in the LICENSE
251 + * file included with this software, and also available at
252 + * http://www.lemurproject.org/license.html
254 + *==========================================================================
255 + */
258 +// TextTokenizerPIA
260 +// 15 September 2005 -- mwb
263 +#include <string.h>
264 +#include <ctype.h>
265 +#include "indri/TextTokenizerPIA.hpp"
266 +#include "indri/TermExtent.hpp"
267 +#include "indri/TagEvent.hpp"
268 +#include "indri/TokenizedDocument.hpp"
269 +#include "indri/UnparsedDocument.hpp"
270 +#include "indri/UTF8Transcoder.hpp"
271 +#include "indri/AttributeValuePair.hpp"
273 +static long byte_position;
275 +#define ZAP 1
276 +#define TAG 2
277 +#define ASCII_TOKEN 3
278 +#define UTF8_TOKEN 4
281 +%start COMMENT
284 +"<!--" { BEGIN(COMMENT); byte_position += piatokleng; return ZAP; }
285 +<COMMENT>[^-]+ { byte_position += piatokleng; return ZAP; }
286 +<COMMENT>"-->" { BEGIN(INITIAL); byte_position += piatokleng; return ZAP; }
287 +<COMMENT>"-" { byte_position += piatokleng; return ZAP; }
288 +"<!"[^\>]*">" { byte_position += piatokleng; return ZAP; }
289 +\<[a-zA-Z/][^\>]*\> { byte_position += piatokleng; return TAG; }
290 +[&]([a-zA-Z]+|[#]([0-9]+|[xX][a-fA-F0-9]+))[;] { byte_position += piatokleng; return ZAP; /* symbols */ }
291 +[A-Z0-9]"."([A-Z0-9]".")* { byte_position += piatokleng; return ASCII_TOKEN; }
292 +[a-zA-Z0-9_']+ { byte_position += piatokleng; return ASCII_TOKEN; }
293 +"-"[0-9]+("."[0-9]+)? { byte_position += piatokleng; return ASCII_TOKEN; }
294 +[a-zA-Z0-9\x80-\xFD]+ { byte_position += piatokleng; return UTF8_TOKEN; }
296 +[\n] { byte_position += piatokleng; return ZAP; }
297 +. { byte_position += piatokleng; return ZAP; }
301 +indri::parse::TokenizedDocument* indri::parse::TextTokenizerPIA::tokenize( indri::parse::UnparsedDocument* document ) {
303 + _termBuffer.clear();
304 + if ( _tokenize_entire_words)
305 + _termBuffer.grow( document->textLength * 4);
306 + else
307 + _termBuffer.grow( document->textLength * 8 ); // extra null per char.
309 + _document.terms.clear();
310 + _document.tags.clear();
311 + _document.positions.clear();
313 + _document.metadata = document->metadata;
314 + _document.text = document->text;
315 + _document.textLength = document->textLength;
316 + _document.content = document->content;
317 + _document.contentLength = document->contentLength;
319 + // byte offset
320 + byte_position = document->content - document->text;
322 + piatok_scan_bytes( document->content, document->contentLength );
324 + // Main Tokenizer loop
326 + int type;
328 + while ( type = piatoklex() ) {
330 + switch ( type ) {
332 + case ASCII_TOKEN: processASCIIToken(); break;
334 + case UTF8_TOKEN: processUTF8Token(); break;
336 + case TAG: if ( _tokenize_markup ) processTag(); break;
338 + default:
339 + case ZAP:
340 + break;
346 + piatok_delete_buffer( YY_CURRENT_BUFFER );
348 + return &_document;
351 +// Member functions for processing tokenization events as dispatched
352 +// from the main tokenizer loop
354 +void indri::parse::TextTokenizerPIA::processTag() {
356 + // Here, we parse the tag in a fashion that is relatively robust to
357 + // malformed markup. toktext matches this pattern: <[^>]+>
359 + if ( piatoktext[1] == '?' || piatoktext[1] == '!' ) {
361 + // XML declaration like <? ... ?> and <!DOCTYPE ... >
362 + return; // ignore
364 + } else if ( piatoktext[1] == '/' ) { // close tag, eg. </FOO>
366 + // Downcase the tag name.
368 + int len = 0;
370 + for ( char *c = piatoktext + 2;
371 +#ifndef WIN32
372 + isalnum( *c ) || *c == '-' || *c == '_' || *c == ':' ; c++ ) {
373 +#else
374 + ((*c >= 0) && isalnum( *c )) || *c == '-' || *c == '_' || *c == ':' ; c++ ) {
375 +#endif
377 + *c = tolower( *c );
378 + if ( *c == ':' ) *c = '_'; /* replace colon (from namespaces) */
379 + len++;
382 + TagEvent te;
384 + te.open_tag = false;
386 + // We need to write len characters, plus a NULL
387 + char* write_loc = _termBuffer.write( len + 1 );
388 + strncpy( write_loc, piatoktext + 2, len );
389 + write_loc[len] = '\0';
390 + te.name = write_loc;
392 + // token position of tag event w/r/t token string
393 + te.pos = _document.terms.size();
395 + te.begin = byte_position - piatokleng;
396 + te.end = byte_position;
398 + _document.tags.push_back( te );
400 +#ifndef WIN32
401 + } else if ( isalpha( piatoktext[1] ) ) {
402 +#else
403 + } else if ( (piatoktext[1] >= 0) && (isalpha( piatoktext[1] ) )) {
404 +#endif
406 + // Try to extract the tag name:
408 + char* c = piatoktext + 1;
409 + int i = 0;
410 + int offset = 1; // current offset w/r/t byte_position - piatokleng
411 + // it starts at one because it is incremented when c is, and c starts at one.
412 + char* write_loc;
414 +#ifndef WIN32
415 + while ( isalnum( c[i] ) || c[i] == '-' || c[i] == '_' || c[i] == ':' ) i++;
416 +#else
417 + while ( ( (c[i] >= 0) && isalnum( c[i] )) || c[i] == '-' || c[i] == '_' || c[i] == ':' ) i++;
418 +#endif
419 + if ( c[i] == '>' ) {
421 + // open tag with no attributes, eg. <title>
423 + // Ensure tag name is downcased
424 + for ( int j = 0; j < i; j++ ) {
425 + c[j] = tolower( c[j] );
426 + if ( c[j] == ':' ) c[j] = '_'; /* replace colon (from namespaces) */
429 + TagEvent te;
431 + te.open_tag = true;
433 + // need to write i characters, plus a NULL
434 + char* write_loc = _termBuffer.write( i + 1 );
435 + strncpy( write_loc, c, i );
436 + write_loc[i] = '\0';
437 + te.name = write_loc;
439 + te.pos = _document.terms.size();
441 + te.begin = byte_position - piatokleng;
442 + te.end = byte_position;
444 + _document.tags.push_back( te );
446 +#ifndef WIN32
447 + } else if ( isspace( c[i] ) ) {
448 +#else
449 + } else if ( (c[i] >= 0) && (isspace( c[i] ) )) {
450 +#endif
452 + // open tag with attributes, eg. <A HREF="www.foo.com/bar">
454 + TagEvent te;
456 + te.open_tag = true;
458 + // Ensure tag name is downcased
459 + for ( int j = 0; j < i; j++ ) {
460 + c[j] = tolower( c[j] );
461 + if ( c[j] == ':' ) c[j] = '_'; /* replace colon (from namespaces) */
464 + // need to write i characters, plus a NULL
465 + char* write_loc = _termBuffer.write( i + 1 );
466 + strncpy( write_loc, c, i );
467 + write_loc[i] = '\0';
468 + te.name = write_loc;
469 + c += i;
470 + offset += i;
472 +#ifndef WIN32
473 + while ( isspace( *c ) ) { c++; offset++; }
474 +#else
475 + while (((*c) >=0) && isspace( *c )) { c++; offset++; }
476 +#endif
478 + te.pos = _document.terms.size();
480 + te.begin = byte_position - piatokleng;
481 + te.end = byte_position;
483 + // Now search for attributes:
485 + while ( *c != '>' && *c != '\0' ) {
487 + AttributeValuePair avp;
489 + // Try to extract attribute name:
491 + i = 0;
492 +#ifndef WIN32
493 + while ( isalnum( c[i] ) || c[i] == '-' || c[i] == '_' ) i++;
494 +#else
495 + while ( (c[i] >= 0) && isalnum( c[i] ) || c[i] == '-' || c[i] == '_') i++;
496 +#endif
498 + if ( i == 0 ) break;
500 + // Ensure attribute name is downcased
501 + for ( int j = 0; j < i; j++ )
502 + c[j] = tolower( c[j] );
504 + // need to write i characters, plus a NULL
505 + write_loc = _termBuffer.write( i + 1 );
506 + strncpy( write_loc, c, i );
507 + write_loc[i] = '\0';
508 + avp.attribute = write_loc;
509 + c += i;
510 + offset += i;
512 + // attributes can be foo\s*=\s*"bar[">] or foo\s*=\s*bar
514 + // ignore any spaces
515 +#ifndef WIN32
516 + while ( isspace( *c ) ) { c++; offset++; }
517 +#else
518 + while (((*c) >=0) && isspace( *c )) { c++; offset++; }
519 +#endif
521 + if ( *c == '=' ) {
523 + c++; // get past the '=' sign.
524 + offset++;
526 +#ifndef WIN32
527 + while ( isspace( *c ) ) { c++; offset++; }
528 +#else
529 + while (((*c) >=0) && isspace( *c )) { c++; offset++; }
530 +#endif
532 + if ( *c == '>' ) {
534 + // common malformed markup <a href=>
536 + // Insert empty attribute value
537 + // need to write a single NULL
538 + write_loc = _termBuffer.write( 1 );
539 + write_loc[0] = '\0';
540 + avp.value = write_loc;
541 + avp.begin = byte_position - piatokleng + offset;
542 + avp.end = byte_position - piatokleng + offset;
544 + } else {
546 + bool quoted = true;
547 + char quote_char;
548 + if ( *c == '"' || *c =='\'' ) { quote_char = *c; c++; offset++; }
549 + else quoted = false;
551 + // Attribute value starts here.
553 + i = 0;
554 +// make sure the opening and closing quote character match...
555 + if ( quoted )
556 +// while ( c[i] != '"' && c[i] != '>' && c[i] !='\'') i++;
557 + while ( c[i] != quote_char && c[i] != '>') i++;
558 + else
559 +#ifndef WIN32
560 + while ( ! isspace( c[i] ) && c[i] != '>' ) i++;
561 +#else
562 + while ( ((c[i] >= 0) && ! isspace( c[i] ) ) && c[i] != '>' ) i++;
563 +#endif
565 + // need to write i characters, plus a NULL
566 + write_loc = _termBuffer.write( i + 1 );
567 + strncpy( write_loc, c, i );
568 + write_loc[i] = '\0';
569 + avp.value = write_loc;
570 + avp.begin = byte_position - piatokleng + offset;
571 + avp.end = byte_position - piatokleng + offset + i;
572 + c += i;
573 + offset += i;
576 + } else {
578 + // Insert empty attribute value
579 + // need to write a single NULL
580 + write_loc = _termBuffer.write( 1 );
581 + write_loc[0] = '\0';
582 + avp.value = write_loc;
583 + avp.begin = byte_position - piatokleng + offset;
584 + avp.end = byte_position - piatokleng + offset;
586 +#ifndef WIN32
587 + while ( isspace( *c ) || *c == '"' ) { c++; offset++; }
588 +#else
589 + while ( ((*c >= 0) && isspace( *c )) || *c == '"' ) { c++; offset++; }
590 +#endif
592 + te.attributes.push_back( avp );
595 + _document.tags.push_back( te );
599 + // One of the cases that is ignored is this common malformed
600 + // markup <foo=bar> with no tag name. Another is the case
601 + // of an email address <foo@bar.com>
607 +void indri::parse::TextTokenizerPIA::processUTF8Token() {
609 + // A UTF-8 token, as recognized by flex, could actually be
610 + // a mixed ASCII/UTF-8 string containing any number of
611 + // UTF-8 characters, so we re-tokenize it here.
613 + indri::utility::HashTable<UINT64,const int>& unicode = _transcoder.unicode();
615 + int len = strlen( piatoktext );
617 + UINT64* unicode_chars = new UINT64[len + 1];
618 + int* offsets = new int[len + 1];
619 + int* lengths = new int[len + 1];
620 + _transcoder.utf8_decode( piatoktext, &unicode_chars, NULL, NULL,
621 + &offsets, &lengths );
623 + const int* p;
624 + int cls; // Character class of current UTF-8 character
625 + // offset of current UTF-8 character w/r/t toktext stored in offsets[i]
626 + // byte length of current UTF-8 character stored in lengths[i]
628 + int offset = 0; // Position of start of current *token* (not character) w/r/t toktext
629 + int extent = 0; // Extent for this *token* including trailing punct
630 + int piatoken_len = 0; // Same as above, minus the trailing punctuation
632 + char buf[64];
634 + // If this flag is true, we have punctuation symbols at the end of a
635 + // token, so do not attach another letter to this token.
636 + bool no_letter = false;
638 + // In case there are malformed characters preceding the good
639 + // characters:
640 + offset = offsets[0];
642 + for ( int i = 0; unicode_chars[i] != 0; i++ ) {
644 + p = unicode.find( unicode_chars[i] );
645 + cls = p ? *p : 0;
647 + if ( ! _tokenize_entire_words ) { // Tokenize by character
649 + if ( cls != 0 && cls != 3 && cls != 5 && cls != 9 ) {
651 + writeToken( piatoktext + offsets[i], lengths[i],
652 + byte_position - piatokleng + offsets[i],
653 + byte_position - piatokleng + offsets[i] + lengths[i] );
655 + continue;
658 + // If this is not the first time through this loop, we need
659 + // to check to see if any bytes in toktext were skipped
660 + // during the UTF-8 analysis:
662 + if ( i != 0 && offset + piatoken_len != offsets[i] ) {
664 + // Write out the token we are working on, if any:
666 + if ( piatoken_len > 0 ) {
668 + writeToken( piatoktext + offset, piatoken_len,
669 + byte_position - piatokleng + offset,
670 + byte_position - piatokleng + offset + extent );
673 + extent = 0;
674 + piatoken_len = 0;
675 + no_letter = false;
676 + offset = offsets[i];
679 + // Tokenize by word:
681 + switch ( cls ) {
683 + case 4: // Currency symbol: always extracted alone
684 + // Action: write the token we are working on,
685 + // and write this symbol as a separate token
686 + writeToken( piatoktext + offset, extent,
687 + byte_position - piatokleng + offset,
688 + byte_position - piatokleng + offset + extent );
690 + offset += extent;
692 + writeToken( piatoktext + offset, lengths[i],
693 + byte_position - piatokleng + offset,
694 + byte_position - piatokleng + offset + lengths[i] );
696 + offset += lengths[i];
697 + piatoken_len = 0;
698 + extent = 0;
699 + no_letter = false;
700 + break;
702 + case 1: // Apostrophe
703 + case 10: // Decimal separator
704 + case 6: // Letter
705 + case 7: // Digit
706 + // Action: add this character to the end of the token we are
707 + // working on
708 + if ( no_letter ) { // This is a token boundary
709 + writeToken( piatoktext + offset, piatoken_len,
710 + byte_position - piatokleng + offset,
711 + byte_position - piatokleng + offset + extent );
713 + offset += extent;
714 + extent = 0;
715 + piatoken_len = 0;
716 + no_letter = false;
720 + extent += lengths[i];
721 + piatoken_len += lengths[i];
722 + break;
724 + case 2: // Percent
725 + case 8: // Punctuation
726 + case 12: // Thousands separator
727 + case 11: // Hyphen
728 + // Action: These characters are included in the extent of the
729 + // token we are working on.
730 + no_letter = true;
731 + extent += lengths[i];
732 + break;
734 + case 0: // No character class!
735 + case 3: // Control character
736 + case 5: // Non-punctuation symbol
737 + case 9: // Whitespace
738 + default:
739 + // Action: write the token we are working on. Do not include
740 + // this character in any future token.
741 + writeToken( piatoktext + offset, piatoken_len,
742 + byte_position - piatokleng + offset,
743 + byte_position - piatokleng + offset + extent );
745 + offset += (extent + lengths[i]); // Include current character
746 + extent = 0;
747 + piatoken_len = 0;
748 + no_letter = false;
750 + break;
754 + // Write out last token
755 + if ( piatoken_len > 0 )
756 + writeToken( piatoktext + offset, piatoken_len,
757 + byte_position - piatokleng + offset,
758 + byte_position - piatokleng + offset + extent );
760 + delete[] unicode_chars;
761 + delete[] offsets;
762 + delete[] lengths;
765 +void indri::parse::TextTokenizerPIA::processASCIIToken() {
767 + int piatoken_len = strlen( piatoktext );
769 + // token_len here is the length of the token without
770 + // any trailing punctuation.
772 + for ( int i = piatoken_len - 1; i > 0; i-- ) {
774 + if ( ! ispunct( piatoktext[i] ) )
775 + break;
776 + else
777 + piatoken_len--;
780 + if ( _tokenize_entire_words ) {
782 + writeToken( piatoktext, piatoken_len, byte_position - piatokleng, byte_position );
784 + } else {
786 + for ( int i = 0; i < piatoken_len; i++ )
787 + writeToken( piatoktext + i, 1, byte_position - piatokleng + i,
788 + byte_position - piatokleng + i + 1 );
793 +// ObjectHandler implementation
795 +void indri::parse::TextTokenizerPIA::handle( indri::parse::UnparsedDocument* document ) {
797 + _handler->handle( tokenize( document ) );
800 +void indri::parse::TextTokenizerPIA::setHandler( ObjectHandler<indri::parse::TokenizedDocument>& h ) {
802 + _handler = &h;
805 +void indri::parse::TextTokenizerPIA::writeToken( char* token, int piatoken_len,
806 + int extent_begin, int extent_end ) {
809 + // The TermExtent for a token will include trailing punctuation.
810 + // The purpose for this is that it makes for a nicer display when a
811 + // sequence of tokens (say, a sentence) is retrieved and shown to
812 + // the user.
814 + TermExtent extent;
815 + extent.begin = extent_begin;
816 + extent.end = extent_end;
817 + _document.positions.push_back( extent );
819 + // The terms entry for a token won't include the punctuation.
821 + char* write_loc = _termBuffer.write( piatoken_len + 1 );
822 + strncpy( write_loc, token, piatoken_len );
823 + write_loc[piatoken_len] = '\0';
824 + _document.terms.push_back( write_loc );
828 --- indri-5.4/include/indri/TextTokenizerPIA.hpp po črc 15 14:38:50 2013
829 +++ indri-5.4/include/indri/TextTokenizerPIA.hpp po črc 15 14:36:54 2013
830 @@ -0,0 +1,73 @@
831 +/*==========================================================================
832 + * Copyright (c) 2003-2005 University of Massachusetts. All Rights Reserved.
834 + * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
835 + * is subject to the terms of the software license set forth in the LICENSE
836 + * file included with this software, and also available at
837 + * http://www.lemurproject.org/license.html
839 + *==========================================================================
840 + */
843 +// TextTokenizerPIA
845 +// 15 September 2005 -- mwb
848 +#ifndef INDRI_TEXTTOKENIZERPIA_HPP
849 +#define INDRI_TEXTTOKENIZERPIA_HPP
851 +#include <stdio.h>
852 +#include <string>
853 +#include <map>
855 +#include "indri/IndriTokenizer.hpp"
856 +#include "indri/Buffer.hpp"
857 +#include "indri/TagEvent.hpp"
858 +#include "indri/UnparsedDocument.hpp"
859 +#include "indri/TokenizedDocument.hpp"
860 +#include "indri/UTF8Transcoder.hpp"
862 +namespace indri {
863 + namespace parse {
865 + class TextTokenizerPIA : public Tokenizer {
867 + public:
868 + TextTokenizerPIA( bool tokenize_markup = true, bool tokenize_entire_words = true ) : _handler(0) {
870 + _tokenize_markup = tokenize_markup;
871 + _tokenize_entire_words = tokenize_entire_words;
874 + ~TextTokenizerPIA() {}
876 + TokenizedDocument* tokenize( UnparsedDocument* document );
878 + void handle( UnparsedDocument* document );
879 + void setHandler( ObjectHandler<TokenizedDocument>& h );
881 + protected:
882 + void processASCIIToken();
883 + void processUTF8Token();
884 + void processTag();
886 + indri::utility::Buffer _termBuffer;
887 + UTF8Transcoder _transcoder;
889 + bool _tokenize_markup;
890 + bool _tokenize_entire_words;
892 + private:
893 + ObjectHandler<TokenizedDocument>* _handler;
894 + TokenizedDocument _document;
896 + void writeToken( char* token, int token_len, int extent_begin,
897 + int extent_end );
898 + };
902 +#endif // INDRI_TEXTTOKENIZERPIA_HPP
904 --- indri-5.4/src/TokenizerFactory.cpp po črc 15 14:39:30 2013
905 +++ indri-5.4/src/TokenizerFactory.cpp po črc 15 14:29:11 2013
906 @@ -22,6 +22,7 @@
908 #include "indri/TextTokenizer.hpp"
909 // Add an #include for your Tokenizer here.
910 +#include "indri/TextTokenizerPIA.hpp"
913 #define TOKENIZER_WORD ("Word")
914 @@ -29,6 +30,8 @@
915 #define TOKENIZER_CHAR ("Char")
916 #define TOKENIZER_CHAR_NO_MARKUP ("Char without Markup")
917 // Add a #define for your Tokenizer here.
918 +#define TOKENIZER_PIA ("PIA")
919 +#define TOKENIZER_PIA_NO_MARKUP ("PIA without Markup")
923 @@ -78,8 +81,23 @@
924 // got "char"
925 return TOKENIZER_CHAR;
927 + } else if ( ( name[0] == 'p' || name[0] == 'P' ) &&
928 + ( name[1] == 'i' || name[1] == 'I' ) &&
929 + ( name[2] == 'a' || name[3] == 'A' ) ) {
931 + if ( name[4] == '-' &&
932 + ( name[5] == 'n' || name[5] == 'N' ) &&
933 + ( name[5] == 'o' || name[5] == 'O' ) ) {
935 + // got "pia-nomarkup"
936 + return TOKENIZER_PIA_NO_MARKUP;
939 + // got "pia"
940 + return TOKENIZER_PIA;
944 return "";
947 @@ -105,6 +123,14 @@
949 tokenizer = new indri::parse::TextTokenizer( false, false );
951 + } else if ( preferred == TOKENIZER_PIA ) {
953 + tokenizer = new indri::parse::TextTokenizerPIA();
955 + } else if ( preferred == TOKENIZER_PIA_NO_MARKUP ) {
957 + tokenizer = new indri::parse::TextTokenizerPIA( false );
959 } else {
961 LEMUR_THROW( LEMUR_RUNTIME_ERROR, name + " is not a known tokenizer." );
962 --- indri-5.4/src/FileClassEnvironmentFactory.cpp po črc 15 14:40:19 2013
963 +++ indri-5.4/src/FileClassEnvironmentFactory.cpp po črc 15 14:29:12 2013
964 @@ -189,6 +189,20 @@
965 trec_conflations // conflations
968 + "trecpia", // name
969 + "xml", // parser
970 + "pia", // tokenizer
971 + "tagged", // iterator
972 + "<DOC>", // startDocTag
973 + "</DOC>", // endDocTag
974 + NULL, // endMetadataTag
975 + trec_include_tags, // includeTags
976 + NULL, // excludeTags
977 + trec_index_tags, // indexTags
978 + trec_metadata_tags, // metadataTags
979 + trec_conflations // conflations
980 + },
982 "trecchar", // name
983 "xml", // parser
984 "char", // tokenizer
985 --- indri-5.4/Makefile.app.in 2013-09-04 06:31:06.740210927 -0700
986 +++ indri-5.4/Makefile.app.in 2013-09-04 06:27:24.857989779 -0700
987 @@ -1,22 +1,26 @@
988 +include MakeDefns
990 ## your application name here
991 -APP=
992 +APP=pia_wrapper
993 SRC=$(APP).cpp
994 ## extra object files for your app here
995 OBJ=
996 +OUTPUT=lib$(APP).so.1
998 prefix = @prefix@
999 exec_prefix = ${prefix}
1000 libdir = @libdir@
1001 includedir = @includedir@
1002 -INCPATH=-I$(includedir)
1003 -LIBPATH=-L$(libdir)
1004 +INCPATH=-Iinclude -Icontrib/lemur/include
1005 +LIBPATH=-Lobj
1006 CXXFLAGS=@DEFS@ @CPPFLAGS@ @CXXFLAGS@ $(INCPATH)
1007 -CPPLDFLAGS = @LDFLAGS@ -lindri @LIBS@
1008 +CPPLDFLAGS = @LDFLAGS@ -lnvpair -lindri @LIBS@
1010 all:
1011 - $(CXX) $(CXXFLAGS) $(SRC) -o $(APP) $(OBJ) $(LIBPATH) $(CPPLDFLAGS)
1012 + $(CXX) $(CXXFLAGS) $(SRC) -fpic -shared -static-libgcc -h $(OUTPUT) -o $(OUTPUT) $(OBJ) $(LIBPATH) $(CPPLDFLAGS)
1014 clean:
1015 rm -f $(APP)
1018 +install:
1019 + cp $(OUTPUT) $(libdir)
1020 --- indri-5.4/Makefile 2013-09-12 07:39:16.027125829 -0700
1021 +++ indri-5.4/Makefile 2013-09-12 07:38:44.720450641 -0700
1022 @@ -73,5 +73,6 @@
1023 $(MAKE) install -C doc
1024 $(MAKE) -C site-search install
1025 $(INSTALL_DATA) Makefile.app $(pkgdatadir)
1026 + $(MAKE) -f Makefile.app install
1028 test: