2 ** index-xapian-trec.cc
5 ** Internal use only: for xapian paper experiments
20 #include <sys/types.h>
28 #include "htmlparse.h"
31 #include "indextext.h"
33 using namespace Xapian
;
36 /* #define MAXPATH 100 maximum pathlength */
37 #define NO_OF_FILES 0 /* no of file to process in every directory */
38 #define START_FROM 0 /* which file to start from */
39 #define SW_FILE "stop.words"
40 #define LINE_SIZE 80 /* maximum size of a line in a document */
43 /* chamber (from hashbld) is where the input bundles are decompressed.*/
44 #define CHAMBER_SIZE 10000000
45 char chamber
[CHAMBER_SIZE
];
47 class TrecHtmlParser
: public HtmlParser
{
51 string docno
, sample
, keywords
, dump
;
52 bool indexing_allowed
;
53 void process_text(const string
&text
);
54 void opening_tag(const string
&tag
, const map
<string
,string
> &p
);
55 void closing_tag(const string
&tag
);
59 indexing_allowed(true) { }
63 TrecHtmlParser::process_text(const string
&text
)
65 // some tags are meaningful mid-word so this is simplistic at best...
67 if (!in_script_tag
&& !in_style_tag
) {
68 string::size_type firstchar
= text
.find_first_not_of(" \t\n\r");
69 if (firstchar
!= string::npos
) {
70 dump
+= text
.substr(firstchar
);
77 TrecHtmlParser::opening_tag(const string
&tag
, const map
<string
,string
> &p
)
81 map
<string
, string
>::const_iterator x
;
82 for (x
= p
.begin(); x
!= p
.end(); x
++) {
83 cout
<< " " << x
->first
<< "=\"" << x
->second
<< "\"";
89 map
<string
, string
>::const_iterator i
, j
;
90 if ((i
= p
.find("content")) != p
.end()) {
91 if ((j
= p
.find("name")) != p
.end()) {
92 string name
= j
->second
;
94 if (name
== "description") {
97 decode_entities(sample
);
99 } else if (name
== "keywords") {
100 if (!keywords
.empty()) keywords
+= ' ';
101 string tmp
= i
->second
;
102 decode_entities(tmp
);
104 } else if (name
== "robots") {
105 string val
= i
->second
;
106 decode_entities(val
);
108 if (val
.find("none") != string::npos
||
109 val
.find("noindex") != string::npos
) {
110 indexing_allowed
= false;
116 } else if (tag
== "script") {
117 in_script_tag
= true;
118 } else if (tag
== "style") {
120 } else if (tag
== "body") {
126 TrecHtmlParser::closing_tag(const string
&tag
)
128 if (tag
== "docno") {
131 } else if (tag
== "script") {
132 in_script_tag
= false;
133 } else if (tag
== "style") {
134 in_style_tag
= false;
135 } else if (tag
== "body") {
141 p_notalnum(unsigned int c
)
147 p_plusminus(unsigned int c
)
149 return c
== '+' || c
== '-';
152 void index_text(const string
&s
, Xapian::Document
&doc
)
155 AccentNormalisingItor
j(s
.begin());
156 const AccentNormalisingItor
s_end(s
.end());
158 AccentNormalisingItor first
= j
;
159 while (first
!= s_end
&& !isalnum(*first
)) ++first
;
160 if (first
== s_end
) break;
161 AccentNormalisingItor last
;
163 if (isupper(*first
)) {
166 while (++j
!= s_end
&& *j
== '.' && ++j
!= s_end
&& isupper(*j
)) {
169 if (term
.length() < 2 || (j
!= s_end
&& isalnum(*j
))) {
176 while (isalnum(*j
)) {
179 if (j
== s_end
) break;
181 AccentNormalisingItor next
= j
;
183 if (next
== s_end
|| !isalnum(*next
)) break;
188 string::size_type len
= term
.length();
190 while (j
!= s_end
&& p_plusminus(*j
)) {
194 if (j
!= s_end
&& isalnum(*j
)) {
200 if (term
.length() <= MAX_PROB_TERM_LENGTH
) {
201 lowercase_term(term
);
202 doc
.add_term( term
);
209 string
get_document( string
& text
, int & curpos
, int uncolen
) {
211 string enddoc
= "</DOC>";
212 char *end
= strstr( &chamber
[curpos
], enddoc
.c_str() );
214 for (int i
=0; i
< uncolen
&& &chamber
[i
] != end
; i
++)
215 document
+= chamber
[i
];
216 document
+= "</DOC>";
217 curpos
+= document
.length();
223 Xapian::Document
remove_stopwords( Xapian::Document doc
, SW_STORE
& sw_store
) {
224 // take a list of keywords and remove
226 Xapian::Document wordlist
;
229 for( TermIterator t
= doc
.termlist_begin(); t
!= doc
.termlist_end(); t
++ ) {
230 for( int i
=0; i
< (*t
).size(); i
++ ) word
[i
] = (*t
)[i
];
231 if(!IsStopWord( sw_store
, word
)) wordlist
.add_term( *t
);
237 } // END remove_stopwords
239 Xapian::Document
stem_document( Xapian::Document
& doc
) {
241 Stem
stemmer("english");
242 Xapian::Document wordlist
;
244 for( TermIterator t
= doc
.termlist_begin(); t
!= doc
.termlist_end(); t
++ ) {
245 wordlist
.add_term(stemmer
.stem_word(*t
) );
252 } // END stem_document
255 /**********************************************************************/
256 /* Process one compressed bundle */
257 /**********************************************************************/
259 static int processfile(string fp
, Xapian::WritableDatabase
&db
, SW_STORE
& sw_store
) {
260 /* A file believed to be a compressed doc bundle has been encountered. It's
261 full path is fp. It is to be decompressed and processed as requested. */
267 // uncompress the file
268 cout
<< "Processing" << fp
<< "\n";
269 for( int i
=0; i
< fp
.size(); i
++ ) filen
[i
] = fp
[i
];
270 uncolen
= decompress_bundle(filen
, (u_char
*) chamber
, CHAMBER_SIZE
);
272 // index the file in Xapian
275 Xapian::Document newdoc
;
277 // parse the document
278 string text
= get_document( text
, pointer
, uncolen
);
282 // index the document
283 index_text(p
.keywords
, newdoc
);
284 Xapian::Document doc_stopsremoved
= remove_stopwords( newdoc
, sw_store
);
285 Xapian::Document stemdoc
= stem_document( doc_stopsremoved
);
286 stemdoc
.set_data(p
.docno
); // set the data
287 db
.add_document(stemdoc
);
289 } while( pointer
< uncolen
);
292 } /* END processfile */
294 static void index_directory( const string
&dir
, Xapian::WritableDatabase
& db
,
301 cout
<< "[Entering directory " << dir
<< "]" << endl
;
303 d
= opendir(path
.c_str());
305 cout
<< "Can't open directory \"" << path
<< "\" - skipping\n";
308 while ((ent
= readdir(d
)) != NULL
) {
310 // ".", "..", and other hidden files
311 if (ent
->d_name
[0] == '.') continue;
313 if (!file
.empty() && file
[file
.size() - 1] != '/') file
+= '/';
315 if (stat(file
.c_str(), &statbuf
) == -1) {
316 cout
<< "Can't stat \"" << file
<< "\" - skipping\n";
320 if (S_ISDIR(statbuf
.st_mode
)) {
321 // file is a directory
323 index_directory( file
, db
, sw_store
);
326 cout
<< "Caught unknown exception in index_directory, rethrowing" << endl
;
332 if (S_ISREG(statbuf
.st_mode
)) {
333 // file is a regular indexable text file
335 string::size_type dot
= file
.find_last_of('.');
336 if (dot
!= string::npos
) ext
= file
.substr(dot
+ 1);
338 processfile( file
, db
, sw_store
);
342 cout
<< "Not a regular file \"" << file
<< "\" - skipping\n";
346 } // END index_directory
349 int main (int argc
, char *argv
[]) {
351 // There must be four command line arguments passed in the order:
352 // database_name, query file name, results file name, db type, run type,
354 cerr
<< "ERROR: Insufficient arguments passed to program\n";
355 cerr
<< "USAGE: index-xapian-trec <database> \n";
359 // Catch any Error exceptions thrown
362 /* load the stopword list */
364 Read_SW_File( SW_FILE
, &sw_store
);
366 /* set up xapian indexing */
367 Xapian::WritableDatabase
db(Xapian::Flint::open(argv
[1], Xapian::DB_CREATE_OR_OPEN
));
369 /* scan the directories/files and put them in an index */
370 index_directory( argv
[1], db
, sw_store
);
372 } catch (const Error
&error
) {
373 cout
<< "Exception: " << error
.get_msg() << endl
;