1 /* trec_index.cc: indexer for trec experiments
3 * ----START-LICENCE----
4 * Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2003 Olly Betts
6 * Copyright 2003 Andy MacFarlane, City University
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
22 * -----END-LICENCE-----
30 #include <sys/types.h>
38 #include "htmlparse.h"
40 #include "config_file.h"
41 #include "indextext.h"
46 using namespace Xapian
;
49 #define ENDDOC "</DOC>"
51 static const unsigned int MAX_URL_LENGTH
= 240;
52 // chamber (from hashbld) is where the input bundles are decompressed.
53 #define CHAMBER_SIZE 30000000
54 char chamber
[CHAMBER_SIZE
];
56 float ttextsize
=0; // total amount of text in mb indexed
57 int totaldocs
=0; // total number of documents indexed
59 class SGMLParser
: public HtmlParser
{
63 string title
, sample
, keywords
, dump
;
64 bool indexing_allowed
;
65 void process_text(const string
&text
);
66 void opening_tag(const string
&tag
, const map
<string
,string
> &p
);
67 void closing_tag(const string
&tag
);
71 indexing_allowed(true) { }
74 void SGMLParser::process_text(const string
&text
) {
75 // some tags are meaningful mid-word so this is simplistic at best...
77 if (!in_script_tag
&& !in_style_tag
) {
78 string::size_type firstchar
= text
.find_first_not_of(" \t\n\r");
79 if (firstchar
!= string::npos
) {
80 dump
+= text
.substr(firstchar
);
87 SGMLParser::opening_tag(const string
&tag
, const map
<string
,string
> &p
) {
91 map
<string
, string
>::const_iterator i
, j
;
92 if ((i
= p
.find("content")) != p
.end()) {
93 if ((j
= p
.find("name")) != p
.end()) {
94 string name
= j
->second
;
96 if (name
== "description") {
99 decode_entities(sample
);
101 } else if (name
== "keywords") {
102 if (!keywords
.empty()) keywords
+= ' ';
103 string tmp
= i
->second
;
104 decode_entities(tmp
);
106 } else if (name
== "robots") {
107 string val
= i
->second
;
108 decode_entities(val
);
110 if (val
.find("none") != string::npos
||
111 val
.find("noindex") != string::npos
) {
112 //indexing_allowed = false;
113 //cout << "HELP!) found a robot tag which is difficult to index :(" << endl;
118 } else if (tag
== "script") {
119 in_script_tag
= true;
120 } else if (tag
== "style") {
122 } else if (tag
== "body") {
128 SGMLParser::closing_tag(const string
&tag
)
130 //cout << "closing_tag) : " << tag << endl;
131 if (tag
== "docno") {
132 if( dump
.size() < 30 ) // nasty hack to get round problems on terabyte track with robot tags
135 title
= "DOCNO-ERROR";
137 } else if (tag
== "script") {
138 in_script_tag
= false;
139 } else if (tag
== "style") {
140 in_style_tag
= false;
144 string
getline( int & curpos
, int uncolen
) {
147 for( ; curpos
< uncolen
&& chamber
[curpos
] !='\n'; curpos
++ )
148 line
+= chamber
[curpos
];
154 string
get_document( int & curpos
, int uncolen
) {
155 // alternative version of get document
159 while( !end_found
) {
160 string line
= getline( curpos
, uncolen
);
162 string::size_type pos
= line
.find(ENDDOC
,0);
163 if( pos
!= string::npos
) end_found
=1;
168 } // END get_document
170 Xapian::Document
remove_stopwords( Xapian::Document doc
, SW_STORE
& sw_store
) {
171 // take a list of keywords and remove
173 Xapian::Document wordlist
;
176 for( TermIterator t
= doc
.termlist_begin(); t
!= doc
.termlist_end(); t
++ ) {
177 for( int i
=0; i
< (*t
).size(); i
++ ) word
[i
] = (*t
)[i
];
178 if(!IsStopWord( sw_store
, word
)) wordlist
.add_term( *t
);
184 } // END remove_stopwords
186 Xapian::Document
stem_document( Xapian::Document
& doc
) {
188 Stem
stemmer("english");
189 Xapian::Document wordlist
;
191 for( TermIterator t
= doc
.termlist_begin(); t
!= doc
.termlist_end(); t
++ ) {
192 wordlist
.add_term(stemmer(*t
) );
199 } // END stem_document
202 p_plusminus(unsigned int c
)
204 return c
== '+' || c
== '-';
208 index_text(const string
&s
, Xapian::Document
&doc
, Xapian::Stem
&stemmer
,
209 Xapian::termcount wdfinc
, const string
&prefix
,
212 string rprefix
= prefix
;
213 // If we're using a multi-character prefix, make sure to add a colon when
214 // generating raw (R) terms as otherwise XFOO + Rterm will collide with
216 if (rprefix
.size() > 1 && rprefix
[rprefix
.size() - 1] != ':')
220 AccentNormalisingItor
j(s
.begin());
221 const AccentNormalisingItor
s_end(s
.end());
223 AccentNormalisingItor first
= j
;
224 while (first
!= s_end
&& !isalnum(*first
)) ++first
;
225 if (first
== s_end
) break;
226 AccentNormalisingItor last
;
228 if (isupper(*first
)) {
231 while (++j
!= s_end
&& *j
== '.' && ++j
!= s_end
&& isupper(*j
)) {
234 if (term
.length() < 2 || (j
!= s_end
&& isalnum(*j
))) {
241 while (isalnum(*j
)) {
244 if (j
== s_end
) break;
246 AccentNormalisingItor next
= j
;
248 if (next
== s_end
|| !isalnum(*next
)) break;
253 string::size_type len
= term
.length();
255 while (j
!= s_end
&& p_plusminus(*j
)) {
259 if (j
!= s_end
&& isalnum(*j
)) {
265 if (term
.length() <= MAX_PROB_TERM_LENGTH
) {
266 lowercase_term(term
);
267 if (isupper(*first
)) {
268 if (pos
!= static_cast<Xapian::termpos
>(-1)
269 // Not in GCC 2.95.2 numeric_limits<Xapian::termpos>::max()
271 doc
.add_posting(rprefix
+ term
, pos
, wdfinc
);
273 doc
.add_term(rprefix
+ term
, wdfinc
);
277 term
= stemmer(term
);
278 if (pos
!= static_cast<Xapian::termpos
>(-1)
279 // Not in GCC 2.95.2 numeric_limits<Xapian::termpos>::max()
281 doc
.add_posting(prefix
+ term
, pos
++, wdfinc
);
283 doc
.add_term(prefix
+ term
, wdfinc
);
290 static void index_file( const string
&file
,
292 Xapian::WritableDatabase
& db
,
293 SW_STORE sw_store
) {
294 // index a file containing a number of SGML/HTML documents
297 cout
<< "can't read \"" << file
<< "\" - skipping\n";
299 } //else cout << "Indexing [" << file << "]" << endl;
302 Xapian::Stem
stemmer( config
.get_language() );
305 uncolen
= decompress_bundle( (u_char
*)file
.c_str(), (u_char
*) chamber
, CHAMBER_SIZE
);
306 //cout << "DEBUG) decompresses file done, size = " << uncolen << endl;
308 // accumulate the text size read in
309 ttextsize
+= ( (float) uncolen
/ 1048576.0);
311 while( curpos
< uncolen
) {
314 string rawdoc
= get_document( curpos
, uncolen
);
315 //cout << "DEBUG) got a document, size = " << rawdoc.size() <<
316 // ", curpos = " << curpos << endl;
318 if( rawdoc
.size() > 1 ) {
320 // parse the document for the data
322 p
.parse_html(rawdoc
);
324 // Add postings for terms to the document
325 Xapian::Document doc
;
326 Xapian::termpos pos
= 1;
327 pos
= index_text( p
.title
, doc
, stemmer
, pos
);
328 pos
= index_text( p
.keywords
, doc
, stemmer
, pos
+ 1);
330 // index the document
331 Xapian::Document doc_stopsremoved
= remove_stopwords( doc
, sw_store
);
332 Xapian::Document stemdoc
= stem_document( doc_stopsremoved
);
333 //cout << "DOCID = " << p.title << endl;
334 stemdoc
.set_data(p
.title
); // set the data
335 db
.add_document(stemdoc
);
337 // record the total no of docs done
339 //if( (totaldocs % 10000) == 0 ) cout << "DOCUMENTS PROCESSED) " << totaldocs << endl;
346 static void index_directory( const string
&dir
, CONFIG_TREC
& config
, Xapian::WritableDatabase
& db
,
353 //cout << "[Entering directory " << dir << "]" << endl;
355 d
= opendir(path
.c_str());
357 cout
<< "Can't open directory \"" << path
<< "\" - skipping\n";
360 while ((ent
= readdir(d
)) != NULL
) {
362 // ".", "..", and other hidden files
363 if (ent
->d_name
[0] == '.') continue;
365 if (!file
.empty() && file
[file
.size() - 1] != '/') file
+= '/';
367 if (stat(file
.c_str(), &statbuf
) == -1) {
368 cout
<< "Can't stat \"" << file
<< "\" - skipping\n";
372 if (S_ISDIR(statbuf
.st_mode
)) {
373 // file is a directory
375 index_directory( file
, config
, db
, sw_store
);
378 cout
<< "Caught unknown exception in index_directory, rethrowing" << endl
;
384 if (S_ISREG(statbuf
.st_mode
)) {
385 // file is a regular indexable text file
387 string::size_type dot
= file
.find_last_of('.');
388 if (dot
!= string::npos
) ext
= file
.substr(dot
+ 1);
390 index_file( file
, config
, db
, sw_store
);
394 cout
<< "Not a regular file \"" << file
<< "\" - skipping\n";
398 } // END index_directory
400 int main(int argc
, char **argv
)
403 // check for proper useage of program
405 cout
<< "usage: " << argv
[0] << " <config file>" << endl
;
409 CONFIG_TREC trec_config
;
410 trec_config
.setup_config( string(argv
[1]) );
412 if( !trec_config
.check_index_config() ) {
413 cout
<< "ERROR - configure file invalid, pls check" << endl
;
418 string stopsfilename
= trec_config
.get_stopsfile();
419 Read_SW_File( (char *) stopsfilename
.c_str(), &sw_store
);
421 // Catch any Xapian::Error exceptions thrown
424 Xapian::WritableDatabase
db(Xapian::Flint::open(trec_config
.get_db().c_str(), Xapian::DB_CREATE_OR_OPEN
));
426 struct timeval start_time
, finish_time
, timelapse
; /* timing variables */
429 gettimeofday( &start_time
, 0 );
431 // index the text collection
432 index_directory( trec_config
.get_textfile(), trec_config
, db
, sw_store
);
436 gettimeofday( &finish_time
, 0 );
438 // print the total time, and average time per query -
439 diff_time( finish_time
, start_time
, &timelapse
);
440 cout
<< "Total time for " << totaldocs
<< " documents is " << time_real( timelapse
) << " secs, text size = " << ttextsize
442 cout
<< "Total number of documents in the database is now " << db
.get_doccount() << " docs" << endl
;
444 } catch (const Xapian::Error
&e
) {
445 cout
<< "Exception: " << e
.get_msg() << endl
;
447 } catch (const string
&s
) {
448 cout
<< "Exception: " << s
<< endl
;
450 } catch (const char *s
) {
451 cout
<< "Exception: " << s
<< endl
;
454 cout
<< "Caught unknown exception" << endl
;