Add README
[xapian-trec.git] / index-xapian-trec.cc
blob1b357fa8f37cbc0d02a45c06f562a767fefb489b
1 /*
2 ** index-xapian-trec.cc
3 **
4 ** AM 6/12/2006
5 ** Internal use only: for xapian paper experiments
6 **
7 */
9 #include <xapian.h>
10 #include <iostream>
11 #include <fstream>
12 #include <string>
13 #include <vector>
14 #include <stdio.h>
15 #include <fcntl.h>
16 #include <errno.h>
17 #include <sys/uio.h>
18 #include <sys/time.h>
19 #include <unistd.h>
20 #include <sys/types.h>
21 #include <sys/wait.h>
22 #include <signal.h>
23 #include <sys/mman.h>
24 #include <sys/stat.h>
25 #include <dirent.h>
26 #include <string.h>
27 #include <stdlib.h>
28 #include "htmlparse.h"
29 #include "P98_gzip.h"
30 #include "stopword.h"
31 #include "indextext.h"
33 using namespace Xapian;
34 using namespace std;
36 /* #define MAXPATH 100 maximum pathlength */
37 #define NO_OF_FILES 0 /* no of file to process in every directory */
38 #define START_FROM 0 /* which file to start from */
39 #define SW_FILE "stop.words"
40 #define LINE_SIZE 80 /* maximum size of a line in a document */
41 #define MAXPATH 100
43 /* chamber (from hashbld) is where the input bundles are decompressed.*/
44 #define CHAMBER_SIZE 10000000
45 char chamber[CHAMBER_SIZE];
47 class TrecHtmlParser : public HtmlParser {
48 public:
49 bool in_script_tag;
50 bool in_style_tag;
51 string docno, sample, keywords, dump;
52 bool indexing_allowed;
53 void process_text(const string &text);
54 void opening_tag(const string &tag, const map<string,string> &p);
55 void closing_tag(const string &tag);
56 TrecHtmlParser() :
57 in_script_tag(false),
58 in_style_tag(false),
59 indexing_allowed(true) { }
62 void
63 TrecHtmlParser::process_text(const string &text)
65 // some tags are meaningful mid-word so this is simplistic at best...
67 if (!in_script_tag && !in_style_tag) {
68 string::size_type firstchar = text.find_first_not_of(" \t\n\r");
69 if (firstchar != string::npos) {
70 dump += text.substr(firstchar);
71 dump += " ";
76 void
77 TrecHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
79 #if 0
80 cout << "<" << tag;
81 map<string, string>::const_iterator x;
82 for (x = p.begin(); x != p.end(); x++) {
83 cout << " " << x->first << "=\"" << x->second << "\"";
85 cout << ">\n";
86 #endif
88 if (tag == "meta") {
89 map<string, string>::const_iterator i, j;
90 if ((i = p.find("content")) != p.end()) {
91 if ((j = p.find("name")) != p.end()) {
92 string name = j->second;
93 lowercase_term(name);
94 if (name == "description") {
95 if (sample.empty()) {
96 sample = i->second;
97 decode_entities(sample);
99 } else if (name == "keywords") {
100 if (!keywords.empty()) keywords += ' ';
101 string tmp = i->second;
102 decode_entities(tmp);
103 keywords += tmp;
104 } else if (name == "robots") {
105 string val = i->second;
106 decode_entities(val);
107 lowercase_term(val);
108 if (val.find("none") != string::npos ||
109 val.find("noindex") != string::npos) {
110 indexing_allowed = false;
111 throw true;
116 } else if (tag == "script") {
117 in_script_tag = true;
118 } else if (tag == "style") {
119 in_style_tag = true;
120 } else if (tag == "body") {
121 dump = "";
125 void
126 TrecHtmlParser::closing_tag(const string &tag)
128 if (tag == "docno") {
129 docno = dump;
130 dump = "";
131 } else if (tag == "script") {
132 in_script_tag = false;
133 } else if (tag == "style") {
134 in_style_tag = false;
135 } else if (tag == "body") {
136 throw true;
140 inline static bool
141 p_notalnum(unsigned int c)
143 return !isalnum(c);
146 inline static bool
147 p_plusminus(unsigned int c)
149 return c == '+' || c == '-';
152 void index_text(const string &s, Xapian::Document &doc )
155 AccentNormalisingItor j(s.begin());
156 const AccentNormalisingItor s_end(s.end());
157 while (true) {
158 AccentNormalisingItor first = j;
159 while (first != s_end && !isalnum(*first)) ++first;
160 if (first == s_end) break;
161 AccentNormalisingItor last;
162 string term;
163 if (isupper(*first)) {
164 j = first;
165 term = *j;
166 while (++j != s_end && *j == '.' && ++j != s_end && isupper(*j)) {
167 term += *j;
169 if (term.length() < 2 || (j != s_end && isalnum(*j))) {
170 term = "";
172 last = j;
174 if (term.empty()) {
175 j = first;
176 while (isalnum(*j)) {
177 term += *j;
178 ++j;
179 if (j == s_end) break;
180 if (*j == '&') {
181 AccentNormalisingItor next = j;
182 ++next;
183 if (next == s_end || !isalnum(*next)) break;
184 term += '&';
185 j = next;
188 string::size_type len = term.length();
189 last = j;
190 while (j != s_end && p_plusminus(*j)) {
191 term += *j;
192 ++j;
194 if (j != s_end && isalnum(*j)) {
195 term.resize(len);
196 } else {
197 last = j;
200 if (term.length() <= MAX_PROB_TERM_LENGTH) {
201 lowercase_term(term);
202 doc.add_term( term );
207 } // END index_text
209 string get_document( string & text, int & curpos, int uncolen ) {
211 string enddoc = "</DOC>";
212 char *end = strstr( &chamber[curpos], enddoc.c_str() );
213 string document;
214 for (int i=0; i < uncolen && &chamber[i] != end; i++)
215 document += chamber[i];
216 document += "</DOC>";
217 curpos += document.length();
219 return document;
221 } // get_document
223 Xapian::Document remove_stopwords( Xapian::Document doc, SW_STORE & sw_store ) {
224 // take a list of keywords and remove
226 Xapian::Document wordlist;
227 char word[100];
229 for( TermIterator t = doc.termlist_begin(); t != doc.termlist_end(); t++ ) {
230 for( int i=0; i < (*t).size(); i++ ) word[i] = (*t)[i];
231 if(!IsStopWord( sw_store, word )) wordlist.add_term( *t );
233 } // END for
235 return wordlist;
237 } // END remove_stopwords
239 Xapian::Document stem_document( Xapian::Document & doc ) {
241 Stem stemmer("english");
242 Xapian::Document wordlist;
244 for( TermIterator t = doc.termlist_begin(); t != doc.termlist_end(); t++ ) {
245 wordlist.add_term(stemmer.stem_word(*t) );
247 } // END for
249 return wordlist;
252 } // END stem_document
255 /**********************************************************************/
256 /* Process one compressed bundle */
257 /**********************************************************************/
259 static int processfile(string fp, Xapian::WritableDatabase &db, SW_STORE & sw_store ) {
260 /* A file believed to be a compressed doc bundle has been encountered. It's
261 full path is fp. It is to be decompressed and processed as requested. */
263 int uncolen;
264 int pointer=0;
265 u_char filen[100];
267 // uncompress the file
268 cout << "Processing" << fp << "\n";
269 for( int i=0; i < fp.size(); i++ ) filen[i] = fp[i];
270 uncolen = decompress_bundle(filen, (u_char *) chamber, CHAMBER_SIZE);
272 // index the file in Xapian
273 do {
275 Xapian::Document newdoc;
277 // parse the document
278 string text = get_document( text, pointer, uncolen );
279 TrecHtmlParser p;
280 p.parse_html(text);
282 // index the document
283 index_text(p.keywords, newdoc );
284 Xapian::Document doc_stopsremoved = remove_stopwords( newdoc, sw_store );
285 Xapian::Document stemdoc = stem_document( doc_stopsremoved );
286 stemdoc.set_data(p.docno); // set the data
287 db.add_document(stemdoc);
289 } while( pointer < uncolen );
292 } /* END processfile */
294 static void index_directory( const string &dir, Xapian::WritableDatabase & db,
295 SW_STORE sw_store )
297 DIR *d;
298 struct dirent *ent;
299 string path = dir;
301 cout << "[Entering directory " << dir << "]" << endl;
303 d = opendir(path.c_str());
304 if (d == NULL) {
305 cout << "Can't open directory \"" << path << "\" - skipping\n";
306 return;
308 while ((ent = readdir(d)) != NULL) {
309 struct stat statbuf;
310 // ".", "..", and other hidden files
311 if (ent->d_name[0] == '.') continue;
312 string file = dir;
313 if (!file.empty() && file[file.size() - 1] != '/') file += '/';
314 file += ent->d_name;
315 if (stat(file.c_str(), &statbuf) == -1) {
316 cout << "Can't stat \"" << file << "\" - skipping\n";
317 continue;
318 } // END if
320 if (S_ISDIR(statbuf.st_mode)) {
321 // file is a directory
322 try {
323 index_directory( file, db, sw_store );
325 catch (...) {
326 cout << "Caught unknown exception in index_directory, rethrowing" << endl;
327 throw;
329 continue;
330 } // END if
332 if (S_ISREG(statbuf.st_mode)) {
333 // file is a regular indexable text file
334 string ext;
335 string::size_type dot = file.find_last_of('.');
336 if (dot != string::npos) ext = file.substr(dot + 1);
338 processfile( file, db, sw_store );
339 continue;
340 } // END if
342 cout << "Not a regular file \"" << file << "\" - skipping\n";
344 closedir(d);
346 } // END index_directory
349 int main (int argc, char *argv[]) {
351 // There must be four command line arguments passed in the order:
352 // database_name, query file name, results file name, db type, run type,
353 if (argc < 2) {
354 cerr << "ERROR: Insufficient arguments passed to program\n";
355 cerr << "USAGE: index-xapian-trec <database> \n";
356 exit(0);
357 } // END if
359 // Catch any Error exceptions thrown
360 try {
362 /* load the stopword list */
363 SW_STORE sw_store;
364 Read_SW_File( SW_FILE, &sw_store );
366 /* set up xapian indexing */
367 Xapian::WritableDatabase db(Xapian::Flint::open(argv[1], Xapian::DB_CREATE_OR_OPEN));
369 /* scan the directories/files and put them in an index */
370 index_directory( argv[1], db, sw_store );
372 } catch (const Error &error) {
373 cout << "Exception: " << error.get_msg() << endl;
374 exit(1);
375 } // END try/catch
377 } // END main