Update NEWS from git log
[xapian.git] / xapian-applications / omega / omega.cc
blobe33f25f207a9db637da5e7960b13b6166044e5fc
1 /** @file
2 * @brief Main module for omega (example CGI frontend for Xapian)
3 */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001 James Aylett
6 * Copyright 2001,2002 Ananova Ltd
7 * Copyright 2002,2003,2004,2006,2007,2008,2009,2010,2011,2014,2015,2016,2018 Olly Betts
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as
11 * published by the Free Software Foundation; either version 2 of the
12 * License, or (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
22 * USA
25 #include <config.h>
27 #include <cerrno>
28 #include <cstdio>
29 #include <ctime>
31 #include <algorithm>
32 #include <cstring>
33 #include <iostream>
34 #include <set>
36 #include "safefcntl.h"
37 #include "safeunistd.h"
39 #include "omega.h"
40 #include "utils.h"
41 #include "cgiparam.h"
42 #include "query.h"
43 #include "str.h"
44 #include "stringutils.h"
45 #include "expand.h"
47 using namespace std;
49 static const char DEFAULT_STEM_LANGUAGE[] = "english";
51 // A character which doesn't require URL encoding, and isn't likely to appear
52 // in filter values.
53 const char filter_sep = '~';
55 // What we used for filter_sep in Omega < 1.3.4.
56 const char filter_sep_old = '-';
58 Xapian::Enquire * enquire;
59 Xapian::Database db;
61 map<string, string> option;
63 bool set_content_type = false;
65 bool suppress_http_headers = false;
67 string dbname;
68 string fmtname;
69 string filters, old_filters;
71 Xapian::docid hits_per_page = 0;
72 Xapian::docid min_hits = 0;
74 // percentage cut-off
75 int threshold = 0;
77 Xapian::MultiValueKeyMaker* sort_keymaker = NULL;
78 Xapian::valueno sort_key = Xapian::BAD_VALUENO; // Don't sort.
79 bool reverse_sort = true;
80 bool sort_after = false;
81 Xapian::Enquire::docid_order docid_order = Xapian::Enquire::ASCENDING;
83 Xapian::valueno collapse_key = 0;
84 bool collapse = false;
86 static string
87 map_dbname_to_dir(const string &database_name)
89 return database_dir + database_name;
92 static void
93 add_database(const string& this_dbname)
95 if (!dbname.empty()) dbname += '/';
96 dbname += this_dbname;
98 Xapian::Database this_db(map_dbname_to_dir(this_dbname));
99 db.add_database(this_db);
101 size_t this_db_size = this_db.size();
102 size_t db_size = db.size();
103 size_t i = 0;
104 while (subdbs.size() != db_size) {
105 subdbs.emplace_back(this_dbname, i++, this_db_size);
109 // Get database(s) to search.
110 template<typename IT>
111 void
112 parse_db_params(const pair<IT, IT>& dbs)
114 dbname.resize(0);
115 // Only add a repeated db once.
116 set<string> seen;
117 for (auto i = dbs.first; i != dbs.second; ++i) {
118 const string& v = i->second;
119 if (v.empty()) continue;
120 size_t p = 0, q;
121 while (true) {
122 q = v.find('/', p);
123 string s(v, p, q - p);
124 if (!s.empty() && seen.find(s) == seen.end()) {
125 add_database(s);
126 seen.insert(s);
128 if (q == string::npos) break;
129 p = q + 1;
134 int main(int argc, char *argv[])
135 try {
137 // Check for SERVER_PROTOCOL=INCLUDED, which is set when we're being
138 // included in a page via a server-side include directive. In this
139 // case we suppress sending a Content-Type: header.
140 const char* p = getenv("SERVER_PROTOCOL");
141 if (p && strcmp(p, "INCLUDED") == 0) {
142 suppress_http_headers = true;
146 read_config_file();
148 option["flag_default"] = "true";
150 // set default thousands and decimal separators: e.g. "16,729 hits" "1.4K"
151 option["decimal"] = ".";
152 option["thousand"] = ",";
154 // set the default stemming language
155 option["stemmer"] = DEFAULT_STEM_LANGUAGE;
157 // FIXME: set cout to linebuffered not stdout. Or just flush regularly...
158 // setvbuf(stdout, NULL, _IOLBF, 0);
160 const char * method = getenv("REQUEST_METHOD");
161 if (method == NULL) {
162 if (argc > 1 && (argv[1][0] != '-' || strchr(argv[1], '='))) {
163 // omega 'P=information retrieval' DB=papers
164 // check for a leading '-' on the first arg so "omega --version",
165 // "omega --help", and similar take the next branch
166 decode_argv(argv + 1);
167 } else {
168 // Seems we're running from the command line so give version
169 // and allow a query to be entered for testing
170 cout << PROGRAM_NAME " - " PACKAGE " " VERSION "\n";
171 if (argc > 1) exit(0);
172 cout << "Enter NAME=VALUE lines, end with blank line\n";
173 decode_test();
175 } else {
176 if (*method == 'P')
177 decode_post();
178 else
179 decode_get();
182 try {
183 parse_db_params(cgi_params.equal_range("DB"));
184 if (dbname.empty()) {
185 add_database(default_db);
187 enquire = new Xapian::Enquire(db);
188 } catch (const Xapian::Error &) {
189 enquire = NULL;
190 db = Xapian::Database();
193 hits_per_page = 0;
194 auto val = cgi_params.find("HITSPERPAGE");
195 if (val != cgi_params.end()) hits_per_page = atol(val->second.c_str());
196 if (hits_per_page == 0) {
197 hits_per_page = 10;
198 } else if (hits_per_page > 1000) {
199 hits_per_page = 1000;
202 val = cgi_params.find("DEFAULTOP");
203 if (val != cgi_params.end()) {
204 const string & v = val->second;
205 if (v == "OR" || v == "or")
206 default_op = Xapian::Query::OP_OR;
209 val = cgi_params.find("FMT");
210 if (val != cgi_params.end()) {
211 const string & v = val->second;
212 if (!v.empty()) fmtname = v;
214 if (fmtname.empty())
215 fmtname = default_template;
217 auto ml = cgi_params.equal_range("MORELIKE");
218 if (enquire && ml.first != ml.second) {
219 Xapian::RSet tmprset;
220 for (auto i = ml.first; i != ml.second; ++i) {
221 const string& v = i->second;
222 Xapian::docid docid = atol(v.c_str());
223 if (docid == 0) {
224 // Assume it's MORELIKE=Quid1138 and that Quid1138 is a UID
225 // from an external source - we just find the correspond docid.
226 Xapian::PostingIterator p = db.postlist_begin(v);
227 if (p != db.postlist_end(v)) docid = *p;
229 if (docid != 0) {
230 tmprset.add_document(docid);
234 if (!tmprset.empty()) {
235 OmegaExpandDecider decider(db);
236 set_expansion_scheme(*enquire, option);
237 Xapian::ESet eset(enquire->get_eset(40, tmprset, &decider));
238 string morelike_query;
239 for (auto&& term : eset) {
240 if (!morelike_query.empty()) {
241 if (default_op == Xapian::Query::OP_OR) {
242 morelike_query += ' ';
243 } else {
244 morelike_query += " OR ";
247 morelike_query += pretty_term(term);
249 add_query_string(string(), morelike_query);
251 } else {
252 // add expand/topterms terms if appropriate
253 string expand_terms;
254 if (cgi_params.find("ADD") != cgi_params.end()) {
255 auto g = cgi_params.equal_range("X");
256 for (auto i = g.first; i != g.second; ++i) {
257 const string & v = i->second;
258 if (!v.empty()) {
259 if (!expand_terms.empty())
260 expand_terms += ' ';
261 expand_terms += v;
266 // collect the unprefixed prob fields
267 auto g = cgi_params.equal_range("P");
268 for (auto i = g.first; i != g.second; ++i) {
269 const string & v = i->second;
270 if (!v.empty()) {
271 // If there are expand terms, append them to the first
272 // non-empty P parameter.
273 if (!expand_terms.empty()) {
274 string q = v;
275 q += ' ';
276 q += expand_terms;
277 add_query_string(string(), q);
278 expand_terms = string();
279 } else {
280 add_query_string(string(), v);
285 if (!expand_terms.empty()) {
286 add_query_string(string(), expand_terms);
290 auto begin = cgi_params.lower_bound("P.");
291 auto end = cgi_params.lower_bound("P/"); // '/' is '.' + 1.
292 for (auto i = begin; i != end; ++i) {
293 const string & v = i->second;
294 if (!v.empty()) {
295 string pfx(i->first, 2, string::npos);
296 add_query_string(pfx, v);
300 // set any boolean filters
301 auto g = cgi_params.equal_range("B");
302 if (g.first != g.second) {
303 vector<string> filter_v;
304 for (auto i = g.first; i != g.second; ++i) {
305 const string & v = i->second;
306 // we'll definitely get empty B fields from "-ALL-" options
307 if (!v.empty() && C_isalnum(v[0])) {
308 add_bterm(v);
309 filter_v.push_back(v);
312 sort(filter_v.begin(), filter_v.end());
313 vector<string>::const_iterator j;
314 for (j = filter_v.begin(); j != filter_v.end(); ++j) {
315 const string & bterm = *j;
316 string::size_type e = bterm.find(filter_sep);
317 if (usual(e == string::npos)) {
318 filters += bterm;
319 } else {
320 // If a filter contains filter_sep then double it to escape.
321 // Each filter must start with an alnum (checked above) and
322 // the value after the last filter is the default op, which
323 // is encoded as a non-alnum so filter_sep followed by
324 // something other than filter_sep must be separating filters.
325 string::size_type b = 0;
326 while (e != string::npos) {
327 filters.append(bterm, b, e + 1 - b);
328 b = e;
329 e = bterm.find(filter_sep, b + 1);
331 filters.append(bterm, b, string::npos);
333 filters += filter_sep;
334 old_filters += bterm;
335 old_filters += filter_sep_old;
339 // set any negated boolean filters
340 g = cgi_params.equal_range("N");
341 if (g.first != g.second) {
342 vector<string> filter_v;
343 for (auto i = g.first; i != g.second; ++i) {
344 const string & v = i->second;
345 // we'll definitely get empty N fields from "-ALL-" options
346 if (!v.empty() && C_isalnum(v[0])) {
347 add_nterm(v);
348 filter_v.push_back(v);
351 sort(filter_v.begin(), filter_v.end());
352 vector<string>::const_iterator j;
353 for (j = filter_v.begin(); j != filter_v.end(); ++j) {
354 const string & nterm = *j;
355 string::size_type e = nterm.find(filter_sep);
356 filters += '!';
357 if (usual(e == string::npos)) {
358 filters += nterm;
359 } else {
360 // If a filter contains filter_sep then double it to escape.
361 // Each filter must start with an alnum (checked above) and
362 // the value after the last filter is the default op, which
363 // is encoded as a non-alnum so filter_sep followed by
364 // something other than filter_sep must be separating filters.
365 string::size_type b = 0;
366 while (e != string::npos) {
367 filters.append(nterm, b, e + 1 - b);
368 b = e;
369 e = nterm.find(filter_sep, b + 1);
371 filters.append(nterm, b, string::npos);
373 filters += filter_sep;
374 // old_filters predates 'N' terms, so if there are 'N' terms this
375 // is definitely a different query.
376 old_filters.clear();
380 // date range filters
381 struct date_range {
382 string start, end, span;
384 map<Xapian::valueno, date_range> date_ranges;
385 begin = cgi_params.lower_bound("START.");
386 end = cgi_params.lower_bound("START/"); // '/' is '.' + 1.
387 for (auto i = begin; i != end; ++i) {
388 const string & v = i->second;
389 if (!v.empty()) {
390 Xapian::valueno slot = atoi(i->first.c_str() +
391 CONST_STRLEN("START."));
392 date_ranges[slot].start = v;
395 begin = cgi_params.lower_bound("END.");
396 end = cgi_params.lower_bound("END/"); // '/' is '.' + 1.
397 for (auto i = begin; i != end; ++i) {
398 const string & v = i->second;
399 if (!v.empty()) {
400 Xapian::valueno slot = atoi(i->first.c_str() +
401 CONST_STRLEN("END."));
402 date_ranges[slot].end = v;
405 begin = cgi_params.lower_bound("SPAN.");
406 end = cgi_params.lower_bound("SPAN/"); // '/' is '.' + 1.
407 for (auto i = begin; i != end; ++i) {
408 const string & v = i->second;
409 if (!v.empty()) {
410 Xapian::valueno slot = atoi(i->first.c_str() +
411 CONST_STRLEN("SPAN."));
412 date_ranges[slot].span = v;
415 if (!date_ranges.empty()) {
416 // old_filters predates START.N, END.N and SPAN.N so use of any of
417 // these means this is definitely a different query.
418 old_filters.clear();
420 for (auto i : date_ranges) {
421 auto slot = i.first;
422 auto r = i.second;
423 add_date_filter(r.start, r.end, r.span, slot);
424 filters += '$';
425 filters += str(slot);
426 filters += '$';
427 filters += r.start;
428 filters += '$';
429 filters += r.end;
430 filters += '$';
431 filters += r.span;
434 string date_start, date_end, date_span;
435 val = cgi_params.find("START");
436 if (val != cgi_params.end()) date_start = val->second;
437 val = cgi_params.find("END");
438 if (val != cgi_params.end()) date_end = val->second;
439 val = cgi_params.find("SPAN");
440 if (val != cgi_params.end()) date_span = val->second;
441 val = cgi_params.find("DATEVALUE");
442 Xapian::valueno date_value_slot = Xapian::BAD_VALUENO;
443 if (val != cgi_params.end()) date_value_slot = string_to_int(val->second);
444 add_date_filter(date_start, date_end, date_span, date_value_slot);
446 // If more default_op values are supported, encode them as non-alnums
447 // other than filter_sep, '!' or '$'.
448 filters += (default_op == Xapian::Query::OP_AND ? '.' : '-');
449 filters += date_start;
450 filters += filter_sep;
451 filters += date_end;
452 filters += filter_sep;
453 filters += date_span;
454 if (date_value_slot != Xapian::BAD_VALUENO) {
455 // This means we'll force the first page when reloading or changing
456 // page starting from existing URLs upon upgrade to 1.4.1, but the
457 // exact same existing URL could be for a search without the date
458 // filter where we want to force the first page, so there's an inherent
459 // ambiguity there. Forcing first page in this case seems the least
460 // problematic side-effect.
461 filters += filter_sep;
462 filters += str(date_value_slot);
465 if (!old_filters.empty()) {
466 old_filters += date_start;
467 old_filters += filter_sep_old;
468 old_filters += date_end;
469 old_filters += filter_sep_old;
470 old_filters += date_span;
471 old_filters += (default_op == Xapian::Query::OP_AND ? 'A' : 'O');
474 // Percentage relevance cut-off
475 val = cgi_params.find("THRESHOLD");
476 if (val != cgi_params.end()) {
477 threshold = atoi(val->second.c_str());
478 if (threshold < 0) threshold = 0;
479 if (threshold > 100) threshold = 100;
482 // collapsing
483 val = cgi_params.find("COLLAPSE");
484 if (val != cgi_params.end()) {
485 const string & v = val->second;
486 if (!v.empty()) {
487 collapse_key = atoi(v.c_str());
488 collapse = true;
489 filters += filter_sep;
490 filters += str(collapse_key);
491 if (!old_filters.empty()) {
492 old_filters += filter_sep_old;
493 old_filters += str(collapse_key);
497 if (!collapse && date_value_slot != Xapian::BAD_VALUENO) {
498 // We need to either omit filter_sep for both or neither, or else the
499 // encoding is ambiguous.
500 filters += filter_sep;
503 // docid order
504 val = cgi_params.find("DOCIDORDER");
505 if (val != cgi_params.end()) {
506 const string & v = val->second;
507 if (!v.empty()) {
508 char ch = v[0];
509 if (ch == 'D') {
510 docid_order = Xapian::Enquire::DESCENDING;
511 filters += 'D';
512 if (!old_filters.empty()) old_filters += 'D';
513 } else if (ch != 'A') {
514 docid_order = Xapian::Enquire::DONT_CARE;
515 } else {
516 // This is a bug (should add nothing here and 'X' in the (ch !=
517 // 'A') case, but the current "DONT_CARE" implementation
518 // actually always results in ascending docid order so it's not
519 // worth breaking compatibility to fix - let's just do it next
520 // time we change the encoding $filters uses.
521 filters += 'X';
522 if (!old_filters.empty()) old_filters += 'X';
527 // sorting
528 val = cgi_params.find("SORT");
529 if (val != cgi_params.end()) {
530 const char * base = val->second.c_str();
531 const char * p = base;
532 do {
533 bool rev = (*p != '+');
534 if (*p == '-' || *p == '+') {
535 // old_filters predates support for direction in SORT, so if
536 // there's a direction specified this is definitely a different
537 // query.
538 old_filters.clear();
539 ++p;
541 if (!C_isdigit(*p)) {
542 // Invalid.
543 break;
545 errno = 0;
546 char * q;
547 Xapian::valueno slot = strtoul(p, &q, 10);
548 p = q;
549 if (errno != 0) {
550 // Invalid.
551 break;
554 if (sort_key != Xapian::BAD_VALUENO) {
555 // Multiple sort keys specified, so we need a KeyMaker.
557 // Omit leading '+'.
558 if (reverse_sort) filters += '-';
559 filters += str(sort_key);
561 sort_keymaker = new Xapian::MultiValueKeyMaker;
562 sort_keymaker->add_value(sort_key, !reverse_sort);
563 sort_key = Xapian::BAD_VALUENO;
564 reverse_sort = true;
565 // old_filters predates multiple sort keys, so if there are
566 // multiple sort keys this is definitely a different query.
567 old_filters.clear();
570 if (sort_keymaker) {
571 filters += (rev ? '-' : '+');
572 filters += str(slot);
573 sort_keymaker->add_value(slot, !rev);
574 } else {
575 sort_key = slot;
576 reverse_sort = rev;
578 while (C_isspace(*p) || *p == ',') ++p;
579 } while (*p);
581 val = cgi_params.find("SORTREVERSE");
582 if (val != cgi_params.end() && atoi(val->second.c_str()) != 0) {
583 reverse_sort = !reverse_sort;
586 val = cgi_params.find("SORTAFTER");
587 if (val != cgi_params.end()) {
588 sort_after = (atoi(val->second.c_str()) != 0);
591 // Add the sorting related options to filters too.
593 // Note: old_filters really does encode a reversed sort as 'F' and a
594 // non-reversed sort as 'R' or 'r'.
596 // filters has them the other way around for sanity (except in
597 // development snapshot 1.3.4, which was when the new filter encoding
598 // was introduced).
599 if (!sort_keymaker) filters += str(sort_key);
600 if (!old_filters.empty()) old_filters += str(sort_key);
601 if (sort_after) {
602 if (reverse_sort) {
603 filters += 'R';
604 if (!old_filters.empty()) old_filters += 'F';
605 } else {
606 filters += 'F';
607 if (!old_filters.empty()) old_filters += 'R';
609 } else {
610 if (!reverse_sort) {
611 filters += 'f';
612 if (!old_filters.empty()) old_filters += 'r';
617 if (old_filters.empty()) old_filters = filters;
619 // min_hits (fill mset past topdoc+(hits_per_page+1) to
620 // topdoc+max(hits_per_page+1,min_hits)
621 val = cgi_params.find("MINHITS");
622 if (val != cgi_params.end()) {
623 min_hits = atol(val->second.c_str());
626 parse_omegascript();
627 } catch (const Xapian::Error &e) {
628 if (!set_content_type && !suppress_http_headers)
629 cout << "Content-Type: text/html\n\n";
630 cout << "Exception: " << html_escape(e.get_description()) << endl;
631 } catch (const std::exception &e) {
632 if (!set_content_type && !suppress_http_headers)
633 cout << "Content-Type: text/html\n\n";
634 cout << "Exception: std::exception " << html_escape(e.what()) << endl;
635 } catch (const string &s) {
636 if (!set_content_type && !suppress_http_headers)
637 cout << "Content-Type: text/html\n\n";
638 cout << "Exception: " << html_escape(s) << endl;
639 } catch (const char *s) {
640 if (!set_content_type && !suppress_http_headers)
641 cout << "Content-Type: text/html\n\n";
642 cout << "Exception: " << html_escape(s) << endl;
643 } catch (...) {
644 if (!set_content_type && !suppress_http_headers)
645 cout << "Content-Type: text/html\n\n";
646 cout << "Caught unknown exception" << endl;