Fix bug in PL2+ implementation
[xapian.git] / xapian-applications / omega / omega.cc
blobf74e305554db275419a12ed03caa23e7bef56a80
1 /** @file
2 * @brief Main module for omega (example CGI frontend for Xapian)
3 */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001 James Aylett
6 * Copyright 2001,2002 Ananova Ltd
7 * Copyright 2002-2024 Olly Betts
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as
11 * published by the Free Software Foundation; either version 2 of the
12 * License, or (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
22 * USA
25 #include <config.h>
27 #include <cerrno>
28 #include <cstdio>
29 #include <ctime>
31 #include <algorithm>
32 #include <cstring>
33 #include <iostream>
34 #include <set>
36 #include "safefcntl.h"
37 #include "safeunistd.h"
39 #include "omega.h"
40 #include "utils.h"
41 #include "cgiparam.h"
42 #include "query.h"
43 #include "str.h"
44 #include "stringutils.h"
45 #include "expand.h"
46 #include "parseint.h"
48 using namespace std;
50 static const char DEFAULT_STEM_LANGUAGE[] = "english";
52 // A character which doesn't require URL encoding, and isn't likely to appear
53 // in filter values.
54 const char filter_sep = '~';
56 Xapian::Enquire * enquire;
57 Xapian::Database db;
59 map<string, string> option;
61 bool set_content_type = false;
63 bool suppress_http_headers = false;
65 string dbname;
66 string fmtname;
67 string filters, old_filters;
69 Xapian::docid hits_per_page = 0;
70 Xapian::docid min_hits = 0;
72 // percentage cut-off
73 int threshold = 0;
75 Xapian::MultiValueKeyMaker* sort_keymaker = NULL;
76 Xapian::valueno sort_key = Xapian::BAD_VALUENO; // Don't sort.
77 bool reverse_sort = true;
78 bool sort_after = false;
79 Xapian::Enquire::docid_order docid_order = Xapian::Enquire::ASCENDING;
81 Xapian::valueno collapse_key = 0;
82 bool collapse = false;
84 static string
85 map_dbname_to_dir(const string &database_name)
87 return database_dir + database_name;
90 static void
91 add_database(const string& this_dbname)
93 if (!dbname.empty()) dbname += '/';
94 dbname += this_dbname;
96 Xapian::Database this_db(map_dbname_to_dir(this_dbname));
97 db.add_database(this_db);
99 size_t this_db_size = this_db.size();
100 size_t db_size = db.size();
101 size_t i = 0;
102 while (subdbs.size() != db_size) {
103 subdbs.emplace_back(this_dbname, i++, this_db_size);
107 // Get database(s) to search.
108 template<typename IT>
109 void
110 parse_db_params(const pair<IT, IT>& dbs)
112 dbname.resize(0);
113 // Only add a repeated db once.
114 set<string> seen;
115 for (auto i = dbs.first; i != dbs.second; ++i) {
116 const string& v = i->second;
117 if (v.empty()) continue;
118 size_t p = 0, q;
119 while (true) {
120 q = v.find('/', p);
121 string s(v, p, q - p);
122 if (!s.empty() && seen.find(s) == seen.end()) {
123 add_database(s);
124 seen.insert(s);
126 if (q == string::npos) break;
127 p = q + 1;
132 #define FILTER_CODE \
133 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_-"
135 template<typename T>
136 static void
137 filters_encode_uint(T v)
139 do {
140 if (v >= 64)
141 filters += ' ';
142 filters += FILTER_CODE[v & 63];
143 v >>= 6;
144 } while (v);
147 static void
148 filters_append(const string& bterm, const string* prev)
150 auto reuse = prev ? common_prefix_length(*prev, bterm) : 0u;
151 if (prev)
152 filters_encode_uint(reuse);
153 filters_encode_uint(bterm.size() - reuse);
154 filters.append(bterm, reuse);
156 auto e = bterm.find(filter_sep);
157 if (usual(e == string::npos)) {
158 old_filters += bterm;
159 } else {
160 // For old_filters, we don't try to reuse part of the previous term,
161 // and if a filter contains filter_sep then we double it to escape.
162 // Each filter must start with an alnum (checked before we get called)
163 // and the value after the last filter is the default op, which is
164 // encoded as a non-alnum so filter_sep followed by something other
165 // than filter_sep must be separating filters.
166 string::size_type b = 0;
167 while (e != string::npos) {
168 old_filters.append(bterm, b, e + 1 - b);
169 b = e;
170 e = bterm.find(filter_sep, b + 1);
172 old_filters.append(bterm, b, string::npos);
174 old_filters += filter_sep;
177 int main(int argc, char *argv[])
178 try {
180 // Check for SERVER_PROTOCOL=INCLUDED, which is set when we're being
181 // included in a page via a server-side include directive. In this
182 // case we suppress sending a Content-Type: header.
183 const char* p = getenv("SERVER_PROTOCOL");
184 if (p && strcmp(p, "INCLUDED") == 0) {
185 suppress_http_headers = true;
189 read_config_file();
191 option["flag_default"] = "true";
193 // set default thousands and decimal separators: e.g. "16,729 hits" "1.4K"
194 option["decimal"] = ".";
195 option["thousand"] = ",";
197 // set the default stemming language
198 option["stemmer"] = DEFAULT_STEM_LANGUAGE;
200 // FIXME: set cout to linebuffered not stdout. Or just flush regularly...
201 // setvbuf(stdout, NULL, _IOLBF, 0);
203 const char * method = getenv("REQUEST_METHOD");
204 if (method == NULL) {
205 if (argc > 1 && (argv[1][0] != '-' || strchr(argv[1], '='))) {
206 // omega 'P=information retrieval' DB=papers
207 // check for a leading '-' on the first arg so "omega --version",
208 // "omega --help", and similar take the next branch
209 decode_argv(argv + 1);
210 } else {
211 // Seems we're running from the command line so give version
212 // and allow a query to be entered for testing
213 cout << PROGRAM_NAME " - " PACKAGE " " VERSION "\n";
214 if (argc > 1) exit(0);
215 cout << "Enter NAME=VALUE lines, end with blank line\n";
216 decode_test();
218 } else {
219 if (*method == 'P')
220 decode_post();
221 else
222 decode_get();
225 try {
226 parse_db_params(cgi_params.equal_range("DB"));
227 if (dbname.empty()) {
228 add_database(default_db);
230 enquire = new Xapian::Enquire(db);
231 } catch (const Xapian::Error &) {
232 enquire = NULL;
233 db = Xapian::Database();
236 hits_per_page = 0;
237 auto val = cgi_params.find("HITSPERPAGE");
238 if (val != cgi_params.end()) {
239 if (!parse_unsigned(val->second.c_str(), hits_per_page)) {
240 throw "HITSPERPAGE parameter must be >= 0";
243 if (hits_per_page == 0) {
244 hits_per_page = 10;
245 } else if (hits_per_page > 1000) {
246 hits_per_page = 1000;
249 val = cgi_params.find("DEFAULTOP");
250 if (val != cgi_params.end()) {
251 const string & v = val->second;
252 if (v == "OR" || v == "or")
253 default_op = Xapian::Query::OP_OR;
256 val = cgi_params.find("FMT");
257 if (val != cgi_params.end()) {
258 const string & v = val->second;
259 if (!v.empty()) fmtname = v;
261 if (fmtname.empty())
262 fmtname = default_template;
264 auto ml = cgi_params.equal_range("MORELIKE");
265 if (enquire && ml.first != ml.second) {
266 Xapian::RSet tmprset;
267 for (auto i = ml.first; i != ml.second; ++i) {
268 const string& v = i->second;
269 Xapian::docid docid = atol(v.c_str());
270 if (docid == 0) {
271 // Assume it's MORELIKE=Quid1138 and that Quid1138 is a UID
272 // from an external source - we just find the correspond docid.
273 Xapian::PostingIterator p = db.postlist_begin(v);
274 if (p != db.postlist_end(v)) docid = *p;
276 if (docid != 0) {
277 tmprset.add_document(docid);
281 if (!tmprset.empty()) {
282 OmegaExpandDecider decider(db);
283 set_expansion_scheme(*enquire, option);
284 Xapian::ESet eset(enquire->get_eset(40, tmprset, &decider));
285 string morelike_query;
286 for (auto&& term : eset) {
287 if (!morelike_query.empty()) {
288 if (default_op == Xapian::Query::OP_OR) {
289 morelike_query += ' ';
290 } else {
291 morelike_query += " OR ";
294 morelike_query += pretty_term(term);
296 add_query_string(string(), morelike_query);
298 } else {
299 // add expand/topterms terms if appropriate
300 string expand_terms;
301 if (cgi_params.find("ADD") != cgi_params.end()) {
302 auto g = cgi_params.equal_range("X");
303 for (auto i = g.first; i != g.second; ++i) {
304 const string & v = i->second;
305 if (!v.empty()) {
306 if (!expand_terms.empty())
307 expand_terms += ' ';
308 expand_terms += v;
313 // collect the unprefixed prob fields
314 auto g = cgi_params.equal_range("P");
315 for (auto i = g.first; i != g.second; ++i) {
316 const string & v = i->second;
317 if (!v.empty()) {
318 // If there are expand terms, append them to the first
319 // non-empty P parameter.
320 if (!expand_terms.empty()) {
321 string q = v;
322 q += ' ';
323 q += expand_terms;
324 add_query_string(string(), q);
325 expand_terms = string();
326 } else {
327 add_query_string(string(), v);
332 if (!expand_terms.empty()) {
333 add_query_string(string(), expand_terms);
337 auto begin = cgi_params.lower_bound("P.");
338 auto end = cgi_params.lower_bound("P/"); // '/' is '.' + 1.
339 for (auto i = begin; i != end; ++i) {
340 const string & v = i->second;
341 if (!v.empty()) {
342 string pfx(i->first, 2, string::npos);
343 add_query_string(pfx, v);
347 // set any boolean filters
348 auto g = cgi_params.equal_range("B");
349 if (g.first != g.second) {
350 vector<string> filter_v;
351 for (auto i = g.first; i != g.second; ++i) {
352 const string & v = i->second;
353 // we'll definitely get empty B fields from "-ALL-" options
354 if (!v.empty() && C_isalnum(v[0])) {
355 add_bterm(v);
356 filter_v.push_back(v);
359 sort(filter_v.begin(), filter_v.end());
360 const string* prev = NULL;
361 for (const string& bterm : filter_v) {
362 filters_append(bterm, prev);
363 prev = &bterm;
367 // Current filters format:
369 // [<encoded length><boolean filter term>]*
370 // ['!'[<encoded length><negated boolean filter term>]*]?
371 // ['.'<collapse key>]?
372 // ['$'<encoded date range slot (omitted for term-based)>?
373 // ['!'<date start>]?
374 // ['.'<date end>]?
375 // ['~'<date span>]?
376 // [['-'?<sort key>[['-'|'+']<sort key>]+]|<sort key>|]?
377 // <encoded integer of default_op, docid_order, sort_after, sort_reverse>
379 // (filter terms in ascending byte sorted order, and with second and
380 // subsequent actually stored as <reuse character><tail>)
382 // old_filters format:
384 // [<boolean filter term with any '~' escaped to '~~'>'~']*
385 // ['!'<negated boolean filter term with any '~' escaped to '~~'>'~']*
386 // ['$'<date range slot>'$'<date start>'$'<date end>'$'<date span>]*
387 // ['.'|'-'] ; default_op AND vs OR
388 // <date start>'~'<date end>'~'<date span>['~'<date value slot>]?
389 // ['~'<collapse key>]? ; present if <collapse key> non-empty or
390 // ; previous element present
391 // ['D'|'X']? ; 'D' for docid_order DESCENDING; 'X' for DONT_CARE.
392 // [['-'?<sort key>[['-'|'+']<sort key>]+]|<sort key>]? ['R'|'F'|'f']?
394 // (filter terms in ascending byte sorted order)
396 // set any negated boolean filters
397 g = cgi_params.equal_range("N");
398 if (g.first != g.second) {
399 vector<string> filter_v;
400 for (auto i = g.first; i != g.second; ++i) {
401 const string & v = i->second;
402 // we'll definitely get empty N fields from "-ALL-" options
403 if (!v.empty() && C_isalnum(v[0])) {
404 add_nterm(v);
405 filter_v.push_back(v);
408 if (!filter_v.empty()) {
409 filters += '!';
410 sort(filter_v.begin(), filter_v.end());
411 const string* prev = NULL;
412 for (const string& nterm : filter_v) {
413 old_filters += '!';
414 filters_append(nterm, prev);
415 prev = &nterm;
420 // collapsing
421 val = cgi_params.find("COLLAPSE");
422 if (val != cgi_params.end()) {
423 const string& v = val->second;
424 if (!v.empty()) {
425 if (!parse_unsigned(val->second.c_str(), collapse_key)) {
426 throw "COLLAPSE parameter must be >= 0";
428 collapse = true;
429 filters += '.';
430 filters_encode_uint(collapse_key);
434 // date range filters
435 struct date_range {
436 string start, end, span;
438 map<Xapian::valueno, date_range> date_ranges;
439 begin = cgi_params.lower_bound("START.");
440 end = cgi_params.lower_bound("START/"); // '/' is '.' + 1.
441 for (auto i = begin; i != end; ++i) {
442 const string & v = i->second;
443 if (!v.empty()) {
444 Xapian::valueno slot;
445 if (!parse_unsigned(i->first.c_str() +
446 CONST_STRLEN("START."), slot)) {
447 throw "START slot value must be >= 0";
449 date_ranges[slot].start = v;
452 begin = cgi_params.lower_bound("END.");
453 end = cgi_params.lower_bound("END/"); // '/' is '.' + 1.
454 for (auto i = begin; i != end; ++i) {
455 const string & v = i->second;
456 if (!v.empty()) {
457 Xapian::valueno slot;
458 if (!parse_unsigned(i->first.c_str() +
459 CONST_STRLEN("END."), slot)) {
460 throw "END slot value must be >= 0";
462 date_ranges[slot].end = v;
465 begin = cgi_params.lower_bound("SPAN.");
466 end = cgi_params.lower_bound("SPAN/"); // '/' is '.' + 1.
467 for (auto i = begin; i != end; ++i) {
468 const string & v = i->second;
469 if (!v.empty()) {
470 Xapian::valueno slot;
471 if (!parse_unsigned(i->first.c_str() +
472 CONST_STRLEN("SPAN."), slot)) {
473 throw "SPAN slot value must be >= 0";
475 date_ranges[slot].span = v;
479 string date_start, date_end, date_span;
480 val = cgi_params.find("START");
481 if (val != cgi_params.end()) {
482 date_start = val->second;
484 val = cgi_params.find("END");
485 if (val != cgi_params.end()) {
486 date_end = val->second;
488 val = cgi_params.find("SPAN");
489 if (val != cgi_params.end()) {
490 date_span = val->second;
492 val = cgi_params.find("DATEVALUE");
493 Xapian::valueno date_value_slot = Xapian::BAD_VALUENO;
494 if (val != cgi_params.end() &&
495 !parse_unsigned(val->second.c_str(), date_value_slot)) {
496 throw "DATEVALUE slot must be >= 0";
498 if (date_value_slot != Xapian::BAD_VALUENO ||
499 !date_start.empty() ||
500 !date_end.empty() ||
501 !date_span.empty()) {
502 // Process DATEVALUE=n and associated values unless we saw START.n=...
503 // or END.n=... or SPAN.n=...
504 date_ranges.emplace(date_value_slot,
505 date_range{date_start, date_end, date_span});
507 for (auto i : date_ranges) {
508 auto slot = i.first;
509 auto r = i.second;
510 add_date_filter(r.start, r.end, r.span, slot);
511 filters += '$';
512 if (slot != Xapian::BAD_VALUENO) {
513 filters_encode_uint(slot);
514 if (slot != date_value_slot) {
515 old_filters += '$';
516 old_filters += str(slot);
517 old_filters += '$';
518 old_filters += r.start;
519 old_filters += '$';
520 old_filters += r.end;
521 old_filters += '$';
522 old_filters += r.span;
525 if (!r.start.empty()) {
526 filters += '!';
527 filters += r.start;
529 if (!r.end.empty()) {
530 filters += '.';
531 filters += r.end;
533 if (!r.span.empty()) {
534 filters += '~';
535 filters += r.span;
539 old_filters += (default_op == Xapian::Query::OP_AND ? '.' : '-');
540 old_filters += date_start;
541 old_filters += filter_sep;
542 old_filters += date_end;
543 old_filters += filter_sep;
544 old_filters += date_span;
545 if (date_value_slot != Xapian::BAD_VALUENO) {
546 // This means we'll force the first page when reloading or changing
547 // page starting from existing URLs upon upgrade to 1.4.1, but the
548 // exact same existing URL could be for a search without the date
549 // filter where we want to force the first page, so there's an inherent
550 // ambiguity there. Forcing first page in this case seems the least
551 // problematic side-effect.
552 old_filters += filter_sep;
553 old_filters += str(date_value_slot);
556 // Percentage relevance cut-off
557 val = cgi_params.find("THRESHOLD");
558 if (val != cgi_params.end()) {
559 unsigned int temp;
560 if (val->second[0] == '-') {
561 if (!parse_unsigned(val->second.c_str() + 1, temp)) {
562 throw "THRESHOLD parameter must be an integer";
564 threshold = 0;
565 } else if (!parse_unsigned(val->second.c_str(), temp)) {
566 throw "THRESHOLD parameter must be an integer";
568 if (temp > 100) {
569 threshold = 100;
570 } else {
571 threshold = temp;
575 if (collapse) {
576 old_filters += filter_sep;
577 old_filters += str(collapse_key);
578 } else if (date_value_slot != Xapian::BAD_VALUENO) {
579 // We need to either omit filter_sep for both or neither, or else the
580 // encoding is ambiguous.
581 old_filters += filter_sep;
584 // docid order
585 val = cgi_params.find("DOCIDORDER");
586 if (val != cgi_params.end()) {
587 const string & v = val->second;
588 if (!v.empty()) {
589 char ch = v[0];
590 if (ch == 'D') {
591 docid_order = Xapian::Enquire::DESCENDING;
592 old_filters += 'D';
593 } else if (ch != 'A') {
594 docid_order = Xapian::Enquire::DONT_CARE;
595 } else {
596 // This was a bug in the 1.4.x filter encoding (we should have
597 // added nothing here and 'X' in the `ch != 'A'` case), but the
598 // current "DONT_CARE" implementation actually always results
599 // in ascending docid order so it wasn't worth breaking
600 // compatibility in a stable release series to fix.
601 old_filters += 'X';
606 // sorting
607 val = cgi_params.find("SORT");
608 if (val != cgi_params.end()) {
609 const char * base = val->second.c_str();
610 const char * p = base;
611 do {
612 bool rev = (*p != '+');
613 if (*p == '-' || *p == '+') {
614 ++p;
616 if (!C_isdigit(*p)) {
617 // Invalid.
618 break;
620 errno = 0;
621 char * q;
622 Xapian::valueno slot = strtoul(p, &q, 10);
623 p = q;
624 if (errno != 0) {
625 // Invalid.
626 break;
629 if (sort_key != Xapian::BAD_VALUENO) {
630 // Multiple sort keys specified, so we need a KeyMaker.
632 // Omit leading '+'.
633 if (reverse_sort) {
634 filters += '-';
635 old_filters += '-';
637 filters_encode_uint(sort_key);
638 old_filters += str(sort_key);
640 sort_keymaker = new Xapian::MultiValueKeyMaker;
641 sort_keymaker->add_value(sort_key, !reverse_sort);
642 sort_key = Xapian::BAD_VALUENO;
643 reverse_sort = true;
646 if (sort_keymaker) {
647 filters += (rev ? '-' : '+');
648 old_filters += (rev ? '-' : '+');
649 filters_encode_uint(slot);
650 old_filters += str(slot);
651 sort_keymaker->add_value(slot, !rev);
652 } else {
653 sort_key = slot;
654 reverse_sort = rev;
656 while (C_isspace(*p) || *p == ',') ++p;
657 } while (*p);
659 val = cgi_params.find("SORTREVERSE");
660 if (val != cgi_params.end()) {
661 unsigned int temp;
662 if (!parse_unsigned(val->second.c_str(), temp)) {
663 throw "SORTREVERSE parameter must be >= 0";
665 if (temp != 0) {
666 reverse_sort = !reverse_sort;
669 val = cgi_params.find("SORTAFTER");
670 if (val != cgi_params.end()) {
671 unsigned int temp;
672 if (!parse_unsigned(val->second.c_str(), temp)) {
673 throw "SORTAFTER parameter must be >= 0";
675 sort_after = bool(temp);
678 // Add the sorting related options to filters too.
679 if (!sort_keymaker) {
680 filters_encode_uint(sort_key);
681 old_filters += str(sort_key);
683 if (sort_after) {
684 if (reverse_sort) {
685 old_filters += 'R';
686 } else {
687 old_filters += 'F';
689 } else {
690 if (!reverse_sort) {
691 old_filters += 'f';
697 // Encode default_op, docid_order, reverse_sort and sort_after together
698 // in a single character.
699 unsigned v = 0;
700 switch (default_op) {
701 case Xapian::Query::OP_AND:
702 break;
703 case Xapian::Query::OP_OR:
704 v = 0x01 * 12;
705 break;
706 default:
707 // Additional supported value should encode as:
708 // 0x02 * 12
709 // 0x03 * 12
710 // ...
711 break;
713 v |= 0x04 * static_cast<unsigned>(docid_order);
714 if (reverse_sort) v |= 0x01;
715 if (sort_after) v |= 0x02;
716 filters_encode_uint(v);
719 // min_hits (fill mset past topdoc+(hits_per_page+1) to
720 // topdoc+max(hits_per_page+1,min_hits)
721 val = cgi_params.find("MINHITS");
722 if (val != cgi_params.end()) {
723 if (!parse_unsigned(val->second.c_str(), min_hits)) {
724 throw "MINHITS parameter must be >= 0";
728 parse_omegascript();
729 } catch (const Xapian::Error &e) {
730 if (!set_content_type && !suppress_http_headers)
731 cout << "Content-Type: text/html\n\n";
732 cout << "Exception: " << html_escape(e.get_description()) << endl;
733 } catch (const std::exception &e) {
734 if (!set_content_type && !suppress_http_headers)
735 cout << "Content-Type: text/html\n\n";
736 cout << "Exception: std::exception " << html_escape(e.what()) << endl;
737 } catch (const string &s) {
738 if (!set_content_type && !suppress_http_headers)
739 cout << "Content-Type: text/html\n\n";
740 cout << "Exception: " << html_escape(s) << endl;
741 } catch (const char *s) {
742 if (!set_content_type && !suppress_http_headers)
743 cout << "Content-Type: text/html\n\n";
744 cout << "Exception: " << html_escape(s) << endl;
745 } catch (...) {
746 if (!set_content_type && !suppress_http_headers)
747 cout << "Content-Type: text/html\n\n";
748 cout << "Caught unknown exception" << endl;