2 * @brief Main module for omega (example CGI frontend for Xapian)
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001 James Aylett
6 * Copyright 2001,2002 Ananova Ltd
7 * Copyright 2002-2024 Olly Betts
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as
11 * published by the Free Software Foundation; either version 2 of the
12 * License, or (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
36 #include "safefcntl.h"
37 #include "safeunistd.h"
44 #include "stringutils.h"
50 static const char DEFAULT_STEM_LANGUAGE
[] = "english";
52 // A character which doesn't require URL encoding, and isn't likely to appear
54 const char filter_sep
= '~';
56 Xapian::Enquire
* enquire
;
59 map
<string
, string
> option
;
61 bool set_content_type
= false;
63 bool suppress_http_headers
= false;
67 string filters
, old_filters
;
69 Xapian::docid hits_per_page
= 0;
70 Xapian::docid min_hits
= 0;
75 Xapian::MultiValueKeyMaker
* sort_keymaker
= NULL
;
76 Xapian::valueno sort_key
= Xapian::BAD_VALUENO
; // Don't sort.
77 bool reverse_sort
= true;
78 bool sort_after
= false;
79 Xapian::Enquire::docid_order docid_order
= Xapian::Enquire::ASCENDING
;
81 Xapian::valueno collapse_key
= 0;
82 bool collapse
= false;
85 map_dbname_to_dir(const string
&database_name
)
87 return database_dir
+ database_name
;
91 add_database(const string
& this_dbname
)
93 if (!dbname
.empty()) dbname
+= '/';
94 dbname
+= this_dbname
;
96 Xapian::Database
this_db(map_dbname_to_dir(this_dbname
));
97 db
.add_database(this_db
);
99 size_t this_db_size
= this_db
.size();
100 size_t db_size
= db
.size();
102 while (subdbs
.size() != db_size
) {
103 subdbs
.emplace_back(this_dbname
, i
++, this_db_size
);
107 // Get database(s) to search.
108 template<typename IT
>
110 parse_db_params(const pair
<IT
, IT
>& dbs
)
113 // Only add a repeated db once.
115 for (auto i
= dbs
.first
; i
!= dbs
.second
; ++i
) {
116 const string
& v
= i
->second
;
117 if (v
.empty()) continue;
121 string
s(v
, p
, q
- p
);
122 if (!s
.empty() && seen
.find(s
) == seen
.end()) {
126 if (q
== string::npos
) break;
132 #define FILTER_CODE \
133 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_-"
137 filters_encode_uint(T v
)
142 filters
+= FILTER_CODE
[v
& 63];
148 filters_append(const string
& bterm
, const string
* prev
)
150 auto reuse
= prev
? common_prefix_length(*prev
, bterm
) : 0u;
152 filters_encode_uint(reuse
);
153 filters_encode_uint(bterm
.size() - reuse
);
154 filters
.append(bterm
, reuse
);
156 auto e
= bterm
.find(filter_sep
);
157 if (usual(e
== string::npos
)) {
158 old_filters
+= bterm
;
160 // For old_filters, we don't try to reuse part of the previous term,
161 // and if a filter contains filter_sep then we double it to escape.
162 // Each filter must start with an alnum (checked before we get called)
163 // and the value after the last filter is the default op, which is
164 // encoded as a non-alnum so filter_sep followed by something other
165 // than filter_sep must be separating filters.
166 string::size_type b
= 0;
167 while (e
!= string::npos
) {
168 old_filters
.append(bterm
, b
, e
+ 1 - b
);
170 e
= bterm
.find(filter_sep
, b
+ 1);
172 old_filters
.append(bterm
, b
, string::npos
);
174 old_filters
+= filter_sep
;
177 int main(int argc
, char *argv
[])
180 // Check for SERVER_PROTOCOL=INCLUDED, which is set when we're being
181 // included in a page via a server-side include directive. In this
182 // case we suppress sending a Content-Type: header.
183 const char* p
= getenv("SERVER_PROTOCOL");
184 if (p
&& strcmp(p
, "INCLUDED") == 0) {
185 suppress_http_headers
= true;
191 option
["flag_default"] = "true";
193 // set default thousands and decimal separators: e.g. "16,729 hits" "1.4K"
194 option
["decimal"] = ".";
195 option
["thousand"] = ",";
197 // set the default stemming language
198 option
["stemmer"] = DEFAULT_STEM_LANGUAGE
;
200 // FIXME: set cout to linebuffered not stdout. Or just flush regularly...
201 // setvbuf(stdout, NULL, _IOLBF, 0);
203 const char * method
= getenv("REQUEST_METHOD");
204 if (method
== NULL
) {
205 if (argc
> 1 && (argv
[1][0] != '-' || strchr(argv
[1], '='))) {
206 // omega 'P=information retrieval' DB=papers
207 // check for a leading '-' on the first arg so "omega --version",
208 // "omega --help", and similar take the next branch
209 decode_argv(argv
+ 1);
211 // Seems we're running from the command line so give version
212 // and allow a query to be entered for testing
213 cout
<< PROGRAM_NAME
" - " PACKAGE
" " VERSION
"\n";
214 if (argc
> 1) exit(0);
215 cout
<< "Enter NAME=VALUE lines, end with blank line\n";
226 parse_db_params(cgi_params
.equal_range("DB"));
227 if (dbname
.empty()) {
228 add_database(default_db
);
230 enquire
= new Xapian::Enquire(db
);
231 } catch (const Xapian::Error
&) {
233 db
= Xapian::Database();
237 auto val
= cgi_params
.find("HITSPERPAGE");
238 if (val
!= cgi_params
.end()) {
239 if (!parse_unsigned(val
->second
.c_str(), hits_per_page
)) {
240 throw "HITSPERPAGE parameter must be >= 0";
243 if (hits_per_page
== 0) {
245 } else if (hits_per_page
> 1000) {
246 hits_per_page
= 1000;
249 val
= cgi_params
.find("DEFAULTOP");
250 if (val
!= cgi_params
.end()) {
251 const string
& v
= val
->second
;
252 if (v
== "OR" || v
== "or")
253 default_op
= Xapian::Query::OP_OR
;
256 val
= cgi_params
.find("FMT");
257 if (val
!= cgi_params
.end()) {
258 const string
& v
= val
->second
;
259 if (!v
.empty()) fmtname
= v
;
262 fmtname
= default_template
;
264 auto ml
= cgi_params
.equal_range("MORELIKE");
265 if (enquire
&& ml
.first
!= ml
.second
) {
266 Xapian::RSet tmprset
;
267 for (auto i
= ml
.first
; i
!= ml
.second
; ++i
) {
268 const string
& v
= i
->second
;
269 Xapian::docid docid
= atol(v
.c_str());
271 // Assume it's MORELIKE=Quid1138 and that Quid1138 is a UID
272 // from an external source - we just find the correspond docid.
273 Xapian::PostingIterator p
= db
.postlist_begin(v
);
274 if (p
!= db
.postlist_end(v
)) docid
= *p
;
277 tmprset
.add_document(docid
);
281 if (!tmprset
.empty()) {
282 OmegaExpandDecider
decider(db
);
283 set_expansion_scheme(*enquire
, option
);
284 Xapian::ESet
eset(enquire
->get_eset(40, tmprset
, &decider
));
285 string morelike_query
;
286 for (auto&& term
: eset
) {
287 if (!morelike_query
.empty()) {
288 if (default_op
== Xapian::Query::OP_OR
) {
289 morelike_query
+= ' ';
291 morelike_query
+= " OR ";
294 morelike_query
+= pretty_term(term
);
296 add_query_string(string(), morelike_query
);
299 // add expand/topterms terms if appropriate
301 if (cgi_params
.find("ADD") != cgi_params
.end()) {
302 auto g
= cgi_params
.equal_range("X");
303 for (auto i
= g
.first
; i
!= g
.second
; ++i
) {
304 const string
& v
= i
->second
;
306 if (!expand_terms
.empty())
313 // collect the unprefixed prob fields
314 auto g
= cgi_params
.equal_range("P");
315 for (auto i
= g
.first
; i
!= g
.second
; ++i
) {
316 const string
& v
= i
->second
;
318 // If there are expand terms, append them to the first
319 // non-empty P parameter.
320 if (!expand_terms
.empty()) {
324 add_query_string(string(), q
);
325 expand_terms
= string();
327 add_query_string(string(), v
);
332 if (!expand_terms
.empty()) {
333 add_query_string(string(), expand_terms
);
337 auto begin
= cgi_params
.lower_bound("P.");
338 auto end
= cgi_params
.lower_bound("P/"); // '/' is '.' + 1.
339 for (auto i
= begin
; i
!= end
; ++i
) {
340 const string
& v
= i
->second
;
342 string
pfx(i
->first
, 2, string::npos
);
343 add_query_string(pfx
, v
);
347 // set any boolean filters
348 auto g
= cgi_params
.equal_range("B");
349 if (g
.first
!= g
.second
) {
350 vector
<string
> filter_v
;
351 for (auto i
= g
.first
; i
!= g
.second
; ++i
) {
352 const string
& v
= i
->second
;
353 // we'll definitely get empty B fields from "-ALL-" options
354 if (!v
.empty() && C_isalnum(v
[0])) {
356 filter_v
.push_back(v
);
359 sort(filter_v
.begin(), filter_v
.end());
360 const string
* prev
= NULL
;
361 for (const string
& bterm
: filter_v
) {
362 filters_append(bterm
, prev
);
367 // Current filters format:
369 // [<encoded length><boolean filter term>]*
370 // ['!'[<encoded length><negated boolean filter term>]*]?
371 // ['.'<collapse key>]?
372 // ['$'<encoded date range slot (omitted for term-based)>?
373 // ['!'<date start>]?
376 // [['-'?<sort key>[['-'|'+']<sort key>]+]|<sort key>|]?
377 // <encoded integer of default_op, docid_order, sort_after, sort_reverse>
379 // (filter terms in ascending byte sorted order, and with second and
380 // subsequent actually stored as <reuse character><tail>)
382 // old_filters format:
384 // [<boolean filter term with any '~' escaped to '~~'>'~']*
385 // ['!'<negated boolean filter term with any '~' escaped to '~~'>'~']*
386 // ['$'<date range slot>'$'<date start>'$'<date end>'$'<date span>]*
387 // ['.'|'-'] ; default_op AND vs OR
388 // <date start>'~'<date end>'~'<date span>['~'<date value slot>]?
389 // ['~'<collapse key>]? ; present if <collapse key> non-empty or
390 // ; previous element present
391 // ['D'|'X']? ; 'D' for docid_order DESCENDING; 'X' for DONT_CARE.
392 // [['-'?<sort key>[['-'|'+']<sort key>]+]|<sort key>]? ['R'|'F'|'f']?
394 // (filter terms in ascending byte sorted order)
396 // set any negated boolean filters
397 g
= cgi_params
.equal_range("N");
398 if (g
.first
!= g
.second
) {
399 vector
<string
> filter_v
;
400 for (auto i
= g
.first
; i
!= g
.second
; ++i
) {
401 const string
& v
= i
->second
;
402 // we'll definitely get empty N fields from "-ALL-" options
403 if (!v
.empty() && C_isalnum(v
[0])) {
405 filter_v
.push_back(v
);
408 if (!filter_v
.empty()) {
410 sort(filter_v
.begin(), filter_v
.end());
411 const string
* prev
= NULL
;
412 for (const string
& nterm
: filter_v
) {
414 filters_append(nterm
, prev
);
421 val
= cgi_params
.find("COLLAPSE");
422 if (val
!= cgi_params
.end()) {
423 const string
& v
= val
->second
;
425 if (!parse_unsigned(val
->second
.c_str(), collapse_key
)) {
426 throw "COLLAPSE parameter must be >= 0";
430 filters_encode_uint(collapse_key
);
434 // date range filters
436 string start
, end
, span
;
438 map
<Xapian::valueno
, date_range
> date_ranges
;
439 begin
= cgi_params
.lower_bound("START.");
440 end
= cgi_params
.lower_bound("START/"); // '/' is '.' + 1.
441 for (auto i
= begin
; i
!= end
; ++i
) {
442 const string
& v
= i
->second
;
444 Xapian::valueno slot
;
445 if (!parse_unsigned(i
->first
.c_str() +
446 CONST_STRLEN("START."), slot
)) {
447 throw "START slot value must be >= 0";
449 date_ranges
[slot
].start
= v
;
452 begin
= cgi_params
.lower_bound("END.");
453 end
= cgi_params
.lower_bound("END/"); // '/' is '.' + 1.
454 for (auto i
= begin
; i
!= end
; ++i
) {
455 const string
& v
= i
->second
;
457 Xapian::valueno slot
;
458 if (!parse_unsigned(i
->first
.c_str() +
459 CONST_STRLEN("END."), slot
)) {
460 throw "END slot value must be >= 0";
462 date_ranges
[slot
].end
= v
;
465 begin
= cgi_params
.lower_bound("SPAN.");
466 end
= cgi_params
.lower_bound("SPAN/"); // '/' is '.' + 1.
467 for (auto i
= begin
; i
!= end
; ++i
) {
468 const string
& v
= i
->second
;
470 Xapian::valueno slot
;
471 if (!parse_unsigned(i
->first
.c_str() +
472 CONST_STRLEN("SPAN."), slot
)) {
473 throw "SPAN slot value must be >= 0";
475 date_ranges
[slot
].span
= v
;
479 string date_start
, date_end
, date_span
;
480 val
= cgi_params
.find("START");
481 if (val
!= cgi_params
.end()) {
482 date_start
= val
->second
;
484 val
= cgi_params
.find("END");
485 if (val
!= cgi_params
.end()) {
486 date_end
= val
->second
;
488 val
= cgi_params
.find("SPAN");
489 if (val
!= cgi_params
.end()) {
490 date_span
= val
->second
;
492 val
= cgi_params
.find("DATEVALUE");
493 Xapian::valueno date_value_slot
= Xapian::BAD_VALUENO
;
494 if (val
!= cgi_params
.end() &&
495 !parse_unsigned(val
->second
.c_str(), date_value_slot
)) {
496 throw "DATEVALUE slot must be >= 0";
498 if (date_value_slot
!= Xapian::BAD_VALUENO
||
499 !date_start
.empty() ||
501 !date_span
.empty()) {
502 // Process DATEVALUE=n and associated values unless we saw START.n=...
503 // or END.n=... or SPAN.n=...
504 date_ranges
.emplace(date_value_slot
,
505 date_range
{date_start
, date_end
, date_span
});
507 for (auto i
: date_ranges
) {
510 add_date_filter(r
.start
, r
.end
, r
.span
, slot
);
512 if (slot
!= Xapian::BAD_VALUENO
) {
513 filters_encode_uint(slot
);
514 if (slot
!= date_value_slot
) {
516 old_filters
+= str(slot
);
518 old_filters
+= r
.start
;
520 old_filters
+= r
.end
;
522 old_filters
+= r
.span
;
525 if (!r
.start
.empty()) {
529 if (!r
.end
.empty()) {
533 if (!r
.span
.empty()) {
539 old_filters
+= (default_op
== Xapian::Query::OP_AND
? '.' : '-');
540 old_filters
+= date_start
;
541 old_filters
+= filter_sep
;
542 old_filters
+= date_end
;
543 old_filters
+= filter_sep
;
544 old_filters
+= date_span
;
545 if (date_value_slot
!= Xapian::BAD_VALUENO
) {
546 // This means we'll force the first page when reloading or changing
547 // page starting from existing URLs upon upgrade to 1.4.1, but the
548 // exact same existing URL could be for a search without the date
549 // filter where we want to force the first page, so there's an inherent
550 // ambiguity there. Forcing first page in this case seems the least
551 // problematic side-effect.
552 old_filters
+= filter_sep
;
553 old_filters
+= str(date_value_slot
);
556 // Percentage relevance cut-off
557 val
= cgi_params
.find("THRESHOLD");
558 if (val
!= cgi_params
.end()) {
560 if (val
->second
[0] == '-') {
561 if (!parse_unsigned(val
->second
.c_str() + 1, temp
)) {
562 throw "THRESHOLD parameter must be an integer";
565 } else if (!parse_unsigned(val
->second
.c_str(), temp
)) {
566 throw "THRESHOLD parameter must be an integer";
576 old_filters
+= filter_sep
;
577 old_filters
+= str(collapse_key
);
578 } else if (date_value_slot
!= Xapian::BAD_VALUENO
) {
579 // We need to either omit filter_sep for both or neither, or else the
580 // encoding is ambiguous.
581 old_filters
+= filter_sep
;
585 val
= cgi_params
.find("DOCIDORDER");
586 if (val
!= cgi_params
.end()) {
587 const string
& v
= val
->second
;
591 docid_order
= Xapian::Enquire::DESCENDING
;
593 } else if (ch
!= 'A') {
594 docid_order
= Xapian::Enquire::DONT_CARE
;
596 // This was a bug in the 1.4.x filter encoding (we should have
597 // added nothing here and 'X' in the `ch != 'A'` case), but the
598 // current "DONT_CARE" implementation actually always results
599 // in ascending docid order so it wasn't worth breaking
600 // compatibility in a stable release series to fix.
607 val
= cgi_params
.find("SORT");
608 if (val
!= cgi_params
.end()) {
609 const char * base
= val
->second
.c_str();
610 const char * p
= base
;
612 bool rev
= (*p
!= '+');
613 if (*p
== '-' || *p
== '+') {
616 if (!C_isdigit(*p
)) {
622 Xapian::valueno slot
= strtoul(p
, &q
, 10);
629 if (sort_key
!= Xapian::BAD_VALUENO
) {
630 // Multiple sort keys specified, so we need a KeyMaker.
637 filters_encode_uint(sort_key
);
638 old_filters
+= str(sort_key
);
640 sort_keymaker
= new Xapian::MultiValueKeyMaker
;
641 sort_keymaker
->add_value(sort_key
, !reverse_sort
);
642 sort_key
= Xapian::BAD_VALUENO
;
647 filters
+= (rev
? '-' : '+');
648 old_filters
+= (rev
? '-' : '+');
649 filters_encode_uint(slot
);
650 old_filters
+= str(slot
);
651 sort_keymaker
->add_value(slot
, !rev
);
656 while (C_isspace(*p
) || *p
== ',') ++p
;
659 val
= cgi_params
.find("SORTREVERSE");
660 if (val
!= cgi_params
.end()) {
662 if (!parse_unsigned(val
->second
.c_str(), temp
)) {
663 throw "SORTREVERSE parameter must be >= 0";
666 reverse_sort
= !reverse_sort
;
669 val
= cgi_params
.find("SORTAFTER");
670 if (val
!= cgi_params
.end()) {
672 if (!parse_unsigned(val
->second
.c_str(), temp
)) {
673 throw "SORTAFTER parameter must be >= 0";
675 sort_after
= bool(temp
);
678 // Add the sorting related options to filters too.
679 if (!sort_keymaker
) {
680 filters_encode_uint(sort_key
);
681 old_filters
+= str(sort_key
);
697 // Encode default_op, docid_order, reverse_sort and sort_after together
698 // in a single character.
700 switch (default_op
) {
701 case Xapian::Query::OP_AND
:
703 case Xapian::Query::OP_OR
:
707 // Additional supported value should encode as:
713 v
|= 0x04 * static_cast<unsigned>(docid_order
);
714 if (reverse_sort
) v
|= 0x01;
715 if (sort_after
) v
|= 0x02;
716 filters_encode_uint(v
);
719 // min_hits (fill mset past topdoc+(hits_per_page+1) to
720 // topdoc+max(hits_per_page+1,min_hits)
721 val
= cgi_params
.find("MINHITS");
722 if (val
!= cgi_params
.end()) {
723 if (!parse_unsigned(val
->second
.c_str(), min_hits
)) {
724 throw "MINHITS parameter must be >= 0";
729 } catch (const Xapian::Error
&e
) {
730 if (!set_content_type
&& !suppress_http_headers
)
731 cout
<< "Content-Type: text/html\n\n";
732 cout
<< "Exception: " << html_escape(e
.get_description()) << endl
;
733 } catch (const std::exception
&e
) {
734 if (!set_content_type
&& !suppress_http_headers
)
735 cout
<< "Content-Type: text/html\n\n";
736 cout
<< "Exception: std::exception " << html_escape(e
.what()) << endl
;
737 } catch (const string
&s
) {
738 if (!set_content_type
&& !suppress_http_headers
)
739 cout
<< "Content-Type: text/html\n\n";
740 cout
<< "Exception: " << html_escape(s
) << endl
;
741 } catch (const char *s
) {
742 if (!set_content_type
&& !suppress_http_headers
)
743 cout
<< "Content-Type: text/html\n\n";
744 cout
<< "Exception: " << html_escape(s
) << endl
;
746 if (!set_content_type
&& !suppress_http_headers
)
747 cout
<< "Content-Type: text/html\n\n";
748 cout
<< "Caught unknown exception" << endl
;