2 * @brief Main module for omega (example CGI frontend for Xapian)
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001 James Aylett
6 * Copyright 2001,2002 Ananova Ltd
7 * Copyright 2002,2003,2004,2006,2007,2008,2009,2010,2011,2014,2015,2016,2018 Olly Betts
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as
11 * published by the Free Software Foundation; either version 2 of the
12 * License, or (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
36 #include "safefcntl.h"
37 #include "safeunistd.h"
44 #include "stringutils.h"
49 static const char DEFAULT_STEM_LANGUAGE
[] = "english";
51 // A character which doesn't require URL encoding, and isn't likely to appear
53 const char filter_sep
= '~';
55 // What we used for filter_sep in Omega < 1.3.4.
56 const char filter_sep_old
= '-';
58 Xapian::Enquire
* enquire
;
61 map
<string
, string
> option
;
63 bool set_content_type
= false;
65 bool suppress_http_headers
= false;
69 string filters
, old_filters
;
71 Xapian::docid hits_per_page
= 0;
72 Xapian::docid min_hits
= 0;
77 Xapian::MultiValueKeyMaker
* sort_keymaker
= NULL
;
78 Xapian::valueno sort_key
= Xapian::BAD_VALUENO
; // Don't sort.
79 bool reverse_sort
= true;
80 bool sort_after
= false;
81 Xapian::Enquire::docid_order docid_order
= Xapian::Enquire::ASCENDING
;
83 Xapian::valueno collapse_key
= 0;
84 bool collapse
= false;
87 map_dbname_to_dir(const string
&database_name
)
89 return database_dir
+ database_name
;
93 add_database(const string
& this_dbname
)
95 if (!dbname
.empty()) dbname
+= '/';
96 dbname
+= this_dbname
;
98 Xapian::Database
this_db(map_dbname_to_dir(this_dbname
));
99 db
.add_database(this_db
);
101 size_t this_db_size
= this_db
.size();
102 size_t db_size
= db
.size();
104 while (subdbs
.size() != db_size
) {
105 subdbs
.emplace_back(this_dbname
, i
++, this_db_size
);
109 // Get database(s) to search.
110 template<typename IT
>
112 parse_db_params(const pair
<IT
, IT
>& dbs
)
115 // Only add a repeated db once.
117 for (auto i
= dbs
.first
; i
!= dbs
.second
; ++i
) {
118 const string
& v
= i
->second
;
119 if (v
.empty()) continue;
123 string
s(v
, p
, q
- p
);
124 if (!s
.empty() && seen
.find(s
) == seen
.end()) {
128 if (q
== string::npos
) break;
134 int main(int argc
, char *argv
[])
137 // Check for SERVER_PROTOCOL=INCLUDED, which is set when we're being
138 // included in a page via a server-side include directive. In this
139 // case we suppress sending a Content-Type: header.
140 const char* p
= getenv("SERVER_PROTOCOL");
141 if (p
&& strcmp(p
, "INCLUDED") == 0) {
142 suppress_http_headers
= true;
148 option
["flag_default"] = "true";
150 // set default thousands and decimal separators: e.g. "16,729 hits" "1.4K"
151 option
["decimal"] = ".";
152 option
["thousand"] = ",";
154 // set the default stemming language
155 option
["stemmer"] = DEFAULT_STEM_LANGUAGE
;
157 // FIXME: set cout to linebuffered not stdout. Or just flush regularly...
158 // setvbuf(stdout, NULL, _IOLBF, 0);
160 const char * method
= getenv("REQUEST_METHOD");
161 if (method
== NULL
) {
162 if (argc
> 1 && (argv
[1][0] != '-' || strchr(argv
[1], '='))) {
163 // omega 'P=information retrieval' DB=papers
164 // check for a leading '-' on the first arg so "omega --version",
165 // "omega --help", and similar take the next branch
166 decode_argv(argv
+ 1);
168 // Seems we're running from the command line so give version
169 // and allow a query to be entered for testing
170 cout
<< PROGRAM_NAME
" - " PACKAGE
" " VERSION
"\n";
171 if (argc
> 1) exit(0);
172 cout
<< "Enter NAME=VALUE lines, end with blank line\n";
183 parse_db_params(cgi_params
.equal_range("DB"));
184 if (dbname
.empty()) {
185 add_database(default_db
);
187 enquire
= new Xapian::Enquire(db
);
188 } catch (const Xapian::Error
&) {
190 db
= Xapian::Database();
194 auto val
= cgi_params
.find("HITSPERPAGE");
195 if (val
!= cgi_params
.end()) hits_per_page
= atol(val
->second
.c_str());
196 if (hits_per_page
== 0) {
198 } else if (hits_per_page
> 1000) {
199 hits_per_page
= 1000;
202 val
= cgi_params
.find("DEFAULTOP");
203 if (val
!= cgi_params
.end()) {
204 const string
& v
= val
->second
;
205 if (v
== "OR" || v
== "or")
206 default_op
= Xapian::Query::OP_OR
;
209 val
= cgi_params
.find("FMT");
210 if (val
!= cgi_params
.end()) {
211 const string
& v
= val
->second
;
212 if (!v
.empty()) fmtname
= v
;
215 fmtname
= default_template
;
217 auto ml
= cgi_params
.equal_range("MORELIKE");
218 if (enquire
&& ml
.first
!= ml
.second
) {
219 Xapian::RSet tmprset
;
220 for (auto i
= ml
.first
; i
!= ml
.second
; ++i
) {
221 const string
& v
= i
->second
;
222 Xapian::docid docid
= atol(v
.c_str());
224 // Assume it's MORELIKE=Quid1138 and that Quid1138 is a UID
225 // from an external source - we just find the correspond docid.
226 Xapian::PostingIterator p
= db
.postlist_begin(v
);
227 if (p
!= db
.postlist_end(v
)) docid
= *p
;
230 tmprset
.add_document(docid
);
234 if (!tmprset
.empty()) {
235 OmegaExpandDecider
decider(db
);
236 set_expansion_scheme(*enquire
, option
);
237 Xapian::ESet
eset(enquire
->get_eset(40, tmprset
, &decider
));
238 string morelike_query
;
239 for (auto&& term
: eset
) {
240 if (!morelike_query
.empty()) {
241 if (default_op
== Xapian::Query::OP_OR
) {
242 morelike_query
+= ' ';
244 morelike_query
+= " OR ";
247 morelike_query
+= pretty_term(term
);
249 add_query_string(string(), morelike_query
);
252 // add expand/topterms terms if appropriate
254 if (cgi_params
.find("ADD") != cgi_params
.end()) {
255 auto g
= cgi_params
.equal_range("X");
256 for (auto i
= g
.first
; i
!= g
.second
; ++i
) {
257 const string
& v
= i
->second
;
259 if (!expand_terms
.empty())
266 // collect the unprefixed prob fields
267 auto g
= cgi_params
.equal_range("P");
268 for (auto i
= g
.first
; i
!= g
.second
; ++i
) {
269 const string
& v
= i
->second
;
271 // If there are expand terms, append them to the first
272 // non-empty P parameter.
273 if (!expand_terms
.empty()) {
277 add_query_string(string(), q
);
278 expand_terms
= string();
280 add_query_string(string(), v
);
285 if (!expand_terms
.empty()) {
286 add_query_string(string(), expand_terms
);
290 auto begin
= cgi_params
.lower_bound("P.");
291 auto end
= cgi_params
.lower_bound("P/"); // '/' is '.' + 1.
292 for (auto i
= begin
; i
!= end
; ++i
) {
293 const string
& v
= i
->second
;
295 string
pfx(i
->first
, 2, string::npos
);
296 add_query_string(pfx
, v
);
300 // set any boolean filters
301 auto g
= cgi_params
.equal_range("B");
302 if (g
.first
!= g
.second
) {
303 vector
<string
> filter_v
;
304 for (auto i
= g
.first
; i
!= g
.second
; ++i
) {
305 const string
& v
= i
->second
;
306 // we'll definitely get empty B fields from "-ALL-" options
307 if (!v
.empty() && C_isalnum(v
[0])) {
309 filter_v
.push_back(v
);
312 sort(filter_v
.begin(), filter_v
.end());
313 vector
<string
>::const_iterator j
;
314 for (j
= filter_v
.begin(); j
!= filter_v
.end(); ++j
) {
315 const string
& bterm
= *j
;
316 string::size_type e
= bterm
.find(filter_sep
);
317 if (usual(e
== string::npos
)) {
320 // If a filter contains filter_sep then double it to escape.
321 // Each filter must start with an alnum (checked above) and
322 // the value after the last filter is the default op, which
323 // is encoded as a non-alnum so filter_sep followed by
324 // something other than filter_sep must be separating filters.
325 string::size_type b
= 0;
326 while (e
!= string::npos
) {
327 filters
.append(bterm
, b
, e
+ 1 - b
);
329 e
= bterm
.find(filter_sep
, b
+ 1);
331 filters
.append(bterm
, b
, string::npos
);
333 filters
+= filter_sep
;
334 old_filters
+= bterm
;
335 old_filters
+= filter_sep_old
;
339 // set any negated boolean filters
340 g
= cgi_params
.equal_range("N");
341 if (g
.first
!= g
.second
) {
342 vector
<string
> filter_v
;
343 for (auto i
= g
.first
; i
!= g
.second
; ++i
) {
344 const string
& v
= i
->second
;
345 // we'll definitely get empty N fields from "-ALL-" options
346 if (!v
.empty() && C_isalnum(v
[0])) {
348 filter_v
.push_back(v
);
351 sort(filter_v
.begin(), filter_v
.end());
352 vector
<string
>::const_iterator j
;
353 for (j
= filter_v
.begin(); j
!= filter_v
.end(); ++j
) {
354 const string
& nterm
= *j
;
355 string::size_type e
= nterm
.find(filter_sep
);
357 if (usual(e
== string::npos
)) {
360 // If a filter contains filter_sep then double it to escape.
361 // Each filter must start with an alnum (checked above) and
362 // the value after the last filter is the default op, which
363 // is encoded as a non-alnum so filter_sep followed by
364 // something other than filter_sep must be separating filters.
365 string::size_type b
= 0;
366 while (e
!= string::npos
) {
367 filters
.append(nterm
, b
, e
+ 1 - b
);
369 e
= nterm
.find(filter_sep
, b
+ 1);
371 filters
.append(nterm
, b
, string::npos
);
373 filters
+= filter_sep
;
374 // old_filters predates 'N' terms, so if there are 'N' terms this
375 // is definitely a different query.
380 // date range filters
382 string start
, end
, span
;
384 map
<Xapian::valueno
, date_range
> date_ranges
;
385 begin
= cgi_params
.lower_bound("START.");
386 end
= cgi_params
.lower_bound("START/"); // '/' is '.' + 1.
387 for (auto i
= begin
; i
!= end
; ++i
) {
388 const string
& v
= i
->second
;
390 Xapian::valueno slot
= atoi(i
->first
.c_str() +
391 CONST_STRLEN("START."));
392 date_ranges
[slot
].start
= v
;
395 begin
= cgi_params
.lower_bound("END.");
396 end
= cgi_params
.lower_bound("END/"); // '/' is '.' + 1.
397 for (auto i
= begin
; i
!= end
; ++i
) {
398 const string
& v
= i
->second
;
400 Xapian::valueno slot
= atoi(i
->first
.c_str() +
401 CONST_STRLEN("END."));
402 date_ranges
[slot
].end
= v
;
405 begin
= cgi_params
.lower_bound("SPAN.");
406 end
= cgi_params
.lower_bound("SPAN/"); // '/' is '.' + 1.
407 for (auto i
= begin
; i
!= end
; ++i
) {
408 const string
& v
= i
->second
;
410 Xapian::valueno slot
= atoi(i
->first
.c_str() +
411 CONST_STRLEN("SPAN."));
412 date_ranges
[slot
].span
= v
;
415 if (!date_ranges
.empty()) {
416 // old_filters predates START.N, END.N and SPAN.N so use of any of
417 // these means this is definitely a different query.
420 for (auto i
: date_ranges
) {
423 add_date_filter(r
.start
, r
.end
, r
.span
, slot
);
425 filters
+= str(slot
);
434 string date_start
, date_end
, date_span
;
435 val
= cgi_params
.find("START");
436 if (val
!= cgi_params
.end()) date_start
= val
->second
;
437 val
= cgi_params
.find("END");
438 if (val
!= cgi_params
.end()) date_end
= val
->second
;
439 val
= cgi_params
.find("SPAN");
440 if (val
!= cgi_params
.end()) date_span
= val
->second
;
441 val
= cgi_params
.find("DATEVALUE");
442 Xapian::valueno date_value_slot
= Xapian::BAD_VALUENO
;
443 if (val
!= cgi_params
.end()) date_value_slot
= string_to_int(val
->second
);
444 add_date_filter(date_start
, date_end
, date_span
, date_value_slot
);
446 // If more default_op values are supported, encode them as non-alnums
447 // other than filter_sep, '!' or '$'.
448 filters
+= (default_op
== Xapian::Query::OP_AND
? '.' : '-');
449 filters
+= date_start
;
450 filters
+= filter_sep
;
452 filters
+= filter_sep
;
453 filters
+= date_span
;
454 if (date_value_slot
!= Xapian::BAD_VALUENO
) {
455 // This means we'll force the first page when reloading or changing
456 // page starting from existing URLs upon upgrade to 1.4.1, but the
457 // exact same existing URL could be for a search without the date
458 // filter where we want to force the first page, so there's an inherent
459 // ambiguity there. Forcing first page in this case seems the least
460 // problematic side-effect.
461 filters
+= filter_sep
;
462 filters
+= str(date_value_slot
);
465 if (!old_filters
.empty()) {
466 old_filters
+= date_start
;
467 old_filters
+= filter_sep_old
;
468 old_filters
+= date_end
;
469 old_filters
+= filter_sep_old
;
470 old_filters
+= date_span
;
471 old_filters
+= (default_op
== Xapian::Query::OP_AND
? 'A' : 'O');
474 // Percentage relevance cut-off
475 val
= cgi_params
.find("THRESHOLD");
476 if (val
!= cgi_params
.end()) {
477 threshold
= atoi(val
->second
.c_str());
478 if (threshold
< 0) threshold
= 0;
479 if (threshold
> 100) threshold
= 100;
483 val
= cgi_params
.find("COLLAPSE");
484 if (val
!= cgi_params
.end()) {
485 const string
& v
= val
->second
;
487 collapse_key
= atoi(v
.c_str());
489 filters
+= filter_sep
;
490 filters
+= str(collapse_key
);
491 if (!old_filters
.empty()) {
492 old_filters
+= filter_sep_old
;
493 old_filters
+= str(collapse_key
);
497 if (!collapse
&& date_value_slot
!= Xapian::BAD_VALUENO
) {
498 // We need to either omit filter_sep for both or neither, or else the
499 // encoding is ambiguous.
500 filters
+= filter_sep
;
504 val
= cgi_params
.find("DOCIDORDER");
505 if (val
!= cgi_params
.end()) {
506 const string
& v
= val
->second
;
510 docid_order
= Xapian::Enquire::DESCENDING
;
512 if (!old_filters
.empty()) old_filters
+= 'D';
513 } else if (ch
!= 'A') {
514 docid_order
= Xapian::Enquire::DONT_CARE
;
516 // This is a bug (should add nothing here and 'X' in the (ch !=
517 // 'A') case, but the current "DONT_CARE" implementation
518 // actually always results in ascending docid order so it's not
519 // worth breaking compatibility to fix - let's just do it next
520 // time we change the encoding $filters uses.
522 if (!old_filters
.empty()) old_filters
+= 'X';
528 val
= cgi_params
.find("SORT");
529 if (val
!= cgi_params
.end()) {
530 const char * base
= val
->second
.c_str();
531 const char * p
= base
;
533 bool rev
= (*p
!= '+');
534 if (*p
== '-' || *p
== '+') {
535 // old_filters predates support for direction in SORT, so if
536 // there's a direction specified this is definitely a different
541 if (!C_isdigit(*p
)) {
547 Xapian::valueno slot
= strtoul(p
, &q
, 10);
554 if (sort_key
!= Xapian::BAD_VALUENO
) {
555 // Multiple sort keys specified, so we need a KeyMaker.
558 if (reverse_sort
) filters
+= '-';
559 filters
+= str(sort_key
);
561 sort_keymaker
= new Xapian::MultiValueKeyMaker
;
562 sort_keymaker
->add_value(sort_key
, !reverse_sort
);
563 sort_key
= Xapian::BAD_VALUENO
;
565 // old_filters predates multiple sort keys, so if there are
566 // multiple sort keys this is definitely a different query.
571 filters
+= (rev
? '-' : '+');
572 filters
+= str(slot
);
573 sort_keymaker
->add_value(slot
, !rev
);
578 while (C_isspace(*p
) || *p
== ',') ++p
;
581 val
= cgi_params
.find("SORTREVERSE");
582 if (val
!= cgi_params
.end() && atoi(val
->second
.c_str()) != 0) {
583 reverse_sort
= !reverse_sort
;
586 val
= cgi_params
.find("SORTAFTER");
587 if (val
!= cgi_params
.end()) {
588 sort_after
= (atoi(val
->second
.c_str()) != 0);
591 // Add the sorting related options to filters too.
593 // Note: old_filters really does encode a reversed sort as 'F' and a
594 // non-reversed sort as 'R' or 'r'.
596 // filters has them the other way around for sanity (except in
597 // development snapshot 1.3.4, which was when the new filter encoding
599 if (!sort_keymaker
) filters
+= str(sort_key
);
600 if (!old_filters
.empty()) old_filters
+= str(sort_key
);
604 if (!old_filters
.empty()) old_filters
+= 'F';
607 if (!old_filters
.empty()) old_filters
+= 'R';
612 if (!old_filters
.empty()) old_filters
+= 'r';
617 if (old_filters
.empty()) old_filters
= filters
;
619 // min_hits (fill mset past topdoc+(hits_per_page+1) to
620 // topdoc+max(hits_per_page+1,min_hits)
621 val
= cgi_params
.find("MINHITS");
622 if (val
!= cgi_params
.end()) {
623 min_hits
= atol(val
->second
.c_str());
627 } catch (const Xapian::Error
&e
) {
628 if (!set_content_type
&& !suppress_http_headers
)
629 cout
<< "Content-Type: text/html\n\n";
630 cout
<< "Exception: " << html_escape(e
.get_description()) << endl
;
631 } catch (const std::exception
&e
) {
632 if (!set_content_type
&& !suppress_http_headers
)
633 cout
<< "Content-Type: text/html\n\n";
634 cout
<< "Exception: std::exception " << html_escape(e
.what()) << endl
;
635 } catch (const string
&s
) {
636 if (!set_content_type
&& !suppress_http_headers
)
637 cout
<< "Content-Type: text/html\n\n";
638 cout
<< "Exception: " << html_escape(s
) << endl
;
639 } catch (const char *s
) {
640 if (!set_content_type
&& !suppress_http_headers
)
641 cout
<< "Content-Type: text/html\n\n";
642 cout
<< "Exception: " << html_escape(s
) << endl
;
644 if (!set_content_type
&& !suppress_http_headers
)
645 cout
<< "Content-Type: text/html\n\n";
646 cout
<< "Caught unknown exception" << endl
;