Index gzip-compressed SVG files
[xapian.git] / xapian-applications / omega / query.cc
blob46bb19fc855bdff2729fc26b0a3b59d41fbdc45a
1 /** @file
2 * @brief query executor for omega
3 */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001 James Aylett
6 * Copyright 2001,2002 Ananova Ltd
7 * Copyright 2002 Intercede 1749 Ltd
8 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2015,2016,2017,2018,2019,2020,2021 Olly Betts
9 * Copyright 2008 Thomas Viehmann
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License as
13 * published by the Free Software Foundation; either version 2 of the
14 * License, or (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
24 * USA
27 #include <config.h>
29 #include <algorithm>
30 #include <iostream>
31 #include <map>
32 #include <set>
33 #include <unordered_map>
34 #include <unordered_set>
35 #include <vector>
37 #include <cassert>
38 #include <cctype>
39 #include <cerrno>
40 #include <stdio.h>
41 #include <cstdlib>
42 #include <cstring>
43 #include "strcasecmp.h"
44 #include <ctime>
46 #include "safeunistd.h"
47 #include <sys/types.h>
48 #include "safesysstat.h"
49 #include "safefcntl.h"
51 #include "realtime.h"
53 #include <cdb.h>
55 #include "csvescape.h"
56 #include "date.h"
57 #include "datevalue.h"
58 #include "fields.h"
59 #include "jsonescape.h"
60 #include "utils.h"
61 #include "omega.h"
62 #include "query.h"
63 #include "cgiparam.h"
64 #include "loadfile.h"
65 #include "sample.h"
66 #include "sort.h"
67 #include "str.h"
68 #include "stringutils.h"
69 #include "transform.h"
70 #include "urldecode.h"
71 #include "urlencode.h"
72 #include "unixperm.h"
73 #include "values.h"
74 #include "weight.h"
75 #include "expand.h"
77 #include <xapian.h>
79 using namespace std;
81 using Xapian::Utf8Iterator;
83 using Xapian::Unicode::is_wordchar;
85 #ifndef SNPRINTF
86 #include <cstdarg>
88 static int my_snprintf(char *str, size_t size, const char *format, ...)
90 int res;
91 va_list ap;
92 va_start(ap, format);
93 str[size - 1] = '\0';
94 res = vsprintf(str, format, ap);
95 if (str[size - 1] || res < 0 || size_t(res) >= size)
96 abort(); /* Overflowed! */
97 va_end(ap);
98 return res;
100 #else
101 #define my_snprintf SNPRINTF
102 #endif
104 /// Map shard to DB parameter value and stats to allow docid mapping.
105 vector<SubDB> subdbs;
107 static bool query_parsed = false;
108 static bool done_query = false;
109 static Xapian::docid last = 0;
110 static Xapian::docid topdoc = 0;
112 static Xapian::MSet mset;
113 static Xapian::RSet rset;
115 static map<Xapian::docid, bool> ticked;
117 static void ensure_query_parsed();
118 static void ensure_match();
120 static Xapian::Query query;
121 //static string url_query_string;
122 Xapian::Query::op default_op = Xapian::Query::OP_AND; // default matching mode
124 // Maintain an explicit date_filter_set flag - date_filter.empty() will also
125 // be true if a date filter is specified which simplifies to
126 // Query::MatchNothing at construction time.
127 static bool date_filter_set = false;
128 static Xapian::Query date_filter;
130 static Xapian::QueryParser qp;
131 static Xapian::NumberRangeProcessor * size_rp = NULL;
132 static Xapian::Stem *stemmer = NULL;
134 static string eval_file(const string& fmtfile, bool* p_not_found = nullptr);
136 static set<string> termset;
138 // Holds mapping from term prefix to user prefix (e.g. 'S' -> 'subject:').
139 static map<string, string> termprefix_to_userprefix;
141 static string queryterms;
143 static string error_msg;
145 static double secs = -1;
147 static const char DEFAULT_LOG_ENTRY[] =
148 "$or{$env{REMOTE_HOST},$env{REMOTE_ADDR},-}\t"
149 "[$date{$now,%d/%b/%Y:%H:%M:%S} +0000]\t"
150 "$if{$cgi{X},add,$if{$cgi{MORELIKE},morelike,query}}\t"
151 "$dbname\t"
152 "$query\t"
153 "$msize$if{$env{HTTP_REFERER},\t$env{HTTP_REFERER}}";
155 class MyStopper : public Xapian::Stopper {
156 public:
157 bool operator()(const string &t) const {
158 switch (t[0]) {
159 case 'a':
160 return (t == "a" || t == "about" || t == "an" || t == "and" ||
161 t == "are" || t == "as" || t == "at");
162 case 'b':
163 return (t == "be" || t == "by");
164 case 'e':
165 return (t == "en");
166 case 'f':
167 return (t == "for" || t == "from");
168 case 'h':
169 return (t == "how");
170 case 'i':
171 return (t == "i" || t == "in" || t == "is" || t == "it");
172 case 'o':
173 return (t == "of" || t == "on" || t == "or");
174 case 't':
175 return (t == "that" || t == "the" || t == "this" || t == "to");
176 case 'w':
177 return (t == "was" || t == "what" || t == "when" ||
178 t == "where" || t == "which" || t == "who" ||
179 t == "why" || t == "will" || t == "with");
180 case 'y':
181 return (t == "you" || t == "your");
182 default:
183 return false;
188 static size_t
189 prefix_from_term(string* prefix, const string& term)
191 if (!term.empty()) {
192 if (term[0] == 'X') {
193 const string::const_iterator begin = term.begin();
194 string::const_iterator i = begin + 1;
195 while (i != term.end() && C_isupper(*i))
196 ++i;
197 if (prefix)
198 prefix->assign(begin, i);
199 if (i != term.end() && *i == ':')
200 ++i;
201 return i - begin;
204 if (C_isupper(term[0])) {
205 if (prefix)
206 *prefix = term[0];
207 return 1;
211 if (prefix)
212 prefix->resize(0);
213 return 0;
216 // Don't allow ".." in format names, log file names, etc as this would allow
217 // people to open a format "../../etc/passwd" or similar.
218 // FIXME: make this check more exact ("foo..bar" is safe)
219 // FIXME: log when this check fails
220 static bool
221 vet_filename(const string &filename)
223 string::size_type i = filename.find("..");
224 return (i == string::npos);
227 // Heuristics:
228 // * If any terms have been removed, it's a "fresh query" so we discard any
229 // relevance judgements
230 // * If all previous terms are there but more have been added then we keep
231 // the relevance judgements, but return the first page of hits
233 // NEW_QUERY entirely new query
234 // SAME_QUERY unchanged query
235 // EXTENDED_QUERY new query, but based on the old one
236 // BAD_QUERY parse error (message in error_msg)
237 typedef enum { NEW_QUERY, SAME_QUERY, EXTENDED_QUERY, BAD_QUERY } querytype;
239 static multimap<string, string> query_strings;
241 void
242 add_query_string(const string& prefix, const string& s)
244 string query_string = s;
245 // Strip leading and trailing whitespace from query_string.
246 trim(query_string);
247 if (!query_string.empty())
248 query_strings.insert(make_pair(prefix, query_string));
251 static unsigned
252 read_qp_flags(const string & opt_pfx, unsigned f)
254 map<string, string>::const_iterator i = option.lower_bound(opt_pfx);
255 for (; i != option.end() && startswith(i->first, opt_pfx); ++i) {
256 unsigned mask = 0;
257 const char * s = i->first.c_str() + opt_pfx.size();
258 switch (s[0]) {
259 case 'a':
260 // Note that the ``Xapian::QueryParser::FLAG_ACCUMULATE`` flag
261 // is or-ed in below because it's needed for ``$stoplist`` and
262 // ``$unstem`` to work correctly, and so is deliberately not
263 // available to specify here.
264 if (strcmp(s, "auto_multiword_synonyms") == 0) {
265 mask = Xapian::QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
266 break;
268 if (strcmp(s, "auto_synonyms") == 0) {
269 mask = Xapian::QueryParser::FLAG_AUTO_SYNONYMS;
270 break;
272 break;
273 case 'b':
274 if (strcmp(s, "boolean") == 0) {
275 mask = Xapian::QueryParser::FLAG_BOOLEAN;
276 break;
278 if (strcmp(s, "boolean_any_case") == 0) {
279 mask = Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE;
280 break;
282 break;
283 case 'c':
284 if (strcmp(s, "cjk_ngram") == 0) {
285 mask = Xapian::QueryParser::FLAG_CJK_NGRAM;
286 break;
288 break;
289 case 'd':
290 if (strcmp(s, "default") == 0) {
291 mask = Xapian::QueryParser::FLAG_DEFAULT;
292 break;
294 break;
295 case 'l':
296 if (strcmp(s, "lovehate") == 0) {
297 mask = Xapian::QueryParser::FLAG_LOVEHATE;
298 break;
300 break;
301 case 'n':
302 if (strcmp(s, "no_positions") == 0) {
303 mask = Xapian::QueryParser::FLAG_NO_POSITIONS;
304 break;
306 break;
307 case 'p':
308 if (strcmp(s, "partial") == 0) {
309 mask = Xapian::QueryParser::FLAG_PARTIAL;
310 break;
312 if (strcmp(s, "phrase") == 0) {
313 mask = Xapian::QueryParser::FLAG_PHRASE;
314 break;
316 if (strcmp(s, "pure_not") == 0) {
317 mask = Xapian::QueryParser::FLAG_PURE_NOT;
318 break;
320 break;
321 case 's':
322 if (strcmp(s, "spelling_correction") == 0) {
323 mask = Xapian::QueryParser::FLAG_SPELLING_CORRECTION;
324 break;
326 if (strcmp(s, "synonym") == 0) {
327 mask = Xapian::QueryParser::FLAG_SYNONYM;
328 break;
330 break;
331 case 'w':
332 if (strcmp(s, "wildcard") == 0) {
333 mask = Xapian::QueryParser::FLAG_WILDCARD;
334 break;
336 break;
339 if (i->second.empty()) {
340 f &= ~mask;
341 } else {
342 f |= mask;
345 // Always enable FLAG_ACCUMULATE so that $stoplist and $unstem report
346 // values accumulated over all query strings parsed as part of a query, not
347 // just the last one parsed.
348 return f | Xapian::QueryParser::FLAG_ACCUMULATE;
351 static querytype
352 parse_queries(const string& oldp)
354 // Parse the query string.
355 auto opt_it = option.find("stem_strategy");
356 if (opt_it != option.end()) {
357 if (opt_it->second == "all") {
358 qp.set_stemming_strategy(Xapian::QueryParser::STEM_ALL);
359 } else if (opt_it->second == "all_z") {
360 qp.set_stemming_strategy(Xapian::QueryParser::STEM_ALL_Z);
361 } else if (opt_it->second == "none") {
362 qp.set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
363 } else if (opt_it->second == "some") {
364 qp.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
365 } else if (opt_it->second == "some_full_pos") {
366 qp.set_stemming_strategy(Xapian::QueryParser::STEM_SOME_FULL_POS);
368 } else {
369 opt_it = option.find("stem_all");
370 if (opt_it != option.end() && opt_it->second == "true") {
371 qp.set_stemming_strategy(Xapian::QueryParser::STEM_ALL);
374 qp.set_stopper((new MyStopper())->release());
375 qp.set_default_op(default_op);
376 qp.set_database(db);
377 // FIXME: provide a custom RP which handles size:10..20K, etc.
378 if (!size_rp)
379 size_rp = new Xapian::NumberRangeProcessor(VALUE_SIZE, "size:");
380 qp.add_rangeprocessor(size_rp);
381 map<string, string>::const_iterator pfx = option.lower_bound("prefix,");
382 for (; pfx != option.end() && startswith(pfx->first, "prefix,"); ++pfx) {
383 string user_prefix(pfx->first, 7);
384 const string & term_pfx_list = pfx->second;
385 string::size_type i = 0;
386 do {
387 string::size_type i0 = i;
388 i = term_pfx_list.find('\t', i);
389 const string & term_pfx = term_pfx_list.substr(i0, i - i0);
390 qp.add_prefix(user_prefix, term_pfx);
391 // std::map::insert() won't overwrite an existing entry, so we'll
392 // prefer the first user_prefix for which a particular term prefix
393 // is specified.
394 termprefix_to_userprefix.insert(make_pair(term_pfx, user_prefix));
395 } while (++i);
397 pfx = option.lower_bound("boolprefix,");
398 for (; pfx != option.end() && startswith(pfx->first, "boolprefix,"); ++pfx) {
399 string user_prefix(pfx->first, 11, string::npos);
400 auto it = option.find("nonexclusiveprefix," + pfx->second);
401 bool exclusive = (it == option.end() || it->second.empty());
402 qp.add_boolean_prefix(user_prefix, pfx->second, exclusive);
403 termprefix_to_userprefix.insert(make_pair(pfx->second, user_prefix));
406 try {
407 unsigned default_flags = read_qp_flags("flag_", 0);
408 if (option["spelling"] == "true")
409 default_flags |= qp.FLAG_SPELLING_CORRECTION;
411 vector<Xapian::Query> queries;
412 queries.reserve(query_strings.size());
414 for (auto& j : query_strings) {
415 const string& prefix = j.first;
416 const string& query_string = j.second;
418 // Choose the stemmer to use for this input.
419 string stemlang = option[prefix + ":stemmer"];
420 if (stemlang.empty())
421 stemlang = option["stemmer"];
422 qp.set_stemmer(Xapian::Stem(stemlang));
424 // Work out the flags to use for this input.
425 unsigned f = read_qp_flags(prefix + ":flag_", default_flags);
427 Xapian::Query q = qp.parse_query(query_string, f, prefix);
428 if (!q.empty())
429 queries.push_back(q);
431 query = Xapian::Query(query.OP_AND, queries.begin(), queries.end());
432 } catch (Xapian::QueryParserError &e) {
433 error_msg = e.get_msg();
434 return BAD_QUERY;
437 Xapian::termcount n_new_terms = 0;
438 for (Xapian::TermIterator i = query.get_terms_begin();
439 i != query.get_terms_end(); ++i) {
440 if (termset.find(*i) == termset.end()) {
441 termset.insert(*i);
442 if (!queryterms.empty()) queryterms += '\t';
443 queryterms += *i;
445 n_new_terms++;
448 // Check new query against the previous one
449 if (oldp.empty()) {
450 // If oldp was empty that means there were no parsed query terms
451 // before, so if there are now this is a new query.
452 return n_new_terms ? NEW_QUERY : SAME_QUERY;
455 // The terms in oldp are separated by tabs.
456 const char oldp_separator = '\t';
457 size_t n_old_terms = count(oldp.begin(), oldp.end(), oldp_separator) + 1;
459 // short-cut: if the new query has fewer terms, it must be a new one
460 if (n_new_terms < n_old_terms) return NEW_QUERY;
462 const char *term = oldp.c_str();
463 const char *pend;
464 while ((pend = strchr(term, oldp_separator)) != NULL) {
465 if (termset.find(string(term, pend - term)) == termset.end())
466 return NEW_QUERY;
467 term = pend + 1;
469 if (*term) {
470 if (termset.find(string(term)) == termset.end())
471 return NEW_QUERY;
474 // Use termset.size() rather than n_new_terms so we correctly handle
475 // the case when the query has repeated terms.
476 // This works wrongly in the case when the user extends the query
477 // by adding a term already in it, but that's unlikely and the behaviour
478 // isn't too bad (we just don't reset page 1). We also mishandle a few
479 // other obscure cases e.g. adding quotes to turn a query into a phrase.
480 if (termset.size() > n_old_terms) return EXTENDED_QUERY;
481 return SAME_QUERY;
484 static multimap<string, string> filter_map;
485 static set<string> neg_filters;
487 void add_bterm(const string &term) {
488 string prefix;
489 if (prefix_from_term(&prefix, term) > 0)
490 filter_map.insert(multimap<string, string>::value_type(prefix, term));
493 void add_nterm(const string &term) {
494 if (!term.empty())
495 neg_filters.insert(term);
498 void
499 add_date_filter(const string& date_start,
500 const string& date_end,
501 const string& date_span,
502 Xapian::valueno date_value_slot)
504 if (date_start.empty() && date_end.empty() && date_span.empty())
505 return;
507 Xapian::Query q;
508 if (date_value_slot != Xapian::BAD_VALUENO) {
509 // The values can be a time_t in 4 bytes, or YYYYMMDD... (with the
510 // latter the sort order just works correctly between different
511 // precisions).
512 bool as_time_t =
513 db.get_value_lower_bound(date_value_slot).size() == 4 &&
514 db.get_value_upper_bound(date_value_slot).size() == 4;
515 q = date_value_range(as_time_t, date_value_slot,
516 date_start, date_end,
517 date_span);
518 } else {
519 q = date_range_filter(date_start, date_end, date_span);
520 q |= Xapian::Query("Dlatest");
523 if (date_filter_set) {
524 date_filter &= q;
525 } else {
526 date_filter_set = true;
527 date_filter = q;
531 static void
532 run_query()
534 string scheme;
535 bool force_boolean = false;
536 if (!filter_map.empty()) {
537 // OR together filters with the same prefix (or AND for non-exclusive
538 // prefixes), then AND together the resultant groups.
539 vector<Xapian::Query> filter_vec;
540 vector<string> same_vec;
541 string current;
542 for (auto i = filter_map.begin(); ; ++i) {
543 bool over = (i == filter_map.end());
544 if (over || i->first != current) {
545 switch (same_vec.size()) {
546 case 0:
547 break;
548 case 1:
549 filter_vec.push_back(Xapian::Query(same_vec[0]));
550 break;
551 default: {
552 Xapian::Query::op op = Xapian::Query::OP_OR;
553 auto it = option.find("nonexclusiveprefix," + current);
554 if (it != option.end() && !it->second.empty()) {
555 op = Xapian::Query::OP_AND;
557 filter_vec.push_back(Xapian::Query(op,
558 same_vec.begin(),
559 same_vec.end()));
560 break;
563 same_vec.clear();
564 if (over) break;
565 current = i->first;
567 same_vec.push_back(i->second);
570 Xapian::Query filter(Xapian::Query::OP_AND,
571 filter_vec.begin(), filter_vec.end());
573 if (query.empty()) {
574 // If no query strings were provided then promote the filters
575 // to be THE query - filtering an empty query will give no
576 // matches.
577 std::swap(query, filter);
578 auto&& it = option.find("weightingpurefilter");
579 if (it != option.end() && !it->second.empty()) {
580 scheme = it->second;
581 } else {
582 force_boolean = true;
584 } else {
585 query = Xapian::Query(Xapian::Query::OP_FILTER, query, filter);
589 if (date_filter_set) {
590 // If no query strings were provided then promote the daterange
591 // filter to be THE query instead of filtering an empty query.
592 if (query.empty()) {
593 query = date_filter;
594 force_boolean = true;
595 } else {
596 query = Xapian::Query(Xapian::Query::OP_FILTER, query, date_filter);
600 if (!neg_filters.empty()) {
601 // OR together all negated filters.
602 Xapian::Query filter(Xapian::Query::OP_OR,
603 neg_filters.begin(), neg_filters.end());
605 if (query.empty() && !date_filter_set) {
606 // If we only have a negative filter for the query, use MatchAll as
607 // the query to apply the filters to.
608 query = Xapian::Query::MatchAll;
609 force_boolean = true;
611 query = Xapian::Query(Xapian::Query::OP_AND_NOT, query, filter);
614 if (!enquire || !error_msg.empty()) return;
616 if (!force_boolean && scheme.empty()) {
617 auto&& it = option.find("weighting");
618 if (it != option.end()) scheme = it->second;
620 set_weighting_scheme(*enquire, scheme, force_boolean);
622 enquire->set_cutoff(threshold);
624 if (sort_keymaker) {
625 if (sort_after) {
626 enquire->set_sort_by_relevance_then_key(sort_keymaker,
627 reverse_sort);
628 } else {
629 enquire->set_sort_by_key_then_relevance(sort_keymaker,
630 reverse_sort);
632 } else if (sort_key != Xapian::BAD_VALUENO) {
633 if (sort_after) {
634 enquire->set_sort_by_relevance_then_value(sort_key, reverse_sort);
635 } else {
636 enquire->set_sort_by_value_then_relevance(sort_key, reverse_sort);
640 enquire->set_docid_order(docid_order);
642 if (collapse) {
643 enquire->set_collapse_key(collapse_key);
646 if (!query.empty()) {
647 #if 0
648 // FIXME: If we start doing permissions checks based on $REMOTE_USER
649 // we're going to break some existing setups if users upgrade. We
650 // probably want a way to set this from OmegaScript.
651 const char * remote_user = getenv("REMOTE_USER");
652 if (remote_user)
653 apply_unix_permissions(query, remote_user);
654 #endif
656 enquire->set_query(query);
657 // We could use the value of topdoc as first parameter, but we
658 // need to know the first few items in the mset to fake a
659 // relevance set for topterms.
661 // If min_hits isn't set, check at least one extra result so we
662 // know if we've reached the end of the matches or not - then we
663 // can avoid offering a "next" button which leads to an empty page.
664 mset = enquire->get_mset(0, topdoc + hits_per_page,
665 topdoc + max(hits_per_page + 1, min_hits),
666 &rset);
670 string
671 html_escape(const string &str)
673 string res;
674 string::size_type p = 0;
675 while (p < str.size()) {
676 char ch = str[p++];
677 switch (ch) {
678 case '<':
679 res += "&lt;";
680 continue;
681 case '>':
682 res += "&gt;";
683 continue;
684 case '&':
685 res += "&amp;";
686 continue;
687 case '"':
688 res += "&quot;";
689 continue;
690 default:
691 res += ch;
694 return res;
697 static string
698 html_strip(const string &str)
700 string res;
701 string::size_type p = 0;
702 bool skip = false;
703 while (p < str.size()) {
704 char ch = str[p++];
705 switch (ch) {
706 case '<':
707 skip = true;
708 continue;
709 case '>':
710 skip = false;
711 continue;
712 default:
713 if (!skip) res += ch;
716 return res;
719 class WordList {
720 static string prev_list;
721 static unordered_map<string, int> word_to_occurrence;
722 public:
723 void build_word_map(const string& list) {
724 // Don't build map again if passed list of terms is same as before.
725 if (prev_list == list) return;
726 word_to_occurrence.clear();
727 string::size_type split = 0, split2;
728 int word_index = 0;
729 string word;
730 while ((split2 = list.find('\t', split)) != string::npos) {
731 word = list.substr(split, split2 - split);
732 if (word_to_occurrence.emplace(make_pair(word, word_index)).second)
733 ++word_index;
734 split = split2 + 1;
736 word = list.substr(split, list.size() - split);
737 if (word_to_occurrence.emplace(make_pair(word, word_index)).second)
738 ++word_index;
739 prev_list = list;
742 int word_in_list(const string& word) {
743 auto it = word_to_occurrence.find(word);
744 if (it == word_to_occurrence.end()) return -1;
745 return it->second;
749 string WordList::prev_list;
750 unordered_map<string, int> WordList::word_to_occurrence;
752 // Not a character in an identifier
753 static inline bool
754 p_notid(unsigned int c)
756 return !C_isalnum(c) && c != '_';
759 // Not a character in an HTML tag name
760 static inline bool
761 p_nottag(unsigned int c)
763 return !C_isalnum(c) && c != '.' && c != '-';
766 // FIXME: shares algorithm with indextext.cc!
767 static string
768 html_highlight(const string &s, const string &list,
769 const string &bra, const string &ket)
771 if (!stemmer) {
772 stemmer = new Xapian::Stem(option["stemmer"]);
775 string res;
777 Utf8Iterator j(s);
778 const Utf8Iterator s_end;
779 while (true) {
780 Utf8Iterator first = j;
781 while (first != s_end && !is_wordchar(*first)) ++first;
782 if (first == s_end) break;
783 Utf8Iterator term_end;
784 string term;
785 string word;
786 const char *l = j.raw();
787 if (*first < 128 && C_isupper(*first)) {
788 j = first;
789 Xapian::Unicode::append_utf8(term, *j);
790 while (++j != s_end && *j == '.' && ++j != s_end && *j < 128 && C_isupper(*j)) {
791 Xapian::Unicode::append_utf8(term, *j);
793 if (term.length() < 2 || (j != s_end && is_wordchar(*j))) {
794 term.resize(0);
796 term_end = j;
798 if (term.empty()) {
799 j = first;
800 while (is_wordchar(*j)) {
801 Xapian::Unicode::append_utf8(term, *j);
802 ++j;
803 if (j == s_end) break;
804 if (*j == '&' || *j == '\'') {
805 Utf8Iterator next = j;
806 ++next;
807 if (next == s_end || !is_wordchar(*next)) break;
808 term += *j;
809 j = next;
812 term_end = j;
813 if (j != s_end && (*j == '+' || *j == '-' || *j == '#')) {
814 string::size_type len = term.length();
815 if (*j == '#') {
816 term += '#';
817 do { ++j; } while (j != s_end && *j == '#');
818 } else {
819 while (j != s_end && (*j == '+' || *j == '-')) {
820 Xapian::Unicode::append_utf8(term, *j);
821 ++j;
824 if (term.size() - len > 3 || (j != s_end && is_wordchar(*j))) {
825 term.resize(len);
826 } else {
827 term_end = j;
831 j = term_end;
832 term = Xapian::Unicode::tolower(term);
833 WordList w;
834 w.build_word_map(list);
835 int match = w.word_in_list(term);
836 if (match == -1) {
837 string stem = "Z";
838 stem += (*stemmer)(term);
839 match = w.word_in_list(stem);
841 if (match >= 0) {
842 res += html_escape(string(l, first.raw() - l));
843 if (!bra.empty()) {
844 res += bra;
845 } else {
846 static const char * colours[] = {
847 "ffff66", "99ff99", "99ffff", "ff66ff", "ff9999",
848 "990000", "009900", "996600", "006699", "990099"
850 size_t idx = match % (sizeof(colours) / sizeof(colours[0]));
851 const char * bg = colours[idx];
852 if (strchr(bg, 'f')) {
853 res += "<b style=\"color:black;background-color:#";
854 } else {
855 res += "<b style=\"color:white;background-color:#";
857 res += bg;
858 res += "\">";
860 word.assign(first.raw(), j.raw() - first.raw());
861 res += html_escape(word);
862 if (!bra.empty()) {
863 res += ket;
864 } else {
865 res += "</b>";
867 } else {
868 res += html_escape(string(l, j.raw() - l));
871 if (j != s_end) res += html_escape(string(j.raw(), j.left()));
872 return res;
875 #if 0
876 static void
877 print_query_string(const char *after)
879 if (after && strncmp(after, "&B=", 3) == 0) {
880 char prefix = after[3];
881 string::size_type start = 0, amp = 0;
882 while (true) {
883 amp = url_query_string.find('&', amp);
884 if (amp == string::npos) {
885 cout << url_query_string.substr(start);
886 return;
888 amp++;
889 while (url_query_string[amp] == 'B' &&
890 url_query_string[amp + 1] == '=' &&
891 url_query_string[amp + 2] == prefix) {
892 cout << url_query_string.substr(start, amp - start - 1);
893 start = url_query_string.find('&', amp + 3);
894 if (start == string::npos) return;
895 amp = start + 1;
899 cout << url_query_string;
901 #endif
903 class CachedFields : private Fields {
904 Xapian::docid did_cached = 0;
906 public:
907 CachedFields() {}
909 const string& get_field(Xapian::docid did, const string& name) {
910 if (did != did_cached) {
911 did_cached = did;
912 auto it = option.find("fieldnames");
913 Fields::parse_fields(db.get_document(did).get_data(),
914 it == option.end() ? nullptr : &it->second);
916 return Fields::get_field(name);
920 static CachedFields fields;
921 static Xapian::docid q0;
922 static Xapian::doccount hit_no;
923 static int percent;
924 static double weight;
925 static Xapian::doccount collapsed;
927 static string print_caption(const string& fmt, vector<string>& param);
929 enum tagval {
930 CMD_,
931 CMD_add,
932 CMD_addfilter,
933 CMD_allterms,
934 CMD_and,
935 CMD_base64,
936 CMD_cgi,
937 CMD_cgilist,
938 CMD_cgiparams,
939 CMD_chr,
940 CMD_collapsed,
941 CMD_cond,
942 CMD_contains,
943 CMD_csv,
944 CMD_date,
945 CMD_dbname,
946 CMD_dbsize,
947 CMD_def,
948 CMD_defaultop,
949 CMD_div,
950 CMD_emptydocs,
951 CMD_env,
952 CMD_eq,
953 CMD_error,
954 CMD_field,
955 CMD_filesize,
956 CMD_filters,
957 CMD_filterterms,
958 CMD_find,
959 CMD_fmt,
960 CMD_foreach,
961 CMD_freq,
962 CMD_ge,
963 CMD_gt,
964 CMD_highlight,
965 CMD_hit,
966 CMD_hitlist,
967 CMD_hitsperpage,
968 CMD_hostname,
969 CMD_html,
970 CMD_htmlstrip,
971 CMD_httpheader,
972 CMD_id,
973 CMD_if,
974 CMD_include,
975 CMD_json,
976 CMD_jsonarray,
977 CMD_jsonbool,
978 CMD_jsonobject,
979 CMD_keys,
980 CMD_last,
981 CMD_lastpage,
982 CMD_le,
983 CMD_length,
984 CMD_list,
985 CMD_log,
986 CMD_lookup,
987 CMD_lower,
988 CMD_lt,
989 CMD_map,
990 CMD_match,
991 CMD_max,
992 CMD_min,
993 CMD_mod,
994 CMD_msize,
995 CMD_msizeexact,
996 CMD_msizelower,
997 CMD_msizeupper,
998 CMD_mul,
999 CMD_muldiv,
1000 CMD_ne,
1001 CMD_nice,
1002 CMD_not,
1003 CMD_now,
1004 CMD_opt,
1005 CMD_or,
1006 CMD_ord,
1007 CMD_pack,
1008 CMD_percentage,
1009 CMD_prettyterm,
1010 CMD_prettyurl,
1011 CMD_query,
1012 CMD_querydescription,
1013 CMD_queryterms,
1014 CMD_range,
1015 CMD_record,
1016 CMD_relevant,
1017 CMD_relevants,
1018 CMD_score,
1019 CMD_set,
1020 CMD_seterror,
1021 CMD_setmap,
1022 CMD_setrelevant,
1023 CMD_slice,
1024 CMD_snippet,
1025 CMD_sort,
1026 CMD_split,
1027 CMD_stoplist,
1028 CMD_sub,
1029 CMD_subdb,
1030 CMD_subid,
1031 CMD_substr,
1032 CMD_suggestion,
1033 CMD_switch,
1034 CMD_termprefix,
1035 CMD_terms,
1036 CMD_thispage,
1037 CMD_time,
1038 CMD_topdoc,
1039 CMD_topterms,
1040 CMD_transform,
1041 CMD_truncate,
1042 CMD_uniq,
1043 CMD_unique,
1044 CMD_unpack,
1045 CMD_unprefix,
1046 CMD_unstem,
1047 CMD_upper,
1048 CMD_url,
1049 CMD_value,
1050 CMD_version,
1051 CMD_weight,
1052 CMD_MACRO // special tag for macro evaluation
1055 struct func_attrib {
1056 int tag;
1057 int minargs, maxargs, evalargs;
1058 char ensure;
1061 #define T(F,A,B,C,D) {STRINGIZE(F),{CMD_##F,A,B,C,D}}
1062 struct func_desc {
1063 const char *name;
1064 struct func_attrib a;
1067 #define N (-1)
1068 #define M 'M'
1069 #define Q 'Q'
1070 // NB when adding a new command which ensures M or Q, update the list in
1071 // docs/omegascript.rst
1072 static const struct func_desc func_tab[] = {
1073 //name minargs maxargs evalargs ensure
1074 {"",{CMD_, N, N, 0, 0}},// commented out code
1075 T(add, 0, N, N, 0), // add a list of numbers
1076 T(addfilter, 1, 2, N, 0), // add filter term
1077 T(allterms, 0, 1, N, 0), // list of all terms matching document
1078 T(and, 1, N, 0, 0), // logical shortcutting and of a list of values
1079 T(base64, 1, 1, N, 0), // base64 encode
1080 T(cgi, 1, 1, N, 0), // return cgi parameter value
1081 T(cgilist, 1, 1, N, 0), // return list of values for cgi parameter
1082 T(cgiparams, 0, 0, N, 0), // return list of cgi parameter names
1083 T(chr, 1, 1, N, 0), // return UTF-8 for given Unicode codepoint
1084 T(collapsed, 0, 0, N, 0), // return number of hits collapsed into this
1085 T(cond, 2, N, 0, 0), // cascaded conditionals
1086 T(contains, 2, 2, N, 0), // return position of substring, or empty string
1087 T(csv, 1, 2, N, 0), // CSV string escaping
1088 T(date, 1, 2, N, 0), // convert time_t to strftime format
1089 // (default: YYYY-MM-DD)
1090 T(dbname, 0, 0, N, 0), // database name
1091 T(dbsize, 0, 0, N, 0), // database size (# of documents)
1092 T(def, 2, 2, 1, 0), // define a macro
1093 T(defaultop, 0, 0, N, 0), // default operator: "and" or "or"
1094 T(div, 2, 2, N, 0), // integer divide
1095 T(emptydocs, 0, 1, N, 0), // list of empty documents
1096 T(env, 1, 1, N, 0), // environment variable
1097 T(eq, 2, 2, N, 0), // test equality
1098 T(error, 0, 0, N, 0), // error message
1099 T(field, 1, 2, N, 0), // lookup field in record
1100 T(filesize, 1, 1, N, 0), // pretty printed filesize
1101 T(filters, 0, 0, N, 0), // serialisation of current filters
1102 T(filterterms, 1, 1, N, 0), // list of terms with a given prefix
1103 T(find, 2, 2, N, 0), // find entry in list
1104 T(fmt, 0, 0, N, 0), // name of current format
1105 T(foreach, 2, 2, 1, 0), // evaluate something for every entry in a list
1106 T(freq, 1, 1, N, 0), // frequency of a term
1107 T(ge, 2, 2, N, 0), // test >=
1108 T(gt, 2, 2, N, 0), // test >
1109 T(highlight, 2, 4, N, 0), // html escape and highlight words from list
1110 T(hit, 0, 0, N, 0), // hit number of current mset entry (0-based)
1111 T(hitlist, 1, 1, 0, M), // display hitlist using format in argument
1112 T(hitsperpage, 0, 0, N, 0), // hits per page
1113 T(hostname, 1, 1, N, 0), // extract hostname from URL
1114 T(html, 1, 1, N, 0), // html escape string (<>&")
1115 T(htmlstrip, 1, 1, N, 0), // html strip tags string (s/<[^>]*>?//g)
1116 T(httpheader, 2, 2, N, 0), // arbitrary HTTP header
1117 T(id, 0, 0, N, 0), // docid of current doc
1118 T(if, 1, 3, 1, 0), // conditional
1119 T(include, 1, 2, 1, 0), // include another file
1120 T(json, 1, 1, N, 0), // JSON string escaping
1121 T(jsonarray, 1, 2, 1, 0), // Format list as a JSON array
1122 T(jsonbool, 1, 1, 1, 0), // Format list as a JSON bool
1123 T(jsonobject, 1, 3, 1, 0), // Format map as JSON object
1124 T(keys, 1, 1, N, 0), // list of keys from a map
1125 T(last, 0, 0, N, M), // hit number one beyond end of current page
1126 T(lastpage, 0, 0, N, M), // number of last hit page
1127 T(le, 2, 2, N, 0), // test <=
1128 T(length, 1, 1, N, 0), // length of list
1129 T(list, 2, 5, N, 0), // pretty print list
1130 T(log, 1, 2, 1, 0), // create a log entry
1131 T(lookup, 2, 2, N, 0), // lookup in named cdb file
1132 T(lower, 1, 1, N, 0), // convert string to lower case
1133 T(lt, 2, 2, N, 0), // test <
1134 T(map, 2, 2, 1, 0), // map a list into another list
1135 T(match, 2, 3, N, 0), // regex match
1136 T(max, 1, N, N, 0), // maximum of a list of values
1137 T(min, 1, N, N, 0), // minimum of a list of values
1138 T(mod, 2, 2, N, 0), // integer modulus
1139 T(msize, 0, 0, N, M), // number of matches (estimated)
1140 T(msizeexact, 0, 0, N, M), // is $msize exact?
1141 T(msizelower, 0, 0, N, M), // number of matches (lower bound)
1142 T(msizeupper, 0, 0, N, M), // number of matches (upper bound)
1143 T(mul, 2, N, N, 0), // multiply a list of numbers
1144 T(muldiv, 3, 3, N, 0), // calculate A*B/C
1145 T(ne, 2, 2, N, 0), // test not equal
1146 T(nice, 1, 1, N, 0), // pretty print integer (with thousands sep)
1147 T(not, 1, 1, N, 0), // logical not
1148 T(now, 0, 0, N, 0), // current date/time as a time_t
1149 T(opt, 1, 2, N, 0), // lookup an option value
1150 T(or, 1, N, 0, 0), // logical shortcutting or of a list of values
1151 T(ord, 1, 1, N, 0), // return codepoint for first character of UTF-8 string
1152 T(pack, 1, 1, N, 0), // convert a number to a 4 byte big endian binary string
1153 T(percentage, 0, 0, N, 0), // percentage score of current hit
1154 T(prettyterm, 1, 1, N, Q), // pretty print term name
1155 T(prettyurl, 1, 1, N, 0), // pretty version of URL
1156 T(query, 0, 1, N, Q), // query
1157 T(querydescription,0, 0, N, M), // query.get_description() (run_query() adds filters so M)
1158 T(queryterms, 0, 0, N, Q), // list of query terms
1159 T(range, 2, 2, N, 0), // return list of values between start and end
1160 T(record, 0, 1, N, 0), // record contents of document
1161 T(relevant, 0, 1, N, Q), // is document relevant?
1162 T(relevants, 0, 0, N, Q), // return list of relevant documents
1163 T(score, 0, 0, N, 0), // score (0-10) of current hit
1164 T(set, 2, 2, N, 0), // set option value
1165 T(seterror, 1, 1, N, 0), // set error_msg, setting it early stops query execution
1166 T(setmap, 1, N, N, 0), // set map of option values
1167 T(setrelevant, 1, 1, N, Q), // set rset
1168 T(slice, 2, 2, N, 0), // slice a list using a second list
1169 T(snippet, 1, 2, N, M), // generate snippet from text
1170 T(sort, 1, 2, N, 0), // alpha sort a list
1171 T(split, 1, 2, N, 0), // split a string to give a list
1172 T(stoplist, 0, 0, N, Q), // return list of stopped terms
1173 T(sub, 2, 2, N, 0), // subtract
1174 T(subdb, 0, 1, N, 0), // name of subdb docid is in
1175 T(subid, 0, 1, N, 0), // docid in the subdb#
1176 T(substr, 2, 3, N, 0), // substring
1177 T(suggestion, 0, 0, N, Q), // misspelled word correction suggestion
1178 T(switch, 3, N, 1, 0), // return position of substring, or empty string
1179 T(termprefix, 1, 1, N, 0), // get any prefix from a term
1180 T(terms, 0, 1, N, M), // list of matching terms
1181 T(thispage, 0, 0, N, M), // page number of current page
1182 T(time, 0, 0, N, M), // how long the match took (in seconds)
1183 T(topdoc, 0, 0, N, M), // first document on current page of hit list
1184 // (counting from 0)
1185 T(topterms, 0, 1, N, M), // list of up to N top relevance feedback terms
1186 // (default 16)
1187 T(transform, 3, 4, N, 0), // transform with a regexp
1188 T(truncate, 2, 4, N, 0), // truncate after a word
1189 T(uniq, 1, 1, N, 0), // removed duplicates from a sorted list
1190 T(unique, 1, 1, N, 0), // removed duplicates from any list
1191 T(unpack, 1, 1, N, 0), // convert 4 byte big endian binary string to a number
1192 T(unprefix, 1, 1, N, 0), // remove any prefix from a term
1193 T(unstem, 1, 1, N, Q), // return list of terms from the parsed query
1194 // which stemmed to this term
1195 T(upper, 1, 1, N, 0), // convert string to upper case
1196 T(url, 1, 1, N, 0), // url encode argument
1197 T(value, 1, 2, N, 0), // return document value
1198 T(version, 0, 0, N, 0), // omega version string
1199 T(weight, 0, 0, N, 0), // weight of the current hit
1200 { NULL,{0, 0, 0, 0, 0}}
1203 #undef T // Leaving T defined screws up Sun's C++ compiler!
1205 static vector<string> macros;
1207 // Call write() repeatedly until all data is written or we get a
1208 // non-recoverable error.
1209 static ssize_t
1210 write_all(int fd, const char * buf, size_t count)
1212 while (count) {
1213 ssize_t r = write(fd, buf, count);
1214 if (rare(r < 0)) {
1215 if (errno == EINTR) continue;
1216 return r;
1218 buf += r;
1219 count -= r;
1221 return 0;
1224 static string eval(const string& fmt, vector<string>& param);
1226 /** Implements $foreach{} and $map{}. */
1227 static string
1228 foreach(const string& list,
1229 const string& pat,
1230 vector<string>& param,
1231 char sep = '\0')
1233 string result;
1234 string saved_arg0 = std::move(param[0]);
1235 string::size_type i = 0, j;
1236 while (true) {
1237 j = list.find('\t', i);
1238 param[0].assign(list, i, j - i);
1239 result += eval(pat, param);
1240 if (j == string::npos) break;
1241 if (sep) result += sep;
1242 i = j + 1;
1244 param[0] = std::move(saved_arg0);
1245 return result;
1248 static string
1249 eval(const string& fmt, vector<string>& param)
1251 static map<string, const struct func_attrib *> func_map;
1252 if (func_map.empty()) {
1253 for (auto p = func_tab; p->name != NULL; ++p) {
1254 func_map[string(p->name)] = &(p->a);
1257 string res;
1258 string::size_type p = 0, q;
1259 while ((q = fmt.find('$', p)) != string::npos) try {
1260 res.append(fmt, p, q - p);
1261 string::size_type code_start = q; // note down for error reporting
1262 q++;
1263 if (q >= fmt.size()) break;
1264 unsigned char ch = fmt[q];
1265 switch (ch) {
1266 // Magic sequences:
1267 // '$$' -> '$', '$(' -> '{', '$)' -> '}', '$.' -> ','
1268 case '$':
1269 res += '$';
1270 p = q + 1;
1271 continue;
1272 case '(':
1273 res += '{';
1274 p = q + 1;
1275 continue;
1276 case ')':
1277 res += '}';
1278 p = q + 1;
1279 continue;
1280 case '.':
1281 res += ',';
1282 p = q + 1;
1283 continue;
1284 case '_':
1285 ch = '0';
1286 // FALL THRU
1287 case '1': case '2': case '3': case '4': case '5':
1288 case '6': case '7': case '8': case '9':
1289 ch -= '0';
1290 if (ch < param.size()) res += param[ch];
1291 p = q + 1;
1292 continue;
1293 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1294 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1295 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1296 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1297 case 'y': case 'z':
1298 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1299 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1300 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1301 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1302 case 'Y': case 'Z':
1303 case '{':
1304 break;
1305 default:
1306 string msg = "Unknown $ code in: $";
1307 msg.append(fmt, q, string::npos);
1308 throw msg;
1310 p = find_if(fmt.begin() + q, fmt.end(), p_notid) - fmt.begin();
1311 string var(fmt, q, p - q);
1312 map<string, const struct func_attrib *>::const_iterator func;
1313 func = func_map.find(var);
1314 if (func == func_map.end()) {
1315 throw "Unknown function '" + var + "'";
1317 vector<string> args;
1318 if (fmt[p] == '{') {
1319 q = p + 1;
1320 int nest = 1;
1321 while (true) {
1322 p = fmt.find_first_of(",{}", p + 1);
1323 if (p == string::npos)
1324 throw "missing } in " + fmt.substr(code_start);
1325 if (fmt[p] == '{') {
1326 ++nest;
1327 } else {
1328 if (nest == 1) {
1329 // should we split the args
1330 if (func->second->minargs != N) {
1331 args.push_back(fmt.substr(q, p - q));
1332 q = p + 1;
1335 if (fmt[p] == '}' && --nest == 0) break;
1338 if (func->second->minargs == N)
1339 args.push_back(fmt.substr(q, p - q));
1340 ++p;
1343 if (func->second->minargs != N) {
1344 if (int(args.size()) < func->second->minargs)
1345 throw "too few arguments to $" + var;
1346 if (func->second->maxargs != N &&
1347 int(args.size()) > func->second->maxargs)
1348 throw "too many arguments to $" + var;
1350 vector<string>::size_type n;
1351 if (func->second->evalargs != N)
1352 n = func->second->evalargs;
1353 else
1354 n = args.size();
1356 for (vector<string>::size_type j = 0; j < n; ++j)
1357 args[j] = eval(args[j], param);
1359 if (func->second->ensure == 'Q' || func->second->ensure == 'M')
1360 ensure_query_parsed();
1361 if (func->second->ensure == 'M') ensure_match();
1362 string value;
1363 switch (func->second->tag) {
1364 case CMD_:
1365 break;
1366 case CMD_add: {
1367 int total = 0;
1368 for (auto&& arg : args)
1369 total += string_to_int(arg);
1370 value = str(total);
1371 break;
1373 case CMD_addfilter:
1374 if (args.size() == 1 || args[1].empty() || args[1] == "B") {
1375 add_bterm(args[0]);
1376 } else if (args[1] == "N") {
1377 add_nterm(args[0]);
1378 } else {
1379 string msg = "Invalid $addfilter type '";
1380 msg += args[1];
1381 msg += "'";
1382 throw msg;
1384 break;
1385 case CMD_allterms: {
1386 // list of all terms indexing document
1387 Xapian::docid id = q0;
1388 if (!args.empty()) id = string_to_int(args[0]);
1389 for (Xapian::TermIterator term = db.termlist_begin(id);
1390 term != db.termlist_end(id); ++term) {
1391 value += *term;
1392 value += '\t';
1395 if (!value.empty()) value.erase(value.size() - 1);
1396 break;
1398 case CMD_and: {
1399 value = "true";
1400 for (auto&& arg : args) {
1401 if (eval(arg, param).empty()) {
1402 value.resize(0);
1403 break;
1406 break;
1408 case CMD_base64: {
1409 const static char encode[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdef"
1410 "ghijklmnopqrstuvwxyz0123456789+/";
1411 const char pad = '=';
1412 const string& input = args[0];
1413 value.reserve((input.size() + 2) / 3 * 4);
1414 auto it = input.begin();
1415 auto n = input.size() / 3;
1416 while (n--) {
1417 uint32_t v = uint8_t(*it++);
1418 v = (v << 8) | uint8_t(*it++);
1419 v = (v << 8) | uint8_t(*it++);
1420 value += encode[v >> 18];
1421 value += encode[(v >> 12) & 63];
1422 value += encode[(v >> 6) & 63];
1423 value += encode[v & 63];
1425 switch (input.size() % 3) {
1426 case 2: {
1427 uint32_t v = uint8_t(*it++);
1428 v = (v << 8) | uint8_t(*it++);
1429 value += encode[v >> 10];
1430 value += encode[(v >> 4) & 63];
1431 value += encode[(v << 2) & 63];
1432 value += pad;
1433 break;
1435 case 1: {
1436 uint32_t v = uint8_t(*it++);
1437 value += encode[v >> 2];
1438 value += encode[(v << 4) & 63];
1439 value += pad;
1440 value += pad;
1441 break;
1444 break;
1446 case CMD_cgi: {
1447 auto i = cgi_params.find(args[0]);
1448 if (i != cgi_params.end()) value = i->second;
1449 break;
1451 case CMD_cgilist: {
1452 auto g = cgi_params.equal_range(args[0]);
1453 for (auto i = g.first; i != g.second; ++i) {
1454 value += i->second;
1455 value += '\t';
1457 if (!value.empty()) value.erase(value.size() - 1);
1458 break;
1460 case CMD_cgiparams: {
1461 const string* prev = NULL;
1462 for (auto&& i : cgi_params) {
1463 if (prev && i.first == *prev) continue;
1464 value += i.first;
1465 value += '\t';
1466 prev = &i.first;
1468 if (!value.empty()) value.erase(value.size() - 1);
1469 break;
1471 case CMD_chr:
1472 Xapian::Unicode::append_utf8(value, string_to_int(args[0]));
1473 break;
1474 case CMD_collapsed: {
1475 value = str(collapsed);
1476 break;
1478 case CMD_cond:
1479 for (size_t i = 0; i < args.size(); i += 2) {
1480 if (i == args.size() - 1) {
1481 // Handle optional "else" value.
1482 value = eval(args[i], param);
1483 break;
1485 if (!eval(args[i], param).empty()) {
1486 value = eval(args[i + 1], param);
1487 break;
1490 break;
1491 case CMD_contains: {
1492 size_t pos = args[1].find(args[0]);
1493 if (pos != string::npos) {
1494 value = str(pos);
1496 break;
1498 case CMD_csv:
1499 value = args[0];
1500 if (args.size() > 1 && !args[1].empty()) {
1501 csv_escape_always(value);
1502 } else {
1503 csv_escape(value);
1505 break;
1506 case CMD_date:
1507 value = args[0];
1508 if (!value.empty()) {
1509 char buf[64] = "";
1510 time_t date = string_to_int(value);
1511 if (date != static_cast<time_t>(-1)) {
1512 struct tm *then;
1513 then = gmtime(&date);
1514 string date_fmt = "%Y-%m-%d";
1515 if (args.size() > 1) date_fmt = eval(args[1], param);
1516 strftime(buf, sizeof buf, date_fmt.c_str(), then);
1518 value = buf;
1520 break;
1521 case CMD_dbname:
1522 value = dbname;
1523 break;
1524 case CMD_dbsize: {
1525 static Xapian::doccount dbsize;
1526 if (!dbsize) dbsize = db.get_doccount();
1527 value = str(dbsize);
1528 break;
1530 case CMD_def: {
1531 func_attrib *fa = new func_attrib;
1532 fa->tag = CMD_MACRO + macros.size();
1533 fa->minargs = 0;
1534 fa->maxargs = 9;
1535 fa->evalargs = N; // FIXME: or 0?
1536 fa->ensure = 0;
1538 macros.push_back(args[1]);
1539 func_map[args[0]] = fa;
1540 break;
1542 case CMD_defaultop:
1543 if (default_op == Xapian::Query::OP_AND) {
1544 value = "and";
1545 } else {
1546 value = "or";
1548 break;
1549 case CMD_div: {
1550 int denom = string_to_int(args[1]);
1551 if (denom == 0) {
1552 value = "divide by 0";
1553 } else {
1554 value = str(string_to_int(args[0]) / denom);
1556 break;
1558 case CMD_emptydocs: {
1559 string t;
1560 if (!args.empty())
1561 t = args[0];
1562 Xapian::PostingIterator i;
1563 for (i = db.postlist_begin(t); i != db.postlist_end(t); ++i) {
1564 if (i.get_doclength() != 0) continue;
1565 if (!value.empty()) value += '\t';
1566 value += str(*i);
1568 break;
1570 case CMD_env: {
1571 char *env = getenv(args[0].c_str());
1572 if (env != NULL) value = env;
1573 break;
1575 case CMD_eq:
1576 if (args[0] == args[1]) value = "true";
1577 break;
1578 case CMD_error:
1579 if (error_msg.empty() && enquire == NULL && !dbname.empty()) {
1580 error_msg = "Database '" + dbname + "' couldn't be opened";
1582 value = error_msg;
1583 break;
1584 case CMD_field: {
1585 Xapian::docid did = q0;
1586 if (args.size() > 1) did = string_to_int(args[1]);
1587 value = fields.get_field(did, args[0]);
1588 break;
1590 case CMD_filesize: {
1591 // FIXME: rounding? i18n?
1592 int size = string_to_int(args[0]);
1593 int intpart = size;
1594 int fraction = -1;
1595 const char * format = 0;
1596 if (size < 0) {
1597 // Negative size -> empty result.
1598 } else if (size == 1) {
1599 format = "%d byte";
1600 } else if (size < 1024) {
1601 format = "%d bytes";
1602 } else {
1603 if (size < 1024 * 1024) {
1604 format = "%d.%cK";
1605 } else {
1606 size /= 1024;
1607 if (size < 1024 * 1024) {
1608 format = "%d.%cM";
1609 } else {
1610 size /= 1024;
1611 format = "%d.%cG";
1614 intpart = unsigned(size) / 1024;
1615 fraction = unsigned(size) % 1024;
1617 if (format) {
1618 char buf[200];
1619 int len;
1620 if (fraction == -1) {
1621 len = my_snprintf(buf, sizeof(buf), format, intpart);
1622 } else {
1623 fraction = (fraction * 10 / 1024) + '0';
1624 len = my_snprintf(buf, sizeof(buf), format, intpart, fraction);
1626 if (len < 0 || unsigned(len) > sizeof(buf)) len = sizeof(buf);
1627 value.assign(buf, len);
1629 break;
1631 case CMD_filters:
1632 value = filters;
1633 break;
1634 case CMD_filterterms: {
1635 Xapian::TermIterator term = db.allterms_begin();
1636 term.skip_to(args[0]);
1637 while (term != db.allterms_end()) {
1638 string t = *term;
1639 if (!startswith(t, args[0])) break;
1640 value += t;
1641 value += '\t';
1642 ++term;
1645 if (!value.empty()) value.erase(value.size() - 1);
1646 break;
1648 case CMD_find: {
1649 string l = args[0], s = args[1];
1650 string::size_type i = 0, j = 0;
1651 size_t count = 0;
1652 while (j != l.size()) {
1653 j = l.find('\t', i);
1654 if (j == string::npos) j = l.size();
1655 if (j - i == s.length()) {
1656 if (memcmp(s.data(), l.data() + i, j - i) == 0) {
1657 value = str(count);
1658 break;
1661 ++count;
1662 i = j + 1;
1664 break;
1666 case CMD_fmt:
1667 value = fmtname;
1668 break;
1669 case CMD_foreach:
1670 if (!args[0].empty()) {
1671 value = foreach(args[0], args[1], param);
1673 break;
1674 case CMD_freq: {
1675 const string& term = args[0];
1676 Xapian::doccount termfreq = 0;
1677 if (done_query) {
1678 try {
1679 termfreq = mset.get_termfreq(term);
1680 } catch (const Xapian::InvalidOperationError&) {
1681 // In 1.4.x and earlier, InvalidOperationError is
1682 // thrown if the MSet is empty and not associated with
1683 // an Enquire object. In 1.5.0 and later, a termfreq
1684 // of 0 is returned for this case.
1687 if (termfreq == 0) {
1688 // We want $freq to work before the match is run, and we
1689 // don't want using it to force the match to run.
1690 termfreq = db.get_termfreq(term);
1692 value = str(termfreq);
1693 break;
1695 case CMD_ge:
1696 if (string_to_int(args[0]) >= string_to_int(args[1]))
1697 value = "true";
1698 break;
1699 case CMD_gt:
1700 if (string_to_int(args[0]) > string_to_int(args[1]))
1701 value = "true";
1702 break;
1703 case CMD_highlight: {
1704 string bra, ket;
1705 if (args.size() > 2) {
1706 bra = args[2];
1707 if (args.size() > 3) {
1708 ket = args[3];
1709 } else {
1710 string::const_iterator i;
1711 i = find_if(bra.begin() + 2, bra.end(), p_nottag);
1712 ket = "</";
1713 ket.append(bra, 1, i - bra.begin() - 1);
1714 ket += '>';
1718 value = html_highlight(args[0], args[1], bra, ket);
1719 break;
1721 case CMD_hit:
1722 // 0-based mset index
1723 value = str(hit_no);
1724 break;
1725 case CMD_hitlist: {
1726 #if 0
1727 url_query_string = "?DB=";
1728 url_query_string += dbname;
1729 for (auto& j : query_strings) {
1730 if (j.first.empty()) {
1731 url_query_string += "&P=";
1732 } else {
1733 url_query_string += "&P."
1734 url_query_string += j.first;
1735 url_query_string += '=';
1737 const char *q = j.second.c_str();
1738 int ch;
1739 while ((ch = *q++) != '\0') {
1740 switch (ch) {
1741 case '+':
1742 url_query_string += "%2b";
1743 break;
1744 case '"':
1745 url_query_string += "%22";
1746 break;
1747 case '%':
1748 url_query_string += "%25";
1749 break;
1750 case '&':
1751 url_query_string += "%26";
1752 break;
1753 case ' ':
1754 ch = '+';
1755 /* fall through */
1756 default:
1757 url_query_string += ch;
1761 // add any boolean terms
1762 for (auto i = filter_map.begin(); i != filter_map.end(); ++i) {
1763 url_query_string += "&B=";
1764 url_query_string += i->second;
1766 #endif
1767 auto save_hit_no = hit_no;
1768 for (hit_no = topdoc; hit_no < last; ++hit_no)
1769 value += print_caption(args[0], param);
1770 hit_no = save_hit_no;
1771 break;
1773 case CMD_hitsperpage:
1774 value = str(hits_per_page);
1775 break;
1776 case CMD_hostname: {
1777 value = args[0];
1778 // remove URL scheme and/or path
1779 string::size_type i = value.find("://");
1780 if (i == string::npos) i = 0; else i += 3;
1781 value = value.substr(i, value.find('/', i) - i);
1782 // remove user@ or user:password@
1783 i = value.find('@');
1784 if (i != string::npos) value.erase(0, i + 1);
1785 // remove :port
1786 i = value.find(':');
1787 if (i != string::npos) value.resize(i);
1788 break;
1790 case CMD_html:
1791 value = html_escape(args[0]);
1792 break;
1793 case CMD_htmlstrip:
1794 value = html_strip(args[0]);
1795 break;
1796 case CMD_httpheader:
1797 if (!suppress_http_headers) {
1798 cout << args[0] << ": " << args[1] << endl;
1799 if (!set_content_type && args[0].length() == 12 &&
1800 strcasecmp(args[0].c_str(), "Content-Type") == 0) {
1801 set_content_type = true;
1804 break;
1805 case CMD_id:
1806 // document id
1807 value = str(q0);
1808 break;
1809 case CMD_if:
1810 if (args.size() > 1 && !args[0].empty())
1811 value = eval(args[1], param);
1812 else if (args.size() > 2)
1813 value = eval(args[2], param);
1814 break;
1815 case CMD_include: {
1816 if (args.size() == 1) {
1817 value = eval_file(args[0]);
1818 } else {
1819 bool fallback = false;
1820 value = eval_file(args[0], &fallback);
1821 if (fallback) {
1822 value = eval(args[1], param);
1825 break;
1827 case CMD_json:
1828 value = args[0];
1829 json_escape(value);
1830 break;
1831 case CMD_jsonarray: {
1832 const string & l = args[0];
1833 string::size_type i = 0, j;
1834 if (l.empty()) {
1835 value = "[]";
1836 break;
1838 vector<string> new_args(1);
1839 value = "[";
1840 while (true) {
1841 j = l.find('\t', i);
1842 string elt(l, i, j - i);
1843 if (args.size() == 1) {
1844 value += '"';
1845 json_escape(elt);
1846 value += elt;
1847 value += '"';
1848 } else {
1849 new_args[0] = std::move(elt);
1850 value += eval(args[1], new_args);
1852 if (j == string::npos) break;
1853 value += ',';
1854 i = j + 1;
1856 value += ']';
1857 break;
1859 case CMD_jsonbool:
1860 value = args[0].empty() ? "false" : "true";
1861 break;
1862 case CMD_jsonobject: {
1863 vector<string> new_args;
1864 new_args.push_back(string());
1866 class map_range {
1867 typedef map<string, string>::const_iterator iterator;
1868 iterator b, e;
1870 public:
1871 map_range(iterator b_, iterator e_) : b(b_), e(e_) {}
1873 iterator begin() const { return b; }
1874 iterator end() const { return e; }
1877 string prefix = args[0] + ',';
1878 auto b = option.lower_bound(prefix);
1879 ++prefix.back();
1880 auto e = option.lower_bound(prefix);
1881 value = to_json(map_range(b, e),
1882 [&](const string& k) {
1883 string key(k, prefix.size());
1884 if (args.size() > 1 && !args[1].empty()) {
1885 new_args[0] = std::move(key);
1886 key = eval(args[1], new_args);
1888 return key;
1890 [&](const string& v) {
1891 if (args.size() > 2 && !args[2].empty()) {
1892 new_args[0] = v;
1893 return eval(args[2], new_args);
1895 string r(1, '"');
1896 string elt = v;
1897 json_escape(elt);
1898 r += elt;
1899 r += '"';
1900 return r;
1902 break;
1904 case CMD_keys: {
1905 string prefix = args[0] + ',';
1906 auto i = option.lower_bound(prefix);
1907 for (; i != option.end() && startswith(i->first, prefix); ++i) {
1908 const string& key = i->first;
1909 if (!value.empty()) value += '\t';
1910 value.append(key, prefix.size(), string::npos);
1912 break;
1914 case CMD_last:
1915 value = str(last);
1916 break;
1917 case CMD_lastpage: {
1918 int l = mset.get_matches_estimated();
1919 if (l > 0) l = (l - 1) / hits_per_page + 1;
1920 value = str(l);
1921 break;
1923 case CMD_le:
1924 if (string_to_int(args[0]) <= string_to_int(args[1]))
1925 value = "true";
1926 break;
1927 case CMD_length:
1928 if (args[0].empty()) {
1929 value = "0";
1930 } else {
1931 size_t length = count(args[0].begin(), args[0].end(), '\t');
1932 value = str(length + 1);
1934 break;
1935 case CMD_list: {
1936 if (!args[0].empty()) {
1937 string pre, inter, interlast, post;
1938 switch (args.size()) {
1939 case 2:
1940 inter = interlast = args[1];
1941 break;
1942 case 3:
1943 inter = args[1];
1944 interlast = args[2];
1945 break;
1946 case 4:
1947 pre = args[1];
1948 inter = interlast = args[2];
1949 post = args[3];
1950 break;
1951 case 5:
1952 pre = args[1];
1953 inter = args[2];
1954 interlast = args[3];
1955 post = args[4];
1956 break;
1958 value += pre;
1959 string list = args[0];
1960 string::size_type split = 0, split2;
1961 while ((split2 = list.find('\t', split)) != string::npos) {
1962 if (split) value += inter;
1963 value.append(list, split, split2 - split);
1964 split = split2 + 1;
1966 if (split) value += interlast;
1967 value.append(list, split, string::npos);
1968 value += post;
1970 break;
1972 case CMD_log: {
1973 if (!vet_filename(args[0])) break;
1974 string logfile = log_dir + args[0];
1975 int fd = open(logfile.c_str(), O_CREAT|O_APPEND|O_WRONLY, 0644);
1976 if (fd == -1) break;
1977 vector<string> noargs;
1978 noargs.resize(1);
1979 string line;
1980 if (args.size() > 1) {
1981 line = args[1];
1982 } else {
1983 line = DEFAULT_LOG_ENTRY;
1985 line = eval(line, noargs);
1986 line += '\n';
1987 (void)write_all(fd, line.data(), line.length());
1988 close(fd);
1989 break;
1991 case CMD_lookup: {
1992 if (!vet_filename(args[0])) break;
1993 string cdbfile = cdb_dir + args[0];
1994 int fd = open(cdbfile.c_str(), O_RDONLY);
1995 if (fd == -1) break;
1997 struct cdb cdb;
1998 if (cdb_init(&cdb, fd) < 0) {
1999 close(fd);
2000 break;
2003 if (cdb_find(&cdb, args[1].data(), args[1].length()) > 0) {
2004 size_t datalen = cdb_datalen(&cdb);
2005 const void *dat = cdb_get(&cdb, datalen, cdb_datapos(&cdb));
2006 if (dat) {
2007 value.assign(static_cast<const char *>(dat), datalen);
2011 cdb_free(&cdb);
2012 close(fd); // FIXME: cache fds?
2013 break;
2015 case CMD_lower:
2016 value = Xapian::Unicode::tolower(args[0]);
2017 break;
2018 case CMD_lt:
2019 if (string_to_int(args[0]) < string_to_int(args[1]))
2020 value = "true";
2021 break;
2022 case CMD_map:
2023 if (!args[0].empty()) {
2024 value = foreach(args[0], args[1], param, '\t');
2026 break;
2027 case CMD_match:
2028 omegascript_match(value, args);
2029 break;
2030 case CMD_max: {
2031 vector<string>::const_iterator i = args.begin();
2032 int val = string_to_int(*i++);
2033 for (; i != args.end(); ++i) {
2034 int x = string_to_int(*i);
2035 if (x > val) val = x;
2037 value = str(val);
2038 break;
2040 case CMD_min: {
2041 vector<string>::const_iterator i = args.begin();
2042 int val = string_to_int(*i++);
2043 for (; i != args.end(); ++i) {
2044 int x = string_to_int(*i);
2045 if (x < val) val = x;
2047 value = str(val);
2048 break;
2050 case CMD_mod: {
2051 int denom = string_to_int(args[1]);
2052 if (denom == 0) {
2053 value = "divide by 0";
2054 } else {
2055 value = str(string_to_int(args[0]) % denom);
2057 break;
2059 case CMD_msize:
2060 // Estimated number of matches.
2061 value = str(mset.get_matches_estimated());
2062 break;
2063 case CMD_msizeexact:
2064 // Is msize exact?
2065 if (mset.get_matches_lower_bound()
2066 == mset.get_matches_upper_bound())
2067 value = "true";
2068 break;
2069 case CMD_msizelower:
2070 // Lower bound on number of matches.
2071 value = str(mset.get_matches_lower_bound());
2072 break;
2073 case CMD_msizeupper:
2074 // Upper bound on number of matches.
2075 value = str(mset.get_matches_upper_bound());
2076 break;
2077 case CMD_mul: {
2078 vector<string>::const_iterator i = args.begin();
2079 int total = string_to_int(*i++);
2080 while (i != args.end())
2081 total *= string_to_int(*i++);
2082 value = str(total);
2083 break;
2085 case CMD_muldiv: {
2086 int denom = string_to_int(args[2]);
2087 if (denom == 0) {
2088 value = "divide by 0";
2089 } else {
2090 int num = string_to_int(args[0]) * string_to_int(args[1]);
2091 value = str(num / denom);
2093 break;
2095 case CMD_ne:
2096 if (args[0] != args[1]) value = "true";
2097 break;
2098 case CMD_nice: {
2099 string::const_iterator i = args[0].begin();
2100 int len = args[0].length();
2101 while (len) {
2102 value += *i++;
2103 if (--len && len % 3 == 0) value += option["thousand"];
2105 break;
2107 case CMD_not:
2108 if (args[0].empty()) value = "true";
2109 break;
2110 case CMD_now:
2111 value = str(static_cast<unsigned long>(time(NULL)));
2112 break;
2113 case CMD_opt:
2114 if (args.size() == 2) {
2115 value = option[args[0] + "," + args[1]];
2116 } else {
2117 value = option[args[0]];
2119 break;
2120 case CMD_or: {
2121 for (auto&& arg : args) {
2122 value = eval(arg, param);
2123 if (!value.empty()) break;
2125 break;
2127 case CMD_ord: {
2128 if (!args[0].empty()) {
2129 Utf8Iterator it(args[0]);
2130 value = str(*it);
2132 break;
2134 case CMD_pack:
2135 value = int_to_binary_string(string_to_int(args[0]));
2136 break;
2137 case CMD_percentage:
2138 // percentage score
2139 value = str(percent);
2140 break;
2141 case CMD_prettyterm:
2142 value = pretty_term(args[0]);
2143 break;
2144 case CMD_prettyurl:
2145 value = args[0];
2146 url_prettify(value);
2147 break;
2148 case CMD_query: {
2149 auto r = query_strings.equal_range(args.empty() ?
2150 string() : args[0]);
2151 for (auto j = r.first; j != r.second; ++j) {
2152 if (!value.empty()) value += '\t';
2153 const string & s = j->second;
2154 size_t start = 0, tab;
2155 while ((tab = s.find('\t', start)) != string::npos) {
2156 value.append(s, start, tab - start);
2157 value += ' ';
2158 start = tab + 1;
2160 value.append(s, start, string::npos);
2162 break;
2164 case CMD_querydescription:
2165 value = query.get_description();
2166 break;
2167 case CMD_queryterms:
2168 value = queryterms;
2169 break;
2170 case CMD_range: {
2171 int start = string_to_int(args[0]);
2172 int end = string_to_int(args[1]);
2173 while (start <= end) {
2174 value += str(start);
2175 if (start < end) value += '\t';
2176 start++;
2178 break;
2180 case CMD_record: {
2181 Xapian::docid id = q0;
2182 if (!args.empty()) id = string_to_int(args[0]);
2183 value = db.get_document(id).get_data();
2184 break;
2186 case CMD_relevant: {
2187 // document id if relevant; empty otherwise
2188 Xapian::docid id = q0;
2189 if (!args.empty()) id = string_to_int(args[0]);
2190 auto i = ticked.find(id);
2191 if (i != ticked.end()) {
2192 i->second = false; // icky side-effect
2193 value = str(id);
2195 break;
2197 case CMD_relevants: {
2198 for (auto i : ticked) {
2199 if (i.second) {
2200 value += str(i.first);
2201 value += '\t';
2204 if (!value.empty()) value.erase(value.size() - 1);
2205 break;
2207 case CMD_score:
2208 // Score (0 to 10)
2209 value = str(percent / 10);
2210 break;
2211 case CMD_set:
2212 option[args[0]] = args[1];
2213 break;
2214 case CMD_seterror:
2215 error_msg = args[0];
2216 break;
2217 case CMD_setmap: {
2218 string base = args[0] + ',';
2219 if (args.size() % 2 != 1)
2220 throw string("$setmap requires an odd number of arguments");
2221 for (unsigned int i = 1; i + 1 < args.size(); i += 2) {
2222 option[base + args[i]] = args[i + 1];
2224 break;
2226 case CMD_setrelevant: {
2227 string::size_type i = 0, j;
2228 while (true) {
2229 j = args[0].find_first_not_of("0123456789", i);
2230 Xapian::docid id = atoi(args[0].substr(i, j - i).c_str());
2231 if (id) {
2232 rset.add_document(id);
2233 ticked[id] = true;
2235 if (j == string::npos) break;
2236 i = j + 1;
2238 break;
2240 case CMD_slice: {
2241 string list = args[0], pos = args[1];
2242 vector<string> items;
2243 string::size_type i = 0, j;
2244 while (true) {
2245 j = list.find('\t', i);
2246 items.push_back(list.substr(i, j - i));
2247 if (j == string::npos) break;
2248 i = j + 1;
2250 i = 0;
2251 bool have_added = false;
2252 while (true) {
2253 j = pos.find('\t', i);
2254 int item = string_to_int(pos.substr(i, j - i));
2255 if (item >= 0 && size_t(item) < items.size()) {
2256 if (have_added) value += '\t';
2257 value += items[item];
2258 have_added = true;
2260 if (j == string::npos) break;
2261 i = j + 1;
2263 break;
2265 case CMD_snippet: {
2266 size_t length = 200;
2267 if (args.size() > 1) {
2268 length = string_to_int(args[1]);
2270 if (!stemmer)
2271 stemmer = new Xapian::Stem(option["stemmer"]);
2272 // FIXME: Allow start and end highlight and omit to be specified.
2273 value = mset.snippet(args[0], length, *stemmer,
2274 mset.SNIPPET_BACKGROUND_MODEL|mset.SNIPPET_EXHAUSTIVE,
2275 "<strong>", "</strong>", "...");
2276 break;
2278 case CMD_sort:
2279 omegascript_sort(args, value);
2280 break;
2281 case CMD_split: {
2282 string split;
2283 if (args.size() == 1) {
2284 split = " ";
2285 value = args[0];
2286 } else {
2287 split = args[0];
2288 value = args[1];
2290 string::size_type i = 0;
2291 while (true) {
2292 if (split.empty()) {
2293 ++i;
2294 if (i >= value.size()) break;
2295 } else {
2296 i = value.find(split, i);
2297 if (i == string::npos) break;
2299 value.replace(i, split.size(), 1, '\t');
2300 ++i;
2302 break;
2304 case CMD_stoplist: {
2305 Xapian::TermIterator i = qp.stoplist_begin();
2306 Xapian::TermIterator end = qp.stoplist_end();
2307 while (i != end) {
2308 if (!value.empty()) value += '\t';
2309 value += *i;
2310 ++i;
2312 break;
2314 case CMD_sub:
2315 value = str(string_to_int(args[0]) - string_to_int(args[1]));
2316 break;
2317 case CMD_subdb: {
2318 Xapian::docid id = q0;
2319 if (args.size() > 0) id = string_to_int(args[0]);
2320 value = subdbs[(id - 1) % subdbs.size()].get_name();
2321 break;
2323 case CMD_subid: {
2324 Xapian::docid id = q0;
2325 if (args.size() > 0) id = string_to_int(args[0]);
2326 // This is the docid in the single shard.
2327 Xapian::docid shard_did = (id - 1) / subdbs.size() + 1;
2328 // We now need to map this back to the docid in the collection
2329 // of shards specified by the DB parameter value which $subdb
2330 // returns.
2331 const SubDB& subdb = subdbs[(id - 1) % subdbs.size()];
2332 value = str(subdb.map_docid(shard_did));
2333 break;
2335 case CMD_substr: {
2336 int start = string_to_int(args[1]);
2337 if (start < 0) {
2338 if (static_cast<size_t>(-start) >= args[0].size()) {
2339 start = 0;
2340 } else {
2341 start = static_cast<int>(args[0].size()) + start;
2343 } else {
2344 if (static_cast<size_t>(start) >= args[0].size()) break;
2346 size_t len = string::npos;
2347 if (args.size() > 2) {
2348 int int_len = string_to_int(args[2]);
2349 if (int_len >= 0) {
2350 len = size_t(int_len);
2351 } else {
2352 len = args[0].size() - start;
2353 if (static_cast<size_t>(-int_len) >= len) {
2354 len = 0;
2355 } else {
2356 len -= static_cast<size_t>(-int_len);
2360 value.assign(args[0], start, len);
2361 break;
2363 case CMD_suggestion:
2364 value = qp.get_corrected_query_string();
2365 break;
2366 case CMD_switch: {
2367 const string& val = args[0];
2368 for (size_t i = 1; i < args.size(); i += 2) {
2369 if (i == args.size() - 1) {
2370 // Handle optional "else" value.
2371 value = eval(args[i], param);
2372 break;
2374 if (val == eval(args[i], param)) {
2375 value = eval(args[i + 1], param);
2376 break;
2379 break;
2381 case CMD_termprefix:
2382 (void)prefix_from_term(&value, args[0]);
2383 break;
2384 case CMD_terms: {
2385 // list of matching terms
2386 if (!enquire) break;
2387 Xapian::TermIterator term = enquire->get_matching_terms_begin(q0);
2388 if (args.empty()) {
2389 while (term != enquire->get_matching_terms_end(q0)) {
2390 // check term was in the typed query so we ignore
2391 // boolean filter terms
2392 const string & t = *term;
2393 if (termset.find(t) != termset.end()) {
2394 value += t;
2395 value += '\t';
2397 ++term;
2399 } else {
2400 // Return matching terms with specified prefix. We can't
2401 // use skip_to() as the terms aren't ordered by termname.
2402 const string & pfx = args[0];
2403 while (term != enquire->get_matching_terms_end(q0)) {
2404 const string & t = *term;
2405 if (startswith(t, pfx)) {
2406 value += t;
2407 value += '\t';
2409 ++term;
2413 if (!value.empty()) value.erase(value.size() - 1);
2414 break;
2416 case CMD_thispage:
2417 value = str(topdoc / hits_per_page + 1);
2418 break;
2419 case CMD_time:
2420 if (secs >= 0) {
2421 char buf[64];
2422 my_snprintf(buf, sizeof(buf), "%.6f", secs);
2423 // MSVC's snprintf omits the zero byte if the string if
2424 // sizeof(buf) long.
2425 buf[sizeof(buf) - 1] = '\0';
2426 value = buf;
2428 break;
2429 case CMD_topdoc:
2430 // first document on current page of hit list (counting from 0)
2431 value = str(topdoc);
2432 break;
2433 case CMD_topterms:
2434 if (enquire) {
2435 int howmany = 16;
2436 if (!args.empty()) howmany = string_to_int(args[0]);
2437 if (howmany < 0) howmany = 0;
2439 // List of expand terms
2440 Xapian::ESet eset;
2441 OmegaExpandDecider decider(db, &termset);
2443 if (!rset.empty()) {
2444 set_expansion_scheme(*enquire, option);
2445 eset = enquire->get_eset(howmany * 2, rset, &decider);
2446 } else if (mset.size()) {
2447 // invent an rset
2448 Xapian::RSet tmp;
2450 int c = 5;
2451 // FIXME: what if mset does not start at first match?
2452 for (Xapian::docid did : mset) {
2453 tmp.add_document(did);
2454 if (--c == 0) break;
2457 set_expansion_scheme(*enquire, option);
2458 eset = enquire->get_eset(howmany * 2, tmp, &decider);
2461 // Don't show more than one word with the same stem.
2462 set<string> stems;
2463 Xapian::ESetIterator i;
2464 for (i = eset.begin(); i != eset.end(); ++i) {
2465 string term(*i);
2466 string stem = (*stemmer)(term);
2467 if (stems.find(stem) != stems.end()) continue;
2468 stems.insert(stem);
2469 value += term;
2470 value += '\t';
2471 if (--howmany == 0) break;
2473 if (!value.empty()) value.erase(value.size() - 1);
2475 break;
2476 case CMD_transform:
2477 omegascript_transform(value, args);
2478 break;
2479 case CMD_truncate:
2480 value = generate_sample(args[0],
2481 string_to_int(args[1]),
2482 args.size() > 2 ? args[2] : string(),
2483 args.size() > 3 ? args[3] : string());
2484 break;
2485 case CMD_uniq: {
2486 const string &list = args[0];
2487 if (list.empty()) break;
2488 string::size_type split = 0, split2;
2489 string prev;
2490 do {
2491 split2 = list.find('\t', split);
2492 string item(list, split, split2 - split);
2493 if (split == 0) {
2494 value = item;
2495 } else if (item != prev) {
2496 value += '\t';
2497 value += item;
2499 prev = item;
2500 split = split2 + 1;
2501 } while (split2 != string::npos);
2502 break;
2504 case CMD_unique: {
2505 unordered_set<string> seen;
2506 const string &list = args[0];
2507 if (list.empty()) break;
2508 string::size_type split = 0, split2;
2509 do {
2510 split2 = list.find('\t', split);
2511 string item(list, split, split2 - split);
2512 if (seen.insert(item).second) {
2513 if (split != 0)
2514 value += '\t';
2515 value += item;
2517 split = split2 + 1;
2518 } while (split2 != string::npos);
2519 break;
2521 case CMD_unpack:
2522 value = str(binary_string_to_int(args[0]));
2523 break;
2524 case CMD_unprefix: {
2525 size_t prefix_len = prefix_from_term(NULL, args[0]);
2526 value.assign(args[0], prefix_len, string::npos);
2527 break;
2529 case CMD_unstem: {
2530 const string &term = args[0];
2531 Xapian::TermIterator i = qp.unstem_begin(term);
2532 Xapian::TermIterator end = qp.unstem_end(term);
2533 while (i != end) {
2534 if (!value.empty()) value += '\t';
2535 value += *i;
2536 ++i;
2538 break;
2540 case CMD_upper:
2541 value = Xapian::Unicode::toupper(args[0]);
2542 break;
2543 case CMD_url:
2544 url_encode(value, args[0]);
2545 break;
2546 case CMD_value: {
2547 Xapian::docid id = q0;
2548 Xapian::valueno value_no = string_to_int(args[0]);
2549 if (args.size() > 1) id = string_to_int(args[1]);
2550 value = db.get_document(id).get_value(value_no);
2551 break;
2553 case CMD_version:
2554 value = PACKAGE_STRING;
2555 break;
2556 case CMD_weight:
2557 value = double_to_string(weight);
2558 break;
2559 default: {
2560 args.insert(args.begin(), param[0]);
2561 int macro_no = func->second->tag - CMD_MACRO;
2562 assert(macro_no >= 0 && unsigned(macro_no) < macros.size());
2563 // throw "Unknown function '" + var + "'";
2564 value = eval(macros[macro_no], args);
2565 break;
2568 res += value;
2569 } catch (const Xapian::Error & e) {
2570 // FIXME: this means we only see the most recent error in $error
2571 // - is that the best approach?
2572 error_msg = e.get_description();
2575 res.append(fmt, p, string::npos);
2576 return res;
2579 static string
2580 eval_file(const string& fmtfile, bool* p_not_found)
2582 // Use -1 to indicate vet_filename() failed.
2583 int eno = -1;
2584 if (vet_filename(fmtfile)) {
2585 string file = template_dir + fmtfile;
2586 string fmt;
2587 errno = 0;
2588 if (load_file(file, fmt)) {
2589 vector<string> noargs;
2590 noargs.resize(1);
2591 return eval(fmt, noargs);
2593 eno = errno;
2596 if (p_not_found) {
2597 *p_not_found = true;
2598 return string();
2601 // FIXME: report why!
2602 string msg = string("Couldn't read format template '") + fmtfile + '\'';
2603 if (eno) {
2604 msg += " (";
2605 msg += (eno < 0 ? "name contains '..'" : strerror(eno));
2606 msg += ')';
2608 throw msg;
2611 extern string
2612 pretty_term(string term)
2614 // Just leave empty strings and single characters alone.
2615 if (term.length() <= 1) return term;
2617 // Assume unprefixed terms are unstemmed.
2618 if (!C_isupper(term[0])) return term;
2620 // Handle stemmed terms.
2621 bool stemmed = (term[0] == 'Z');
2622 if (stemmed) {
2623 // First of all, check if a term in the query stemmed to this one.
2624 Xapian::TermIterator u = qp.unstem_begin(term);
2625 // There might be multiple words with the same stem, but we only want
2626 // one so just take the first.
2627 if (u != qp.unstem_end(term)) return *u;
2629 // Remove the 'Z'.
2630 term.erase(0, 1);
2633 bool add_quotes = false;
2635 // Check if the term has a prefix.
2636 if (C_isupper(term[0])) {
2637 // See if we have this prefix in the termprefix_to_userprefix map. If
2638 // so, just reverse the mapping (e.g. turn 'Sfish' into 'subject:fish').
2639 string prefix;
2640 size_t prefix_len = prefix_from_term(&prefix, term);
2642 map<string, string>::const_iterator i;
2643 i = termprefix_to_userprefix.find(prefix);
2644 if (i != termprefix_to_userprefix.end()) {
2645 string user_prefix = i->second;
2646 user_prefix += ':';
2647 term.replace(0, prefix_len, user_prefix);
2648 } else {
2649 // We don't have a prefix mapping for this, so just set a flag to
2650 // add quotes around the term.
2651 add_quotes = true;
2655 if (stemmed) term += '.';
2657 if (add_quotes) {
2658 term.insert(0, "\"");
2659 term.append("\"");
2662 return term;
2665 static string
2666 print_caption(const string& fmt, vector<string>& param)
2668 q0 = *(mset[hit_no]);
2670 weight = mset[hit_no].get_weight();
2671 percent = mset.convert_to_percent(mset[hit_no]);
2672 collapsed = mset[hit_no].get_collapse_count();
2674 return eval(fmt, param);
2677 void
2678 parse_omegascript()
2680 try {
2681 string output = eval_file(fmtname);
2682 if (!set_content_type && !suppress_http_headers) {
2683 cout << "Content-Type: text/html" << endl;
2684 set_content_type = true;
2686 if (!suppress_http_headers) cout << endl;
2687 cout << output;
2688 } catch (...) {
2689 // Ensure the headers have been output so that any exception gets
2690 // reported rather than giving a server error.
2691 if (!set_content_type && !suppress_http_headers) {
2692 cout << "Content-Type: text/html" << endl;
2693 set_content_type = true;
2695 if (!suppress_http_headers) cout << endl;
2696 throw;
2700 static void
2701 ensure_query_parsed()
2703 if (query_parsed) return;
2704 query_parsed = true;
2706 // Should we discard the existing R-set recorded in R CGI parameters?
2707 bool discard_rset = false;
2709 // Should we force the first page of hits (and ignore [ > < # and TOPDOC
2710 // CGI parameters)?
2711 bool force_first_page = false;
2713 string v;
2714 // get list of terms from previous iteration of query
2715 auto val = cgi_params.find("xP");
2716 if (val != cgi_params.end()) {
2717 v = val->second;
2718 // If xP given, default to discarding any RSet and forcing the first
2719 // page of results. If the query is the same, or an extension of
2720 // the previous query, we adjust these again below.
2721 discard_rset = true;
2722 force_first_page = true;
2724 querytype result = parse_queries(v);
2725 switch (result) {
2726 case BAD_QUERY:
2727 break;
2728 case NEW_QUERY:
2729 break;
2730 case SAME_QUERY:
2731 case EXTENDED_QUERY:
2732 // If we've changed database, force the first page of hits
2733 // and discard the R-set (since the docids will have changed)
2734 val = cgi_params.find("xDB");
2735 if (val != cgi_params.end() && val->second != dbname) break;
2736 if (result == SAME_QUERY && force_first_page) {
2737 val = cgi_params.find("xFILTERS");
2738 if (val != cgi_params.end() && val->second != filters &&
2739 val->second != old_filters) {
2740 // Filters have changed since last query.
2741 } else {
2742 force_first_page = false;
2745 discard_rset = false;
2746 break;
2749 if (!force_first_page) {
2750 // Work out which mset element is the first hit we want
2751 // to display
2752 val = cgi_params.find("TOPDOC");
2753 if (val != cgi_params.end()) {
2754 topdoc = atol(val->second.c_str());
2757 // Handle next, previous, and page links
2758 if (cgi_params.find(">") != cgi_params.end()) {
2759 topdoc += hits_per_page;
2760 } else if (cgi_params.find("<") != cgi_params.end()) {
2761 if (topdoc >= hits_per_page)
2762 topdoc -= hits_per_page;
2763 else
2764 topdoc = 0;
2765 } else if ((val = cgi_params.find("[")) != cgi_params.end() ||
2766 (val = cgi_params.find("#")) != cgi_params.end()) {
2767 long page = atol(val->second.c_str());
2768 // Do something sensible for page 0 (we count pages from 1).
2769 if (page == 0) page = 1;
2770 topdoc = (page - 1) * hits_per_page;
2773 // raw_search means don't snap TOPDOC to a multiple of HITSPERPAGE.
2774 // Normally we snap TOPDOC like this so that things work nicely if
2775 // HITSPERPAGE is in a <select> or on radio buttons. If we're
2776 // postprocessing the output of omega and want variable sized pages,
2777 // this is unhelpful.
2778 bool raw_search = false;
2779 val = cgi_params.find("RAWSEARCH");
2780 if (val != cgi_params.end()) {
2781 raw_search = bool(atol(val->second.c_str()));
2784 if (!raw_search) topdoc = (topdoc / hits_per_page) * hits_per_page;
2787 if (!discard_rset) {
2788 // put documents marked as relevant into the rset
2789 auto g = cgi_params.equal_range("R");
2790 for (auto i = g.first; i != g.second; ++i) {
2791 const string & value = i->second;
2792 for (size_t j = 0; j < value.size(); j = value.find('.', j)) {
2793 while (value[j] == '.') ++j;
2794 Xapian::docid d = atoi(value.c_str() + j);
2795 if (d) {
2796 rset.add_document(d);
2797 ticked[d] = true;
2804 // run query if we haven't already
2805 static void
2806 ensure_match()
2808 if (done_query) return;
2810 secs = RealTime::now();
2811 run_query();
2812 if (secs != -1)
2813 secs = RealTime::now() - secs;
2815 done_query = true;
2816 last = mset.get_matches_lower_bound();
2817 if (last == 0) {
2818 // Otherwise topdoc ends up being -6 if it's non-zero!
2819 topdoc = 0;
2820 } else {
2821 if (topdoc >= last)
2822 topdoc = ((last - 1) / hits_per_page) * hits_per_page;
2823 // last is the count of documents up to the end of the current page
2824 // (as returned by $last)
2825 if (topdoc + hits_per_page < last)
2826 last = topdoc + hits_per_page;
2830 // OmegaExpandDecider methods.
2832 OmegaExpandDecider::OmegaExpandDecider(const Xapian::Database & db_,
2833 set<string> * querytermset)
2834 : db(db_)
2836 // We'll want the stemmer for testing matches anyway.
2837 if (!stemmer)
2838 stemmer = new Xapian::Stem(option["stemmer"]);
2839 if (querytermset) {
2840 set<string>::const_iterator i;
2841 for (i = querytermset->begin(); i != querytermset->end(); ++i) {
2842 string term(*i);
2843 if (term.empty()) continue;
2845 unsigned char ch = term[0];
2846 bool stemmed = (ch == 'Z');
2847 if (stemmed) {
2848 term.erase(0, 1);
2849 if (term.empty()) continue;
2850 ch = term[0];
2853 if (C_isupper(ch)) {
2854 size_t prefix_len = prefix_from_term(NULL, term);
2855 term.erase(0, prefix_len);
2858 if (!stemmed) term = (*stemmer)(term);
2860 exclude_stems.insert(term);
2865 bool
2866 OmegaExpandDecider::operator()(const string & term) const
2868 unsigned char ch = term[0];
2870 // Reject terms with a prefix.
2871 if (C_isupper(ch)) return false;
2874 MyStopper stopper;
2875 // Don't suggest stopwords.
2876 if (stopper(term)) return false;
2879 // Reject small numbers.
2880 if (term.size() < 4 && C_isdigit(ch)) return false;
2882 // Reject terms containing a space.
2883 if (term.find(' ') != string::npos) return false;
2885 // Skip terms with stems in the exclude_stems set, to avoid suggesting
2886 // terms which are already in the query in some form.
2887 string stem = (*stemmer)(term);
2888 if (exclude_stems.find(stem) != exclude_stems.end())
2889 return false;
2891 // Ignore terms that only occur once (hapaxes) since they aren't
2892 // useful for finding related documents - they only occur in a
2893 // document that's already been marked as relevant.
2894 // FIXME: add an expand option to ignore terms where
2895 // termfreq == rtermfreq.
2896 if (db.get_termfreq(term) <= 1) return false;
2898 return true;