2 * @brief query executor for omega
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001 James Aylett
6 * Copyright 2001,2002 Ananova Ltd
7 * Copyright 2002 Intercede 1749 Ltd
8 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2013,2014,2015,2016,2017,2018,2019,2020,2021 Olly Betts
9 * Copyright 2008 Thomas Viehmann
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public License as
13 * published by the Free Software Foundation; either version 2 of the
14 * License, or (at your option) any later version.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
34 #include <unordered_map>
35 #include <unordered_set>
44 #include "strcasecmp.h"
47 #include "safeunistd.h"
48 #include <sys/types.h>
49 #include "safesysstat.h"
50 #include "safefcntl.h"
56 #include "csvescape.h"
58 #include "datevalue.h"
60 #include "jsonescape.h"
69 #include "stringutils.h"
70 #include "transform.h"
71 #include "urldecode.h"
72 #include "urlencode.h"
83 using Xapian::Utf8Iterator
;
85 using Xapian::Unicode::is_wordchar
;
87 /// Map shard to DB parameter value and stats to allow docid mapping.
90 static bool query_parsed
= false;
91 static bool done_query
= false;
92 static Xapian::docid last
= 0;
93 static Xapian::docid topdoc
= 0;
95 static Xapian::MSet mset
;
96 static Xapian::RSet rset
;
98 static map
<Xapian::docid
, bool> ticked
;
100 static void ensure_query_parsed();
101 static void ensure_match();
103 static Xapian::Query query
;
104 //static string url_query_string;
105 Xapian::Query::op default_op
= Xapian::Query::OP_AND
; // default matching mode
107 // Maintain an explicit date_filter_set flag - date_filter.empty() will also
108 // be true if a date filter is specified which simplifies to
109 // Query::MatchNothing at construction time.
110 static bool date_filter_set
= false;
111 static Xapian::Query date_filter
;
113 static Xapian::QueryParser qp
;
114 static Xapian::NumberRangeProcessor
* size_rp
= NULL
;
115 static Xapian::Stem
*stemmer
= NULL
;
117 static string
eval_file(const string
& fmtfile
, bool* p_not_found
= nullptr);
119 static set
<string
> termset
;
121 // Holds mapping from term prefix to user prefix (e.g. 'S' -> 'subject:').
122 static map
<string
, string
> termprefix_to_userprefix
;
124 static string queryterms
;
126 static string error_msg
;
128 static double secs
= -1;
130 static const char DEFAULT_LOG_ENTRY
[] =
131 "$or{$env{REMOTE_HOST},$env{REMOTE_ADDR},-}\t"
132 "[$date{$now,%d/%b/%Y:%H:%M:%S} +0000]\t"
133 "$if{$cgi{X},add,$if{$cgi{MORELIKE},morelike,query}}\t"
136 "$msize$if{$env{HTTP_REFERER},\t$env{HTTP_REFERER}}";
138 class MyStopper
: public Xapian::Stopper
{
140 bool operator()(const string
& t
) const override
{
143 return (t
== "a" || t
== "about" || t
== "an" || t
== "and" ||
144 t
== "are" || t
== "as" || t
== "at");
146 return (t
== "be" || t
== "by");
150 return (t
== "for" || t
== "from");
154 return (t
== "i" || t
== "in" || t
== "is" || t
== "it");
156 return (t
== "of" || t
== "on" || t
== "or");
158 return (t
== "that" || t
== "the" || t
== "this" || t
== "to");
160 return (t
== "was" || t
== "what" || t
== "when" ||
161 t
== "where" || t
== "which" || t
== "who" ||
162 t
== "why" || t
== "will" || t
== "with");
164 return (t
== "you" || t
== "your");
172 prefix_from_term(string
* prefix
, const string
& term
)
175 if (term
[0] == 'X') {
176 const string::const_iterator begin
= term
.begin();
177 string::const_iterator i
= begin
+ 1;
178 while (i
!= term
.end() && C_isupper(*i
))
181 prefix
->assign(begin
, i
);
182 if (i
!= term
.end() && *i
== ':')
187 if (C_isupper(term
[0])) {
199 // Don't allow ".." in format names, log file names, etc as this would allow
200 // people to open a format "../../etc/passwd" or similar.
201 // FIXME: make this check more exact ("foo..bar" is safe)
202 // FIXME: log when this check fails
204 vet_filename(const string
&filename
)
206 string::size_type i
= filename
.find("..");
207 return (i
== string::npos
);
211 // * If any terms have been removed, it's a "fresh query" so we discard any
212 // relevance judgements
213 // * If all previous terms are there but more have been added then we keep
214 // the relevance judgements, but return the first page of hits
216 // NEW_QUERY entirely new query
217 // SAME_QUERY unchanged query
218 // EXTENDED_QUERY new query, but based on the old one
219 // BAD_QUERY parse error (message in error_msg)
220 typedef enum { NEW_QUERY
, SAME_QUERY
, EXTENDED_QUERY
, BAD_QUERY
} querytype
;
222 static multimap
<string
, string
> query_strings
;
225 add_query_string(const string
& prefix
, const string
& s
)
227 string query_string
= s
;
228 // Strip leading and trailing whitespace from query_string.
230 if (!query_string
.empty())
231 query_strings
.insert(make_pair(prefix
, query_string
));
235 read_qp_flags(const string
& opt_pfx
, unsigned f
)
237 map
<string
, string
>::const_iterator i
= option
.lower_bound(opt_pfx
);
238 for (; i
!= option
.end() && startswith(i
->first
, opt_pfx
); ++i
) {
240 const char * s
= i
->first
.c_str() + opt_pfx
.size();
243 // Note that the ``Xapian::QueryParser::FLAG_ACCUMULATE`` flag
244 // is or-ed in below because it's needed for ``$stoplist`` and
245 // ``$unstem`` to work correctly, and so is deliberately not
246 // available to specify here.
247 if (strcmp(s
, "auto_multiword_synonyms") == 0) {
248 mask
= Xapian::QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS
;
251 if (strcmp(s
, "auto_synonyms") == 0) {
252 mask
= Xapian::QueryParser::FLAG_AUTO_SYNONYMS
;
257 if (strcmp(s
, "boolean") == 0) {
258 mask
= Xapian::QueryParser::FLAG_BOOLEAN
;
261 if (strcmp(s
, "boolean_any_case") == 0) {
262 mask
= Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE
;
267 if (strcmp(s
, "cjk_ngram") == 0) {
268 mask
= Xapian::QueryParser::FLAG_CJK_NGRAM
;
273 if (strcmp(s
, "default") == 0) {
274 mask
= Xapian::QueryParser::FLAG_DEFAULT
;
279 if (strcmp(s
, "fuzzy") == 0) {
280 mask
= Xapian::QueryParser::FLAG_FUZZY
;
285 if (strcmp(s
, "lovehate") == 0) {
286 mask
= Xapian::QueryParser::FLAG_LOVEHATE
;
291 if (strcmp(s
, "no_positions") == 0) {
292 mask
= Xapian::QueryParser::FLAG_NO_POSITIONS
;
295 if (strcmp(s
, "ngrams") == 0) {
296 mask
= Xapian::QueryParser::FLAG_NGRAMS
;
301 if (strcmp(s
, "partial") == 0) {
302 mask
= Xapian::QueryParser::FLAG_PARTIAL
;
305 if (strcmp(s
, "phrase") == 0) {
306 mask
= Xapian::QueryParser::FLAG_PHRASE
;
309 if (strcmp(s
, "pure_not") == 0) {
310 mask
= Xapian::QueryParser::FLAG_PURE_NOT
;
315 if (strcmp(s
, "spelling_correction") == 0) {
316 mask
= Xapian::QueryParser::FLAG_SPELLING_CORRECTION
;
319 if (strcmp(s
, "synonym") == 0) {
320 mask
= Xapian::QueryParser::FLAG_SYNONYM
;
325 if (strcmp(s
, "wildcard") == 0) {
326 mask
= Xapian::QueryParser::FLAG_WILDCARD
;
329 #if XAPIAN_AT_LEAST(1,5,0)
330 if (strcmp(s
, "wildcard_glob") == 0) {
331 mask
= Xapian::QueryParser::FLAG_WILDCARD_GLOB
;
334 if (strcmp(s
, "wildcard_multi") == 0) {
335 mask
= Xapian::QueryParser::FLAG_WILDCARD_MULTI
;
338 if (strcmp(s
, "wildcard_single") == 0) {
339 mask
= Xapian::QueryParser::FLAG_WILDCARD_SINGLE
;
342 if (strcmp(s
, "word_breaks") == 0) {
343 mask
= Xapian::QueryParser::FLAG_WORD_BREAKS
;
350 if (i
->second
.empty()) {
356 // Always enable FLAG_ACCUMULATE so that $stoplist and $unstem report
357 // values accumulated over all query strings parsed as part of a query, not
358 // just the last one parsed.
359 return f
| Xapian::QueryParser::FLAG_ACCUMULATE
;
363 parse_queries(const string
& oldp
)
365 // Parse the query string.
366 auto opt_it
= option
.find("stem_strategy");
367 if (opt_it
!= option
.end()) {
368 if (opt_it
->second
== "all") {
369 qp
.set_stemming_strategy(Xapian::QueryParser::STEM_ALL
);
370 } else if (opt_it
->second
== "all_z") {
371 qp
.set_stemming_strategy(Xapian::QueryParser::STEM_ALL_Z
);
372 } else if (opt_it
->second
== "none") {
373 qp
.set_stemming_strategy(Xapian::QueryParser::STEM_NONE
);
374 } else if (opt_it
->second
== "some") {
375 qp
.set_stemming_strategy(Xapian::QueryParser::STEM_SOME
);
376 } else if (opt_it
->second
== "some_full_pos") {
377 qp
.set_stemming_strategy(Xapian::QueryParser::STEM_SOME_FULL_POS
);
380 opt_it
= option
.find("stem_all");
381 if (opt_it
!= option
.end() && opt_it
->second
== "true") {
382 qp
.set_stemming_strategy(Xapian::QueryParser::STEM_ALL
);
385 qp
.set_stopper((new MyStopper())->release());
386 qp
.set_default_op(default_op
);
388 // FIXME: provide a custom RP which handles size:10..20K, etc.
390 size_rp
= new Xapian::NumberRangeProcessor(VALUE_SIZE
, "size:");
391 qp
.add_rangeprocessor(size_rp
);
392 map
<string
, string
>::const_iterator pfx
= option
.lower_bound("prefix,");
393 for (; pfx
!= option
.end() && startswith(pfx
->first
, "prefix,"); ++pfx
) {
394 string
user_prefix(pfx
->first
, 7);
395 const string
& term_pfx_list
= pfx
->second
;
396 string::size_type i
= 0;
398 string::size_type i0
= i
;
399 i
= term_pfx_list
.find('\t', i
);
400 const string
& term_pfx
= term_pfx_list
.substr(i0
, i
- i0
);
401 qp
.add_prefix(user_prefix
, term_pfx
);
402 // std::map::insert() won't overwrite an existing entry, so we'll
403 // prefer the first user_prefix for which a particular term prefix
405 termprefix_to_userprefix
.insert(make_pair(term_pfx
, user_prefix
));
406 } while (UNSIGNED_OVERFLOW_OK(++i
));
408 pfx
= option
.lower_bound("boolprefix,");
409 for (; pfx
!= option
.end() && startswith(pfx
->first
, "boolprefix,"); ++pfx
) {
410 string
user_prefix(pfx
->first
, 11, string::npos
);
411 auto it
= option
.find("nonexclusiveprefix," + pfx
->second
);
412 bool exclusive
= (it
== option
.end() || it
->second
.empty());
413 qp
.add_boolean_prefix(user_prefix
, pfx
->second
, exclusive
);
414 termprefix_to_userprefix
.insert(make_pair(pfx
->second
, user_prefix
));
418 unsigned default_flags
= read_qp_flags("flag_", 0);
420 vector
<Xapian::Query
> queries
;
421 queries
.reserve(query_strings
.size());
423 for (auto& j
: query_strings
) {
424 const string
& prefix
= j
.first
;
425 const string
& query_string
= j
.second
;
427 // Choose the stemmer to use for this input.
428 string stemlang
= option
[prefix
+ ":stemmer"];
429 if (stemlang
.empty())
430 stemlang
= option
["stemmer"];
431 qp
.set_stemmer(Xapian::Stem(stemlang
));
433 // Work out the flags to use for this input.
434 unsigned f
= read_qp_flags(prefix
+ ":flag_", default_flags
);
436 Xapian::Query q
= qp
.parse_query(query_string
, f
, prefix
);
438 queries
.push_back(q
);
441 Xapian::Query::op intra_query_op
= Xapian::Query::OP_AND
;
442 if (queries
.size() > 1) {
443 // Determine operator to use to combine multiple P and P.<prefix>
444 // parameters. Note that we only need to bother if there are two
445 // or more query strings, since for one or none the operator
446 // specified isn't actually used.
447 opt_it
= option
.find("intra_query_op");
448 if (opt_it
!= option
.end()) {
449 const string
& v
= opt_it
->second
;
450 if (v
== "OR" || v
== "or") {
451 intra_query_op
= Xapian::Query::OP_OR
;
455 query
= Xapian::Query(intra_query_op
, queries
.begin(), queries
.end());
456 } catch (Xapian::QueryParserError
&e
) {
457 error_msg
= e
.get_msg();
461 Xapian::termcount n_new_terms
= 0;
462 for (Xapian::TermIterator i
= query
.get_terms_begin();
463 i
!= query
.get_terms_end(); ++i
) {
464 if (termset
.find(*i
) == termset
.end()) {
466 if (!queryterms
.empty()) queryterms
+= '\t';
472 // Check new query against the previous one
474 // If oldp was empty that means there were no parsed query terms
475 // before, so if there are now this is a new query.
476 return n_new_terms
? NEW_QUERY
: SAME_QUERY
;
479 // The terms in oldp are separated by tabs.
480 const char oldp_separator
= '\t';
481 size_t n_old_terms
= count(oldp
.begin(), oldp
.end(), oldp_separator
) + 1;
483 // short-cut: if the new query has fewer terms, it must be a new one
484 if (n_new_terms
< n_old_terms
) return NEW_QUERY
;
486 const char *term
= oldp
.c_str();
488 while ((pend
= strchr(term
, oldp_separator
)) != NULL
) {
489 if (termset
.find(string(term
, pend
- term
)) == termset
.end())
494 if (termset
.find(string(term
)) == termset
.end())
498 // Use termset.size() rather than n_new_terms so we correctly handle
499 // the case when the query has repeated terms.
500 // This works wrongly in the case when the user extends the query
501 // by adding a term already in it, but that's unlikely and the behaviour
502 // isn't too bad (we just don't reset page 1). We also mishandle a few
503 // other obscure cases e.g. adding quotes to turn a query into a phrase.
504 if (termset
.size() > n_old_terms
) return EXTENDED_QUERY
;
508 static multimap
<string
, string
> filter_map
;
509 static set
<string
> neg_filters
;
511 void add_bterm(const string
&term
) {
513 if (prefix_from_term(&prefix
, term
) > 0)
514 filter_map
.insert(multimap
<string
, string
>::value_type(prefix
, term
));
517 void add_nterm(const string
&term
) {
519 neg_filters
.insert(term
);
523 add_date_filter(const string
& date_start
,
524 const string
& date_end
,
525 const string
& date_span
,
526 Xapian::valueno date_value_slot
)
528 if (date_start
.empty() && date_end
.empty() && date_span
.empty())
532 if (date_value_slot
!= Xapian::BAD_VALUENO
) {
533 // The values can be a time_t in 4 bytes, or YYYYMMDD... (with the
534 // latter the sort order just works correctly between different
537 db
.get_value_lower_bound(date_value_slot
).size() == 4 &&
538 db
.get_value_upper_bound(date_value_slot
).size() == 4;
539 q
= date_value_range(as_time_t
, date_value_slot
,
540 date_start
, date_end
,
543 q
= date_range_filter(date_start
, date_end
, date_span
);
544 q
|= Xapian::Query("Dlatest");
547 if (date_filter_set
) {
550 date_filter_set
= true;
559 bool force_boolean
= false;
560 if (!filter_map
.empty()) {
561 // OR together filters with the same prefix (or AND for non-exclusive
562 // prefixes), then AND together the resultant groups.
563 vector
<Xapian::Query
> filter_vec
;
564 vector
<string
> same_vec
;
566 for (auto i
= filter_map
.begin(); ; ++i
) {
567 bool over
= (i
== filter_map
.end());
568 if (over
|| i
->first
!= current
) {
569 switch (same_vec
.size()) {
573 filter_vec
.push_back(Xapian::Query(same_vec
[0]));
576 Xapian::Query::op op
= Xapian::Query::OP_OR
;
577 auto it
= option
.find("nonexclusiveprefix," + current
);
578 if (it
!= option
.end() && !it
->second
.empty()) {
579 op
= Xapian::Query::OP_AND
;
581 filter_vec
.push_back(Xapian::Query(op
,
591 same_vec
.push_back(i
->second
);
594 Xapian::Query
filter(Xapian::Query::OP_AND
,
595 filter_vec
.begin(), filter_vec
.end());
598 // If no query strings were provided then promote the filters
599 // to be THE query - filtering an empty query will give no
601 std::swap(query
, filter
);
602 auto&& it
= option
.find("weightingpurefilter");
603 if (it
!= option
.end() && !it
->second
.empty()) {
606 force_boolean
= true;
609 query
= Xapian::Query(Xapian::Query::OP_FILTER
, query
, filter
);
613 if (date_filter_set
) {
614 // If no query strings were provided then promote the daterange
615 // filter to be THE query instead of filtering an empty query.
618 force_boolean
= true;
620 query
= Xapian::Query(Xapian::Query::OP_FILTER
, query
, date_filter
);
624 if (!neg_filters
.empty()) {
625 // OR together all negated filters.
626 Xapian::Query
filter(Xapian::Query::OP_OR
,
627 neg_filters
.begin(), neg_filters
.end());
629 if (query
.empty() && !date_filter_set
) {
630 // If we only have a negative filter for the query, use MatchAll as
631 // the query to apply the filters to.
632 query
= Xapian::Query::MatchAll
;
633 force_boolean
= true;
635 query
= Xapian::Query(Xapian::Query::OP_AND_NOT
, query
, filter
);
638 if (!enquire
|| !error_msg
.empty()) return;
640 if (!force_boolean
&& scheme
.empty()) {
641 auto&& it
= option
.find("weighting");
642 if (it
!= option
.end()) scheme
= it
->second
;
644 set_weighting_scheme(*enquire
, scheme
, force_boolean
);
646 enquire
->set_cutoff(threshold
);
650 enquire
->set_sort_by_relevance_then_key(sort_keymaker
,
653 enquire
->set_sort_by_key_then_relevance(sort_keymaker
,
656 } else if (sort_key
!= Xapian::BAD_VALUENO
) {
658 enquire
->set_sort_by_relevance_then_value(sort_key
, reverse_sort
);
660 enquire
->set_sort_by_value_then_relevance(sort_key
, reverse_sort
);
664 enquire
->set_docid_order(docid_order
);
667 enquire
->set_collapse_key(collapse_key
);
670 if (!query
.empty()) {
672 // FIXME: If we start doing permissions checks based on $REMOTE_USER
673 // we're going to break some existing setups if users upgrade. We
674 // probably want a way to set this from OmegaScript.
675 const char * remote_user
= getenv("REMOTE_USER");
677 apply_unix_permissions(query
, remote_user
);
680 enquire
->set_query(query
);
681 // We could use the value of topdoc as first parameter, but we
682 // need to know the first few items in the mset to fake a
683 // relevance set for topterms.
685 // If min_hits isn't set, check at least one extra result so we
686 // know if we've reached the end of the matches or not - then we
687 // can avoid offering a "next" button which leads to an empty page.
688 mset
= enquire
->get_mset(0, topdoc
+ hits_per_page
,
689 topdoc
+ max(hits_per_page
+ 1, min_hits
),
695 html_escape(const string
&str
)
698 string::size_type p
= 0;
699 while (p
< str
.size()) {
722 html_strip(const string
&str
)
725 string::size_type p
= 0;
727 while (p
< str
.size()) {
737 if (!skip
) res
+= ch
;
744 static string prev_list
;
745 static unordered_map
<string
, int> word_to_occurrence
;
747 void build_word_map(const string
& list
) {
748 // Don't build map again if passed list of terms is same as before.
749 if (prev_list
== list
) return;
750 word_to_occurrence
.clear();
751 string::size_type split
= 0, split2
;
754 while ((split2
= list
.find('\t', split
)) != string::npos
) {
755 word
= list
.substr(split
, split2
- split
);
756 if (word_to_occurrence
.emplace(make_pair(word
, word_index
)).second
)
760 word
= list
.substr(split
, list
.size() - split
);
761 if (word_to_occurrence
.emplace(make_pair(word
, word_index
)).second
)
766 int word_in_list(const string
& word
) {
767 auto it
= word_to_occurrence
.find(word
);
768 if (it
== word_to_occurrence
.end()) return -1;
773 string
WordList::prev_list
;
774 unordered_map
<string
, int> WordList::word_to_occurrence
;
776 // Not a character in an identifier
778 p_notid(unsigned int c
)
780 return !C_isalnum(c
) && c
!= '_';
783 // Not a character in an HTML tag name
785 p_nottag(unsigned int c
)
787 return !C_isalnum(c
) && c
!= '.' && c
!= '-';
790 // FIXME: shares algorithm with indextext.cc!
792 html_highlight(const string
&s
, const string
&list
,
793 const string
&bra
, const string
&ket
)
796 stemmer
= new Xapian::Stem(option
["stemmer"]);
802 const Utf8Iterator s_end
;
804 Utf8Iterator first
= j
;
805 while (first
!= s_end
&& !is_wordchar(*first
)) ++first
;
806 if (first
== s_end
) break;
807 Utf8Iterator term_end
;
810 const char *l
= j
.raw();
811 if (*first
< 128 && C_isupper(*first
)) {
813 Xapian::Unicode::append_utf8(term
, *j
);
814 while (++j
!= s_end
&& *j
== '.' && ++j
!= s_end
&& *j
< 128 && C_isupper(*j
)) {
815 Xapian::Unicode::append_utf8(term
, *j
);
817 if (term
.length() < 2 || (j
!= s_end
&& is_wordchar(*j
))) {
824 while (is_wordchar(*j
)) {
825 Xapian::Unicode::append_utf8(term
, *j
);
827 if (j
== s_end
) break;
828 if (*j
== '&' || *j
== '\'') {
829 Utf8Iterator next
= j
;
831 if (next
== s_end
|| !is_wordchar(*next
)) break;
837 if (j
!= s_end
&& (*j
== '+' || *j
== '-' || *j
== '#')) {
838 string::size_type len
= term
.length();
841 do { ++j
; } while (j
!= s_end
&& *j
== '#');
843 while (j
!= s_end
&& (*j
== '+' || *j
== '-')) {
844 Xapian::Unicode::append_utf8(term
, *j
);
848 if (term
.size() - len
> 3 || (j
!= s_end
&& is_wordchar(*j
))) {
856 term
= Xapian::Unicode::tolower(term
);
858 w
.build_word_map(list
);
859 int match
= w
.word_in_list(term
);
862 stem
+= (*stemmer
)(term
);
863 match
= w
.word_in_list(stem
);
866 res
+= html_escape(string(l
, first
.raw() - l
));
870 static const char * colours
[] = {
871 "ffff66", "99ff99", "99ffff", "ff66ff", "ff9999",
872 "990000", "009900", "996600", "006699", "990099"
874 size_t idx
= match
% (sizeof(colours
) / sizeof(colours
[0]));
875 const char * bg
= colours
[idx
];
876 if (strchr(bg
, 'f')) {
877 res
+= "<b style=\"color:black;background-color:#";
879 res
+= "<b style=\"color:white;background-color:#";
884 word
.assign(first
.raw(), j
.raw() - first
.raw());
885 res
+= html_escape(word
);
892 res
+= html_escape(string(l
, j
.raw() - l
));
895 if (j
!= s_end
) res
+= html_escape(string(j
.raw(), j
.left()));
901 print_query_string(const char *after
)
903 if (after
&& strncmp(after
, "&B=", 3) == 0) {
904 char prefix
= after
[3];
905 string::size_type start
= 0, amp
= 0;
907 amp
= url_query_string
.find('&', amp
);
908 if (amp
== string::npos
) {
909 cout
<< url_query_string
.substr(start
);
913 while (url_query_string
[amp
] == 'B' &&
914 url_query_string
[amp
+ 1] == '=' &&
915 url_query_string
[amp
+ 2] == prefix
) {
916 cout
<< url_query_string
.substr(start
, amp
- start
- 1);
917 start
= url_query_string
.find('&', amp
+ 3);
918 if (start
== string::npos
) return;
923 cout
<< url_query_string
;
927 class CachedFields
: private Fields
{
928 Xapian::docid did_cached
= 0;
933 const string
& get_field(Xapian::docid did
, const string
& name
) {
934 if (did
!= did_cached
) {
936 auto it
= option
.find("fieldnames");
937 Fields::parse_fields(db
.get_document(did
).get_data(),
938 it
== option
.end() ? nullptr : &it
->second
);
940 return Fields::get_field(name
);
944 static CachedFields fields
;
945 static Xapian::docid q0
;
946 static Xapian::doccount hit_no
;
948 static double weight
;
949 static Xapian::doccount collapsed
;
951 static string
print_caption(const string
& fmt
, vector
<string
>& param
);
1038 CMD_querydescription
,
1053 CMD_sortableunserialise
,
1079 CMD_valuelowerbound
,
1080 CMD_valueupperbound
,
1083 CMD_MACRO
// special tag for macro evaluation
1086 struct func_attrib
{
1088 int minargs
, maxargs
, evalargs
;
1092 #define T(F,A,B,C,D) {STRINGIZE(F),{CMD_##F,A,B,C,D}}
1095 struct func_attrib a
;
1101 // NB when adding a new command which ensures M or Q, update the list in
1102 // docs/omegascript.rst
1103 static const struct func_desc func_tab
[] = {
1104 //name minargs maxargs evalargs ensure
1105 {"",{CMD_
, N
, N
, 0, 0}},// commented out code
1106 T(add
, 0, N
, N
, 0), // add a list of numbers
1107 T(addfilter
, 1, 2, N
, 0), // add filter term
1108 T(allterms
, 0, 1, N
, 0), // list of all terms matching document
1109 T(and, 1, N
, 0, 0), // logical shortcutting and of a list of values
1110 T(base64
, 1, 1, N
, 0), // base64 encode
1111 T(cgi
, 1, 1, N
, 0), // return cgi parameter value
1112 T(cgilist
, 1, 1, N
, 0), // return list of values for cgi parameter
1113 T(cgiparams
, 0, 0, N
, 0), // return list of cgi parameter names
1114 T(chr
, 1, 1, N
, 0), // return UTF-8 for given Unicode codepoint
1115 T(collapsed
, 0, 0, N
, 0), // return number of hits collapsed into this
1116 T(cond
, 2, N
, 0, 0), // cascaded conditionals
1117 T(contains
, 2, 2, N
, 0), // return position of substring, or empty string
1118 T(csv
, 1, 2, N
, 0), // CSV string escaping
1119 T(date
, 1, 2, N
, 0), // convert time_t to strftime format
1120 // (default: YYYY-MM-DD)
1121 T(dbname
, 0, 0, N
, 0), // database name
1122 T(dbsize
, 0, 0, N
, 0), // database size (# of documents)
1123 T(def
, 2, 2, 1, 0), // define a macro
1124 T(defaultop
, 0, 0, N
, 0), // default operator: "and" or "or"
1125 T(div
, 2, 2, N
, 0), // integer divide
1126 T(emptydocs
, 0, 1, N
, 0), // list of empty documents
1127 T(env
, 1, 1, N
, 0), // environment variable
1128 T(eq
, 2, 2, N
, 0), // test equality
1129 T(error
, 0, 0, N
, 0), // error message
1130 T(field
, 1, 2, N
, 0), // lookup field in record
1131 T(filesize
, 1, 1, N
, 0), // pretty printed filesize
1132 T(filters
, 0, 1, N
, 0), // serialisation of current filters
1133 T(filterterms
, 1, 1, N
, 0), // list of terms with a given prefix
1134 T(find
, 2, 2, N
, 0), // find entry in list
1135 T(fmt
, 0, 0, N
, 0), // name of current format
1136 T(foreach
, 2, 2, 1, 0), // evaluate something for every entry in a list
1137 T(freq
, 1, 1, N
, 0), // frequency of a term
1138 T(ge
, 2, 2, N
, 0), // test >=
1139 T(gt
, 2, 2, N
, 0), // test >
1140 T(hash
, 2, 2, N
, 0), // hash a string using the specified hash function
1141 T(highlight
, 2, 4, N
, 0), // html escape and highlight words from list
1142 T(hit
, 0, 0, N
, 0), // hit number of current mset entry (0-based)
1143 T(hitlist
, 1, 1, 0, M
), // display hitlist using format in argument
1144 T(hitsperpage
, 0, 0, N
, 0), // hits per page
1145 T(hostname
, 1, 1, N
, 0), // extract hostname from URL
1146 T(html
, 1, 1, N
, 0), // html escape string (<>&")
1147 T(htmlstrip
, 1, 1, N
, 0), // html strip tags string (s/<[^>]*>?//g)
1148 T(httpheader
, 2, 2, N
, 0), // arbitrary HTTP header
1149 T(id
, 0, 0, N
, 0), // docid of current doc
1150 T(if, 1, 3, 1, 0), // conditional
1151 T(include
, 1, 2, 1, 0), // include another file
1152 T(json
, 1, 1, N
, 0), // JSON string escaping
1153 T(jsonarray
, 1, 2, 1, 0), // Format list as a JSON array
1154 T(jsonbool
, 1, 1, 1, 0), // Format list as a JSON bool
1155 T(jsonobject
, 1, 3, 1, 0), // Format map as JSON object
1156 T(jsonobject2
, 2, 4, 2, 0), // Format 2 lists as JSON object
1157 T(keys
, 1, 1, N
, 0), // list of keys from a map
1158 T(last
, 0, 0, N
, M
), // hit number one beyond end of current page
1159 T(lastpage
, 0, 0, N
, M
), // number of last hit page
1160 T(le
, 2, 2, N
, 0), // test <=
1161 T(length
, 1, 1, N
, 0), // length of list
1162 T(list
, 2, 5, N
, 0), // pretty print list
1163 T(log
, 1, 2, 1, 0), // create a log entry
1164 T(lookup
, 2, 2, N
, 0), // lookup in named cdb file
1165 T(lower
, 1, 1, N
, 0), // convert string to lower case
1166 T(lt
, 2, 2, N
, 0), // test <
1167 T(map
, 2, 2, 1, 0), // map a list into another list
1168 T(match
, 2, 3, N
, 0), // regex match
1169 T(max
, 1, N
, N
, 0), // maximum of a list of values
1170 T(min
, 1, N
, N
, 0), // minimum of a list of values
1171 T(mod
, 2, 2, N
, 0), // integer modulus
1172 T(msize
, 0, 0, N
, M
), // number of matches (estimated)
1173 T(msizeexact
, 0, 0, N
, M
), // is $msize exact?
1174 T(msizelower
, 0, 0, N
, M
), // number of matches (lower bound)
1175 T(msizeupper
, 0, 0, N
, M
), // number of matches (upper bound)
1176 T(mul
, 2, N
, N
, 0), // multiply a list of numbers
1177 T(muldiv
, 3, 3, N
, 0), // calculate A*B/C
1178 T(ne
, 2, 2, N
, 0), // test not equal
1179 T(nice
, 1, 1, N
, 0), // pretty print integer (with thousands sep)
1180 T(not, 1, 1, N
, 0), // logical not
1181 T(now
, 0, 0, N
, 0), // current date/time as a time_t
1182 T(opt
, 1, 2, N
, 0), // lookup an option value
1183 T(or, 1, N
, 0, 0), // logical shortcutting or of a list of values
1184 T(ord
, 1, 1, N
, 0), // return codepoint for first character of UTF-8 string
1185 T(pack
, 1, 1, N
, 0), // convert a number to a 4 byte big endian binary string
1186 T(percentage
, 0, 0, N
, 0), // percentage score of current hit
1187 T(prettyterm
, 1, 1, N
, Q
), // pretty print term name
1188 T(prettyurl
, 1, 1, N
, 0), // pretty version of URL
1189 T(query
, 0, 1, N
, Q
), // query
1190 T(querydescription
,0, 0, N
, M
), // query.get_description() (run_query() adds filters so M)
1191 T(queryterms
, 0, 0, N
, Q
), // list of query terms
1192 T(random
, 1, 1, N
, 0), // return a random number
1193 T(range
, 2, 2, N
, 0), // return list of values between start and end
1194 T(record
, 0, 1, N
, 0), // record contents of document
1195 T(relevant
, 0, 1, N
, Q
), // is document relevant?
1196 T(relevants
, 0, 0, N
, Q
), // return list of relevant documents
1197 T(score
, 0, 0, N
, 0), // score (0-10) of current hit
1198 T(set
, 2, 2, N
, 0), // set option value
1199 T(seterror
, 1, 1, N
, 0), // set error_msg, setting it early stops query execution
1200 T(setmap
, 1, N
, N
, 0), // set map of option values
1201 T(setrelevant
, 1, 1, N
, Q
), // set rset
1202 T(slice
, 2, 2, N
, 0), // slice a list using a second list
1203 T(snippet
, 1, 6, N
, M
), // generate snippet from text
1204 T(sort
, 1, 2, N
, 0), // alpha sort a list
1205 T(sortableunserialise
,
1206 1, 1, N
, 0), // decode with Xapian::sortable_unserialise
1207 T(split
, 1, 2, N
, 0), // split a string to give a list
1208 T(srandom
, 1, 1, N
, 0), // seed for random number
1209 T(stoplist
, 0, 0, N
, Q
), // return list of stopped terms
1210 T(sub
, 2, 2, N
, 0), // subtract
1211 T(subdb
, 0, 1, N
, 0), // name of subdb docid is in
1212 T(subid
, 0, 1, N
, 0), // docid in the subdb#
1213 T(substr
, 2, 3, N
, 0), // substring
1214 T(suggestion
, 0, 0, N
, Q
), // misspelled word correction suggestion
1215 T(switch, 3, N
, 1, 0), // return position of substring, or empty string
1216 T(termprefix
, 1, 1, N
, 0), // get any prefix from a term
1217 T(terms
, 0, 1, N
, M
), // list of matching terms
1218 T(thispage
, 0, 0, N
, M
), // page number of current page
1219 T(time
, 0, 0, N
, M
), // how long the match took (in seconds)
1220 T(topdoc
, 0, 0, N
, M
), // first document on current page of hit list
1221 // (counting from 0)
1222 T(topterms
, 0, 1, N
, M
), // list of up to N top relevance feedback terms
1224 T(transform
, 3, 4, N
, 0), // transform with a regexp
1225 T(truncate
, 2, 4, N
, 0), // truncate after a word
1226 T(uniq
, 1, 1, N
, 0), // removed duplicates from a sorted list
1227 T(unique
, 1, 1, N
, 0), // removed duplicates from any list
1228 T(unpack
, 1, 1, N
, 0), // convert 4 byte big endian binary string to a number
1229 T(unprefix
, 1, 1, N
, 0), // remove any prefix from a term
1230 T(unstem
, 1, 1, N
, Q
), // return list of terms from the parsed query
1231 // which stemmed to this term
1232 T(upper
, 1, 1, N
, 0), // convert string to upper case
1233 T(url
, 1, 1, N
, 0), // url encode argument
1234 T(value
, 1, 2, N
, 0), // return document value
1235 T(valuelowerbound
, 1, 1, N
, 0), // return value slot lower bound
1236 T(valueupperbound
, 1, 1, N
, 0), // return value slot upper bound
1237 T(version
, 0, 0, N
, 0), // omega version string
1238 T(weight
, 0, 0, N
, 0), // weight of the current hit
1239 { NULL
,{0, 0, 0, 0, 0}}
1242 #undef T // Leaving T defined screws up Sun's C++ compiler!
1244 static vector
<string
> macros
;
1246 // Call write() repeatedly until all data is written or we get a
1247 // non-recoverable error.
1249 write_all(int fd
, const char * buf
, size_t count
)
1252 ssize_t r
= write(fd
, buf
, count
);
1254 if (errno
== EINTR
) continue;
1263 // mersenne twister for RNG
1265 static bool seed_set
= false;
1267 static string
eval(const string
& fmt
, vector
<string
>& param
);
1269 /** Implements $foreach{} and $map{}. */
1271 foreach(const string
& list
,
1273 vector
<string
>& param
,
1277 string saved_arg0
= std::move(param
[0]);
1278 string::size_type i
= 0, j
;
1280 j
= list
.find('\t', i
);
1281 param
[0].assign(list
, i
, j
- i
);
1282 result
+= eval(pat
, param
);
1283 if (j
== string::npos
) break;
1284 if (sep
) result
+= sep
;
1287 param
[0] = std::move(saved_arg0
);
1292 eval(const string
& fmt
, vector
<string
>& param
)
1294 static map
<string
, const struct func_attrib
*> func_map
;
1295 if (func_map
.empty()) {
1296 for (auto p
= func_tab
; p
->name
!= NULL
; ++p
) {
1297 func_map
[string(p
->name
)] = &(p
->a
);
1301 string::size_type p
= 0, q
;
1302 while ((q
= fmt
.find('$', p
)) != string::npos
) try {
1303 res
.append(fmt
, p
, q
- p
);
1304 string::size_type code_start
= q
; // note down for error reporting
1306 if (q
>= fmt
.size()) break;
1307 unsigned char ch
= fmt
[q
];
1310 // '$$' -> '$', '$(' -> '{', '$)' -> '}', '$.' -> ','
1330 case '1': case '2': case '3': case '4': case '5':
1331 case '6': case '7': case '8': case '9':
1333 if (ch
< param
.size()) res
+= param
[ch
];
1336 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1337 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1338 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1339 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1341 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1342 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1343 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1344 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1349 string msg
= "Unknown $ code in: $";
1350 msg
.append(fmt
, q
, string::npos
);
1353 p
= find_if(fmt
.begin() + q
, fmt
.end(), p_notid
) - fmt
.begin();
1354 string
var(fmt
, q
, p
- q
);
1355 map
<string
, const struct func_attrib
*>::const_iterator func
;
1356 func
= func_map
.find(var
);
1357 if (func
== func_map
.end()) {
1358 throw "Unknown function '" + var
+ "'";
1360 vector
<string
> args
;
1361 if (fmt
[p
] == '{') {
1365 p
= fmt
.find_first_of(",{}", p
+ 1);
1366 if (p
== string::npos
)
1367 throw "missing } in " + fmt
.substr(code_start
);
1368 if (fmt
[p
] == '{') {
1372 // should we split the args
1373 if (func
->second
->minargs
!= N
) {
1374 args
.push_back(fmt
.substr(q
, p
- q
));
1378 if (fmt
[p
] == '}' && --nest
== 0) break;
1381 if (func
->second
->minargs
== N
)
1382 args
.push_back(fmt
.substr(q
, p
- q
));
1386 if (func
->second
->minargs
!= N
) {
1387 if (int(args
.size()) < func
->second
->minargs
)
1388 throw "too few arguments to $" + var
;
1389 if (func
->second
->maxargs
!= N
&&
1390 int(args
.size()) > func
->second
->maxargs
)
1391 throw "too many arguments to $" + var
;
1393 vector
<string
>::size_type n
;
1394 if (func
->second
->evalargs
!= N
)
1395 n
= func
->second
->evalargs
;
1399 for (vector
<string
>::size_type j
= 0; j
< n
; ++j
)
1400 args
[j
] = eval(args
[j
], param
);
1402 if (func
->second
->ensure
== 'Q' || func
->second
->ensure
== 'M')
1403 ensure_query_parsed();
1404 if (func
->second
->ensure
== 'M') ensure_match();
1406 switch (func
->second
->tag
) {
1411 for (auto&& arg
: args
)
1412 total
+= string_to_int(arg
);
1417 if (args
.size() == 1 || args
[1].empty() || args
[1] == "B") {
1419 } else if (args
[1] == "N") {
1422 string msg
= "Invalid $addfilter type '";
1428 case CMD_allterms
: {
1429 // list of all terms indexing document
1430 Xapian::docid id
= q0
;
1431 if (!args
.empty() &&
1432 (!parse_unsigned(args
[0].c_str(), id
) || id
== 0)) {
1433 throw "Document id for command allterms should be > 0";
1435 for (Xapian::TermIterator term
= db
.termlist_begin(id
);
1436 term
!= db
.termlist_end(id
); ++term
) {
1441 if (!value
.empty()) value
.erase(value
.size() - 1);
1446 for (auto&& arg
: args
) {
1447 if (eval(arg
, param
).empty()) {
1455 const static char encode
[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdef"
1456 "ghijklmnopqrstuvwxyz0123456789+/";
1457 const char pad
= '=';
1458 const string
& input
= args
[0];
1459 value
.reserve((input
.size() + 2) / 3 * 4);
1460 auto it
= input
.begin();
1461 auto n
= input
.size() / 3;
1463 uint32_t v
= uint8_t(*it
++);
1464 v
= (v
<< 8) | uint8_t(*it
++);
1465 v
= (v
<< 8) | uint8_t(*it
++);
1466 value
+= encode
[v
>> 18];
1467 value
+= encode
[(v
>> 12) & 63];
1468 value
+= encode
[(v
>> 6) & 63];
1469 value
+= encode
[v
& 63];
1471 switch (input
.size() % 3) {
1473 uint32_t v
= uint8_t(*it
++);
1474 v
= (v
<< 8) | uint8_t(*it
++);
1475 value
+= encode
[v
>> 10];
1476 value
+= encode
[(v
>> 4) & 63];
1477 value
+= encode
[(v
<< 2) & 63];
1482 uint32_t v
= uint8_t(*it
++);
1483 value
+= encode
[v
>> 2];
1484 value
+= encode
[(v
<< 4) & 63];
1493 auto i
= cgi_params
.find(args
[0]);
1494 if (i
!= cgi_params
.end()) value
= i
->second
;
1498 auto g
= cgi_params
.equal_range(args
[0]);
1499 for (auto i
= g
.first
; i
!= g
.second
; ++i
) {
1503 if (!value
.empty()) value
.erase(value
.size() - 1);
1506 case CMD_cgiparams
: {
1507 const string
* prev
= NULL
;
1508 for (auto&& i
: cgi_params
) {
1509 if (prev
&& i
.first
== *prev
) continue;
1514 if (!value
.empty()) value
.erase(value
.size() - 1);
1518 unsigned int codepoint
;
1519 if (!parse_unsigned(args
[0].c_str(), codepoint
)) {
1520 throw "Unicode codepoint for command chr should be >= 0";
1522 Xapian::Unicode::append_utf8(value
, codepoint
);
1525 case CMD_collapsed
: {
1526 value
= str(collapsed
);
1530 for (size_t i
= 0; i
< args
.size(); i
+= 2) {
1531 if (i
== args
.size() - 1) {
1532 // Handle optional "else" value.
1533 value
= eval(args
[i
], param
);
1536 if (!eval(args
[i
], param
).empty()) {
1537 value
= eval(args
[i
+ 1], param
);
1542 case CMD_contains
: {
1543 size_t pos
= args
[1].find(args
[0]);
1544 if (pos
!= string::npos
) {
1551 if (args
.size() > 1 && !args
[1].empty()) {
1552 csv_escape_always(value
);
1559 if (!value
.empty()) {
1562 if (!parse_signed(value
.c_str(), date
)) {
1563 throw "Date (in secs) for command date should "
1566 if (date
!= static_cast<time_t>(-1)) {
1568 then
= gmtime(&date
);
1569 string date_fmt
= "%Y-%m-%d";
1570 if (args
.size() > 1) date_fmt
= eval(args
[1], param
);
1571 strftime(buf
, sizeof buf
, date_fmt
.c_str(), then
);
1580 static Xapian::doccount dbsize
;
1581 if (!dbsize
) dbsize
= db
.get_doccount();
1582 value
= str(dbsize
);
1586 func_attrib
*fa
= new func_attrib
;
1587 fa
->tag
= CMD_MACRO
+ macros
.size();
1590 fa
->evalargs
= N
; // FIXME: or 0?
1593 macros
.push_back(args
[1]);
1594 func_map
[args
[0]] = fa
;
1598 if (default_op
== Xapian::Query::OP_AND
) {
1605 int denom
= string_to_int(args
[1]);
1607 value
= "divide by 0";
1609 value
= str(string_to_int(args
[0]) / denom
);
1613 case CMD_emptydocs
: {
1617 Xapian::PostingIterator i
;
1618 for (i
= db
.postlist_begin(t
); i
!= db
.postlist_end(t
); ++i
) {
1619 if (i
.get_doclength() != 0) continue;
1620 if (!value
.empty()) value
+= '\t';
1626 char *env
= getenv(args
[0].c_str());
1627 if (env
!= NULL
) value
= env
;
1631 if (args
[0] == args
[1]) value
= "true";
1634 if (error_msg
.empty() && enquire
== NULL
&& !dbname
.empty()) {
1635 error_msg
= "Database '" + dbname
+ "' couldn't be opened";
1640 Xapian::docid did
= q0
;
1641 if (args
.size() > 1 &&
1642 (!parse_unsigned(args
[1].c_str(), did
) || did
== 0)) {
1643 throw "Document id for command field should be > 0";
1645 value
= fields
.get_field(did
, args
[0]);
1648 case CMD_filesize
: {
1649 if (args
[0].empty()) break;
1650 // FIXME: rounding? i18n?
1652 if (!parse_signed(args
[0].c_str(), size
)) {
1653 throw "Filesize must be an integer";
1657 const char * format
= 0;
1659 // Negative size -> empty result.
1660 } else if (size
== 1) {
1662 } else if (size
< 1024) {
1663 format
= "%d bytes";
1665 if (size
< 1024 * 1024) {
1669 if (size
< 1024 * 1024) {
1676 intpart
= unsigned(size
) / 1024;
1677 fraction
= unsigned(size
) % 1024;
1682 if (fraction
== -1) {
1683 len
= snprintf(buf
, sizeof(buf
), format
, intpart
);
1685 fraction
= (fraction
* 10 / 1024) + '0';
1686 len
= snprintf(buf
, sizeof(buf
), format
, intpart
, fraction
);
1688 if (len
< 0 || unsigned(len
) > sizeof(buf
)) len
= sizeof(buf
);
1689 value
.assign(buf
, len
);
1694 value
= args
.size() ? old_filters
: filters
;
1696 case CMD_filterterms
: {
1697 Xapian::TermIterator term
= db
.allterms_begin();
1698 term
.skip_to(args
[0]);
1699 while (term
!= db
.allterms_end()) {
1701 if (!startswith(t
, args
[0])) break;
1707 if (!value
.empty()) value
.erase(value
.size() - 1);
1711 string l
= args
[0], s
= args
[1];
1712 string::size_type i
= 0, j
= 0;
1714 while (j
!= l
.size()) {
1715 j
= l
.find('\t', i
);
1716 if (j
== string::npos
) j
= l
.size();
1717 if (j
- i
== s
.length()) {
1718 if (memcmp(s
.data(), l
.data() + i
, j
- i
) == 0) {
1732 if (!args
[0].empty()) {
1733 value
= foreach(args
[0], args
[1], param
);
1737 const string
& term
= args
[0];
1738 Xapian::doccount termfreq
= 0;
1740 termfreq
= mset
.get_termfreq(term
);
1742 if (termfreq
== 0) {
1743 // We want $freq to work before the match is run, and we
1744 // don't want using it to force the match to run.
1745 termfreq
= db
.get_termfreq(term
);
1747 value
= str(termfreq
);
1751 if (string_to_int(args
[0]) >= string_to_int(args
[1]))
1755 if (string_to_int(args
[0]) > string_to_int(args
[1]))
1759 const string
& data
= args
[0];
1760 const string
& hash
= args
[1];
1761 if (hash
== "md5") {
1763 md5_string(data
, md5
);
1764 value
.reserve(md5
.size() * 2);
1765 for (unsigned char byte
: md5
) {
1766 value
+= "0123456789abcdef"[byte
>> 4];
1767 value
+= "0123456789abcdef"[byte
& 0x0f];
1770 throw "Unknown hash function: " + hash
;
1774 case CMD_highlight
: {
1776 if (args
.size() > 2) {
1778 if (args
.size() > 3) {
1781 string::const_iterator i
;
1782 i
= find_if(bra
.begin() + 2, bra
.end(), p_nottag
);
1784 ket
.append(bra
, 1, i
- bra
.begin() - 1);
1789 value
= html_highlight(args
[0], args
[1], bra
, ket
);
1793 // 0-based mset index
1794 value
= str(hit_no
);
1798 url_query_string
= "?DB=";
1799 url_query_string
+= dbname
;
1800 for (auto& j
: query_strings
) {
1801 if (j
.first
.empty()) {
1802 url_query_string
+= "&P=";
1804 url_query_string
+= "&P."
1805 url_query_string
+= j
.first
;
1806 url_query_string
+= '=';
1808 const char *q
= j
.second
.c_str();
1810 while ((ch
= *q
++) != '\0') {
1813 url_query_string
+= "%2b";
1816 url_query_string
+= "%22";
1819 url_query_string
+= "%25";
1822 url_query_string
+= "%26";
1828 url_query_string
+= ch
;
1832 // add any boolean terms
1833 for (auto i
= filter_map
.begin(); i
!= filter_map
.end(); ++i
) {
1834 url_query_string
+= "&B=";
1835 url_query_string
+= i
->second
;
1838 auto save_hit_no
= hit_no
;
1839 for (hit_no
= topdoc
; hit_no
< last
; ++hit_no
)
1840 value
+= print_caption(args
[0], param
);
1841 hit_no
= save_hit_no
;
1844 case CMD_hitsperpage
:
1845 value
= str(hits_per_page
);
1847 case CMD_hostname
: {
1849 // remove URL scheme and/or path
1850 string::size_type i
= value
.find("://");
1851 if (i
== string::npos
) i
= 0; else i
+= 3;
1852 value
= value
.substr(i
, value
.find('/', i
) - i
);
1853 // remove user@ or user:password@
1854 i
= value
.find('@');
1855 if (i
!= string::npos
) value
.erase(0, i
+ 1);
1857 i
= value
.find(':');
1858 if (i
!= string::npos
) value
.resize(i
);
1862 value
= html_escape(args
[0]);
1865 value
= html_strip(args
[0]);
1867 case CMD_httpheader
:
1868 if (!suppress_http_headers
) {
1869 cout
<< args
[0] << ": " << args
[1] << endl
;
1870 if (!set_content_type
&& args
[0].length() == 12 &&
1871 strcasecmp(args
[0].c_str(), "Content-Type") == 0) {
1872 set_content_type
= true;
1881 if (args
.size() > 1 && !args
[0].empty())
1882 value
= eval(args
[1], param
);
1883 else if (args
.size() > 2)
1884 value
= eval(args
[2], param
);
1887 if (args
.size() == 1) {
1888 value
= eval_file(args
[0]);
1890 bool fallback
= false;
1891 value
= eval_file(args
[0], &fallback
);
1893 value
= eval(args
[1], param
);
1902 case CMD_jsonarray
: {
1903 const string
& l
= args
[0];
1904 string::size_type i
= 0, j
;
1909 vector
<string
> new_args(1);
1912 j
= l
.find('\t', i
);
1913 string
elt(l
, i
, j
- i
);
1914 if (args
.size() == 1) {
1920 new_args
[0] = std::move(elt
);
1921 value
+= eval(args
[1], new_args
);
1923 if (j
== string::npos
) break;
1931 value
= args
[0].empty() ? "false" : "true";
1933 case CMD_jsonobject
: {
1934 vector
<string
> new_args(1);
1937 typedef map
<string
, string
>::const_iterator iterator
;
1941 map_range(iterator b_
, iterator e_
) : b(b_
), e(e_
) {}
1943 iterator
begin() const { return b
; }
1944 iterator
end() const { return e
; }
1947 string prefix
= args
[0] + ',';
1948 auto b
= option
.lower_bound(prefix
);
1950 auto e
= option
.lower_bound(prefix
);
1951 value
= to_json(map_range(b
, e
),
1952 [&](const string
& k
) {
1953 string
key(k
, prefix
.size());
1954 if (args
.size() > 1 && !args
[1].empty()) {
1955 new_args
[0] = std::move(key
);
1956 key
= eval(args
[1], new_args
);
1960 [&](const string
& v
) {
1961 if (args
.size() > 2 && !args
[2].empty()) {
1963 return eval(args
[2], new_args
);
1974 case CMD_jsonobject2
: {
1975 vector
<string
> new_args(1);
1977 static string dummy
;
1981 const string
& values
;
1986 const string
& values
;
1987 string::size_type ki
= 0;
1988 string::size_type kj
;
1989 string::size_type vi
= 0;
1990 string::size_type vj
;
1994 : keys(dummy
), values(dummy
),
1995 ki(string::npos
), vi(string::npos
) {}
1997 iterator(const string
& k
, const string
& v
)
1998 : keys(k
), values(v
) {
1999 if (keys
.empty() && values
.empty()) {
2000 // Don't treat this as: { "": "" }
2001 ki
= kj
= vi
= vj
= string::npos
;
2003 kj
= keys
.find('\t');
2004 vj
= values
.find('\t');
2008 pair
<string
, string
> operator*() const {
2009 return {keys
.substr(ki
, kj
- ki
),
2010 values
.substr(vi
, vj
- vi
)};
2013 iterator
& operator++() {
2015 if (ki
!= string::npos
) {
2017 kj
= keys
.find('\t', ki
);
2020 if (vi
!= string::npos
) {
2022 vj
= values
.find('\t', vi
);
2024 if ((ki
== string::npos
) !=
2025 (vi
== string::npos
)) {
2026 throw "$jsonobject2: Different number of keys "
2032 iterator
operator++(int) {
2038 bool operator==(const iterator
& o
) const {
2039 return ki
== o
.ki
&& vi
== o
.vi
;
2042 bool operator!=(const iterator
& o
) const {
2043 return !(*this == o
);
2047 list_range(const string
& k
, const string
& v
)
2048 : keys(k
), values(v
) { }
2050 iterator
begin() const { return iterator(keys
, values
); }
2051 iterator
end() const { return iterator(); }
2054 value
= to_json(list_range(args
[0], args
[1]),
2055 [&](const string
& k
) {
2057 if (args
.size() > 2 && !args
[2].empty()) {
2058 new_args
[0] = std::move(key
);
2059 key
= eval(args
[2], new_args
);
2063 [&](const string
& v
) {
2064 if (args
.size() > 3 && !args
[3].empty()) {
2066 return eval(args
[3], new_args
);
2078 string prefix
= args
[0] + ',';
2079 auto i
= option
.lower_bound(prefix
);
2080 for (; i
!= option
.end() && startswith(i
->first
, prefix
); ++i
) {
2081 const string
& key
= i
->first
;
2082 if (!value
.empty()) value
+= '\t';
2083 value
.append(key
, prefix
.size(), string::npos
);
2090 case CMD_lastpage
: {
2091 int l
= mset
.get_matches_estimated();
2092 if (l
> 0) l
= (l
- 1) / hits_per_page
+ 1;
2097 if (string_to_int(args
[0]) <= string_to_int(args
[1]))
2101 if (args
[0].empty()) {
2104 size_t length
= count(args
[0].begin(), args
[0].end(), '\t');
2105 value
= str(length
+ 1);
2109 if (!args
[0].empty()) {
2110 string pre
, inter
, interlast
, post
;
2111 switch (args
.size()) {
2113 inter
= interlast
= args
[1];
2117 interlast
= args
[2];
2121 inter
= interlast
= args
[2];
2127 interlast
= args
[3];
2132 string list
= args
[0];
2133 string::size_type split
= 0, split2
;
2134 while ((split2
= list
.find('\t', split
)) != string::npos
) {
2135 if (split
) value
+= inter
;
2136 value
.append(list
, split
, split2
- split
);
2139 if (split
) value
+= interlast
;
2140 value
.append(list
, split
, string::npos
);
2146 if (!vet_filename(args
[0])) {
2147 value
= "filename can't contain \"..\"";
2150 string logfile
= log_dir
+ args
[0];
2151 int fd
= open(logfile
.c_str(), O_CREAT
|O_APPEND
|O_WRONLY
, 0644);
2153 value
= "open failed: ";
2154 value
+= strerror(errno
);
2157 vector
<string
> noargs
;
2160 if (args
.size() > 1) {
2163 line
= DEFAULT_LOG_ENTRY
;
2165 line
= eval(line
, noargs
);
2167 if (write_all(fd
, line
.data(), line
.length()) < 0) {
2168 value
= "write failed: ";
2169 value
+= strerror(errno
);
2175 if (!vet_filename(args
[0])) break;
2176 string cdbfile
= cdb_dir
+ args
[0];
2177 int fd
= open(cdbfile
.c_str(), O_RDONLY
);
2178 if (fd
== -1) break;
2181 if (cdb_init(&cdb
, fd
) < 0) {
2186 if (cdb_find(&cdb
, args
[1].data(), args
[1].length()) > 0) {
2187 size_t datalen
= cdb_datalen(&cdb
);
2188 const void *dat
= cdb_get(&cdb
, datalen
, cdb_datapos(&cdb
));
2190 value
.assign(static_cast<const char *>(dat
), datalen
);
2195 close(fd
); // FIXME: cache fds?
2199 value
= Xapian::Unicode::tolower(args
[0]);
2202 if (string_to_int(args
[0]) < string_to_int(args
[1]))
2206 if (!args
[0].empty()) {
2207 value
= foreach(args
[0], args
[1], param
, '\t');
2211 omegascript_match(value
, args
);
2214 vector
<string
>::const_iterator i
= args
.begin();
2215 int val
= string_to_int(*i
++);
2216 for (; i
!= args
.end(); ++i
) {
2217 int x
= string_to_int(*i
);
2218 if (x
> val
) val
= x
;
2224 vector
<string
>::const_iterator i
= args
.begin();
2225 int val
= string_to_int(*i
++);
2226 for (; i
!= args
.end(); ++i
) {
2227 int x
= string_to_int(*i
);
2228 if (x
< val
) val
= x
;
2234 int denom
= string_to_int(args
[1]);
2236 value
= "divide by 0";
2238 value
= str(string_to_int(args
[0]) % denom
);
2243 // Estimated number of matches.
2244 value
= str(mset
.get_matches_estimated());
2246 case CMD_msizeexact
:
2248 if (mset
.get_matches_lower_bound()
2249 == mset
.get_matches_upper_bound())
2252 case CMD_msizelower
:
2253 // Lower bound on number of matches.
2254 value
= str(mset
.get_matches_lower_bound());
2256 case CMD_msizeupper
:
2257 // Upper bound on number of matches.
2258 value
= str(mset
.get_matches_upper_bound());
2261 vector
<string
>::const_iterator i
= args
.begin();
2262 int total
= string_to_int(*i
++);
2263 while (i
!= args
.end())
2264 total
*= string_to_int(*i
++);
2269 int denom
= string_to_int(args
[2]);
2271 value
= "divide by 0";
2273 int num
= string_to_int(args
[0]) * string_to_int(args
[1]);
2274 value
= str(num
/ denom
);
2279 if (args
[0] != args
[1]) value
= "true";
2282 string::const_iterator i
= args
[0].begin();
2283 int len
= args
[0].length();
2286 if (--len
&& len
% 3 == 0) value
+= option
["thousand"];
2291 if (args
[0].empty()) value
= "true";
2294 value
= str(static_cast<unsigned long>(time(NULL
)));
2297 if (args
.size() == 2) {
2298 value
= option
[args
[0] + "," + args
[1]];
2300 value
= option
[args
[0]];
2304 for (auto&& arg
: args
) {
2305 value
= eval(arg
, param
);
2306 if (!value
.empty()) break;
2311 if (!args
[0].empty()) {
2312 Utf8Iterator
it(args
[0]);
2319 if (!parse_signed(args
[0].c_str(), number
)) {
2320 throw "NUMBER parameter for pack command "
2321 "must be an integer";
2323 value
= int_to_binary_string(number
);
2326 case CMD_percentage
:
2328 value
= str(percent
);
2330 case CMD_prettyterm
:
2331 value
= pretty_term(args
[0]);
2335 url_prettify(value
);
2338 auto r
= query_strings
.equal_range(args
.empty() ?
2339 string() : args
[0]);
2340 for (auto j
= r
.first
; j
!= r
.second
; ++j
) {
2341 if (!value
.empty()) value
+= '\t';
2342 const string
& s
= j
->second
;
2343 size_t start
= 0, tab
;
2344 while ((tab
= s
.find('\t', start
)) != string::npos
) {
2345 value
.append(s
, start
, tab
- start
);
2349 value
.append(s
, start
, string::npos
);
2353 case CMD_querydescription
:
2354 value
= query
.get_description();
2356 case CMD_queryterms
:
2365 uniform_int_distribution
<int>
2366 distr(0, string_to_int(args
[0]));
2367 value
= str(distr(rng
));
2372 if (!parse_signed(args
[0].c_str(), start
)) {
2373 throw "Start value for range command "
2374 "must be an integer";
2376 if (!parse_signed(args
[1].c_str(), end
)) {
2377 throw "End value for range command "
2378 "must be an integer";
2380 while (start
<= end
) {
2381 value
+= str(start
);
2382 if (start
< end
) value
+= '\t';
2388 Xapian::docid id
= q0
;
2389 if (!args
.empty() &&
2390 (!parse_unsigned(args
[0].c_str(), id
) || id
== 0)) {
2391 throw "Document id for command record should be > 0";
2393 value
= db
.get_document(id
).get_data();
2396 case CMD_relevant
: {
2397 // document id if relevant; empty otherwise
2398 Xapian::docid id
= q0
;
2399 if (!args
.empty() &&
2400 (!parse_unsigned(args
[0].c_str(), id
) || id
== 0)) {
2401 throw "Document id for command relevant should be > 0";
2403 auto i
= ticked
.find(id
);
2404 if (i
!= ticked
.end()) {
2405 i
->second
= false; // icky side-effect
2410 case CMD_relevants
: {
2411 for (auto i
: ticked
) {
2413 value
+= str(i
.first
);
2417 if (!value
.empty()) value
.erase(value
.size() - 1);
2422 value
= str(percent
/ 10);
2425 option
[args
[0]] = args
[1];
2428 error_msg
= args
[0];
2431 string base
= args
[0] + ',';
2432 if (args
.size() % 2 != 1)
2433 throw string("$setmap requires an odd number of arguments");
2434 for (unsigned int i
= 1; i
+ 1 < args
.size(); i
+= 2) {
2435 option
[base
+ args
[i
]] = args
[i
+ 1];
2439 case CMD_setrelevant
: {
2440 string::size_type i
= 0, j
;
2442 j
= args
[0].find_first_not_of("0123456789", i
);
2443 Xapian::docid id
= atoi(args
[0].substr(i
, j
- i
).c_str());
2445 rset
.add_document(id
);
2448 if (j
== string::npos
) break;
2454 string list
= args
[0], pos
= args
[1];
2455 vector
<string
> items
;
2456 string::size_type i
= 0, j
;
2458 j
= list
.find('\t', i
);
2459 items
.push_back(list
.substr(i
, j
- i
));
2460 if (j
== string::npos
) break;
2464 bool have_added
= false;
2466 j
= pos
.find('\t', i
);
2467 int item
= string_to_int(pos
.substr(i
, j
- i
));
2468 if (item
>= 0 && size_t(item
) < items
.size()) {
2469 if (have_added
) value
+= '\t';
2470 value
+= items
[item
];
2473 if (j
== string::npos
) break;
2479 size_t length
= 200;
2480 if (args
.size() > 1 && !args
[1].empty()) {
2481 if (!parse_unsigned(args
[1].c_str(), length
)) {
2482 throw "Snippet length must be >= 0";
2485 unsigned flags
= mset
.SNIPPET_BACKGROUND_MODEL
|
2486 mset
.SNIPPET_EXHAUSTIVE
;
2487 if (args
.size() > 2 && !args
[2].empty()) {
2489 const string
& s
= args
[2];
2492 size_t j
= s
.find('|', i
);
2493 string
flag(s
, i
, j
- i
);
2494 for (char& c
: flag
) {
2497 if (startswith(flag
, "snippet_")) {
2498 flag
.erase(0, CONST_STRLEN("snippet_"));
2500 if (flag
== "background_model") {
2501 flags
|= mset
.SNIPPET_BACKGROUND_MODEL
;
2502 } else if (flag
== "cjk_ngram") {
2503 flags
|= mset
.SNIPPET_CJK_NGRAM
;
2504 } else if (flag
== "empty_without_match") {
2505 flags
|= mset
.SNIPPET_EMPTY_WITHOUT_MATCH
;
2506 } else if (flag
== "exhaustive") {
2507 flags
|= mset
.SNIPPET_EXHAUSTIVE
;
2508 } else if (flag
== "ngrams") {
2509 flags
|= mset
.SNIPPET_NGRAMS
;
2510 } else if (flag
== "word_breaks") {
2511 flags
|= mset
.SNIPPET_WORD_BREAKS
;
2513 throw "Unknown $snippet flag '" + flag
+ "'";
2515 if (j
== string::npos
) break;
2519 string bra
, ket
, gap
;
2520 if (args
.size() > 3) {
2525 if (args
.size() > 4) {
2530 if (args
.size() > 5) {
2536 stemmer
= new Xapian::Stem(option
["stemmer"]);
2537 value
= mset
.snippet(args
[0], length
, *stemmer
, flags
,
2542 omegascript_sort(args
, value
);
2544 case CMD_sortableunserialise
:
2545 // FIXME: This uses printf %f - maybe we want more than 6
2546 // decimal places in some cases though...
2547 value
= double_to_string(Xapian::sortable_unserialise(args
[0]));
2551 if (args
.size() == 1) {
2558 string::size_type i
= 0;
2560 if (split
.empty()) {
2562 if (i
>= value
.size()) break;
2564 i
= value
.find(split
, i
);
2565 if (i
== string::npos
) break;
2567 value
.replace(i
, split
.size(), 1, '\t');
2573 int seed
= string_to_int(args
[0]);
2578 case CMD_stoplist
: {
2579 Xapian::TermIterator i
= qp
.stoplist_begin();
2580 Xapian::TermIterator end
= qp
.stoplist_end();
2582 if (!value
.empty()) value
+= '\t';
2589 value
= str(string_to_int(args
[0]) - string_to_int(args
[1]));
2592 Xapian::docid id
= q0
;
2593 if (args
.size() > 0 &&
2594 (!parse_unsigned(args
[0].c_str(), id
) || id
== 0)) {
2595 throw "Document id of the subdb command should be > 0";
2597 value
= subdbs
[(id
- 1) % subdbs
.size()].get_name();
2601 Xapian::docid id
= q0
;
2602 if (args
.size() > 0 &&
2603 (!parse_unsigned(args
[0].c_str(), id
) || id
== 0)) {
2604 throw "Document id of the subid command should be > 0";
2606 // This is the docid in the single shard.
2607 Xapian::docid shard_did
= (id
- 1) / subdbs
.size() + 1;
2608 // We now need to map this back to the docid in the collection
2609 // of shards specified by the DB parameter value which $subdb
2611 const SubDB
& subdb
= subdbs
[(id
- 1) % subdbs
.size()];
2612 value
= str(subdb
.map_docid(shard_did
));
2617 if (!parse_signed(args
[1].c_str(), start
)) {
2618 throw "Start value for substr command "
2619 "must be an integer";
2622 if (static_cast<size_t>(-start
) >= args
[0].size()) {
2625 start
= static_cast<int>(args
[0].size()) + start
;
2628 if (static_cast<size_t>(start
) >= args
[0].size()) break;
2630 size_t len
= string::npos
;
2631 if (args
.size() > 2) {
2633 if (!parse_signed(args
[2].c_str(), int_len
)) {
2634 throw "Length value for substr command "
2635 "must be an integer";
2638 len
= size_t(int_len
);
2640 len
= args
[0].size() - start
;
2641 if (static_cast<size_t>(-int_len
) >= len
) {
2644 len
-= static_cast<size_t>(-int_len
);
2648 value
.assign(args
[0], start
, len
);
2651 case CMD_suggestion
:
2652 value
= qp
.get_corrected_query_string();
2655 const string
& val
= args
[0];
2656 for (size_t i
= 1; i
< args
.size(); i
+= 2) {
2657 if (i
== args
.size() - 1) {
2658 // Handle optional "else" value.
2659 value
= eval(args
[i
], param
);
2662 if (val
== eval(args
[i
], param
)) {
2663 value
= eval(args
[i
+ 1], param
);
2669 case CMD_termprefix
:
2670 (void)prefix_from_term(&value
, args
[0]);
2673 // list of matching terms
2674 if (!enquire
) break;
2675 Xapian::TermIterator term
= enquire
->get_matching_terms_begin(q0
);
2677 while (term
!= enquire
->get_matching_terms_end(q0
)) {
2678 // check term was in the typed query so we ignore
2679 // boolean filter terms
2680 const string
& t
= *term
;
2681 if (termset
.find(t
) != termset
.end()) {
2688 // Return matching terms with specified prefix. We can't
2689 // use skip_to() as the terms aren't ordered by termname.
2690 const string
& pfx
= args
[0];
2691 while (term
!= enquire
->get_matching_terms_end(q0
)) {
2692 const string
& t
= *term
;
2693 if (startswith(t
, pfx
)) {
2701 if (!value
.empty()) value
.erase(value
.size() - 1);
2705 value
= str(topdoc
/ hits_per_page
+ 1);
2710 snprintf(buf
, sizeof(buf
), "%.6f", secs
);
2711 // MSVC's snprintf omits the zero byte if the string is
2712 // sizeof(buf) long.
2713 buf
[sizeof(buf
) - 1] = '\0';
2718 // first document on current page of hit list (counting from 0)
2719 value
= str(topdoc
);
2724 if (!args
.empty()) {
2725 if (!parse_signed(args
[0].c_str(), howmany
)) {
2726 throw "Number of terms for command "
2727 "topterms must be an integer";
2730 if (howmany
< 0) howmany
= 0;
2731 // List of expand terms
2733 OmegaExpandDecider
decider(db
, &termset
);
2735 if (!rset
.empty()) {
2736 set_expansion_scheme(*enquire
, option
);
2737 eset
= enquire
->get_eset(howmany
* 2, rset
, &decider
);
2738 } else if (mset
.size()) {
2743 // FIXME: what if mset does not start at first match?
2744 for (Xapian::docid did
: mset
) {
2745 tmp
.add_document(did
);
2746 if (--c
== 0) break;
2749 set_expansion_scheme(*enquire
, option
);
2750 eset
= enquire
->get_eset(howmany
* 2, tmp
, &decider
);
2753 // Don't show more than one word with the same stem.
2755 Xapian::ESetIterator i
;
2756 for (i
= eset
.begin(); i
!= eset
.end(); ++i
) {
2758 string stem
= (*stemmer
)(term
);
2759 if (stems
.find(stem
) != stems
.end()) continue;
2763 if (--howmany
== 0) break;
2765 if (!value
.empty()) value
.erase(value
.size() - 1);
2769 omegascript_transform(value
, args
);
2771 case CMD_truncate
: {
2772 unsigned int length
;
2773 if (!parse_unsigned(args
[1].c_str(), length
)) {
2774 throw "Length for truncate command must be >= 0";
2776 value
= generate_sample(args
[0],
2778 args
.size() > 2 ? args
[2] : string(),
2779 args
.size() > 3 ? args
[3] : string());
2783 const string
&list
= args
[0];
2784 if (list
.empty()) break;
2785 string::size_type split
= 0, split2
;
2788 split2
= list
.find('\t', split
);
2789 string
item(list
, split
, split2
- split
);
2792 } else if (item
!= prev
) {
2797 split
= UNSIGNED_OVERFLOW_OK(split2
+ 1);
2798 } while (split2
!= string::npos
);
2802 unordered_set
<string
> seen
;
2803 const string
&list
= args
[0];
2804 if (list
.empty()) break;
2805 string::size_type split
= 0, split2
;
2807 split2
= list
.find('\t', split
);
2808 string
item(list
, split
, split2
- split
);
2809 if (seen
.insert(item
).second
) {
2814 split
= UNSIGNED_OVERFLOW_OK(split2
+ 1);
2815 } while (split2
!= string::npos
);
2819 value
= str(binary_string_to_int(args
[0]));
2821 case CMD_unprefix
: {
2822 size_t prefix_len
= prefix_from_term(NULL
, args
[0]);
2823 value
.assign(args
[0], prefix_len
, string::npos
);
2827 const string
&term
= args
[0];
2828 Xapian::TermIterator i
= qp
.unstem_begin(term
);
2829 Xapian::TermIterator end
= qp
.unstem_end(term
);
2831 if (!value
.empty()) value
+= '\t';
2838 value
= Xapian::Unicode::toupper(args
[0]);
2841 url_encode(value
, args
[0]);
2844 Xapian::docid id
= q0
;
2845 Xapian::valueno slot
;
2846 if (!parse_unsigned(args
[0].c_str(), slot
)) {
2847 throw "Value slot number should be >= 0";
2849 if (args
.size() > 1 &&
2850 (!parse_unsigned(args
[1].c_str(), id
) || id
== 0)) {
2851 throw "Document id for value command must be > 0";
2853 value
= db
.get_document(id
).get_value(slot
);
2856 case CMD_valuelowerbound
: {
2857 Xapian::valueno slot
;
2858 if (!parse_unsigned(args
[0].c_str(), slot
)) {
2859 throw "Value slot number should be >= 0";
2861 value
= db
.get_value_lower_bound(slot
);
2864 case CMD_valueupperbound
: {
2865 Xapian::valueno slot
;
2866 if (!parse_unsigned(args
[0].c_str(), slot
)) {
2867 throw "Value slot number should be >= 0";
2869 value
= db
.get_value_upper_bound(slot
);
2873 value
= PACKAGE_STRING
;
2876 value
= double_to_string(weight
);
2879 args
.insert(args
.begin(), param
[0]);
2880 int macro_no
= func
->second
->tag
- CMD_MACRO
;
2881 assert(macro_no
>= 0 && unsigned(macro_no
) < macros
.size());
2882 // throw "Unknown function '" + var + "'";
2883 value
= eval(macros
[macro_no
], args
);
2888 } catch (const Xapian::Error
& e
) {
2889 // FIXME: this means we only see the most recent error in $error
2890 // - is that the best approach?
2891 error_msg
= e
.get_description();
2894 res
.append(fmt
, p
, string::npos
);
2899 eval_file(const string
& fmtfile
, bool* p_not_found
)
2901 // Use -1 to indicate vet_filename() failed.
2903 if (vet_filename(fmtfile
)) {
2904 string file
= template_dir
+ fmtfile
;
2907 if (load_file(file
, fmt
)) {
2908 vector
<string
> noargs
;
2910 return eval(fmt
, noargs
);
2916 *p_not_found
= true;
2920 // FIXME: report why!
2921 string msg
= string("Couldn't read format template '") + fmtfile
+ '\'';
2924 msg
+= (eno
< 0 ? "name contains '..'" : strerror(eno
));
2931 pretty_term(string term
)
2933 // Just leave empty strings and single characters alone.
2934 if (term
.length() <= 1) return term
;
2936 // Assume unprefixed terms are unstemmed.
2937 if (!C_isupper(term
[0])) return term
;
2939 // Handle stemmed terms.
2940 bool stemmed
= (term
[0] == 'Z');
2942 // First of all, check if a term in the query stemmed to this one.
2943 Xapian::TermIterator u
= qp
.unstem_begin(term
);
2944 // There might be multiple words with the same stem, but we only want
2945 // one so just take the first.
2946 if (u
!= qp
.unstem_end(term
)) return *u
;
2952 bool add_quotes
= false;
2954 // Check if the term has a prefix.
2955 if (C_isupper(term
[0])) {
2956 // See if we have this prefix in the termprefix_to_userprefix map. If
2957 // so, just reverse the mapping (e.g. turn 'Sfish' into 'subject:fish').
2959 size_t prefix_len
= prefix_from_term(&prefix
, term
);
2961 map
<string
, string
>::const_iterator i
;
2962 i
= termprefix_to_userprefix
.find(prefix
);
2963 if (i
!= termprefix_to_userprefix
.end()) {
2964 string user_prefix
= i
->second
;
2966 term
.replace(0, prefix_len
, user_prefix
);
2968 // We don't have a prefix mapping for this, so just set a flag to
2969 // add quotes around the term.
2974 if (stemmed
) term
+= '.';
2977 term
.insert(0, "\"");
2985 print_caption(const string
& fmt
, vector
<string
>& param
)
2987 q0
= *(mset
[hit_no
]);
2989 weight
= mset
[hit_no
].get_weight();
2990 percent
= mset
.convert_to_percent(mset
[hit_no
]);
2991 collapsed
= mset
[hit_no
].get_collapse_count();
2993 return eval(fmt
, param
);
3000 string output
= eval_file(fmtname
);
3001 if (!set_content_type
&& !suppress_http_headers
) {
3002 cout
<< "Content-Type: text/html" << endl
;
3003 set_content_type
= true;
3005 if (!suppress_http_headers
) cout
<< endl
;
3008 // Ensure the headers have been output so that any exception gets
3009 // reported rather than giving a server error.
3010 if (!set_content_type
&& !suppress_http_headers
) {
3011 cout
<< "Content-Type: text/html" << endl
;
3012 set_content_type
= true;
3014 if (!suppress_http_headers
) cout
<< endl
;
3020 ensure_query_parsed()
3022 if (query_parsed
) return;
3023 query_parsed
= true;
3025 // Should we discard the existing R-set recorded in R CGI parameters?
3026 bool discard_rset
= false;
3028 // Should we force the first page of hits (and ignore [ > < # and TOPDOC
3030 bool force_first_page
= false;
3033 // get list of terms from previous iteration of query
3034 auto val
= cgi_params
.find("xP");
3035 if (val
!= cgi_params
.end()) {
3037 // If xP given, default to discarding any RSet and forcing the first
3038 // page of results. If the query is the same, or an extension of
3039 // the previous query, we adjust these again below.
3040 discard_rset
= true;
3041 force_first_page
= true;
3043 querytype result
= parse_queries(v
);
3050 case EXTENDED_QUERY
:
3051 // If we've changed database, force the first page of hits
3052 // and discard the R-set (since the docids will have changed)
3053 val
= cgi_params
.find("xDB");
3054 if (val
!= cgi_params
.end() && val
->second
!= dbname
) break;
3055 if (result
== SAME_QUERY
&& force_first_page
) {
3056 val
= cgi_params
.find("xFILTERS");
3057 if (val
!= cgi_params
.end() && val
->second
!= filters
&&
3058 val
->second
!= old_filters
) {
3059 // Filters have changed since last query.
3061 force_first_page
= false;
3064 discard_rset
= false;
3068 if (!force_first_page
) {
3069 // Work out which mset element is the first hit we want
3071 val
= cgi_params
.find("TOPDOC");
3072 if (val
!= cgi_params
.end()) {
3073 if (!parse_unsigned(val
->second
.c_str(), topdoc
)) {
3074 throw "TOPDOC parameter must be >= 0";
3078 // Handle next, previous, and page links
3079 if (cgi_params
.find(">") != cgi_params
.end()) {
3080 topdoc
+= hits_per_page
;
3081 } else if (cgi_params
.find("<") != cgi_params
.end()) {
3082 if (topdoc
>= hits_per_page
)
3083 topdoc
-= hits_per_page
;
3086 } else if ((val
= cgi_params
.find("[")) != cgi_params
.end() ||
3087 (val
= cgi_params
.find("#")) != cgi_params
.end()) {
3088 if (!C_isdigit(val
->second
[0])) {
3089 throw "Page parameter must be >= 0";
3091 long page
= atol(val
->second
.c_str());
3092 // Do something sensible for page 0 (we count pages from 1).
3093 if (page
== 0) page
= 1;
3094 topdoc
= (page
- 1) * hits_per_page
;
3097 // raw_search means don't snap TOPDOC to a multiple of HITSPERPAGE.
3098 // Normally we snap TOPDOC like this so that things work nicely if
3099 // HITSPERPAGE is in a <select> or on radio buttons. If we're
3100 // postprocessing the output of omega and want variable sized pages,
3101 // this is unhelpful.
3102 bool raw_search
= false;
3103 val
= cgi_params
.find("RAWSEARCH");
3104 if (val
!= cgi_params
.end()) {
3106 if (!parse_unsigned(val
->second
.c_str(), temp
)) {
3107 throw "RAWSEARCH parameter must be >= 0";
3109 raw_search
= bool(temp
);
3112 if (!raw_search
) topdoc
= (topdoc
/ hits_per_page
) * hits_per_page
;
3115 if (!discard_rset
) {
3116 // put documents marked as relevant into the rset
3117 auto g
= cgi_params
.equal_range("R");
3118 for (auto i
= g
.first
; i
!= g
.second
; ++i
) {
3119 const string
& value
= i
->second
;
3120 for (size_t j
= 0; j
< value
.size(); j
= value
.find('.', j
)) {
3121 while (value
[j
] == '.') ++j
;
3123 if (!parse_unsigned(value
.c_str() + j
, d
) || d
== 0) {
3124 throw "Document id for 'R' parameter must be > 0";
3127 rset
.add_document(d
);
3135 // run query if we haven't already
3139 if (done_query
) return;
3141 secs
= RealTime::now();
3144 secs
= RealTime::now() - secs
;
3147 last
= mset
.get_matches_lower_bound();
3149 // Otherwise topdoc ends up being -6 if it's non-zero!
3153 topdoc
= ((last
- 1) / hits_per_page
) * hits_per_page
;
3154 // last is the count of documents up to the end of the current page
3155 // (as returned by $last)
3156 if (topdoc
+ hits_per_page
< last
)
3157 last
= topdoc
+ hits_per_page
;
3161 // OmegaExpandDecider methods.
3163 OmegaExpandDecider::OmegaExpandDecider(const Xapian::Database
& db_
,
3164 set
<string
> * querytermset
)
3167 // We'll want the stemmer for testing matches anyway.
3169 stemmer
= new Xapian::Stem(option
["stemmer"]);
3171 set
<string
>::const_iterator i
;
3172 for (i
= querytermset
->begin(); i
!= querytermset
->end(); ++i
) {
3174 if (term
.empty()) continue;
3176 unsigned char ch
= term
[0];
3177 bool stemmed
= (ch
== 'Z');
3180 if (term
.empty()) continue;
3184 if (C_isupper(ch
)) {
3185 size_t prefix_len
= prefix_from_term(NULL
, term
);
3186 term
.erase(0, prefix_len
);
3189 if (!stemmed
) term
= (*stemmer
)(term
);
3191 exclude_stems
.insert(term
);
3197 OmegaExpandDecider::operator()(const string
& term
) const
3199 unsigned char ch
= term
[0];
3201 // Reject terms with a prefix.
3202 if (C_isupper(ch
)) return false;
3206 // Don't suggest stopwords.
3207 if (stopper(term
)) return false;
3210 // Reject small numbers.
3211 if (term
.size() < 4 && C_isdigit(ch
)) return false;
3213 // Reject terms containing a space.
3214 if (term
.find(' ') != string::npos
) return false;
3216 // Skip terms with stems in the exclude_stems set, to avoid suggesting
3217 // terms which are already in the query in some form.
3218 string stem
= (*stemmer
)(term
);
3219 if (exclude_stems
.find(stem
) != exclude_stems
.end())
3222 // Ignore terms that only occur once (hapaxes) since they aren't
3223 // useful for finding related documents - they only occur in a
3224 // document that's already been marked as relevant.
3225 // FIXME: add an expand option to ignore terms where
3226 // termfreq == rtermfreq.
3227 if (db
.get_termfreq(term
) <= 1) return false;