2 * Copyright 2004 The Apache Software Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 JAVA_UNICODE_ESCAPE=true;
20 USER_CHAR_STREAM=true;
23 PARSER_BEGIN(QueryParser)
25 package org.apache.lucene.queryParser;
27 import java.util.Vector;
31 import org.apache.lucene.index.Term;
32 import org.apache.lucene.analysis.*;
33 import org.apache.lucene.document.*;
34 import org.apache.lucene.search.*;
37 * This class is generated by JavaCC. The only method that clients should need
38 * to call is <a href="#parse">parse()</a>.
40 * The syntax for query strings is as follows:
41 * A Query is a series of clauses.
42 * A clause may be prefixed by:
44 * <li> a plus (<code>+</code>) or a minus (<code>-</code>) sign, indicating
45 * that the clause is required or prohibited respectively; or
46 * <li> a term followed by a colon, indicating the field to be searched.
47 * This enables one to construct queries which search multiple fields.
50 * A clause may be either:
52 * <li> a term, indicating all the documents that contain this term; or
53 * <li> a nested query, enclosed in parentheses. Note that this may be used
54 * with a <code>+</code>/<code>-</code> prefix to require any of a set of
58 * Thus, in BNF, the query grammar is:
60 * Query ::= ( Clause )*
61 * Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" )
65 * Examples of appropriately formatted queries can be found in the <a
66 * href="http://jakarta.apache.org/lucene/src/test/org/apache/lucene/queryParser/TestQueryParser.java">test cases</a>.
70 * @author Peter Halacsy
71 * @author Tatu Saloranta
74 public class QueryParser {
76 private static final int CONJ_NONE = 0;
77 private static final int CONJ_AND = 1;
78 private static final int CONJ_OR = 2;
80 private static final int MOD_NONE = 0;
81 private static final int MOD_NOT = 10;
82 private static final int MOD_REQ = 11;
84 public static final int DEFAULT_OPERATOR_OR = 0;
85 public static final int DEFAULT_OPERATOR_AND = 1;
87 /** The actual operator that parser uses to combine query terms */
88 private int operator = DEFAULT_OPERATOR_OR;
91 * Whether terms of wildcard and prefix queries are to be automatically
92 * lower-cased or not. Default is <code>true</code>.
94 boolean lowercaseWildcardTerms = true;
99 Locale locale = Locale.getDefault();
101 /** Parses a query string, returning a {@link org.apache.lucene.search.Query}.
102 * @param query the query string to be parsed.
103 * @param field the default field for query terms.
104 * @param analyzer used to find terms in the query text.
105 * @throws ParseException if the parsing fails
107 static public Query parse(String query, String field, Analyzer analyzer)
108 throws ParseException {
109 QueryParser parser = new QueryParser(field, analyzer);
110 return parser.parse(query);
113 /** Constructs a query parser.
114 * @param f the default field for query terms.
115 * @param a used to find terms in the query text.
117 public QueryParser(String f, Analyzer a) {
118 this(new FastCharStream(new StringReader("")));
123 /** Parses a query string, returning a
124 * <a href="lucene.search.Query.html">Query</a>.
125 * @param query the query string to be parsed.
126 * @throws ParseException if the parsing fails
128 public Query parse(String query) throws ParseException {
129 ReInit(new FastCharStream(new StringReader(query)));
133 catch (TokenMgrError tme) {
134 throw new ParseException(tme.getMessage());
136 catch (BooleanQuery.TooManyClauses tmc) {
137 throw new ParseException("Too many boolean clauses");
142 * Sets the default slop for phrases. If zero, then exact phrase matches
143 * are required. Default value is zero.
145 public void setPhraseSlop(int phraseSlop) {
146 this.phraseSlop = phraseSlop;
150 * Gets the default slop for phrases.
152 public int getPhraseSlop() {
157 * Sets the boolean operator of the QueryParser.
158 * In classic mode (<code>DEFAULT_OPERATOR_OR</code>) terms without any modifiers
159 * are considered optional: for example <code>capital of Hungary</code> is equal to
160 * <code>capital OR of OR Hungary</code>.<br/>
161 * In <code>DEFAULT_OPERATOR_AND</code> terms are considered to be in conjuction: the
162 * above mentioned query is parsed as <code>capital AND of AND Hungary</code>
164 public void setOperator(int operator) {
165 this.operator = operator;
169 * Gets implicit operator setting, which will be either DEFAULT_OPERATOR_AND
170 * or DEFAULT_OPERATOR_OR.
172 public int getOperator() {
176 public void setLowercaseWildcardTerms(boolean lowercaseWildcardTerms) {
177 this.lowercaseWildcardTerms = lowercaseWildcardTerms;
180 public boolean getLowercaseWildcardTerms() {
181 return lowercaseWildcardTerms;
185 * Set locale used by date range parsing.
187 public void setLocale(Locale locale) {
188 this.locale = locale;
192 * Returns current locale, allowing access by subclasses.
194 public Locale getLocale() {
198 protected void addClause(Vector clauses, int conj, int mods, Query q) {
199 boolean required, prohibited;
201 // If this term is introduced by AND, make the preceding term required,
202 // unless it's already prohibited
203 if (conj == CONJ_AND) {
204 BooleanClause c = (BooleanClause) clauses.elementAt(clauses.size()-1);
209 if (operator == DEFAULT_OPERATOR_AND && conj == CONJ_OR) {
210 // If this term is introduced by OR, make the preceding term optional,
211 // unless it's prohibited (that means we leave -a OR b but +a OR b-->a OR b)
212 // notice if the input is a OR b, first term is parsed as required; without
213 // this modification a OR b would parsed as +a OR b
214 BooleanClause c = (BooleanClause) clauses.elementAt(clauses.size()-1);
219 // We might have been passed a null query; the term might have been
220 // filtered away by the analyzer.
224 if (operator == DEFAULT_OPERATOR_OR) {
225 // We set REQUIRED if we're introduced by AND or +; PROHIBITED if
226 // introduced by NOT or -; make sure not to set both.
227 prohibited = (mods == MOD_NOT);
228 required = (mods == MOD_REQ);
229 if (conj == CONJ_AND && !prohibited) {
233 // We set PROHIBITED if we're introduced by NOT or -; We set REQUIRED
234 // if not PROHIBITED and not introduced by OR
235 prohibited = (mods == MOD_NOT);
236 required = (!prohibited && conj != CONJ_OR);
238 clauses.addElement(new BooleanClause(q, required, prohibited));
242 * @exception ParseException throw in overridden method to disallow
244 protected Query getFieldQuery(String field,
246 String queryText) throws ParseException {
247 // Use the analyzer to get all the tokens, and then build a TermQuery,
248 // PhraseQuery, or nothing based on the term count
250 TokenStream source = analyzer.tokenStream(field,
251 new StringReader(queryText));
252 Vector v = new Vector();
253 org.apache.lucene.analysis.Token t;
259 catch (IOException e) {
264 v.addElement(t.termText());
269 catch (IOException e) {
275 else if (v.size() == 1)
276 return new TermQuery(new Term(field, (String) v.elementAt(0)));
278 PhraseQuery q = new PhraseQuery();
279 q.setSlop(phraseSlop);
280 for (int i=0; i<v.size(); i++) {
281 q.add(new Term(field, (String) v.elementAt(i)));
288 * Base implementation delegates to {@link #getFieldQuery(String,Analyzer,String)}.
289 * This method may be overridden, for example, to return
290 * a SpanNearQuery instead of a PhraseQuery.
292 * @exception ParseException throw in overridden method to disallow
294 protected Query getFieldQuery(String field,
297 int slop) throws ParseException {
298 Query query = getFieldQuery(field, analyzer, queryText);
300 if (query instanceof PhraseQuery) {
301 ((PhraseQuery) query).setSlop(slop);
308 * @exception ParseException throw in overridden method to disallow
310 protected Query getRangeQuery(String field,
314 boolean inclusive) throws ParseException
317 DateFormat df = DateFormat.getDateInstance(DateFormat.SHORT, locale);
319 Date d1 = df.parse(part1);
320 Date d2 = df.parse(part2);
321 part1 = DateField.dateToString(d1);
322 part2 = DateField.dateToString(d2);
324 catch (Exception e) { }
326 return new RangeQuery(new Term(field, part1),
327 new Term(field, part2),
332 * Factory method for generating query, given a set of clauses.
333 * By default creates a boolean query composed of clauses passed in.
335 * Can be overridden by extending classes, to modify query being
338 * @param clauses Vector that contains {@link BooleanClause} instances
341 * @return Resulting {@link Query} object.
342 * @exception ParseException throw in overridden method to disallow
344 protected Query getBooleanQuery(Vector clauses) throws ParseException
346 BooleanQuery query = new BooleanQuery();
347 for (int i = 0; i < clauses.size(); i++) {
348 query.add((BooleanClause)clauses.elementAt(i));
354 * Factory method for generating a query. Called when parser
355 * parses an input term token that contains one or more wildcard
356 * characters (? and *), but is not a prefix term token (one
357 * that has just a single * character at the end)
359 * Depending on settings, prefix term may be lower-cased
360 * automatically. It will not go through the default Analyzer,
361 * however, since normal Analyzers are unlikely to work properly
362 * with wildcard templates.
364 * Can be overridden by extending classes, to provide custom handling for
365 * wildcard queries, which may be necessary due to missing analyzer calls.
367 * @param field Name of the field query will use.
368 * @param termStr Term token that contains one or more wild card
369 * characters (? or *), but is not simple prefix term
371 * @return Resulting {@link Query} built for the term
372 * @exception ParseException throw in overridden method to disallow
374 protected Query getWildcardQuery(String field, String termStr) throws ParseException
376 if (lowercaseWildcardTerms) {
377 termStr = termStr.toLowerCase();
379 Term t = new Term(field, termStr);
380 return new WildcardQuery(t);
384 * Factory method for generating a query (similar to
385 * ({@link #getWildcardQuery}). Called when parser parses an input term
386 * token that uses prefix notation; that is, contains a single '*' wildcard
387 * character as its last character. Since this is a special case
388 * of generic wildcard term, and such a query can be optimized easily,
389 * this usually results in a different query object.
391 * Depending on settings, a prefix term may be lower-cased
392 * automatically. It will not go through the default Analyzer,
393 * however, since normal Analyzers are unlikely to work properly
394 * with wildcard templates.
396 * Can be overridden by extending classes, to provide custom handling for
397 * wild card queries, which may be necessary due to missing analyzer calls.
399 * @param field Name of the field query will use.
400 * @param termStr Term token to use for building term for the query
401 * (<b>without</b> trailing '*' character!)
403 * @return Resulting {@link Query} built for the term
404 * @exception ParseException throw in overridden method to disallow
406 protected Query getPrefixQuery(String field, String termStr) throws ParseException
408 if (lowercaseWildcardTerms) {
409 termStr = termStr.toLowerCase();
411 Term t = new Term(field, termStr);
412 return new PrefixQuery(t);
416 * Factory method for generating a query (similar to
417 * ({@link #getWildcardQuery}). Called when parser parses
418 * an input term token that has the fuzzy suffix (~) appended.
420 * @param field Name of the field query will use.
421 * @param termStr Term token to use for building term for the query
423 * @return Resulting {@link Query} built for the term
424 * @exception ParseException throw in overridden method to disallow
426 protected Query getFuzzyQuery(String field, String termStr) throws ParseException
428 Term t = new Term(field, termStr);
429 return new FuzzyQuery(t);
433 * Returns a String where the escape char has been
434 * removed, or kept only once if there was a double escape.
436 private String discardEscapeChar(String input) {
437 char[] caSource = input.toCharArray();
438 char[] caDest = new char[caSource.length];
440 for (int i = 0; i < caSource.length; i++) {
441 if ((caSource[i] != '\\') || (i > 0 && caSource[i-1] == '\\')) {
442 caDest[j++]=caSource[i];
445 return new String(caDest, 0, j);
448 public static void main(String[] args) throws Exception {
449 QueryParser qp = new QueryParser("field",
450 new org.apache.lucene.analysis.SimpleAnalyzer());
451 Query q = qp.parse(args[0]);
452 System.out.println(q.toString("field"));
456 PARSER_END(QueryParser)
458 /* ***************** */
459 /* Token Definitions */
460 /* ***************** */
463 <#_NUM_CHAR: ["0"-"9"] >
464 | <#_ESCAPED_CHAR: "\\" [ "\\", "+", "-", "!", "(", ")", ":", "^",
465 "[", "]", "\"", "{", "}", "~", "*", "?" ] >
466 | <#_TERM_START_CHAR: ( ~[ " ", "\t", "\n", "\r", "+", "-", "!", "(", ")", ":", "^",
467 "[", "]", "\"", "{", "}", "~", "*", "?" ]
468 | <_ESCAPED_CHAR> ) >
469 | <#_TERM_CHAR: ( <_TERM_START_CHAR> | <_ESCAPED_CHAR> | "-" | "+" ) >
470 | <#_WHITESPACE: ( " " | "\t" | "\n" | "\r") >
473 <DEFAULT, RangeIn, RangeEx> SKIP : {
477 // OG: to support prefix queries:
478 // http://nagoya.apache.org/bugzilla/show_bug.cgi?id=12137
480 // | <WILDTERM: <_TERM_START_CHAR>
481 // (<_TERM_CHAR> | ( [ "*", "?" ] ))* >
484 // | <WILDTERM: (<_TERM_CHAR> | ( [ "*", "?" ] ))* >
487 <AND: ("AND" | "&&") >
488 | <OR: ("OR" | "||") >
489 | <NOT: ("NOT" | "!") >
495 | <CARAT: "^" > : Boost
496 | <QUOTED: "\"" (~["\""])+ "\"">
497 | <TERM: <_TERM_START_CHAR> (<_TERM_CHAR>)* >
499 | <SLOP: "~" (<_NUM_CHAR>)+ >
500 | <PREFIXTERM: <_TERM_START_CHAR> (<_TERM_CHAR>)* "*" >
501 | <WILDTERM: <_TERM_START_CHAR>
502 (<_TERM_CHAR> | ( [ "*", "?" ] ))* >
503 | <RANGEIN_START: "[" > : RangeIn
504 | <RANGEEX_START: "{" > : RangeEx
508 <NUMBER: (<_NUM_CHAR>)+ ( "." (<_NUM_CHAR>)+ )? > : DEFAULT
513 | <RANGEIN_END: "]"> : DEFAULT
514 | <RANGEIN_QUOTED: "\"" (~["\""])+ "\"">
515 | <RANGEIN_GOOP: (~[ " ", "]" ])+ >
520 | <RANGEEX_END: "}"> : DEFAULT
521 | <RANGEEX_QUOTED: "\"" (~["\""])+ "\"">
522 | <RANGEEX_GOOP: (~[ " ", "}" ])+ >
525 // * Query ::= ( Clause )*
526 // * Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" )
528 int Conjunction() : {
533 <AND> { ret = CONJ_AND; }
534 | <OR> { ret = CONJ_OR; }
544 <PLUS> { ret = MOD_REQ; }
545 | <MINUS> { ret = MOD_NOT; }
546 | <NOT> { ret = MOD_NOT; }
551 Query Query(String field) :
553 Vector clauses = new Vector();
554 Query q, firstQuery=null;
558 mods=Modifiers() q=Clause(field)
560 addClause(clauses, CONJ_NONE, mods, q);
561 if (mods == MOD_NONE)
565 conj=Conjunction() mods=Modifiers() q=Clause(field)
566 { addClause(clauses, conj, mods, q); }
569 if (clauses.size() == 1 && firstQuery != null)
572 return getBooleanQuery(clauses);
577 Query Clause(String field) : {
579 Token fieldToken=null, boost=null;
584 fieldToken=<TERM> <COLON> {
585 field=discardEscapeChar(fieldToken.image);
591 | <LPAREN> q=Query(field) <RPAREN> (<CARAT> boost=<NUMBER>)?
596 float f = (float)1.0;
598 f = Float.valueOf(boost.image).floatValue();
600 } catch (Exception ignored) { }
607 Query Term(String field) : {
608 Token term, boost=null, slop=null, goop1, goop2;
609 boolean prefix = false;
610 boolean wildcard = false;
611 boolean fuzzy = false;
612 boolean rangein = false;
619 | term=<PREFIXTERM> { prefix=true; }
620 | term=<WILDTERM> { wildcard=true; }
623 [ <FUZZY> { fuzzy=true; } ]
624 [ <CARAT> boost=<NUMBER> [ <FUZZY> { fuzzy=true; } ] ]
626 String termImage=discardEscapeChar(term.image);
628 q = getWildcardQuery(field, termImage);
630 q = getPrefixQuery(field,
631 discardEscapeChar(term.image.substring
632 (0, term.image.length()-1)));
634 q = getFuzzyQuery(field, termImage);
636 q = getFieldQuery(field, analyzer, termImage);
639 | ( <RANGEIN_START> ( goop1=<RANGEIN_GOOP>|goop1=<RANGEIN_QUOTED> )
640 [ <RANGEIN_TO> ] ( goop2=<RANGEIN_GOOP>|goop2=<RANGEIN_QUOTED> )
642 [ <CARAT> boost=<NUMBER> ]
644 if (goop1.kind == RANGEIN_QUOTED) {
645 goop1.image = goop1.image.substring(1, goop1.image.length()-1);
647 goop1.image = discardEscapeChar(goop1.image);
649 if (goop2.kind == RANGEIN_QUOTED) {
650 goop2.image = goop2.image.substring(1, goop2.image.length()-1);
652 goop2.image = discardEscapeChar(goop2.image);
654 q = getRangeQuery(field, analyzer, goop1.image, goop2.image, true);
656 | ( <RANGEEX_START> ( goop1=<RANGEEX_GOOP>|goop1=<RANGEEX_QUOTED> )
657 [ <RANGEEX_TO> ] ( goop2=<RANGEEX_GOOP>|goop2=<RANGEEX_QUOTED> )
659 [ <CARAT> boost=<NUMBER> ]
661 if (goop1.kind == RANGEEX_QUOTED) {
662 goop1.image = goop1.image.substring(1, goop1.image.length()-1);
664 goop1.image = discardEscapeChar(goop1.image);
666 if (goop2.kind == RANGEEX_QUOTED) {
667 goop2.image = goop2.image.substring(1, goop2.image.length()-1);
669 goop2.image = discardEscapeChar(goop2.image);
672 q = getRangeQuery(field, analyzer, goop1.image, goop2.image, false);
676 [ <CARAT> boost=<NUMBER> ]
682 s = Float.valueOf(slop.image.substring(1)).intValue();
684 catch (Exception ignored) { }
686 q = getFieldQuery(field, analyzer,
687 term.image.substring(1, term.image.length()-1),
693 float f = (float) 1.0;
695 f = Float.valueOf(boost.image).floatValue();
697 catch (Exception ignored) {
698 /* Should this be handled somehow? (defaults to "no boost", if
699 * boost number is invalid)
703 // avoid boosting null queries, such as those caused by stop words