"MDL-12304, fix double text"
[moodle-linuxchix.git] / lib / searchlib.php
blob4fee4b482419686f02f6775b91181c342c2f23e7
1 <?php // $Id$
3 require_once($CFG->libdir.'/lexer.php');
5 // Constants for the various types of tokens
7 define("TOKEN_USER","0");
8 define("TOKEN_META","1");
9 define("TOKEN_EXACT","2");
10 define("TOKEN_NEGATE","3");
11 define("TOKEN_STRING","4");
12 define("TOKEN_USERID","5");
13 define("TOKEN_DATEFROM","6");
14 define("TOKEN_DATETO","7");
15 define("TOKEN_INSTANCE","8");
17 // Class to hold token/value pairs after they're parsed.
19 class search_token {
20 var $value;
21 var $type;
22 function search_token($type,$value){
23 $this->type = $type;
24 $this->value = $this->sanitize($value);
28 // Try to clean up user input to avoid potential security issues.
29 // Need to think about this some more.
31 function sanitize($userstring){
32 return htmlspecialchars(addslashes($userstring));
34 function getValue(){
35 return $this->value;
37 function getType(){
38 return $this->type;
44 // This class does the heavy lifting of lexing the search string into tokens.
45 // Using a full-blown lexer is probably overkill for this application, but
46 // might be useful for other tasks.
48 class search_lexer extends Lexer{
50 function search_lexer(&$parser){
52 // Call parent constructor.
53 $this->Lexer($parser);
55 //Set up the state machine and pattern matches for transitions.
57 // Patterns to handle strings of the form datefrom:foo
59 // If we see the string datefrom: while in the base accept state, start
60 // parsing a username and go to the indatefrom state.
61 $this->addEntryPattern("datefrom:\S+","accept","indatefrom");
63 // Snarf everything into the username until we see whitespace, then exit
64 // back to the base accept state.
65 $this->addExitPattern("\s","indatefrom");
68 // Patterns to handle strings of the form dateto:foo
70 // If we see the string dateto: while in the base accept state, start
71 // parsing a username and go to the indateto state.
72 $this->addEntryPattern("dateto:\S+","accept","indateto");
74 // Snarf everything into the username until we see whitespace, then exit
75 // back to the base accept state.
76 $this->addExitPattern("\s","indateto");
79 // Patterns to handle strings of the form instance:foo
81 // If we see the string instance: while in the base accept state, start
82 // parsing for instance number and go to the ininstance state.
83 $this->addEntryPattern("instance:\S+","accept","ininstance");
85 // Snarf everything into the username until we see whitespace, then exit
86 // back to the base accept state.
87 $this->addExitPattern("\s","ininstance");
90 // Patterns to handle strings of the form userid:foo
92 // If we see the string userid: while in the base accept state, start
93 // parsing a username and go to the inuserid state.
94 $this->addEntryPattern("userid:\S+","accept","inuserid");
96 // Snarf everything into the username until we see whitespace, then exit
97 // back to the base accept state.
98 $this->addExitPattern("\s","inuserid");
101 // Patterns to handle strings of the form user:foo
103 // If we see the string user: while in the base accept state, start
104 // parsing a username and go to the inusername state.
105 $this->addEntryPattern("user:\S+","accept","inusername");
107 // Snarf everything into the username until we see whitespace, then exit
108 // back to the base accept state.
109 $this->addExitPattern("\s","inusername");
112 // Patterns to handle strings of the form meta:foo
114 // If we see the string meta: while in the base accept state, start
115 // parsing a username and go to the inmeta state.
116 $this->addEntryPattern("subject:\S+","accept","inmeta");
118 // Snarf everything into the meta token until we see whitespace, then exit
119 // back to the base accept state.
120 $this->addExitPattern("\s","inmeta");
123 // Patterns to handle required exact match strings (+foo) .
125 // If we see a + sign while in the base accept state, start
126 // parsing an exact match string and enter the inrequired state
127 $this->addEntryPattern("\+\S+","accept","inrequired");
128 // When we see white space, exit back to accept state.
129 $this->addExitPattern("\s","inrequired");
131 // Handle excluded strings (-foo)
133 // If we see a - sign while in the base accept state, start
134 // parsing an excluded string and enter the inexcluded state
135 $this->addEntryPattern("\-\S+","accept","inexcluded");
136 // When we see white space, exit back to accept state.
137 $this->addExitPattern("\s","inexcluded");
140 // Patterns to handle quoted strings.
142 // If we see a quote while in the base accept state, start
143 // parsing a quoted string and enter the inquotedstring state.
144 // Grab everything until we see the closing quote.
146 $this->addEntryPattern("\"[^\"]+","accept","inquotedstring");
148 // When we see a closing quote, reenter the base accept state.
149 $this->addExitPattern("\"","inquotedstring");
151 // Patterns to handle ordinary, nonquoted words.
153 // When we see non-whitespace, snarf everything into the nonquoted word
154 // until we see whitespace again.
155 $this->addEntryPattern("\S+","accept","plainstring");
157 // Once we see whitespace, reenter the base accept state.
158 $this->addExitPattern("\s","plainstring");
166 // This class takes care of sticking the proper token type/value pairs into
167 // the parsed token array.
168 // Most functions in this class should only be called by the lexer, the
169 // one exception being getParseArray() which returns the result.
171 class search_parser {
172 var $tokens;
175 // This function is called by the code that's interested in the result of the parse operation.
176 function get_parsed_array(){
177 return $this->tokens;
181 * Functions below this are part of the state machine for the parse
182 * operation and should not be called directly.
185 // Base state. No output emitted.
186 function accept() {
187 return true;
190 // State for handling datefrom:foo constructs. Potentially emits a token.
191 function indatefrom($content){
192 if (strlen($content) < 10) { // State exit or missing parameter.
193 return true;
195 // Strip off the datefrom: part and add the reminder to the parsed token array
196 $param = trim(substr($content,9));
197 $this->tokens[] = new search_token(TOKEN_DATEFROM,$param);
198 return true;
201 // State for handling dateto:foo constructs. Potentially emits a token.
202 function indateto($content){
203 if (strlen($content) < 8) { // State exit or missing parameter.
204 return true;
206 // Strip off the dateto: part and add the reminder to the parsed token array
207 $param = trim(substr($content,7));
208 $this->tokens[] = new search_token(TOKEN_DATETO,$param);
209 return true;
212 // State for handling instance:foo constructs. Potentially emits a token.
213 function ininstance($content){
214 if (strlen($content) < 10) { // State exit or missing parameter.
215 return true;
217 // Strip off the instance: part and add the reminder to the parsed token array
218 $param = trim(substr($content,9));
219 $this->tokens[] = new search_token(TOKEN_INSTANCE,$param);
220 return true;
224 // State for handling userid:foo constructs. Potentially emits a token.
225 function inuserid($content){
226 if (strlen($content) < 8) { // State exit or missing parameter.
227 return true;
229 // Strip off the userid: part and add the reminder to the parsed token array
230 $param = trim(substr($content,7));
231 $this->tokens[] = new search_token(TOKEN_USERID,$param);
232 return true;
236 // State for handling user:foo constructs. Potentially emits a token.
237 function inusername($content){
238 if (strlen($content) < 6) { // State exit or missing parameter.
239 return true;
241 // Strip off the user: part and add the reminder to the parsed token array
242 $param = trim(substr($content,5));
243 $this->tokens[] = new search_token(TOKEN_USER,$param);
244 return true;
248 // State for handling meta:foo constructs. Potentially emits a token.
249 function inmeta($content){
250 if (strlen($content) < 9) { // Missing parameter.
251 return true;
253 // Strip off the meta: part and add the reminder to the parsed token array.
254 $param = trim(substr($content,8));
255 $this->tokens[] = new search_token(TOKEN_META,$param);
256 return true;
260 // State entered when we've seen a required string (+foo). Potentially
261 // emits a token.
262 function inrequired($content){
263 if (strlen($content) < 2) { // State exit or missing parameter, don't emit.
264 return true;
266 // Strip off the + sign and add the reminder to the parsed token array.
267 $this->tokens[] = new search_token(TOKEN_EXACT,substr($content,1));
268 return true;
271 // State entered when we've seen an excluded string (-foo). Potentially
272 // emits a token.
273 function inexcluded($content){
274 if (strlen($content) < 2) { // State exit or missing parameter.
275 return true;
277 // Strip off the -sign and add the reminder to the parsed token array.
278 $this->tokens[] = new search_token(TOKEN_NEGATE,substr($content,1));
279 return true;
283 // State entered when we've seen a quoted string. Potentially emits a token.
284 function inquotedstring($content){
285 if (strlen($content) < 2) { // State exit or missing parameter.
286 return true;
288 // Strip off the opening quote and add the reminder to the parsed token array.
289 $this->tokens[] = new search_token(TOKEN_STRING,substr($content,1));
290 return true;
293 // State entered when we've seen an ordinary, non-quoted word. Potentially
294 // emits a token.
295 function plainstring($content){
296 if (trim($content) === '') { // State exit
297 return true;
299 // Add the string to the parsed token array.
300 $this->tokens[] = new search_token(TOKEN_STRING,$content);
301 return true;
305 // Primitive function to generate a SQL string from a parse tree
306 // using TEXT indexes. If searches aren't suitable to use TEXT
307 // this function calls the default search_generate_SQL() one.
309 // $parsetree should be a parse tree generated by a
310 // search_lexer/search_parser combination.
311 // Other fields are database table names to search.
312 function search_generate_text_SQL($parsetree, $datafield, $metafield, $mainidfield, $useridfield,
313 $userfirstnamefield, $userlastnamefield, $timefield, $instancefield) {
314 global $CFG;
316 /// First of all, search for reasons to switch to standard SQL generation
317 /// Only mysql are supported for now
318 if ($CFG->dbfamily != 'mysql') {
319 return search_generate_SQL($parsetree, $datafield, $metafield, $mainidfield, $useridfield,
320 $userfirstnamefield, $userlastnamefield, $timefield, $instancefield);
323 /// Some languages don't have "word separators" and MySQL FULLTEXT doesn't perform well with them, so
324 /// switch to standard SQL search generation
325 if ($CFG->dbfamily == 'mysql') {
326 $nonseparatedlangs = array('ja_utf8', 'th_utf8', 'zh_cn_utf8', 'zh_tw_utf8');
327 if (in_array(current_language(), $nonseparatedlangs)) {
328 return search_generate_SQL($parsetree, $datafield, $metafield, $mainidfield, $useridfield,
329 $userfirstnamefield, $userlastnamefield, $timefield, $instancefield);
333 /// Here we'll acumulate non-textual tokens
334 $non_text_tokens = array();
336 $ntokens = count($parsetree);
337 if ($ntokens == 0) {
338 return "";
341 $SQLString = '';
342 $text_sql_string = '';
344 $datasearch_clause = '';
345 $metasearch_clause = '';
347 foreach ($parsetree as $token) {
349 $type = $token->getType();
350 $value = $token->getValue();
352 switch($type){
353 case TOKEN_STRING:
354 /// If it's a multiword token, quote it
355 if (strstr($value, ' ')) {
356 $datasearch_clause .= '"' . $value . '" ';
357 /// Simple word token, search for it as prefix
358 } else {
359 $datasearch_clause .= '+' . $value . '* ';
361 break;
362 case TOKEN_EXACT:
363 /// token must be exactly as requested
364 $datasearch_clause .= '+' . $value . ' ';
365 break;
366 case TOKEN_NEGATE:
367 /// token must not exist as prefix
368 $datasearch_clause .= '-' . $value . '* ';
369 break;
370 case TOKEN_META:
371 /// token in metafield, search for it as prefix
372 $metasearch_clause .= '+' . $value . '* ';
373 break;
374 case TOKEN_USER:
375 case TOKEN_USERID:
376 case TOKEN_INSTANCE:
377 case TOKEN_DATETO:
378 case TOKEN_DATEFROM:
379 /// delegate to standard search
380 $non_text_tokens[] = $token;
381 break;
382 default:
383 return '';
387 /// Call to standard search for pending tokens
388 if (!empty($non_text_tokens)) {
389 $SQLString = search_generate_SQL($non_text_tokens, $datafield, $metafield, $mainidfield, $useridfield,
390 $userfirstnamefield, $userlastnamefield, $timefield, $instancefield);
392 /// Build the final SQL clause
393 if (!empty($datasearch_clause)) {
394 /// Must have $datafield to search within
395 if (!empty($datafield)) {
396 $text_sql_string .= 'MATCH (' . $datafield;
397 /// And optionally $metafield
398 if (!empty($metafield)) {
399 $text_sql_string .= ', ' . $metafield;
401 /// Begin with the AGAINST clause
402 $text_sql_string .= ') AGAINST (' . "'";
403 /// Add the search terms
404 $text_sql_string .= trim($datasearch_clause);
405 /// Close AGAINST clause
406 $text_sql_string .= "' IN BOOLEAN MODE)";
409 /// Now add the metasearch_clause
410 if (!empty($metasearch_clause)) {
411 /// Must have $metafield to search within
412 if (!empty($metafield)) {
413 /// AND operator if needed
414 if (!empty($text_sql_string)) {
415 $text_sql_string .= ' AND ';
417 $text_sql_string .= 'MATCH (' . $metafield;
418 /// Begin with the AGAINST clause
419 $text_sql_string .= ') AGAINST (' . "'";
420 /// Add the search terms
421 $text_sql_string .= trim($metasearch_clause);
422 /// Close AGAINST clause
423 $text_sql_string .= "' IN BOOLEAN MODE)";
426 /// Finally add the non-text conditions
427 if (!empty($SQLString)) {
428 /// AND operator if needed
429 if (!empty($text_sql_string)) {
430 $text_sql_string .= ' AND ';
432 $text_sql_string .= $SQLString;
435 return $text_sql_string;
438 // Primitive function to generate a SQL string from a parse tree.
439 // Parameters:
441 // $parsetree should be a parse tree generated by a
442 // search_lexer/search_parser combination.
443 // Other fields are database table names to search.
445 function search_generate_SQL($parsetree, $datafield, $metafield, $mainidfield, $useridfield,
446 $userfirstnamefield, $userlastnamefield, $timefield, $instancefield) {
447 global $CFG;
449 $LIKE = sql_ilike();
450 $NOTLIKE = 'NOT ' . $LIKE;
451 if ($CFG->dbfamily == "postgres") {
452 $REGEXP = "~*";
453 $NOTREGEXP = "!~*";
454 } else {
455 $REGEXP = "REGEXP";
456 $NOTREGEXP = "NOT REGEXP";
459 $ntokens = count($parsetree);
460 if ($ntokens == 0) {
461 return "";
464 $SQLString = '';
466 for ($i=0; $i<$ntokens; $i++){
467 if ($i > 0) {// We have more than one clause, need to tack on AND
468 $SQLString .= ' AND ';
471 $type = $parsetree[$i]->getType();
472 $value = $parsetree[$i]->getValue();
474 /// Under Oracle and MSSQL, transform TOKEN searches into STRING searches and trim +- chars
475 if ($CFG->dbfamily == 'oracle' || $CFG->dbfamily == 'mssql') {
476 $value = trim($value, '+-');
477 if ($type == TOKEN_EXACT) {
478 $type = TOKEN_STRING;
482 switch($type){
483 case TOKEN_STRING:
484 $SQLString .= "(($datafield $LIKE '%$value%') OR ($metafield $LIKE '%$value%') )";
485 break;
486 case TOKEN_EXACT:
487 $SQLString .= "(($datafield $REGEXP '[[:<:]]".$value."[[:>:]]') OR ($metafield $REGEXP '[[:<:]]".$value."[[:>:]]'))";
488 break;
489 case TOKEN_META:
490 if ($metafield != '') {
491 $SQLString .= "($metafield $LIKE '%$value%')";
493 break;
494 case TOKEN_USER:
495 $SQLString .= "(($mainidfield = $useridfield) AND (($userfirstnamefield $LIKE '%$value%') OR ($userlastnamefield $LIKE '%$value%')))";
496 break;
497 case TOKEN_USERID:
498 $SQLString .= "($useridfield = $value)";
499 break;
500 case TOKEN_INSTANCE:
501 $SQLString .= "($instancefield = $value)";
502 break;
503 case TOKEN_DATETO:
504 $SQLString .= "($timefield <= $value)";
505 break;
506 case TOKEN_DATEFROM:
507 $SQLString .= "($timefield >= $value)";
508 break;
509 case TOKEN_NEGATE:
510 $SQLString .= "(NOT (($datafield $LIKE '%$value%') OR ($metafield $LIKE '%$value%')))";
511 break;
512 default:
513 return '';
517 return $SQLString;