lib/searchlib.php

   1 <?php  // $Id$
   2
   3 require_once($CFG->libdir.'/lexer.php');
   4
   5 // Constants for the various types of tokens
   6
   7 define("TOKEN_USER","0");
   8 define("TOKEN_META","1");
   9 define("TOKEN_EXACT","2");
  10 define("TOKEN_NEGATE","3");
  11 define("TOKEN_STRING","4");
  12 define("TOKEN_USERID","5");
  13 define("TOKEN_DATEFROM","6");
  14 define("TOKEN_DATETO","7");
  15 define("TOKEN_INSTANCE","8");
  16
  17 // Class to hold token/value pairs after they're parsed.
  18
  19 class search_token {
  20   var $value;
  21   var $type;
  22   function search_token($type,$value){
  23     $this->type = $type;
  24     $this->value = $this->sanitize($value);
  25
  26   }
  27
  28   // Try to clean up user input to avoid potential security issues.
  29   // Need to think about this some more.
  30
  31   function sanitize($userstring){
  32     return htmlspecialchars(addslashes($userstring));
  33   }
  34   function getValue(){
  35     return $this->value;
  36   }
  37   function getType(){
  38     return $this->type;
  39   }
  40 }
  41
  42
  43
  44 // This class does the heavy lifting of lexing the search string into tokens.
  45 // Using a full-blown lexer is probably overkill for this application, but
  46 // might be useful for other tasks.
  47
  48 class search_lexer extends Lexer{
  49
  50   function search_lexer(&$parser){
  51
  52     // Call parent constructor.
  53     $this->Lexer($parser);
  54
  55     //Set up the state machine and pattern matches for transitions.
  56
  57     // Patterns to handle strings  of the form datefrom:foo
  58
  59     // If we see the string datefrom: while in the base accept state, start
  60     // parsing a username and go to the indatefrom state.
  61     $this->addEntryPattern("datefrom:\S+","accept","indatefrom");
  62
  63     // Snarf everything into the username until we see whitespace, then exit
  64     // back to the base accept state.
  65     $this->addExitPattern("\s","indatefrom");
  66
  67
  68     // Patterns to handle strings  of the form dateto:foo
  69
  70     // If we see the string dateto: while in the base accept state, start
  71     // parsing a username and go to the indateto state.
  72     $this->addEntryPattern("dateto:\S+","accept","indateto");
  73
  74     // Snarf everything into the username until we see whitespace, then exit
  75     // back to the base accept state.
  76     $this->addExitPattern("\s","indateto");
  77
  78
  79     // Patterns to handle strings  of the form instance:foo
  80
  81     // If we see the string instance: while in the base accept state, start
  82     // parsing for instance number and go to the ininstance state.
  83     $this->addEntryPattern("instance:\S+","accept","ininstance");
  84
  85     // Snarf everything into the username until we see whitespace, then exit
  86     // back to the base accept state.
  87     $this->addExitPattern("\s","ininstance");
  88
  89
  90     // Patterns to handle strings  of the form userid:foo
  91
  92     // If we see the string userid: while in the base accept state, start
  93     // parsing a username and go to the inuserid state.
  94     $this->addEntryPattern("userid:\S+","accept","inuserid");
  95
  96     // Snarf everything into the username until we see whitespace, then exit
  97     // back to the base accept state.
  98     $this->addExitPattern("\s","inuserid");
  99
 100
 101     // Patterns to handle strings  of the form user:foo
 102
 103     // If we see the string user: while in the base accept state, start
 104     // parsing a username and go to the inusername state.
 105     $this->addEntryPattern("user:\S+","accept","inusername");
 106
 107     // Snarf everything into the username until we see whitespace, then exit
 108     // back to the base accept state.
 109     $this->addExitPattern("\s","inusername");
 110
 111
 112     // Patterns to handle strings  of the form meta:foo
 113
 114    // If we see the string meta: while in the base accept state, start
 115     // parsing a username and go to the inmeta state.
 116     $this->addEntryPattern("subject:\S+","accept","inmeta");
 117
 118     // Snarf everything into the meta token until we see whitespace, then exit
 119     // back to the base accept state.
 120     $this->addExitPattern("\s","inmeta");
 121
 122
 123     // Patterns to handle required exact match strings (+foo) .
 124
 125     // If we see a + sign  while in the base accept state, start
 126     // parsing an exact match string and enter the inrequired state
 127     $this->addEntryPattern("\+\S+","accept","inrequired");
 128     // When we see white space, exit back to accept state.
 129     $this->addExitPattern("\s","inrequired");
 130
 131     // Handle excluded strings (-foo)
 132
 133    // If we see a - sign  while in the base accept state, start
 134     // parsing an excluded string and enter the inexcluded state
 135     $this->addEntryPattern("\-\S+","accept","inexcluded");
 136     // When we see white space, exit back to accept state.
 137     $this->addExitPattern("\s","inexcluded");
 138
 139
 140     // Patterns to handle quoted strings.
 141
 142     // If we see a quote  while in the base accept state, start
 143     // parsing a quoted string and enter the inquotedstring state.
 144     // Grab everything until we see the closing quote.
 145
 146     $this->addEntryPattern("\"[^\"]+","accept","inquotedstring");
 147
 148     // When we see a closing quote, reenter the base accept state.
 149     $this->addExitPattern("\"","inquotedstring");
 150
 151     // Patterns to handle ordinary, nonquoted words.
 152
 153     // When we see non-whitespace, snarf everything into the nonquoted word
 154     // until we see whitespace again.
 155     $this->addEntryPattern("\S+","accept","plainstring");
 156
 157     // Once we see whitespace, reenter the base accept state.
 158     $this->addExitPattern("\s","plainstring");
 159
 160   }
 161 }
 162
 163
 164
 165
 166 // This class takes care of sticking the proper token type/value pairs into
 167 // the parsed token  array.
 168 // Most functions in this class should only be called by the lexer, the
 169 // one exception being getParseArray() which returns the result.
 170
 171 class search_parser {
 172     var $tokens;
 173
 174
 175     // This function is called by the code that's interested in the result of the parse operation.
 176     function get_parsed_array(){
 177         return $this->tokens;
 178     }
 179
 180     /*
 181      * Functions below this are part of the state machine for the parse
 182      * operation and should not be called directly.
 183      */
 184
 185     // Base state. No output emitted.
 186     function accept() {
 187         return true;
 188     }
 189
 190     // State for handling datefrom:foo constructs. Potentially emits a token.
 191     function indatefrom($content){
 192         if (strlen($content) < 10) { // State exit or missing parameter.
 193             return true;
 194         }
 195         // Strip off the datefrom: part and add the reminder to the parsed token array
 196         $param = trim(substr($content,9));
 197         $this->tokens[] = new search_token(TOKEN_DATEFROM,$param);
 198         return true;
 199     }
 200
 201     // State for handling dateto:foo constructs. Potentially emits a token.
 202     function indateto($content){
 203         if (strlen($content) < 8) { // State exit or missing parameter.
 204             return true;
 205         }
 206         // Strip off the dateto: part and add the reminder to the parsed token array
 207         $param = trim(substr($content,7));
 208         $this->tokens[] = new search_token(TOKEN_DATETO,$param);
 209         return true;
 210     }
 211
 212     // State for handling instance:foo constructs. Potentially emits a token.
 213     function ininstance($content){
 214         if (strlen($content) < 10) { // State exit or missing parameter.
 215             return true;
 216         }
 217         // Strip off the instance: part and add the reminder to the parsed token array
 218         $param = trim(substr($content,9));
 219         $this->tokens[] = new search_token(TOKEN_INSTANCE,$param);
 220         return true;
 221     }
 222
 223
 224     // State for handling userid:foo constructs. Potentially emits a token.
 225     function inuserid($content){
 226         if (strlen($content) < 8) { // State exit or missing parameter.
 227             return true;
 228         }
 229         // Strip off the userid: part and add the reminder to the parsed token array
 230         $param = trim(substr($content,7));
 231         $this->tokens[] = new search_token(TOKEN_USERID,$param);
 232         return true;
 233     }
 234
 235
 236     // State for handling user:foo constructs. Potentially emits a token.
 237     function inusername($content){
 238         if (strlen($content) < 6) { // State exit or missing parameter.
 239             return true;
 240         }
 241         // Strip off the user: part and add the reminder to the parsed token array
 242         $param = trim(substr($content,5));
 243         $this->tokens[] = new search_token(TOKEN_USER,$param);
 244         return true;
 245     }
 246
 247
 248     // State for handling meta:foo constructs. Potentially emits a token.
 249     function inmeta($content){
 250         if (strlen($content) < 9) { // Missing parameter.
 251             return true;
 252         }
 253         // Strip off the meta: part and add the reminder to the parsed token array.
 254         $param = trim(substr($content,8));
 255         $this->tokens[] = new search_token(TOKEN_META,$param);
 256         return true;
 257     }
 258
 259
 260     // State entered when we've seen a required string (+foo). Potentially
 261     // emits a token.
 262     function inrequired($content){
 263         if (strlen($content) < 2) { // State exit or missing parameter, don't emit.
 264             return true;
 265         }
 266         // Strip off the + sign and add the reminder to the parsed token array.
 267         $this->tokens[] = new search_token(TOKEN_EXACT,substr($content,1));
 268         return true;
 269     }
 270
 271     // State entered when we've seen an excluded string (-foo). Potentially
 272     // emits a token.
 273     function inexcluded($content){
 274         if (strlen($content) < 2) { // State exit or missing parameter.
 275             return true;
 276         }
 277         // Strip off the -sign and add the reminder to the parsed token array.
 278         $this->tokens[] = new search_token(TOKEN_NEGATE,substr($content,1));
 279         return true;
 280     }
 281
 282
 283     // State entered when we've seen a quoted string. Potentially emits a token.
 284     function inquotedstring($content){
 285         if (strlen($content) < 2) { // State exit or missing parameter.
 286             return true;
 287         }
 288         // Strip off the opening quote and add the reminder to the parsed token array.
 289         $this->tokens[] = new search_token(TOKEN_STRING,substr($content,1));
 290         return true;
 291     }
 292
 293     // State entered when we've seen an ordinary, non-quoted word. Potentially
 294     // emits a token.
 295     function plainstring($content){
 296         if (ctype_space($content)) { // State exit
 297             return true;
 298         }
 299         // Add the string to the parsed token array.
 300         $this->tokens[] = new search_token(TOKEN_STRING,$content);
 301         return true;
 302     }
 303 }
 304
 305 // Primitive function to generate a SQL string from a parse tree
 306 // using TEXT indexes. If searches aren't suitable to use TEXT
 307 // this function calls the default search_generate_SQL() one.
 308 //
 309 // $parsetree should be a parse tree generated by a
 310 // search_lexer/search_parser combination.
 311 // Other fields are database table names to search.
 312 function search_generate_text_SQL($parsetree, $datafield, $metafield, $mainidfield, $useridfield,
 313                              $userfirstnamefield, $userlastnamefield, $timefield, $instancefield) {
 314     global $CFG;
 315
 316 /// First of all, search for reasons to switch to standard SQL generation
 317 /// Only mysql are supported for now
 318     if ($CFG->dbfamily != 'mysql') {
 319         return search_generate_SQL($parsetree, $datafield, $metafield, $mainidfield, $useridfield,
 320                                    $userfirstnamefield, $userlastnamefield, $timefield, $instancefield);
 321     }
 322
 323 /// Some languages don't have "word separators" and MySQL FULLTEXT doesn't perform well with them, so
 324 /// switch to standard SQL search generation
 325     if ($CFG->dbfamily == 'mysql') {
 326         $nonseparatedlangs = array('ja_utf8', 'th_utf8', 'zh_cn_utf8', 'zh_tw_utf8');
 327         if (in_array(current_language(), $nonseparatedlangs)) {
 328             return search_generate_SQL($parsetree, $datafield, $metafield, $mainidfield, $useridfield,
 329                                        $userfirstnamefield, $userlastnamefield, $timefield, $instancefield);
 330         }
 331     }
 332
 333 /// Here we'll acumulate non-textual tokens
 334     $non_text_tokens = array();
 335
 336     $ntokens = count($parsetree);
 337     if ($ntokens == 0) {
 338         return "";
 339     }
 340
 341     $SQLString = '';
 342     $text_sql_string = '';
 343
 344     $datasearch_clause = '';
 345     $metasearch_clause = '';
 346
 347     foreach ($parsetree as $token) {
 348
 349         $type = $token->getType();
 350         $value = $token->getValue();
 351
 352         switch($type){
 353             case TOKEN_STRING:
 354             /// If it's a multiword token, quote it
 355                 if (strstr($value, ' ')) {
 356                     $datasearch_clause .= '"' . $value . '" ';
 357             /// Simple word token, search for it as prefix
 358                 } else {
 359                     $datasearch_clause .= '+' . $value . '* ';
 360                 }
 361                 break;
 362             case TOKEN_EXACT:
 363             /// token must be exactly as requested
 364                 $datasearch_clause .= '+' . $value . ' ';
 365                 break;
 366             case TOKEN_NEGATE:
 367             /// token must not exist as prefix
 368                 $datasearch_clause .= '-' . $value . '* ';
 369                 break;
 370             case TOKEN_META:
 371             /// token in metafield, search for it as prefix
 372                 $metasearch_clause .= '+' . $value . '* ';
 373                 break;
 374             case TOKEN_USER:
 375             case TOKEN_USERID:
 376             case TOKEN_INSTANCE:
 377             case TOKEN_DATETO:
 378             case TOKEN_DATEFROM:
 379             /// delegate to standard search
 380                 $non_text_tokens[] = $token;
 381                 break;
 382             default:
 383                 return '';
 384         }
 385     }
 386
 387 /// Call to standard search for pending tokens
 388     if (!empty($non_text_tokens)) {
 389         $SQLString = search_generate_SQL($non_text_tokens, $datafield, $metafield, $mainidfield, $useridfield,
 390                                          $userfirstnamefield, $userlastnamefield, $timefield, $instancefield);
 391     }
 392 /// Build the final SQL clause
 393     if (!empty($datasearch_clause)) {
 394     /// Must have $datafield to search within
 395         if (!empty($datafield)) {
 396             $text_sql_string .= 'MATCH (' . $datafield;
 397         /// And optionally $metafield
 398             if (!empty($metafield)) {
 399                 $text_sql_string .= ', ' . $metafield;
 400             }
 401         /// Begin with the AGAINST clause
 402             $text_sql_string .= ') AGAINST (' . "'";
 403         /// Add the search terms
 404             $text_sql_string .= trim($datasearch_clause);
 405         /// Close AGAINST clause
 406             $text_sql_string .= "' IN BOOLEAN MODE)";
 407         }
 408     }
 409 /// Now add the metasearch_clause
 410     if (!empty($metasearch_clause)) {
 411     /// Must have $metafield to search within
 412         if (!empty($metafield)) {
 413         /// AND operator if needed
 414             if (!empty($text_sql_string)) {
 415                 $text_sql_string .= ' AND ';
 416             }
 417             $text_sql_string .= 'MATCH (' . $metafield;
 418         /// Begin with the AGAINST clause
 419             $text_sql_string .= ') AGAINST (' . "'";
 420         /// Add the search terms
 421             $text_sql_string .= trim($metasearch_clause);
 422         /// Close AGAINST clause
 423             $text_sql_string .= "' IN BOOLEAN MODE)";
 424         }
 425     }
 426 /// Finally add the non-text conditions
 427     if (!empty($SQLString)) {
 428     /// AND operator if needed
 429         if (!empty($text_sql_string)) {
 430             $text_sql_string .= ' AND ';
 431         }
 432         $text_sql_string .= $SQLString;
 433     }
 434
 435     return $text_sql_string;
 436 }
 437
 438 // Primitive function to generate a SQL string from a parse tree.
 439 // Parameters:
 440 //
 441 // $parsetree should be a parse tree generated by a
 442 // search_lexer/search_parser combination.
 443 // Other fields are database table names to search.
 444
 445 function search_generate_SQL($parsetree, $datafield, $metafield, $mainidfield, $useridfield,
 446                              $userfirstnamefield, $userlastnamefield, $timefield, $instancefield) {
 447     global $CFG;
 448
 449     $LIKE = sql_ilike();
 450     $NOTLIKE = 'NOT ' . $LIKE;
 451     if ($CFG->dbfamily == "postgres") {
 452         $REGEXP = "~*";
 453         $NOTREGEXP = "!~*";
 454     } else {
 455         $REGEXP = "REGEXP";
 456         $NOTREGEXP = "NOT REGEXP";
 457     }
 458
 459     $ntokens = count($parsetree);
 460     if ($ntokens == 0) {
 461         return "";
 462     }
 463
 464     $SQLString = '';
 465
 466     for ($i=0; $i<$ntokens; $i++){
 467         if ($i > 0) {// We have more than one clause, need to tack on AND
 468             $SQLString .= ' AND ';
 469         }
 470
 471         $type = $parsetree[$i]->getType();
 472         $value = $parsetree[$i]->getValue();
 473
 474     /// Under Oracle and MSSQL, transform TOKEN searches into STRING searches and trim +- chars
 475         if ($CFG->dbfamily == 'oracle' || $CFG->dbfamily == 'mssql') {
 476             $value = trim($value, '+-');
 477             if ($type == TOKEN_EXACT) {
 478                 $type = TOKEN_STRING;
 479             }
 480         }
 481
 482         switch($type){
 483             case TOKEN_STRING:
 484                 $SQLString .= "(($datafield $LIKE '%$value%') OR ($metafield $LIKE '%$value%') )";
 485                 break;
 486             case TOKEN_EXACT:
 487                 $SQLString .= "(($datafield $REGEXP '[[:<:]]".$value."[[:>:]]') OR ($metafield $REGEXP '[[:<:]]".$value."[[:>:]]'))";
 488                 break;
 489             case TOKEN_META:
 490                 if ($metafield != '') {
 491                     $SQLString .= "($metafield $LIKE '%$value%')";
 492                 }
 493                 break;
 494             case TOKEN_USER:
 495                 $SQLString .= "(($mainidfield = $useridfield) AND (($userfirstnamefield $LIKE '%$value%') OR ($userlastnamefield $LIKE '%$value%')))";
 496                 break;
 497             case TOKEN_USERID:
 498                 $SQLString .= "($useridfield = $value)";
 499                 break;
 500             case TOKEN_INSTANCE:
 501                 $SQLString .= "($instancefield = $value)";
 502                 break;
 503             case TOKEN_DATETO:
 504                 $SQLString .= "($timefield <= $value)";
 505                 break;
 506             case TOKEN_DATEFROM:
 507                 $SQLString .= "($timefield >= $value)";
 508                 break;
 509             case TOKEN_NEGATE:
 510                 $SQLString .= "(NOT (($datafield  $LIKE '%$value%') OR ($metafield  $LIKE '%$value%')))";
 511                 break;
 512             default:
 513                 return '';
 514
 515         }
 516     }
 517     return $SQLString;
 518 }
 519
 520
 521 ?>