Merge commit 'catalyst/MOODLE_19_STABLE' into mdl19-linuxchix
[moodle-linuxchix.git] / search / documents / physical_htm.php
blob76d6073dd6901bba5b934b95814894343a38ab96
1 <?php
2 /**
3 * Global Search Engine for Moodle
5 * @package search
6 * @category core
7 * @subpackage document_wrappers
8 * @author Valery Fremaux [valery.fremaux@club-internet.fr] > 1.8
9 * @date 2008/03/31
10 * @license http://www.gnu.org/copyleft/gpl.html GNU Public License
12 * this is a format handler for getting text out of a proprietary binary format
13 * so it can be indexed by Lucene search engine
16 /**
17 * @param object $resource
18 * @uses CFG, USER
20 function get_text_for_indexing_htm(&$resource){
21 global $CFG, $USER;
23 // SECURITY : do not allow non admin execute anything on system !!
24 if (!isadmin($USER->id)) return;
26 // just get text
27 $text = implode('', file("{$CFG->dataroot}/{$resource->course}/{$resource->reference}"));
29 // extract keywords and other interesting meta information and put it back as real content for indexing
30 if (preg_match('/(.*)<meta ([^>]*)>(.*)/is', $text, $matches)){
31 $prefix = $matches[1];
32 $meta_attributes = $matches[2];
33 $suffix = $matches[3];
34 if (preg_match('/name="(keywords|description)"/i', $meta_attributes)){
35 preg_match('/content="([^"]+)"/i', $meta_attributes, $matches);
36 $text = $prefix.' '.$matches[1].' '.$suffix;
39 // brutally filters all html tags
40 $text = preg_replace("/<[^>]*>/", '', $text);
41 $text = preg_replace("/<!--[^>]*-->/", '', $text);
42 $text = html_entity_decode($text, ENT_COMPAT, 'UTF-8');
43 $text = mb_convert_encoding($text, 'UTF-8', 'AUTO');
46 * debug code for tracing input
47 echo "<hr/>";
48 $FILE = fopen("filetrace.log", 'w');
49 fwrite($FILE, $text);
50 fclose($FILE);
51 echo "<hr/>";
54 if (!empty($CFG->block_search_limit_index_body)){
55 $text = shorten($text, $CFG->block_search_limit_index_body);
57 return $text;