adding some strings
[moodle-linuxchix.git] / search / documents / physical_ppt.php
blobc9ceb55e0825314d8513ec6626a6cc1d20ad7f59
1 <?php
2 /**
3 * Global Search Engine for Moodle
4 * add-on 1.8+ : Valery Fremaux [valery.fremaux@club-internet.fr]
5 * 2007/08/02
7 * this is a format handler for getting text out of a proprietary binary format
8 * so it can be indexed by Lucene search engine
9 */
11 /**
12 * first implementation is a trivial heuristic based on ppt character stream :
13 * text sequence always starts with a 00 9F 0F 04 sequence followed by a 15 bytes
14 * sequence
15 * In this sequence is a A8 0F or A0 0F or AA 0F followed by a little-indian encoding of text buffer size
16 * A8 0F denotes for ASCII text (local system monobyte encoding)
17 * A0 0F denotes for UTF-16 encoding
18 * AA 0F are non textual sequences
19 * texts are either in ASCII or UTF-16
20 * text ends on a new sequence start, or on a 00 00 NULL UTF-16 end of stream
22 * based on these following rules, here is a little empiric texte extractor for PPT
25 function get_text_for_indexing_ppt(&$resource){
26 global $CFG, $USER;
28 $indextext = null;
30 // SECURITY : do not allow non admin execute anything on system !!
31 if (!isadmin($USER->id)) return;
33 $text = implode('', file("{$CFG->dataroot}/{$resource->course}/{$resource->reference}"));
35 $remains = $text;
36 $fragments = array();
37 while (preg_match('/\x00\x9F\x0F\x04.{9}(......)(.*)/s', $remains, $matches)){
38 $unpacked = unpack("ncode/Llength", $matches[1]);
39 $sequencecode = $unpacked['code'];
40 $length = $unpacked['length'];
41 // print "length : ".$length." ; segment type : ".sprintf("%x", $sequencecode)."<br/>";
42 $followup = $matches[2];
43 // local system encoding sequence
44 if ($sequencecode == 0xA80F){
45 $aFragment = substr($followup, 0, $length);
46 $remains = substr($followup, $length);
47 $fragments[] = $aFragment;
49 // denotes unicode encoded sequence
50 elseif ($sequencecode == 0xA00F){
51 $aFragment = substr($followup, 0, $length);
52 // $aFragment = mb_convert_encoding($aFragment, 'UTF-16', 'UTF-8');
53 $aFragment = preg_replace('/\xA0\x00\x19\x20/s', "'", $aFragment); // some quotes
54 $aFragment = preg_replace('/\x00/s', "", $aFragment);
55 $remains = substr($followup, $length);
56 $fragments[] = $aFragment;
58 else{
59 $remains = $followup;
62 $indextext = implode(' ', $fragments);
63 $indextext = preg_replace('/\x19\x20/', "'", $indextext); // some quotes
64 $indextext = preg_replace('/\x09/', '', $indextext); // some extra chars
65 $indextext = preg_replace('/\x0D/', "\n", $indextext); // some quotes
66 $indextext = preg_replace('/\x0A/', "\n", $indextext); // some quotes
67 $indextextprint = implode('<hr/>', $fragments);
69 $logppt = fopen("C:/php5/logs/pptlog", "w");
70 fwrite($logppt, $indextext);
71 fclose($logppt);
73 if (!empty($CFG->block_search_limit_index_body)){
74 $indextext = shorten($text, $CFG->block_search_limit_index_body);
77 $indextext = mb_convert_encoding($indextext, 'UTF8', 'auto');
78 return $indextext;