Registered user can set their own language for the interface. See http://bugzilla...
[mediawiki.git] / includes / Tokenizer.php
blob84a2f06bcb44a6cf6552c78dd7d5bedbc3a651d7
1 <?php
2 /**
4 * @package MediaWiki
5 */
7 /**
9 * @package MediaWiki
11 class Tokenizer {
12 /* private */ var $mText, # Text to be processed by the tokenizer
13 $mPos, # current position of tokenizer in text
14 $mTextLength, # Length of $mText
15 $mQueuedToken; # Tokens that were already found, but not
16 # returned yet.
18 /**
19 * Constructor
20 * @access private
22 function Tokenizer() {
23 global $wgLang;
25 $this->mPos=0;
26 $this->mTokenQueue=array();
27 $this->linkPrefixExtension = $wgLang->linkPrefixExtension();
30 /**
31 * factory function
33 function newFromString( $s ) {
34 $fname = 'Tokenizer::newFromString';
35 wfProfileIn( $fname );
37 $t = new Tokenizer();
38 $t->mText = $s;
39 $t->mTextLength = strlen( $s );
41 wfProfileOut( $fname );
42 return $t;
46 /**
47 * Return the next token, but do not increase the pointer. The next call
48 * to previewToken or nextToken will return the same token again.
49 * Actually, the pointer is increased, but the token is queued. The next
50 * call to previewToken or nextToken will check the queue and return
51 * the stored token.
53 function previewToken() {
54 $fname = 'Tokenizer::previewToken';
55 wfProfileIn( $fname );
57 if ( count( $this->mQueuedToken ) != 0 ) {
58 // still one token from the last round around. Return that one first.
59 $token = $this->mQueuedToken[0];
60 } else {
61 $token = $this->nextToken();
62 array_unshift( $this->mQueuedToken, $token );
65 wfProfileOut( $fname );
66 return $token;
70 /**
71 * Get the next token.
73 * proceeds character by character through the text, looking for characters needing
74 * special attention. Those are currently: I, R, ', [, ], newline
76 * @todo handling of French blanks not yet implemented
78 function nextToken() {
79 $fname = 'Tokenizer::nextToken';
80 wfProfileIn( $fname );
82 if ( count( $this->mQueuedToken ) != 0 ) {
83 // still one token from the last round around. Return that one first.
84 $token = array_shift( $this->mQueuedToken );
85 } else if ( $this->mPos > $this->mTextLength ) {
86 // If no text is left, return 'false'.
87 $token = false;
88 } else {
90 $token['text']='';
91 $token['type']='text';
93 while ( $this->mPos <= $this->mTextLength ) {
94 switch ( @$ch = $this->mText[$this->mPos] ) {
95 case 'R': // for "RFC "
96 if ( $this->continues('FC ') ) {
97 $queueToken['type'] = $queueToken['text'] = 'RFC ';
98 $this->mQueuedToken[] = $queueToken;
99 $this->mPos += 3;
100 break 2; // switch + while
102 break;
103 case 'I': // for "ISBN "
104 if ( $this->continues('SBN ') ) {
105 $queueToken['type'] = $queueToken['text'] = 'ISBN ';
106 $this->mQueuedToken[] = $queueToken;
107 $this->mPos += 4;
108 break 2; // switch + while
110 break;
111 case '[': // for links "[["
112 if ( $this->continues('[[') ) {
113 $queueToken['type'] = '[[[';
114 $queueToken['text'] = '';
115 $this->mQueuedToken[] = $queueToken;
116 $this->mPos += 3;
117 break 2; // switch + while
118 } else if ( $this->continues('[') ) {
119 $queueToken['type'] = '[[';
120 $queueToken['text'] = '';
121 // Check for a "prefixed link", e.g. Al[[Khazar]]
122 // Mostly for arabic wikipedia
123 if ( $this->linkPrefixExtension ) {
124 while ( $this->linkPrefixExtension
125 && ($len = strlen( $token['text'] ) ) > 0
126 && !ctype_space( $token['text'][$len-1] ) )
128 //prepend the character to the link's open tag
129 $queueToken['text'] = $token['text'][$len-1] . $queueToken['text'];
130 //remove character from the end of the text token
131 $token['text'] = substr( $token['text'], 0, -1);
134 $this->mQueuedToken[] = $queueToken;
135 $this->mPos += 2;
136 break 2; // switch + while
138 break;
139 case ']': // for end of links "]]"
140 if ( $this->continues(']') ) {
141 $queueToken['type'] = ']]';
142 $queueToken['text'] = '';
143 $this->mQueuedToken[] = $queueToken;
144 $this->mPos += 2;
145 break 2; // switch + while
147 break;
148 case "'": // for all kind of em's and strong's
149 if ( $this->continues("'") ) {
150 $queueToken['type'] = "'";
151 $queueToken['text'] = '';
152 while( ($this->mPos+1 < $this->mTextLength)
153 && $this->mText[$this->mPos+1] == "'" )
155 $queueToken['type'] .= "'";
156 $queueToken['pos'] = $this->mPos;
157 $this->mPos ++;
160 $this->mQueuedToken[] = $queueToken;
161 $this->mPos ++;
162 break 2; // switch + while
164 break;
165 case "\n": // for block levels, actually, only "----" is handled.
166 case "\r": // headings are detected to close any unbalanced em or strong tags in a section
167 if ( $this->continues( '----' ) )
169 $queueToken['type'] = '----';
170 $queueToken['text'] = '';
171 $this->mQueuedToken[] = $queueToken;
172 $this->mPos += 5;
173 while ( $this->mPos<$this->mTextLength
174 and $this->mText[$this->mPos] == '-' )
176 $this->mPos ++;
178 break 2;
179 } else if (
180 $this->continues( '<h' ) and (
181 $this->continues( '<h1' ) or
182 $this->continues( '<h2' ) or
183 $this->continues( '<h3' ) or
184 $this->continues( '<h4' ) or
185 $this->continues( '<h5' ) or
186 $this->continues( '<h6' )
188 ) { // heading
189 $queueToken['type'] = 'h';
190 $queueToken['text'] = '';
191 $this->mQueuedToken[] = $queueToken;
192 $this->mPos ++;
193 break 2; // switch + while
195 break;
196 case '!': // French spacing rules have a space before exclamation
197 case '?': // and question marks. Those have to become &nbsp;
198 case ':': // And colons, Hashar says ...
199 if ( $this->preceeded( ' ' ) )
201 // strip blank from Token
202 $token['text'] = substr( $token['text'], 0, -1 );
203 $queueToken['type'] = 'blank';
204 $queueToken['text'] = ' '.$ch;
205 $this->mQueuedToken[] = $queueToken;
206 $this->mPos ++;
207 break 2; // switch + while
209 break;
210 case '0': // A space between two numbers is used to ease reading
211 case '1': // of big numbers, e.g. 1 000 000. Those spaces need
212 case '2': // to be unbreakable
213 case '3':
214 case '4':
215 case '5':
216 case '6':
217 case '7':
218 case '8':
219 case '9':
220 if ( ($this->mTextLength >= $this->mPos +2)
221 && ($this->mText[$this->mPos+1] == ' ')
222 && ctype_digit( $this->mText[$this->mPos+2] ) )
224 $queueToken['type'] = 'blank';
225 $queueToken['text'] = $ch . ' ';
226 $this->mQueuedToken[] = $queueToken;
227 $this->mPos += 2;
228 break 2; // switch + while
230 break;
231 case "\302": // first byte of UTF-8 Character Guillemet-left
232 if ( $this->continues( "\253 ") ) // second byte and a blank
234 $queueToken['type'] = 'blank';
235 $queueToken['text'] = "\302\253 ";
236 $this->mQueuedToken[] = $queueToken;
237 $this->mPos += 3;
238 break 2; // switch + while
240 break;
241 case "\273": //last byte of UTF-8 Character Guillemet-right
242 if ( $this->preceeded( " \302" ) )
244 $queueToken['type'] = 'blank';
245 $queueToken['text'] = " \302\273";
246 $token['text'] = substr( $token['text'], 0, -2 );
247 $this->mQueuedToken[] = $queueToken;
248 $this->mPos ++;
249 break 2; // switch + while
251 break;
252 case '&': //extensions like <timeline>, since HTML stripping has already been done,
253 //those look like &lt;timeline&gt;
254 if ( $this->continues( 'lt;timeline&gt;' ) )
256 $queueToken['type'] = '<timeline>';
257 $queueToken['text'] = '&lt;timeline&gt;';
258 $this->mQueuedToken[] = $queueToken;
259 $this->mPos += 16;
260 break 2; // switch + while
262 break;
264 } /* switch */
265 $token['text'].=$ch;
266 $this->mPos ++;
267 // echo $this->mPos . "<br>\n";
268 } /* while */
269 } /* if (nothing left in queue) */
271 wfProfileOut( $fname );
272 return $token;
276 * function continues
278 * checks whether the mText continues with $cont from mPos+1
280 * @access private
282 function continues( $cont ) {
283 // If string is not long enough to contain $cont, return false
284 if ( $this->mTextLength < $this->mPos + strlen( $cont ) )
285 return false;
286 for ( $i=0; $i < strlen( $cont ); $i++ )
288 if ( $this->mText[$this->mPos+1+$i] != $cont[$i] )
289 return false;
291 return true;
295 * function preceeded
297 * checks whether the mText is preceeded by $prec at position mPos
299 * @access private
301 function preceeded( $prec ) {
302 $len = strlen( $prec );
303 // if $prec is longer than the text up to mPos, return false
304 if ( $this->mPos < $len )
305 return false;
306 return ( 0 == strcmp( $prec, substr($this->mText, $this->mPos-$len, $len) ) );
312 function readAllUntil( $border ) {
313 $n = strpos( $this->mText, $border, $this->mPos );
314 if ( $n === false )
315 return '';
316 $ret = substr( $this->mText, $this->mPos, $n - $this->mPos );
317 $this->mPos = $n + strlen( $border ) + 1;
318 return $ret;