12 /* private */ var $mText, # Text to be processed by the tokenizer
13 $mPos, # current position of tokenizer in text
14 $mTextLength, # Length of $mText
15 $mQueuedToken; # Tokens that were already found, but not
22 function Tokenizer() {
26 $this->mTokenQueue
=array();
27 $this->linkPrefixExtension
= $wgLang->linkPrefixExtension();
33 function newFromString( $s ) {
34 $fname = 'Tokenizer::newFromString';
35 wfProfileIn( $fname );
39 $t->mTextLength
= strlen( $s );
41 wfProfileOut( $fname );
47 * Return the next token, but do not increase the pointer. The next call
48 * to previewToken or nextToken will return the same token again.
49 * Actually, the pointer is increased, but the token is queued. The next
50 * call to previewToken or nextToken will check the queue and return
53 function previewToken() {
54 $fname = 'Tokenizer::previewToken';
55 wfProfileIn( $fname );
57 if ( count( $this->mQueuedToken
) != 0 ) {
58 // still one token from the last round around. Return that one first.
59 $token = $this->mQueuedToken
[0];
61 $token = $this->nextToken();
62 array_unshift( $this->mQueuedToken
, $token );
65 wfProfileOut( $fname );
73 * proceeds character by character through the text, looking for characters needing
74 * special attention. Those are currently: I, R, ', [, ], newline
76 * @todo handling of French blanks not yet implemented
78 function nextToken() {
79 $fname = 'Tokenizer::nextToken';
80 wfProfileIn( $fname );
82 if ( count( $this->mQueuedToken
) != 0 ) {
83 // still one token from the last round around. Return that one first.
84 $token = array_shift( $this->mQueuedToken
);
85 } else if ( $this->mPos
> $this->mTextLength
) {
86 // If no text is left, return 'false'.
91 $token['type']='text';
93 while ( $this->mPos
<= $this->mTextLength
) {
94 switch ( @$ch = $this->mText
[$this->mPos
] ) {
95 case 'R': // for "RFC "
96 if ( $this->continues('FC ') ) {
97 $queueToken['type'] = $queueToken['text'] = 'RFC ';
98 $this->mQueuedToken
[] = $queueToken;
100 break 2; // switch + while
103 case 'I': // for "ISBN "
104 if ( $this->continues('SBN ') ) {
105 $queueToken['type'] = $queueToken['text'] = 'ISBN ';
106 $this->mQueuedToken
[] = $queueToken;
108 break 2; // switch + while
111 case '[': // for links "[["
112 if ( $this->continues('[[') ) {
113 $queueToken['type'] = '[[[';
114 $queueToken['text'] = '';
115 $this->mQueuedToken
[] = $queueToken;
117 break 2; // switch + while
118 } else if ( $this->continues('[') ) {
119 $queueToken['type'] = '[[';
120 $queueToken['text'] = '';
121 // Check for a "prefixed link", e.g. Al[[Khazar]]
122 // Mostly for arabic wikipedia
123 if ( $this->linkPrefixExtension
) {
124 while ( $this->linkPrefixExtension
125 && ($len = strlen( $token['text'] ) ) > 0
126 && !ctype_space( $token['text'][$len-1] ) )
128 //prepend the character to the link's open tag
129 $queueToken['text'] = $token['text'][$len-1] . $queueToken['text'];
130 //remove character from the end of the text token
131 $token['text'] = substr( $token['text'], 0, -1);
134 $this->mQueuedToken
[] = $queueToken;
136 break 2; // switch + while
139 case ']': // for end of links "]]"
140 if ( $this->continues(']') ) {
141 $queueToken['type'] = ']]';
142 $queueToken['text'] = '';
143 $this->mQueuedToken
[] = $queueToken;
145 break 2; // switch + while
148 case "'": // for all kind of em's and strong's
149 if ( $this->continues("'") ) {
150 $queueToken['type'] = "'";
151 $queueToken['text'] = '';
152 while( ($this->mPos+
1 < $this->mTextLength
)
153 && $this->mText
[$this->mPos+
1] == "'" )
155 $queueToken['type'] .= "'";
156 $queueToken['pos'] = $this->mPos
;
160 $this->mQueuedToken
[] = $queueToken;
162 break 2; // switch + while
165 case "\n": // for block levels, actually, only "----" is handled.
166 case "\r": // headings are detected to close any unbalanced em or strong tags in a section
167 if ( $this->continues( '----' ) )
169 $queueToken['type'] = '----';
170 $queueToken['text'] = '';
171 $this->mQueuedToken
[] = $queueToken;
173 while ( $this->mPos
<$this->mTextLength
174 and $this->mText
[$this->mPos
] == '-' )
180 $this->continues( '<h' ) and (
181 $this->continues( '<h1' ) or
182 $this->continues( '<h2' ) or
183 $this->continues( '<h3' ) or
184 $this->continues( '<h4' ) or
185 $this->continues( '<h5' ) or
186 $this->continues( '<h6' )
189 $queueToken['type'] = 'h';
190 $queueToken['text'] = '';
191 $this->mQueuedToken
[] = $queueToken;
193 break 2; // switch + while
196 case '!': // French spacing rules have a space before exclamation
197 case '?': // and question marks. Those have to become
198 case ':': // And colons, Hashar says ...
199 if ( $this->preceeded( ' ' ) )
201 // strip blank from Token
202 $token['text'] = substr( $token['text'], 0, -1 );
203 $queueToken['type'] = 'blank';
204 $queueToken['text'] = ' '.$ch;
205 $this->mQueuedToken
[] = $queueToken;
207 break 2; // switch + while
210 case '0': // A space between two numbers is used to ease reading
211 case '1': // of big numbers, e.g. 1 000 000. Those spaces need
212 case '2': // to be unbreakable
220 if ( ($this->mTextLength
>= $this->mPos +
2)
221 && ($this->mText
[$this->mPos+
1] == ' ')
222 && ctype_digit( $this->mText
[$this->mPos+
2] ) )
224 $queueToken['type'] = 'blank';
225 $queueToken['text'] = $ch . ' ';
226 $this->mQueuedToken
[] = $queueToken;
228 break 2; // switch + while
231 case "\302": // first byte of UTF-8 Character Guillemet-left
232 if ( $this->continues( "\253 ") ) // second byte and a blank
234 $queueToken['type'] = 'blank';
235 $queueToken['text'] = "\302\253 ";
236 $this->mQueuedToken
[] = $queueToken;
238 break 2; // switch + while
241 case "\273": //last byte of UTF-8 Character Guillemet-right
242 if ( $this->preceeded( " \302" ) )
244 $queueToken['type'] = 'blank';
245 $queueToken['text'] = " \302\273";
246 $token['text'] = substr( $token['text'], 0, -2 );
247 $this->mQueuedToken
[] = $queueToken;
249 break 2; // switch + while
252 case '&': //extensions like <timeline>, since HTML stripping has already been done,
253 //those look like <timeline>
254 if ( $this->continues( 'lt;timeline>' ) )
256 $queueToken['type'] = '<timeline>';
257 $queueToken['text'] = '<timeline>';
258 $this->mQueuedToken
[] = $queueToken;
260 break 2; // switch + while
267 // echo $this->mPos . "<br>\n";
269 } /* if (nothing left in queue) */
271 wfProfileOut( $fname );
278 * checks whether the mText continues with $cont from mPos+1
282 function continues( $cont ) {
283 // If string is not long enough to contain $cont, return false
284 if ( $this->mTextLength
< $this->mPos +
strlen( $cont ) )
286 for ( $i=0; $i < strlen( $cont ); $i++
)
288 if ( $this->mText
[$this->mPos+
1+
$i] != $cont[$i] )
297 * checks whether the mText is preceeded by $prec at position mPos
301 function preceeded( $prec ) {
302 $len = strlen( $prec );
303 // if $prec is longer than the text up to mPos, return false
304 if ( $this->mPos
< $len )
306 return ( 0 == strcmp( $prec, substr($this->mText
, $this->mPos
-$len, $len) ) );
312 function readAllUntil( $border ) {
313 $n = strpos( $this->mText
, $border, $this->mPos
);
316 $ret = substr( $this->mText
, $this->mPos
, $n - $this->mPos
);
317 $this->mPos
= $n +
strlen( $border ) +
1;