3 /* private */ var $mText, # Text to be processed by the tokenizer
4 $mPos, # current position of tokenizer in text
5 $mTextLength, # Length of $mText
6 $mQueuedToken; # Tokens that were already found, but not
9 /* private */ function Tokenizer()
14 $this->mTokenQueue
=array();
15 $this->linkPrefixExtension
= $wgLang->linkPrefixExtension();
19 function newFromString( $s )
21 $fname = "Tokenizer::newFromString";
22 wfProfileIn( $fname );
26 $t->mTextLength
= strlen( $s );
28 wfProfileOut( $fname );
33 // Return the next token, but do not increase the pointer. The next call
34 // to previewToken or nextToken will return the same token again.
35 // Actually, the pointer is increased, but the token is queued. The next
36 // call to previewToken or nextToken will check the queue and return
38 function previewToken()
40 $fname = "Tokenizer::previewToken";
41 wfProfileIn( $fname );
43 if ( count( $this->mQueuedToken
) != 0 ) {
44 // still one token from the last round around. Return that one first.
45 $token = $this->mQueuedToken
[0];
47 $token = $this->nextToken();
48 array_unshift( $this->mQueuedToken
, $token );
51 wfProfileOut( $fname );
57 // proceeds character by character through the text, looking for characters needing
58 // special attention. Those are currently: I, R, ', [, ], newline
60 // TODO: handling of French blanks not yet implemented
63 $fname = "Tokenizer::nextToken";
64 wfProfileIn( $fname );
66 if ( count( $this->mQueuedToken
) != 0 ) {
67 // still one token from the last round around. Return that one first.
68 $token = array_shift( $this->mQueuedToken
);
69 } else if ( $this->mPos
> $this->mTextLength
) {
70 // If no text is left, return "false".
75 $token["type"]="text";
77 while ( $this->mPos
<= $this->mTextLength
) {
78 switch ( @$ch = $this->mText
[$this->mPos
] ) {
79 case 'R': // for "RFC "
80 if ( $this->continues("FC ") ) {
81 $queueToken["type"] = $queueToken["text"] = "RFC ";
82 $this->mQueuedToken
[] = $queueToken;
84 break 2; // switch + while
87 case 'I': // for "ISBN "
88 if ( $this->continues("SBN ") ) {
89 $queueToken["type"] = $queueToken["text"] = "ISBN ";
90 $this->mQueuedToken
[] = $queueToken;
92 break 2; // switch + while
95 case "[": // for links "[["
96 if ( $this->continues("[[") ) {
97 $queueToken["type"] = "[[[";
98 $queueToken["text"] = "";
99 $this->mQueuedToken
[] = $queueToken;
101 break 2; // switch + while
102 } else if ( $this->continues("[") ) {
103 $queueToken["type"] = "[[";
104 $queueToken["text"] = "";
105 // Check for a "prefixed link", e.g. Al[[Khazar]]
106 // Mostly for arabic wikipedia
107 if ( $this->linkPrefixExtension
) {
108 while ( $this->linkPrefixExtension
109 && ($len = strlen( $token["text"] ) ) > 0
110 && !ctype_space( $token["text"][$len-1] ) )
112 //prepend the character to the link's open tag
113 $queueToken["text"] = $token["text"][$len-1] . $queueToken["text"];
114 //remove character from the end of the text token
115 $token["text"] = substr( $token["text"], 0, -1);
118 $this->mQueuedToken
[] = $queueToken;
120 break 2; // switch + while
123 case "]": // for end of links "]]"
124 if ( $this->continues("]") ) {
125 $queueToken["type"] = "]]";
126 $queueToken["text"] = "";
127 $this->mQueuedToken
[] = $queueToken;
129 break 2; // switch + while
132 case "'": // for all kind of em's and strong's
133 if ( $this->continues("'") ) {
134 $queueToken["type"] = "'";
135 $queueToken["text"] = "";
136 while( ($this->mPos+
1 < $this->mTextLength
)
137 && $this->mText
[$this->mPos+
1] == "'" )
139 $queueToken["type"] .= "'";
140 $queueToken["pos"] = $this->mPos
;
144 $this->mQueuedToken
[] = $queueToken;
146 break 2; // switch + while
149 case "\n": // for block levels, actually, only "----" is handled.
150 case "\r": // headings are detected to close any unbalanced em or strong tags in a section
151 if ( $this->continues( "----" ) )
153 $queueToken["type"] = "----";
154 $queueToken["text"] = "";
155 $this->mQueuedToken
[] = $queueToken;
157 while ( $this->mPos
<$this->mTextLength
158 and $this->mText
[$this->mPos
] == "-" )
164 $this->continues( "<h" ) and (
165 $this->continues( "<h1" ) or
166 $this->continues( "<h2" ) or
167 $this->continues( "<h3" ) or
168 $this->continues( "<h4" ) or
169 $this->continues( "<h5" ) or
170 $this->continues( "<h6" )
173 $queueToken["type"] = "h";
174 $queueToken["text"] = "";
175 $this->mQueuedToken
[] = $queueToken;
177 break 2; // switch + while
180 case "!": // French spacing rules have a space before exclamation
181 case "?": // and question marks. Those have to become
182 case ":": // And colons, Hashar says ...
183 if ( $this->preceeded( " " ) )
185 // strip blank from Token
186 $token["text"] = substr( $token["text"], 0, -1 );
187 $queueToken["type"] = "blank";
188 $queueToken["text"] = " {$ch}";
189 $this->mQueuedToken
[] = $queueToken;
191 break 2; // switch + while
194 case "0": // A space between two numbers is used to ease reading
195 case "1": // of big numbers, e.g. 1 000 000. Those spaces need
196 case "2": // to be unbreakable
204 if ( ($this->mTextLength
>= $this->mPos +
2)
205 && ($this->mText
[$this->mPos+
1] == " ")
206 && ctype_digit( $this->mText
[$this->mPos+
2] ) )
208 $queueToken["type"] = "blank";
209 $queueToken["text"] = $ch . " ";
210 $this->mQueuedToken
[] = $queueToken;
212 break 2; // switch + while
215 case "\302": // first byte of UTF-8 Character Guillemet-left
216 if ( $this->continues( "\253 ") ) // second byte and a blank
218 $queueToken["type"] = "blank";
219 $queueToken["text"] = "\302\253 ";
220 $this->mQueuedToken
[] = $queueToken;
222 break 2; // switch + while
225 case "\273": //last byte of UTF-8 Character Guillemet-right
226 if ( $this->preceeded( " \302" ) )
228 $queueToken["type"] = "blank";
229 $queueToken["text"] = " \302\273";
230 $token["text"] = substr( $token["text"], 0, -2 );
231 $this->mQueuedToken
[] = $queueToken;
233 break 2; // switch + while
236 case "&": //extensions like <timeline>, since HTML stripping has already been done,
237 //those look like <timeline>
238 if ( $this->continues( "lt;timeline>" ) )
240 $queueToken["type"] = "<timeline>";
241 $queueToken["text"] = "<timeline>";
242 $this->mQueuedToken
[] = $queueToken;
244 break 2; // switch + while
251 // echo $this->mPos . "<br>\n";
253 } /* if (nothing left in queue) */
255 wfProfileOut( $fname );
259 // function continues
260 // checks whether the mText continues with $cont from mPos+1
261 /* private */ function continues( $cont )
263 // If string is not long enough to contain $cont, return false
264 if ( $this->mTextLength
< $this->mPos +
strlen( $cont ) )
266 for ( $i=0; $i < strlen( $cont ); $i++
)
268 if ( $this->mText
[$this->mPos+
1+
$i] != $cont[$i] )
274 // function preceeded
275 // checks whether the mText is preceeded by $prec at position mPos
276 /* private */ function preceeded( $prec )
278 $len = strlen( $prec );
279 // if $prec is longer than the text up to mPos, return false
280 if ( $this->mPos
< $len )
282 return ( 0 == strcmp( $prec, substr($this->mText
, $this->mPos
-$len, $len) ) );
285 function readAllUntil( $border )
287 $n = strpos( $this->mText
, $border, $this->mPos
);
290 $ret = substr( $this->mText
, $this->mPos
, $n - $this->mPos
);
291 $this->mPos
= $n +
strlen( $border ) +
1;