1 // -*- c-basic-offset: 2 -*-
3 * This file is part of the KDE libraries
4 * Copyright (C) 1999-2001,2004 Harri Porten (porten@kde.org)
5 * Copyright (C) 2003,2004 Apple Computer, Inc.
6 * Copyright (C) 2006 Maksim Orlovich (maksim@kde.org)
7 * Copyright (C) 2007 Sune Vuorela (debian@pusling.com)
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2 of the License, or (at your option) any later version.
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
33 #include <wtf/Vector.h>
36 // GCC cstring uses these automatically, but not all implementations do.
45 RegExp::UTF8SupportState
RegExp::utf8Support
= RegExp::Unknown
;
47 // JS regexps can contain Unicode escape sequences (\uxxxx) which
48 // are rather uncommon elsewhere. As our regexp libs don't understand
49 // them we do the unescaping ourselves internally.
50 // Also make sure to expand out any nulls as pcre_compile
51 // expects null termination..
52 static UString
sanitizePattern(const UString
&p
)
56 const char* const nil
= "\\x00";
57 if (p
.find("\\u") >= 0 || p
.find(KJS::UChar('\0')) >= 0) {
59 for (int i
= 0; i
< p
.size(); ++i
) {
63 // we only care about \u
65 // standard unicode escape sequence looks like \uxxxx but
66 // other browsers also accept less than 4 hex digits
69 for (j
= 0; j
< 4; ++j
) {
70 if (i
+ 1 < p
.size() && Lexer::isHexDigit(p
[i
+ 1].unicode())) {
71 u
= (u
<< 4) + Lexer::convertHex(p
[i
+ 1].unicode());
74 // sequence incomplete. restore index.
75 // TODO: cleaner way to propagate warning
76 fprintf(stderr
, "KJS: saw %d digit \\u sequence.\n", j
);
82 // sequence was incomplete. treat \u as u which IE always
83 // and FF sometimes does.
84 newPattern
.append(UString('u'));
89 // Make sure to encode 0, to avoid terminating the string
90 newPattern
+= UString(nil
);
103 // escape pattern characters have to remain escaped
104 newPattern
.append(UString('\\'));
105 // intentional fallthrough
107 newPattern
+= UString(&c
, 1);
113 newPattern
+= UString('\\');
114 newPattern
+= UString(&c
, 1);
119 newPattern
+= UString(nil
);
121 newPattern
+= UString(&c
, 1);
130 RegExp::RegExp(const UString
&p
, char flags
)
131 : _pat(p
), _flags(flags
), _valid(true), _numSubPatterns(0), _buffer(0), _originalPos(0)
133 #ifdef HAVE_PCREPOSIX
134 // Determine whether libpcre has unicode support if need be..
135 if (utf8Support
== Unknown
) {
137 pcre_config(PCRE_CONFIG_UTF8
, (void*)&supported
);
138 utf8Support
= supported
? Supported
: Unsupported
;
142 UString intern
= sanitizePattern(p
);
144 #ifdef HAVE_PCREPOSIX
147 // we are close but not 100% the same as Perl
148 #ifdef PCRE_JAVASCRIPT_COMPAT // introduced in PCRE 7.7
149 options
|= PCRE_JAVASCRIPT_COMPAT
;
152 // Note: the Global flag is already handled by RegExpProtoFunc::execute.
153 // FIXME: That last comment is dubious. Not all RegExps get run through RegExpProtoFunc::execute.
154 if (flags
& IgnoreCase
)
155 options
|= PCRE_CASELESS
;
156 if (flags
& Multiline
)
157 options
|= PCRE_MULTILINE
;
159 if (utf8Support
== Supported
)
160 options
|= (PCRE_UTF8
| PCRE_NO_UTF8_CHECK
);
162 const char *errorMessage
;
165 // Fill our buffer with an encoded version, whether utf-8, or,
166 // if PCRE is incapable, truncated.
167 prepareMatch(intern
);
168 _regex
= pcre_compile(_buffer
, options
, &errorMessage
, &errorOffset
, NULL
);
169 doneMatch(); //Cleanup buffers
172 fprintf(stderr
, "KJS: pcre_compile() failed with '%s'\n", errorMessage
);
178 #ifdef PCRE_INFO_CAPTURECOUNT
179 // Get number of subpatterns that will be returned.
180 pcre_fullinfo(_regex
, NULL
, PCRE_INFO_CAPTURECOUNT
, &_numSubPatterns
);
183 #else /* HAVE_PCREPOSIX */
187 regflags
|= REG_EXTENDED
;
190 if ( flags
& IgnoreCase
)
191 regflags
|= REG_ICASE
;
194 //NOTE: Multiline is not feasible with POSIX regex.
195 //if ( f & Multiline )
197 // Note: the Global flag is already handled by RegExpProtoFunc::execute
199 int errorCode
= regcomp(&_regex
, intern
.ascii(), regflags
);
200 if (errorCode
!= 0) {
202 char errorMessage
[80];
203 regerror(errorCode
, &_regex
, errorMessage
, sizeof errorMessage
);
204 fprintf(stderr
, "KJS: regcomp failed with '%s'\n", errorMessage
);
213 doneMatch(); // Be 100% sure buffers are freed
214 #ifdef HAVE_PCREPOSIX
217 /* TODO: is this really okay after an error ? */
222 void RegExp::prepareUtf8(const UString
& s
)
224 // Allocate a buffer big enough to hold all the characters plus \0
225 const int length
= s
.size();
226 _buffer
= new char[length
* 3 + 1];
228 // Also create buffer for positions. We need one extra character in there,
229 // even past the \0 since the non-empty handling may jump one past the end
230 _originalPos
= new int[length
* 3 + 2];
232 // Convert to runs of 8-bit characters, and generate indices
233 // Note that we do NOT combine surrogate pairs here, as
234 // regexps operate on them as separate characters
236 int *posOut
= _originalPos
;
237 const UChar
*d
= s
.data();
238 for (int i
= 0; i
!= length
; ++i
) {
239 unsigned short c
= d
[i
].unicode();
245 } else if (c
< 0x800) {
246 *p
++ = (char)((c
>> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
247 *p
++ = (char)((c
| 0x80) & 0xBF); // next 6 bits, with high bit set
250 *p
++ = (char)((c
>> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
251 *p
++ = (char)(((c
>> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
252 *p
++ = (char)((c
| 0x80) & 0xBF); // next 6 bits, with high bit set
256 while (sequenceLen
> 0) {
263 _bufferSize
= p
- _buffer
;
267 // Record positions for \0, and the fictional character after that.
269 *(posOut
+1) = length
+1;
272 void RegExp::prepareASCII (const UString
& s
)
276 // Best-effort attempt to get something done
277 // when we don't have utf 8 available -- use
278 // truncated version, and pray for the best
279 CString truncated
= s
.cstring();
280 _buffer
= new char[truncated
.size() + 1];
281 memcpy(_buffer
, truncated
.c_str(), truncated
.size());
282 _buffer
[truncated
.size()] = '\0'; // For _compile use
283 _bufferSize
= truncated
.size();
286 void RegExp::prepareMatch(const UString
&s
)
288 delete[] _originalPos
; // Just to be sure..
290 if (utf8Support
== Supported
)
300 void RegExp::doneMatch()
302 delete[] _originalPos
; _originalPos
= 0;
303 delete[] _buffer
; _buffer
= 0;
306 UString
RegExp::match(const UString
&s
, bool *error
, int i
, int *pos
, int **ovector
)
309 assert(s
.data() == _originalS
.data()); // Make sure prepareMatch got called right..
321 if (i
> s
.size() || s
.isNull())
322 return UString::null();
324 #ifdef HAVE_PCREPOSIX
327 return UString::null();
329 // Set up the offset vector for the result.
330 // First 2/3 used for result, the last third used by PCRE.
332 int offsetVectorSize
;
333 int fixedSizeOffsetVector
[3];
335 offsetVectorSize
= 3;
336 offsetVector
= fixedSizeOffsetVector
;
338 offsetVectorSize
= (_numSubPatterns
+ 1) * 3;
339 offsetVector
= new int [offsetVectorSize
];
343 if (utf8Support
== Supported
) {
345 while (_originalPos
[startPos
] < i
)
351 int baseFlags
= utf8Support
== Supported
? PCRE_NO_UTF8_CHECK
: 0;
353 // See if we have to limit stack space...
355 int stackGlutton
= 0;
356 pcre_config(PCRE_CONFIG_STACKRECURSE
, (void*)&stackGlutton
);
359 limits
.flags
= PCRE_EXTRA_MATCH_LIMIT_RECURSION
;
360 // libPCRE docs claim that it munches about 500 bytes per recursion.
361 // The crash in #160792 actually showed pcre 7.4 using about 1300 bytes
362 // (and I've measured 800 in an another instance)
363 // So the usual 8MiB rlimit on Linux produces about 6452 frames.
364 // We go somewhat conservative, and use about 2/3rds of that, for 4300
365 // especially since we're not exactly light on the stack, either
366 // ### TODO: get some build system help to use getrlimit.
367 limits
.match_limit_recursion
= 4300;
370 const int numMatches
= pcre_exec(_regex
, stackGlutton
? &limits
: 0, _buffer
, _bufferSize
, startPos
, baseFlags
, offsetVector
, offsetVectorSize
);
372 //Now go through and patch up the offsetVector
373 if (utf8Support
== Supported
)
374 for (int c
= 0; c
< 2 * numMatches
; ++c
)
375 if (offsetVector
[c
] != -1)
376 offsetVector
[c
] = _originalPos
[offsetVector
[c
]];
378 if (numMatches
< 0) {
380 if (numMatches
!= PCRE_ERROR_NOMATCH
)
381 fprintf(stderr
, "KJS: pcre_exec() failed with result %d\n", numMatches
);
383 if (offsetVector
!= fixedSizeOffsetVector
)
384 delete [] offsetVector
;
385 if (numMatches
== PCRE_ERROR_MATCHLIMIT
|| numMatches
== PCRE_ERROR_RECURSIONLIMIT
)
387 return UString::null();
390 *pos
= offsetVector
[0];
392 *ovector
= offsetVector
;
393 return s
.substr(offsetVector
[0], offsetVector
[1] - offsetVector
[0]);
398 return UString::null();
400 const unsigned maxMatch
= 10;
401 regmatch_t rmatch
[maxMatch
];
403 char *str
= strdup(s
.ascii()); // TODO: why ???
404 if (regexec(&_regex
, str
+ i
, maxMatch
, rmatch
, 0)) {
406 return UString::null();
411 *pos
= rmatch
[0].rm_so
+ i
;
412 return s
.substr(rmatch
[0].rm_so
+ i
, rmatch
[0].rm_eo
- rmatch
[0].rm_so
);
415 // map rmatch array to ovector used in PCRE case
417 for(unsigned j
= 1; j
< maxMatch
&& rmatch
[j
].rm_so
>= 0 ; j
++)
419 int ovecsize
= (_numSubPatterns
+1)*3; // see above
420 *ovector
= new int[ovecsize
];
421 for (unsigned j
= 0; j
< _numSubPatterns
+ 1; j
++) {
424 (*ovector
)[2*j
] = rmatch
[j
].rm_so
+ i
;
425 (*ovector
)[2*j
+1] = rmatch
[j
].rm_eo
+ i
;
428 *pos
= (*ovector
)[0];
429 return s
.substr((*ovector
)[0], (*ovector
)[1] - (*ovector
)[0]);