utf8: add unit test for g_utf8_make_valid
[glib.git] / glib / update-pcre / ucp.patch
blob402020fa889ab02debcaa9dd59eab6d7a26c2a14
1 From 23d48c5fc7aa889dc7798f9c64acd43d9cb34683 Mon Sep 17 00:00:00 2001
2 From: Christian Persch <chpe@gnome.org>
3 Date: Sun, 12 Feb 2012 21:20:33 +0100
4 Subject: [PATCH] regex: Use glib for unicode data
6 Use g_unichar_type() and g_unichar_get_script() instead of pcre tables.
7 ---
8 glib/pcre/pcre_compile.c | 26 +++---
9 glib/pcre/pcre_dfa_exec.c | 96 ++++++++--------
10 glib/pcre/pcre_exec.c | 26 +++---
11 glib/pcre/pcre_internal.h | 11 +--
12 glib/pcre/pcre_tables.c | 16 +++
13 glib/pcre/pcre_xclass.c | 24 ++--
14 glib/pcre/ucp.h | 265 +++++++++++++++++++++++----------------------
15 7 files changed, 239 insertions(+), 225 deletions(-)
17 diff --git a/glib/pcre/pcre_compile.c b/glib/pcre/pcre_compile.c
18 index 21bef80..a6c84e1 100644
19 --- a/glib/pcre/pcre_compile.c
20 +++ b/glib/pcre/pcre_compile.c
21 @@ -2920,43 +2920,43 @@ Returns: TRUE if auto-possessifying is OK
22 static BOOL
23 check_char_prop(int c, int ptype, int pdata, BOOL negated)
25 -const ucd_record *prop = GET_UCD(c);
26 +const pcre_uint8 chartype = UCD_CHARTYPE(c);
27 switch(ptype)
29 case PT_LAMP:
30 - return (prop->chartype == ucp_Lu ||
31 - prop->chartype == ucp_Ll ||
32 - prop->chartype == ucp_Lt) == negated;
33 + return (chartype == ucp_Lu ||
34 + chartype == ucp_Ll ||
35 + chartype == ucp_Lt) == negated;
37 case PT_GC:
38 - return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
39 + return (pdata == PRIV(ucp_gentype)[chartype]) == negated;
41 case PT_PC:
42 - return (pdata == prop->chartype) == negated;
43 + return (pdata == chartype) == negated;
45 case PT_SC:
46 - return (pdata == prop->script) == negated;
47 + return (pdata == UCD_SCRIPT(c)) == negated;
49 /* These are specials */
51 case PT_ALNUM:
52 - return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
53 - PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
54 + return (PRIV(ucp_gentype)[chartype] == ucp_L ||
55 + PRIV(ucp_gentype)[chartype] == ucp_N) == negated;
57 case PT_SPACE: /* Perl space */
58 - return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
59 + return (PRIV(ucp_gentype)[chartype] == ucp_Z ||
60 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
61 == negated;
63 case PT_PXSPACE: /* POSIX space */
64 - return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
65 + return (PRIV(ucp_gentype)[chartype] == ucp_Z ||
66 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
67 c == CHAR_FF || c == CHAR_CR)
68 == negated;
70 case PT_WORD:
71 - return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
72 - PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
73 + return (PRIV(ucp_gentype)[chartype] == ucp_L ||
74 + PRIV(ucp_gentype)[chartype] == ucp_N ||
75 c == CHAR_UNDERSCORE) == negated;
77 return FALSE;
78 diff --git a/glib/pcre/pcre_dfa_exec.c b/glib/pcre/pcre_dfa_exec.c
79 index 9565d46..3f913ce 100644
80 --- a/glib/pcre/pcre_dfa_exec.c
81 +++ b/glib/pcre/pcre_dfa_exec.c
82 @@ -1060,7 +1060,7 @@ for (;;)
83 if (clen > 0)
85 BOOL OK;
86 - const ucd_record * prop = GET_UCD(c);
87 + const pcre_uint8 chartype = UCD_CHARTYPE(c);
88 switch(code[1])
90 case PT_ANY:
91 @@ -1068,43 +1068,43 @@ for (;;)
92 break;
94 case PT_LAMP:
95 - OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
96 - prop->chartype == ucp_Lt;
97 + OK = chartype == ucp_Lu || chartype == ucp_Ll ||
98 + chartype == ucp_Lt;
99 break;
101 case PT_GC:
102 - OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
103 + OK = PRIV(ucp_gentype)[chartype] == code[2];
104 break;
106 case PT_PC:
107 - OK = prop->chartype == code[2];
108 + OK = chartype == code[2];
109 break;
111 case PT_SC:
112 - OK = prop->script == code[2];
113 + OK = UCD_SCRIPT(c) == code[2];
114 break;
116 /* These are specials for combination cases. */
118 case PT_ALNUM:
119 - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
120 - PRIV(ucp_gentype)[prop->chartype] == ucp_N;
121 + OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
122 + PRIV(ucp_gentype)[chartype] == ucp_N;
123 break;
125 case PT_SPACE: /* Perl space */
126 - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
127 + OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
128 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
129 break;
131 case PT_PXSPACE: /* POSIX space */
132 - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
133 + OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
134 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
135 c == CHAR_FF || c == CHAR_CR;
136 break;
138 case PT_WORD:
139 - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
140 - PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
141 + OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
142 + PRIV(ucp_gentype)[chartype] == ucp_N ||
143 c == CHAR_UNDERSCORE;
144 break;
146 @@ -1294,7 +1294,7 @@ for (;;)
147 if (clen > 0)
149 BOOL OK;
150 - const ucd_record * prop = GET_UCD(c);
151 + const pcre_uint8 chartype = UCD_CHARTYPE(c);
152 switch(code[2])
154 case PT_ANY:
155 @@ -1302,43 +1302,43 @@ for (;;)
156 break;
158 case PT_LAMP:
159 - OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
160 - prop->chartype == ucp_Lt;
161 + OK = chartype == ucp_Lu || chartype == ucp_Ll ||
162 + chartype == ucp_Lt;
163 break;
165 case PT_GC:
166 - OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
167 + OK = PRIV(ucp_gentype)[chartype] == code[3];
168 break;
170 case PT_PC:
171 - OK = prop->chartype == code[3];
172 + OK = chartype == code[3];
173 break;
175 case PT_SC:
176 - OK = prop->script == code[3];
177 + OK = UCD_SCRIPT(c) == code[3];
178 break;
180 /* These are specials for combination cases. */
182 case PT_ALNUM:
183 - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
184 - PRIV(ucp_gentype)[prop->chartype] == ucp_N;
185 + OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
186 + PRIV(ucp_gentype)[chartype] == ucp_N;
187 break;
189 case PT_SPACE: /* Perl space */
190 - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
191 + OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
192 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
193 break;
195 case PT_PXSPACE: /* POSIX space */
196 - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
197 + OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
198 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
199 c == CHAR_FF || c == CHAR_CR;
200 break;
202 case PT_WORD:
203 - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
204 - PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
205 + OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
206 + PRIV(ucp_gentype)[chartype] == ucp_N ||
207 c == CHAR_UNDERSCORE;
208 break;
210 @@ -1541,7 +1541,7 @@ for (;;)
211 if (clen > 0)
213 BOOL OK;
214 - const ucd_record * prop = GET_UCD(c);
215 + const pcre_uint8 chartype = UCD_CHARTYPE(c);
216 switch(code[2])
218 case PT_ANY:
219 @@ -1549,43 +1549,43 @@ for (;;)
220 break;
222 case PT_LAMP:
223 - OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
224 - prop->chartype == ucp_Lt;
225 + OK = chartype == ucp_Lu || chartype == ucp_Ll ||
226 + chartype == ucp_Lt;
227 break;
229 case PT_GC:
230 - OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
231 + OK = PRIV(ucp_gentype)[chartype] == code[3];
232 break;
234 case PT_PC:
235 - OK = prop->chartype == code[3];
236 + OK = chartype == code[3];
237 break;
239 case PT_SC:
240 - OK = prop->script == code[3];
241 + OK = UCD_SCRIPT(c) == code[3];
242 break;
244 /* These are specials for combination cases. */
246 case PT_ALNUM:
247 - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
248 - PRIV(ucp_gentype)[prop->chartype] == ucp_N;
249 + OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
250 + PRIV(ucp_gentype)[chartype] == ucp_N;
251 break;
253 case PT_SPACE: /* Perl space */
254 - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
255 + OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
256 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
257 break;
259 case PT_PXSPACE: /* POSIX space */
260 - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
261 + OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
262 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
263 c == CHAR_FF || c == CHAR_CR;
264 break;
266 case PT_WORD:
267 - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
268 - PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
269 + OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
270 + PRIV(ucp_gentype)[chartype] == ucp_N ||
271 c == CHAR_UNDERSCORE;
272 break;
274 @@ -1813,7 +1813,7 @@ for (;;)
275 if (clen > 0)
277 BOOL OK;
278 - const ucd_record * prop = GET_UCD(c);
279 + const pcre_uint8 chartype = UCD_CHARTYPE(c);
280 switch(code[1 + IMM2_SIZE + 1])
282 case PT_ANY:
283 @@ -1821,43 +1821,43 @@ for (;;)
284 break;
286 case PT_LAMP:
287 - OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
288 - prop->chartype == ucp_Lt;
289 + OK = chartype == ucp_Lu || chartype == ucp_Ll ||
290 + chartype == ucp_Lt;
291 break;
293 case PT_GC:
294 - OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
295 + OK = PRIV(ucp_gentype)[chartype] == code[1 + IMM2_SIZE + 2];
296 break;
298 case PT_PC:
299 - OK = prop->chartype == code[1 + IMM2_SIZE + 2];
300 + OK = chartype == code[1 + IMM2_SIZE + 2];
301 break;
303 case PT_SC:
304 - OK = prop->script == code[1 + IMM2_SIZE + 2];
305 + OK = UCD_SCRIPT(c) == code[1 + IMM2_SIZE + 2];
306 break;
308 /* These are specials for combination cases. */
310 case PT_ALNUM:
311 - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
312 - PRIV(ucp_gentype)[prop->chartype] == ucp_N;
313 + OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
314 + PRIV(ucp_gentype)[chartype] == ucp_N;
315 break;
317 case PT_SPACE: /* Perl space */
318 - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
319 + OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
320 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
321 break;
323 case PT_PXSPACE: /* POSIX space */
324 - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
325 + OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
326 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
327 c == CHAR_FF || c == CHAR_CR;
328 break;
330 case PT_WORD:
331 - OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
332 - PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
333 + OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
334 + PRIV(ucp_gentype)[chartype] == ucp_N ||
335 c == CHAR_UNDERSCORE;
336 break;
338 diff --git a/glib/pcre/pcre_exec.c b/glib/pcre/pcre_exec.c
339 index 830b8b5..c89a3f9 100644
340 --- a/glib/pcre/pcre_exec.c
341 +++ b/glib/pcre/pcre_exec.c
342 @@ -2565,7 +2565,7 @@ for (;;)
344 GETCHARINCTEST(c, eptr);
346 - const ucd_record *prop = GET_UCD(c);
347 + const pcre_uint8 chartype = UCD_CHARTYPE(c);
349 switch(ecode[1])
351 @@ -2574,44 +2574,44 @@ for (;;)
352 break;
354 case PT_LAMP:
355 - if ((prop->chartype == ucp_Lu ||
356 - prop->chartype == ucp_Ll ||
357 - prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
358 + if ((chartype == ucp_Lu ||
359 + chartype == ucp_Ll ||
360 + chartype == ucp_Lt) == (op == OP_NOTPROP))
361 RRETURN(MATCH_NOMATCH);
362 break;
364 case PT_GC:
365 - if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
366 + if ((ecode[2] != PRIV(ucp_gentype)[chartype]) == (op == OP_PROP))
367 RRETURN(MATCH_NOMATCH);
368 break;
370 case PT_PC:
371 - if ((ecode[2] != prop->chartype) == (op == OP_PROP))
372 + if ((ecode[2] != chartype) == (op == OP_PROP))
373 RRETURN(MATCH_NOMATCH);
374 break;
376 case PT_SC:
377 - if ((ecode[2] != prop->script) == (op == OP_PROP))
378 + if ((ecode[2] != UCD_SCRIPT(c)) == (op == OP_PROP))
379 RRETURN(MATCH_NOMATCH);
380 break;
382 /* These are specials */
384 case PT_ALNUM:
385 - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
386 - PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
387 + if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
388 + PRIV(ucp_gentype)[chartype] == ucp_N) == (op == OP_NOTPROP))
389 RRETURN(MATCH_NOMATCH);
390 break;
392 case PT_SPACE: /* Perl space */
393 - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
394 + if ((PRIV(ucp_gentype)[chartype] == ucp_Z ||
395 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
396 == (op == OP_NOTPROP))
397 RRETURN(MATCH_NOMATCH);
398 break;
400 case PT_PXSPACE: /* POSIX space */
401 - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
402 + if ((PRIV(ucp_gentype)[chartype] == ucp_Z ||
403 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
404 c == CHAR_FF || c == CHAR_CR)
405 == (op == OP_NOTPROP))
406 @@ -2619,8 +2619,8 @@ for (;;)
407 break;
409 case PT_WORD:
410 - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
411 - PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
412 + if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
413 + PRIV(ucp_gentype)[chartype] == ucp_N ||
414 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
415 RRETURN(MATCH_NOMATCH);
416 break;
417 diff --git a/glib/pcre/pcre_internal.h b/glib/pcre/pcre_internal.h
418 index 181c312..234af1b 100644
419 --- a/glib/pcre/pcre_internal.h
420 +++ b/glib/pcre/pcre_internal.h
421 @@ -2329,15 +2329,12 @@ extern const int PRIV(ucp_typerange)[];
422 #ifdef SUPPORT_UCP
423 /* UCD access macros */
425 -#define UCD_BLOCK_SIZE 128
426 -#define GET_UCD(ch) (PRIV(ucd_records) + \
427 - PRIV(ucd_stage2)[PRIV(ucd_stage1)[(ch) / UCD_BLOCK_SIZE] * \
428 - UCD_BLOCK_SIZE + (ch) % UCD_BLOCK_SIZE])
429 +unsigned int _pcre_ucp_othercase(const unsigned int c);
431 -#define UCD_CHARTYPE(ch) GET_UCD(ch)->chartype
432 -#define UCD_SCRIPT(ch) GET_UCD(ch)->script
433 +#define UCD_CHARTYPE(ch) (pcre_uint8)g_unichar_type((gunichar)(ch))
434 +#define UCD_SCRIPT(ch) (pcre_uint8)g_unichar_get_script((gunichar)(ch))
435 #define UCD_CATEGORY(ch) PRIV(ucp_gentype)[UCD_CHARTYPE(ch)]
436 -#define UCD_OTHERCASE(ch) (ch + GET_UCD(ch)->other_case)
437 +#define UCD_OTHERCASE(ch) (_pcre_ucp_othercase(ch))
439 #endif /* SUPPORT_UCP */
441 diff --git a/glib/pcre/pcre_tables.c b/glib/pcre/pcre_tables.c
442 index 7ac2d89..e401974 100644
443 --- a/glib/pcre/pcre_tables.c
444 +++ b/glib/pcre/pcre_tables.c
445 @@ -584,6 +584,22 @@ const ucp_type_table PRIV(utt)[] = {
447 const int PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table);
449 +unsigned int
450 +_pcre_ucp_othercase(const unsigned int c)
452 + int other_case = NOTACHAR;
454 + if (g_unichar_islower(c))
455 + other_case = g_unichar_toupper(c);
456 + else if (g_unichar_isupper(c))
457 + other_case = g_unichar_tolower(c);
459 + if (other_case == c)
460 + other_case = NOTACHAR;
462 + return other_case;
465 #endif /* SUPPORT_UTF */
467 /* End of pcre_tables.c */
468 diff --git a/glib/pcre/pcre_xclass.c b/glib/pcre/pcre_xclass.c
469 index dca7a39..e5a55d7 100644
470 --- a/glib/pcre/pcre_xclass.c
471 +++ b/glib/pcre/pcre_xclass.c
472 @@ -127,7 +127,7 @@ while ((t = *data++) != XCL_END)
473 #ifdef SUPPORT_UCP
474 else /* XCL_PROP & XCL_NOTPROP */
476 - const ucd_record *prop = GET_UCD(c);
477 + const pcre_uint8 chartype = UCD_CHARTYPE(c);
479 switch(*data)
481 @@ -136,46 +136,46 @@ while ((t = *data++) != XCL_END)
482 break;
484 case PT_LAMP:
485 - if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
486 - prop->chartype == ucp_Lt) == (t == XCL_PROP)) return !negated;
487 + if ((chartype == ucp_Lu || chartype == ucp_Ll ||
488 + chartype == ucp_Lt) == (t == XCL_PROP)) return !negated;
489 break;
491 case PT_GC:
492 - if ((data[1] == PRIV(ucp_gentype)[prop->chartype]) == (t == XCL_PROP))
493 + if ((data[1] == PRIV(ucp_gentype)[chartype]) == (t == XCL_PROP))
494 return !negated;
495 break;
497 case PT_PC:
498 - if ((data[1] == prop->chartype) == (t == XCL_PROP)) return !negated;
499 + if ((data[1] == chartype) == (t == XCL_PROP)) return !negated;
500 break;
502 case PT_SC:
503 - if ((data[1] == prop->script) == (t == XCL_PROP)) return !negated;
504 + if ((data[1] == UCD_SCRIPT(c)) == (t == XCL_PROP)) return !negated;
505 break;
507 case PT_ALNUM:
508 - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
509 - PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (t == XCL_PROP))
510 + if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
511 + PRIV(ucp_gentype)[chartype] == ucp_N) == (t == XCL_PROP))
512 return !negated;
513 break;
515 case PT_SPACE: /* Perl space */
516 - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
517 + if ((PRIV(ucp_gentype)[chartype] == ucp_Z ||
518 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
519 == (t == XCL_PROP))
520 return !negated;
521 break;
523 case PT_PXSPACE: /* POSIX space */
524 - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
525 + if ((PRIV(ucp_gentype)[chartype] == ucp_Z ||
526 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
527 c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP))
528 return !negated;
529 break;
531 case PT_WORD:
532 - if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
533 - PRIV(ucp_gentype)[prop->chartype] == ucp_N || c == CHAR_UNDERSCORE)
534 + if ((PRIV(ucp_gentype)[chartype] == ucp_L ||
535 + PRIV(ucp_gentype)[chartype] == ucp_N || c == CHAR_UNDERSCORE)
536 == (t == XCL_PROP))
537 return !negated;
538 break;
539 diff --git a/glib/pcre/ucp.h b/glib/pcre/ucp.h
540 index 59c3bec..53a48c9 100644
541 --- a/glib/pcre/ucp.h
542 +++ b/glib/pcre/ucp.h
543 @@ -10,6 +10,7 @@ the UCD access macros. New values that are added for new releases of Unicode
544 should always be at the end of each enum, for backwards compatibility. */
546 /* These are the general character categories. */
547 +#include "gunicode.h"
549 enum {
550 ucp_C, /* Other */
551 @@ -24,148 +25,148 @@ enum {
552 /* These are the particular character types. */
554 enum {
555 - ucp_Cc, /* Control */
556 - ucp_Cf, /* Format */
557 - ucp_Cn, /* Unassigned */
558 - ucp_Co, /* Private use */
559 - ucp_Cs, /* Surrogate */
560 - ucp_Ll, /* Lower case letter */
561 - ucp_Lm, /* Modifier letter */
562 - ucp_Lo, /* Other letter */
563 - ucp_Lt, /* Title case letter */
564 - ucp_Lu, /* Upper case letter */
565 - ucp_Mc, /* Spacing mark */
566 - ucp_Me, /* Enclosing mark */
567 - ucp_Mn, /* Non-spacing mark */
568 - ucp_Nd, /* Decimal number */
569 - ucp_Nl, /* Letter number */
570 - ucp_No, /* Other number */
571 - ucp_Pc, /* Connector punctuation */
572 - ucp_Pd, /* Dash punctuation */
573 - ucp_Pe, /* Close punctuation */
574 - ucp_Pf, /* Final punctuation */
575 - ucp_Pi, /* Initial punctuation */
576 - ucp_Po, /* Other punctuation */
577 - ucp_Ps, /* Open punctuation */
578 - ucp_Sc, /* Currency symbol */
579 - ucp_Sk, /* Modifier symbol */
580 - ucp_Sm, /* Mathematical symbol */
581 - ucp_So, /* Other symbol */
582 - ucp_Zl, /* Line separator */
583 - ucp_Zp, /* Paragraph separator */
584 - ucp_Zs /* Space separator */
585 + ucp_Cc = G_UNICODE_CONTROL, /* Control */
586 + ucp_Cf = G_UNICODE_FORMAT, /* Format */
587 + ucp_Cn = G_UNICODE_UNASSIGNED, /* Unassigned */
588 + ucp_Co = G_UNICODE_PRIVATE_USE, /* Private use */
589 + ucp_Cs = G_UNICODE_SURROGATE, /* Surrogate */
590 + ucp_Ll = G_UNICODE_LOWERCASE_LETTER, /* Lower case letter */
591 + ucp_Lm = G_UNICODE_MODIFIER_LETTER, /* Modifier letter */
592 + ucp_Lo = G_UNICODE_OTHER_LETTER, /* Other letter */
593 + ucp_Lt = G_UNICODE_TITLECASE_LETTER, /* Title case letter */
594 + ucp_Lu = G_UNICODE_UPPERCASE_LETTER, /* Upper case letter */
595 + ucp_Mc = G_UNICODE_SPACING_MARK, /* Spacing mark */
596 + ucp_Me = G_UNICODE_ENCLOSING_MARK, /* Enclosing mark */
597 + ucp_Mn = G_UNICODE_NON_SPACING_MARK, /* Non-spacing mark */
598 + ucp_Nd = G_UNICODE_DECIMAL_NUMBER, /* Decimal number */
599 + ucp_Nl = G_UNICODE_LETTER_NUMBER, /* Letter number */
600 + ucp_No = G_UNICODE_OTHER_NUMBER, /* Other number */
601 + ucp_Pc = G_UNICODE_CONNECT_PUNCTUATION, /* Connector punctuation */
602 + ucp_Pd = G_UNICODE_DASH_PUNCTUATION, /* Dash punctuation */
603 + ucp_Pe = G_UNICODE_CLOSE_PUNCTUATION, /* Close punctuation */
604 + ucp_Pf = G_UNICODE_FINAL_PUNCTUATION, /* Final punctuation */
605 + ucp_Pi = G_UNICODE_INITIAL_PUNCTUATION, /* Initial punctuation */
606 + ucp_Po = G_UNICODE_OTHER_PUNCTUATION, /* Other punctuation */
607 + ucp_Ps = G_UNICODE_OPEN_PUNCTUATION, /* Open punctuation */
608 + ucp_Sc = G_UNICODE_CURRENCY_SYMBOL, /* Currency symbol */
609 + ucp_Sk = G_UNICODE_MODIFIER_SYMBOL, /* Modifier symbol */
610 + ucp_Sm = G_UNICODE_MATH_SYMBOL, /* Mathematical symbol */
611 + ucp_So = G_UNICODE_OTHER_SYMBOL, /* Other symbol */
612 + ucp_Zl = G_UNICODE_LINE_SEPARATOR, /* Line separator */
613 + ucp_Zp = G_UNICODE_PARAGRAPH_SEPARATOR, /* Paragraph separator */
614 + ucp_Zs = G_UNICODE_SPACE_SEPARATOR /* Space separator */
617 /* These are the script identifications. */
619 enum {
620 - ucp_Arabic,
621 - ucp_Armenian,
622 - ucp_Bengali,
623 - ucp_Bopomofo,
624 - ucp_Braille,
625 - ucp_Buginese,
626 - ucp_Buhid,
627 - ucp_Canadian_Aboriginal,
628 - ucp_Cherokee,
629 - ucp_Common,
630 - ucp_Coptic,
631 - ucp_Cypriot,
632 - ucp_Cyrillic,
633 - ucp_Deseret,
634 - ucp_Devanagari,
635 - ucp_Ethiopic,
636 - ucp_Georgian,
637 - ucp_Glagolitic,
638 - ucp_Gothic,
639 - ucp_Greek,
640 - ucp_Gujarati,
641 - ucp_Gurmukhi,
642 - ucp_Han,
643 - ucp_Hangul,
644 - ucp_Hanunoo,
645 - ucp_Hebrew,
646 - ucp_Hiragana,
647 - ucp_Inherited,
648 - ucp_Kannada,
649 - ucp_Katakana,
650 - ucp_Kharoshthi,
651 - ucp_Khmer,
652 - ucp_Lao,
653 - ucp_Latin,
654 - ucp_Limbu,
655 - ucp_Linear_B,
656 - ucp_Malayalam,
657 - ucp_Mongolian,
658 - ucp_Myanmar,
659 - ucp_New_Tai_Lue,
660 - ucp_Ogham,
661 - ucp_Old_Italic,
662 - ucp_Old_Persian,
663 - ucp_Oriya,
664 - ucp_Osmanya,
665 - ucp_Runic,
666 - ucp_Shavian,
667 - ucp_Sinhala,
668 - ucp_Syloti_Nagri,
669 - ucp_Syriac,
670 - ucp_Tagalog,
671 - ucp_Tagbanwa,
672 - ucp_Tai_Le,
673 - ucp_Tamil,
674 - ucp_Telugu,
675 - ucp_Thaana,
676 - ucp_Thai,
677 - ucp_Tibetan,
678 - ucp_Tifinagh,
679 - ucp_Ugaritic,
680 - ucp_Yi,
681 + ucp_Arabic = G_UNICODE_SCRIPT_ARABIC,
682 + ucp_Armenian = G_UNICODE_SCRIPT_ARMENIAN,
683 + ucp_Bengali = G_UNICODE_SCRIPT_BENGALI,
684 + ucp_Bopomofo = G_UNICODE_SCRIPT_BOPOMOFO,
685 + ucp_Braille = G_UNICODE_SCRIPT_BRAILLE,
686 + ucp_Buginese = G_UNICODE_SCRIPT_BUGINESE,
687 + ucp_Buhid = G_UNICODE_SCRIPT_BUHID,
688 + ucp_Canadian_Aboriginal = G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL,
689 + ucp_Cherokee = G_UNICODE_SCRIPT_CHEROKEE,
690 + ucp_Common = G_UNICODE_SCRIPT_COMMON,
691 + ucp_Coptic = G_UNICODE_SCRIPT_COPTIC,
692 + ucp_Cypriot = G_UNICODE_SCRIPT_CYPRIOT,
693 + ucp_Cyrillic = G_UNICODE_SCRIPT_CYRILLIC,
694 + ucp_Deseret = G_UNICODE_SCRIPT_DESERET,
695 + ucp_Devanagari = G_UNICODE_SCRIPT_DEVANAGARI,
696 + ucp_Ethiopic = G_UNICODE_SCRIPT_ETHIOPIC,
697 + ucp_Georgian = G_UNICODE_SCRIPT_GEORGIAN,
698 + ucp_Glagolitic = G_UNICODE_SCRIPT_GLAGOLITIC,
699 + ucp_Gothic = G_UNICODE_SCRIPT_GOTHIC,
700 + ucp_Greek = G_UNICODE_SCRIPT_GREEK,
701 + ucp_Gujarati = G_UNICODE_SCRIPT_GUJARATI,
702 + ucp_Gurmukhi = G_UNICODE_SCRIPT_GURMUKHI,
703 + ucp_Han = G_UNICODE_SCRIPT_HAN,
704 + ucp_Hangul = G_UNICODE_SCRIPT_HANGUL,
705 + ucp_Hanunoo = G_UNICODE_SCRIPT_HANUNOO,
706 + ucp_Hebrew = G_UNICODE_SCRIPT_HEBREW,
707 + ucp_Hiragana = G_UNICODE_SCRIPT_HIRAGANA,
708 + ucp_Inherited = G_UNICODE_SCRIPT_INHERITED,
709 + ucp_Kannada = G_UNICODE_SCRIPT_KANNADA,
710 + ucp_Katakana = G_UNICODE_SCRIPT_KATAKANA,
711 + ucp_Kharoshthi = G_UNICODE_SCRIPT_KHAROSHTHI,
712 + ucp_Khmer = G_UNICODE_SCRIPT_KHMER,
713 + ucp_Lao = G_UNICODE_SCRIPT_LAO,
714 + ucp_Latin = G_UNICODE_SCRIPT_LATIN,
715 + ucp_Limbu = G_UNICODE_SCRIPT_LIMBU,
716 + ucp_Linear_B = G_UNICODE_SCRIPT_LINEAR_B,
717 + ucp_Malayalam = G_UNICODE_SCRIPT_MALAYALAM,
718 + ucp_Mongolian = G_UNICODE_SCRIPT_MONGOLIAN,
719 + ucp_Myanmar = G_UNICODE_SCRIPT_MYANMAR,
720 + ucp_New_Tai_Lue = G_UNICODE_SCRIPT_NEW_TAI_LUE,
721 + ucp_Ogham = G_UNICODE_SCRIPT_OGHAM,
722 + ucp_Old_Italic = G_UNICODE_SCRIPT_OLD_ITALIC,
723 + ucp_Old_Persian = G_UNICODE_SCRIPT_OLD_PERSIAN,
724 + ucp_Oriya = G_UNICODE_SCRIPT_ORIYA,
725 + ucp_Osmanya = G_UNICODE_SCRIPT_OSMANYA,
726 + ucp_Runic = G_UNICODE_SCRIPT_RUNIC,
727 + ucp_Shavian = G_UNICODE_SCRIPT_SHAVIAN,
728 + ucp_Sinhala = G_UNICODE_SCRIPT_SINHALA,
729 + ucp_Syloti_Nagri = G_UNICODE_SCRIPT_SYLOTI_NAGRI,
730 + ucp_Syriac = G_UNICODE_SCRIPT_SYRIAC,
731 + ucp_Tagalog = G_UNICODE_SCRIPT_TAGALOG,
732 + ucp_Tagbanwa = G_UNICODE_SCRIPT_TAGBANWA,
733 + ucp_Tai_Le = G_UNICODE_SCRIPT_TAI_LE,
734 + ucp_Tamil = G_UNICODE_SCRIPT_TAMIL,
735 + ucp_Telugu = G_UNICODE_SCRIPT_TELUGU,
736 + ucp_Thaana = G_UNICODE_SCRIPT_THAANA,
737 + ucp_Thai = G_UNICODE_SCRIPT_THAI,
738 + ucp_Tibetan = G_UNICODE_SCRIPT_TIBETAN,
739 + ucp_Tifinagh = G_UNICODE_SCRIPT_TIFINAGH,
740 + ucp_Ugaritic = G_UNICODE_SCRIPT_UGARITIC,
741 + ucp_Yi = G_UNICODE_SCRIPT_YI,
742 /* New for Unicode 5.0: */
743 - ucp_Balinese,
744 - ucp_Cuneiform,
745 - ucp_Nko,
746 - ucp_Phags_Pa,
747 - ucp_Phoenician,
748 + ucp_Balinese = G_UNICODE_SCRIPT_BALINESE,
749 + ucp_Cuneiform = G_UNICODE_SCRIPT_CUNEIFORM,
750 + ucp_Nko = G_UNICODE_SCRIPT_NKO,
751 + ucp_Phags_Pa = G_UNICODE_SCRIPT_PHAGS_PA,
752 + ucp_Phoenician = G_UNICODE_SCRIPT_PHOENICIAN,
753 /* New for Unicode 5.1: */
754 - ucp_Carian,
755 - ucp_Cham,
756 - ucp_Kayah_Li,
757 - ucp_Lepcha,
758 - ucp_Lycian,
759 - ucp_Lydian,
760 - ucp_Ol_Chiki,
761 - ucp_Rejang,
762 - ucp_Saurashtra,
763 - ucp_Sundanese,
764 - ucp_Vai,
765 + ucp_Carian = G_UNICODE_SCRIPT_CARIAN,
766 + ucp_Cham = G_UNICODE_SCRIPT_CHAM,
767 + ucp_Kayah_Li = G_UNICODE_SCRIPT_KAYAH_LI,
768 + ucp_Lepcha = G_UNICODE_SCRIPT_LEPCHA,
769 + ucp_Lycian = G_UNICODE_SCRIPT_LYCIAN,
770 + ucp_Lydian = G_UNICODE_SCRIPT_LYDIAN,
771 + ucp_Ol_Chiki = G_UNICODE_SCRIPT_OL_CHIKI,
772 + ucp_Rejang = G_UNICODE_SCRIPT_REJANG,
773 + ucp_Saurashtra = G_UNICODE_SCRIPT_SAURASHTRA,
774 + ucp_Sundanese = G_UNICODE_SCRIPT_SUNDANESE,
775 + ucp_Vai = G_UNICODE_SCRIPT_VAI,
776 /* New for Unicode 5.2: */
777 - ucp_Avestan,
778 - ucp_Bamum,
779 - ucp_Egyptian_Hieroglyphs,
780 - ucp_Imperial_Aramaic,
781 - ucp_Inscriptional_Pahlavi,
782 - ucp_Inscriptional_Parthian,
783 - ucp_Javanese,
784 - ucp_Kaithi,
785 - ucp_Lisu,
786 - ucp_Meetei_Mayek,
787 - ucp_Old_South_Arabian,
788 - ucp_Old_Turkic,
789 - ucp_Samaritan,
790 - ucp_Tai_Tham,
791 - ucp_Tai_Viet,
792 + ucp_Avestan = G_UNICODE_SCRIPT_AVESTAN,
793 + ucp_Bamum = G_UNICODE_SCRIPT_BAMUM,
794 + ucp_Egyptian_Hieroglyphs = G_UNICODE_SCRIPT_EGYPTIAN_HIEROGLYPHS,
795 + ucp_Imperial_Aramaic = G_UNICODE_SCRIPT_IMPERIAL_ARAMAIC,
796 + ucp_Inscriptional_Pahlavi = G_UNICODE_SCRIPT_INSCRIPTIONAL_PAHLAVI,
797 + ucp_Inscriptional_Parthian = G_UNICODE_SCRIPT_INSCRIPTIONAL_PARTHIAN,
798 + ucp_Javanese = G_UNICODE_SCRIPT_JAVANESE,
799 + ucp_Kaithi = G_UNICODE_SCRIPT_KAITHI,
800 + ucp_Lisu = G_UNICODE_SCRIPT_LISU,
801 + ucp_Meetei_Mayek = G_UNICODE_SCRIPT_MEETEI_MAYEK,
802 + ucp_Old_South_Arabian = G_UNICODE_SCRIPT_OLD_SOUTH_ARABIAN,
803 + ucp_Old_Turkic = G_UNICODE_SCRIPT_OLD_TURKIC,
804 + ucp_Samaritan = G_UNICODE_SCRIPT_SAMARITAN,
805 + ucp_Tai_Tham = G_UNICODE_SCRIPT_TAI_THAM,
806 + ucp_Tai_Viet = G_UNICODE_SCRIPT_TAI_VIET,
807 /* New for Unicode 6.0.0: */
808 - ucp_Batak,
809 - ucp_Brahmi,
810 - ucp_Mandaic,
811 + ucp_Batak = G_UNICODE_SCRIPT_BATAK,
812 + ucp_Brahmi = G_UNICODE_SCRIPT_BRAHMI,
813 + ucp_Mandaic = G_UNICODE_SCRIPT_MANDAIC,
814 /* New for Unicode 6.1.0: */
815 - ucp_Chakma,
816 - ucp_Meroitic_Cursive,
817 - ucp_Meroitic_Hieroglyphs,
818 - ucp_Miao,
819 - ucp_Sharada,
820 - ucp_Sora_Sompeng,
821 - ucp_Takri
822 + ucp_Chakma = G_UNICODE_SCRIPT_CHAKMA,
823 + ucp_Meroitic_Cursive = G_UNICODE_SCRIPT_MEROITIC_CURSIVE,
824 + ucp_Meroitic_Hieroglyphs = G_UNICODE_SCRIPT_MEROITIC_HIEROGLYPHS,
825 + ucp_Miao = G_UNICODE_SCRIPT_MIAO,
826 + ucp_Sharada = G_UNICODE_SCRIPT_SHARADA,
827 + ucp_Sora_Sompeng = G_UNICODE_SCRIPT_SORA_SOMPENG,
828 + ucp_Takri = G_UNICODE_SCRIPT_TAKRI,
831 #endif
833 1.7.5.1.217.g4e3aa.dirty