3 # Original script modified in November 2003 to take advantage of
4 # the character-validation range routines, and updated to the
5 # current Unicode information (Version 4.0.1)
7 # NOTE: there is an 'alias' facility for blocks which are not present in
8 # the current release, but are needed for ABI compatibility. This
9 # must be accomplished MANUALLY! Please see the comments below under
16 webpage
= "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"
17 sources
= "Blocks-4.0.1.txt UnicodeData-4.0.1.txt"
20 # blockAliases is a small hack - it is used for mapping block names which
21 # were were used in the 3.1 release, but are missing or changed in the current
22 # release. The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
24 blockAliases
.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
25 blockAliases
.append("Greek:GreekandCoptic")
26 blockAliases
.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," +
27 "SupplementaryPrivateUseArea-B")
29 # minTableSize gives the minimum number of ranges which must be present
30 # before a range table is produced. If there are less than this
31 # number, inline comparisons are generated
34 (blockfile
, catfile
) = string
.split(sources
)
38 # Now process the "blocks" file, reducing it to a dictionary
39 # indexed by blockname, containing a tuple with the applicable
44 blocks
= open(blockfile
, "r")
46 print "Missing %s, aborting ..." % blockfile
49 for line
in blocks
.readlines():
52 line
= string
.strip(line
)
56 fields
= string
.split(line
, ';')
57 range = string
.strip(fields
[0])
58 (start
, end
) = string
.split(range, "..")
59 name
= string
.strip(fields
[1])
60 name
= string
.replace(name
, ' ', '')
62 print "Failed to process line: %s" % (line
)
67 BlockNames
[name
].append((start
, end
))
69 BlockNames
[name
] = [(start
, end
)]
71 print "Parsed %d blocks descriptions" % (len(BlockNames
.keys()))
73 for block
in blockAliases
:
74 alias
= string
.split(block
,':')
75 alist
= string
.split(alias
[1],',')
77 if BlockNames
.has_key(comp
):
78 if alias
[0] not in BlockNames
:
79 BlockNames
[alias
[0]] = []
80 for r
in BlockNames
[comp
]:
81 BlockNames
[alias
[0]].append(r
)
83 print "Alias %s: %s not in Blocks" % (alias
[0], comp
)
87 # Next process the Categories file. This is more complex, since
88 # the file is in code sequence, and we need to invert it. We use
89 # a dictionary with index category-name, with each entry containing
90 # all the ranges (codepoints) of that category. Note that category
91 # names comprise two parts - the general category, and the "subclass"
92 # within that category. Therefore, both "general category" (which is
93 # the first character of the 2-character category-name) and the full
94 # (2-character) name are entered into this dictionary.
97 data
= open(catfile
, "r")
99 print "Missing %s, aborting ..." % catfile
104 for line
in data
.readlines():
107 line
= string
.strip(line
)
111 fields
= string
.split(line
, ';')
112 point
= string
.strip(fields
[0])
116 if point
[0] >= '0' and point
[0] <= '9':
117 value
= value
+ ord(point
[0]) - ord('0')
118 elif point
[0] >= 'A' and point
[0] <= 'F':
119 value
= value
+ 10 + ord(point
[0]) - ord('A')
120 elif point
[0] >= 'a' and point
[0] <= 'f':
121 value
= value
+ 10 + ord(point
[0]) - ord('a')
125 print "Failed to process line: %s" % (line
)
129 # update entry for "full name"
131 Categories
[name
].append(value
)
134 Categories
[name
] = [value
]
136 print "Failed to process line: %s" % (line
)
137 # update "general category" name
139 Categories
[name
[0]].append(value
)
142 Categories
[name
[0]] = [value
]
144 print "Failed to process line: %s" % (line
)
147 print "Parsed %d char generating %d categories" % (nbchar
, len(Categories
.keys()))
150 # The data is now all read. Time to process it into a more useful form.
152 # reduce the number list into ranges
153 for cat
in Categories
.keys():
154 list = Categories
[cat
]
164 elif val
== prev
+ 1:
168 ranges
.append((prev
, prev
))
173 ranges
.append((start
, prev
))
178 ranges
.append((prev
, prev
))
180 ranges
.append((start
, prev
))
181 Categories
[cat
] = ranges
184 # Assure all data is in alphabetic order, since we will be doing binary
185 # searches on the tables.
187 bkeys
= BlockNames
.keys()
190 ckeys
= Categories
.keys()
194 # Generate the resulting files
197 header
= open("include/libxml/xmlunicode.h", "w")
199 print "Failed to open include/libxml/xmlunicode.h"
203 output
= open("xmlunicode.c", "w")
205 print "Failed to open xmlunicode.c"
208 date
= time
.asctime(time
.localtime(time
.time()))
212 * Summary: Unicode character APIs
213 * Description: API for the Unicode character APIs
215 * This file is automatically generated from the
216 * UCS description files of the Unicode Character Database
218 * using the genUnicode.py Python script.
220 * Generation date: %s
222 * Author: Daniel Veillard
225 #ifndef __XML_UNICODE_H__
226 #define __XML_UNICODE_H__
228 #include <libxml/xmlversion.h>
230 #ifdef LIBXML_UNICODE_ENABLED
236 """ % (webpage
, date
, sources
));
240 * xmlunicode.c: this module implements the Unicode character APIs
242 * This file is automatically generated from the
243 * UCS description files of the Unicode Character Database
245 * using the genUnicode.py Python script.
247 * Generation date: %s
249 * Daniel Veillard <veillard@redhat.com>
255 #ifdef LIBXML_UNICODE_ENABLED
258 #include <libxml/xmlversion.h>
259 #include <libxml/xmlunicode.h>
260 #include <libxml/chvalid.h>
262 typedef int (xmlIntFunc)(int); /* just to keep one's mind untwisted */
265 const char *rangename;
270 xmlUnicodeRange *table;
272 } xmlUnicodeNameTable;
275 static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname);
277 static xmlUnicodeRange xmlUnicodeBlocks[] = {
278 """ % (webpage
, date
, sources
));
282 name
= string
.replace(block
, '-', '')
287 output
.write(' {"%s", xmlUCSIs%s}' % (block
, name
))
288 output
.write('};\n\n')
290 output
.write('static xmlUnicodeRange xmlUnicodeCats[] = {\n')
297 output
.write(' {"%s", xmlUCSIsCat%s}' % (name
, name
))
298 output
.write('};\n\n')
301 # For any categories with more than minTableSize ranges we generate
302 # a range table suitable for xmlCharInRange
305 if len(Categories
[name
]) > minTableSize
:
308 ranges
= Categories
[name
]
315 pline
= "static const xmlChSRange xml%sS[] = {" % name
316 sptr
= "xml%sS" % name
323 output
.write(pline
+ " };\n")
324 pline
= "static const xmlChLRange xml%sL[] = {" % name
325 lptr
= "xml%sL" % name
330 output
.write(pline
+ "\n")
332 pline
+= "{%s, %s}" % (hex(low
), hex(high
))
333 output
.write(pline
+ " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
334 % (name
, numshort
, numlong
, sptr
, lptr
))
338 """static xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
339 static xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
343 * @tptr: pointer to the name table
344 * @name: name to be found
346 * binary table lookup for user-supplied name
348 * Returns pointer to range function if found, otherwise NULL
351 *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname) {
352 int low, high, mid, cmp;
353 xmlUnicodeRange *sptr;
355 if ((tptr == NULL) || (tname == NULL)) return(NULL);
358 high = tptr->numentries - 1;
360 while (low <= high) {
361 mid = (low + high) / 2;
362 if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)
363 return (sptr[mid].func);
372 """ % (len(BlockNames
), len(Categories
)) )
375 name
= string
.replace(block
, '-', '')
376 header
.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name
)
377 output
.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name
))
378 output
.write(" *\n * Check whether the character is part of %s UCS Block\n"%
380 output
.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
381 output
.write("int\nxmlUCSIs%s(int code) {\n return(" % name
)
383 for (start
, end
) in BlockNames
[block
]:
385 output
.write(" ||\n ")
388 output
.write("((code >= %s) && (code <= %s))" % (start
, end
))
389 output
.write(");\n}\n\n")
391 header
.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n")
395 * @code: UCS code point
396 * @block: UCS block name
398 * Check whether the character is part of the UCS Block
400 * Returns 1 if true, 0 if false and -1 on unknown block
403 xmlUCSIsBlock(int code, const char *block) {
406 func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
415 ranges
= Categories
[name
]
416 header
.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name
)
417 output
.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name
))
418 output
.write(" *\n * Check whether the character is part of %s UCS Category\n"%
420 output
.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
421 output
.write("int\nxmlUCSIsCat%s(int code) {\n" % name
)
422 if len(Categories
[name
]) > minTableSize
:
423 output
.write(" return(xmlCharInRange((unsigned int)code, &xml%sG)"
428 (begin
, end
) = range;
430 output
.write(" return(");
433 output
.write(" ||\n ");
435 output
.write("(code == %s)" % (hex(begin
)))
437 output
.write("((code >= %s) && (code <= %s))" % (
438 hex(begin
), hex(end
)))
439 output
.write(");\n}\n\n")
441 header
.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n")
445 * @code: UCS code point
446 * @cat: UCS Category name
448 * Check whether the character is part of the UCS Category
450 * Returns 1 if true, 0 if false and -1 on unknown category
453 xmlUCSIsCat(int code, const char *cat) {
456 func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
462 #define bottom_xmlunicode
463 #include "elfgcchack.h"
464 #endif /* LIBXML_UNICODE_ENABLED */
472 #endif /* LIBXML_UNICODE_ENABLED */
474 #endif /* __XML_UNICODE_H__ */