Bug 470455 - test_database_sync_embed_visits.js leaks, r=sdwilsh
[wine-gecko.git] / layout / mathml / tools / encode.pl
blob7c2292ac6f81dddf812cff9fcf06eff752b71c0b
1 #!/bin/perl
4 # ***** BEGIN LICENSE BLOCK *****
5 # Version: MPL 1.1/GPL 2.0/LGPL 2.1
7 # The contents of this file are subject to the Mozilla Public License Version
8 # 1.1 (the "License"); you may not use this file except in compliance with
9 # the License. You may obtain a copy of the License at
10 # http://www.mozilla.org/MPL/
12 # Software distributed under the License is distributed on an "AS IS" basis,
13 # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
14 # for the specific language governing rights and limitations under the
15 # License.
17 # The Original Code is Mozilla MathML Project.
19 # The Initial Developer of the Original Code is
20 # The University Of Queensland.
21 # Portions created by the Initial Developer are Copyright (C) 2001
22 # the Initial Developer. All Rights Reserved.
24 # Contributor(s):
25 # Roger B. Sidje <rbs@maths.uq.edu.au> - Original Author
27 # Alternatively, the contents of this file may be used under the terms of
28 # either of the GNU General Public License Version 2 or later (the "GPL"),
29 # or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
30 # in which case the provisions of the GPL or the LGPL are applicable instead
31 # of those above. If you wish to allow use of your version of this file only
32 # under the terms of either the GPL or the LGPL, and not to allow others to
33 # use your version of this file under the terms of the MPL, indicate your
34 # decision by deleting the provisions above and replace them with the notice
35 # and other provisions required by the GPL or the LGPL. If you do not delete
36 # the provisions above, a recipient may use your version of this file under
37 # the terms of any one of the MPL, the GPL or the LGPL.
39 # ***** END LICENSE BLOCK *****
41 # RBS - Last Modified: March 14, 2001.
44 # Usage: perl encode.pl font-encoding-table.html [-t truetype | type1]
47 require 'getopts.pl';
48 our($opt_h, $opt_f, $opt_t);
50 require 'mathfont.pl';
51 my($DEBUG) = 1;
53 # where is the MathML DTD file?
54 my($DTDfile) = "C:\\Mozilla\\src\\mozilla\\layout\\mathml\\content\\src\\mathml.dtd";
56 # where to load the PUA file?
57 my($PUAfile) = "C:\\Mozilla\\src\\mozilla\\layout\\mathml\\base\\src\\mathfontPUA.properties";
59 # where to save the PUA file if new assignments to the PUA are made?
60 my($newPUAfile) = $PUAfile;
62 # get the basename of the script
63 my($progname) = $0 =~ /([^\/\\]+)$/;
65 sub usage
67 my($comment) = @_;
68 my($usage) = <<__USAGE__;
69 Usage: perl $progname [-h] -f fontfile.html [-t truetype | type1]
71 Purpose
72 This script takes in input a Mozilla's MathFont Encoding Table and
73 outputs the data needed to support the font, i.e., the data for the
74 ucvmath module and the data for the MathFont Property File.
76 Options
77 -h (help, this message)
79 -f fontfile.html
80 The file that contains the font's encoding table. To see examples,
81 launch 'view-source' on the .html files of the default
82 fonts at: http://www.mozilla.org/projects/mathml/fonts/encoding/
83 These .html files are *exactly* the files passed to this script
84 with this -f option. For example, to get the data currently used
85 by Mozilla for the Math1 font, this script was executed as:
87 encode.pl -f math1.html
89 The script parses the .html file to extract the mapping enclosed in
90 <!--cmap-->
91 ...
92 <!--/cmap-->
94 Not only does this ease maintenance, but it also means that the website
95 reflects the latest updates. Note however that the MathFont Property
96 Files are also hand-tuned to refine the results and to add further
97 customization. Such refinements should not be done on the files generated
98 by this script directly (see below). Otherwise you will overwrite and
99 thus lose your changes if you run the script again.
101 -t encodingtype
102 This option should not be used if the TrueType and Type1 versions
103 have the same encoding. If there are different encodings, use
104 -t truetype : to process the TrueType encoding table inside the file
105 -t type1 : to process the Type1 encoding table inside the file
107 OUTPUT:
108 Given the fontfile.html on input, three files are created on output:
110 1. a file called fontname-ucvmath.txt : this is the data to pass to
111 "umaptable" for the ucvmath module.
113 For example, running "encode.pl -f math1.html" produces math1-ucvmath.txt.
114 Then, running "umaptable -uf < math1-ucvmath.txt > mathematica1.uf" will
115 produce the resulting file that is currently in mozilla/intl/uconv/ucvmath.
116 "umaptable" is a standalone C program that can be built from
117 mozilla/intl/uconv/tools and the executable copied where you want.
119 2. fontname-properties.txt : this is the data for the MathFont Property File.
120 Continuing with the example of Math1, a file called math1-properties.txt
121 will be created. Then after hand-tuning (if necessary), the file is copied
122 to its final destination:
123 mozilla/layout/mathml/base/src/mathfontMath1.properties
125 3. fontname-encoding.html : the is a prettified output that shows the
126 resulting mappings. For the example of Math1, this is the file that is at:
127 http://www.mozilla.org/projects/mathml/fonts/encoding/math1-encoding.html
129 4. Furthermore, since new assignments can be made to the PUA during
130 processing, the mathfontPUA.properties file is updated with any new
131 assignments that may have been made.
132 __USAGE__
133 $comment = "\nBad usage: $comment\n" if $comment;
134 die "$comment\n$usage";
138 &usage("Missing arguments") if !&Getopts('hf:t:');
139 &usage("") if $opt_h; # help option
141 # The file that contains MathFont Encoding Table(s)
142 my($encoding_file) = $opt_f;
143 &usage("Missing file") unless $opt_f;
144 $encoding_file = "encoding\\" . $encoding_file;
146 # The encoding type should be:
147 # . "" (empty), if there is only one encoding for both TrueType and Type1
148 # . "truetype", to process the TrueType encoding table inside the file
149 # . "type1", to process the Type1 encoding table inside the file
150 my($encoding_type) = $opt_t;
151 &usage("Unexpected type") unless $encoding_type eq "" ||
152 $encoding_type eq "type1" ||
153 $encoding_type eq "truetype";
155 # Processing starts here
156 #######################################################################
158 # global variables -- in capitals
160 # $UNIDATA{$unicode} holds a whitespace-separated list of entities and
161 # annotated unicode points that all resolve to that $unicode point.
162 # %UNIDATA is declared in mathfont.pl
164 # $GLYPHDATA{$glyph} holds a whitespace-separated list of entities and
165 # annotated unicode points that all resolve to that $glyph index.
166 my(%GLYPHDATA);
168 # $MAP{$glyph} holds the ultimate resolved unicode point of the $glyph
169 # (could be a PUA value).
170 my(%MAP);
172 # load current assignments to the PUA
173 &load_PUA($PUAfile);
175 # parse the supplied encoding data
176 my($fontfamily) = &parse_mapping_table($encoding_file, $encoding_type);
178 # resolve all mappings
179 &generate_mapping_data($fontfamily);
181 # save the PUA to preserve any new assignments
182 &save_PUA($PUAfile, $newPUAfile);
184 # dump results
185 &output_stretchy_data($fontfamily, $encoding_type);
186 &output_mapping_data($fontfamily, $encoding_type);
187 &output_encoding_map($fontfamily, $encoding_type);
189 exit(0);
193 #########################################################################
194 # parse_mapping_table:
196 # Parse a file containing the mapping between glyph indices and
197 # unicode points. The file is formatted as:
198 # <!--cmap:FontFamilyName(:truetype|:type1)-->
199 # 0xNN 0xNNNN #optional trailing comment
200 # 0xNN 0xNNNN:0
201 # 0xNN 0xNNNN:1
202 # -0xNN
203 # 0xNN 0xNNNN:T, 0xABCD
204 # 0xNN 0xNNNN:G, 0xABCD
205 # 0xNN >PUA #required comment with &entity;
206 # ...etc
207 # <!--/cmap:FontFamilyName(:truetype|:type1)-->
209 # Note that there is an enclosing: <cmap> ... </cmap> with the *required*
210 # FontFamilyName and the optional type (exactly 'truetype' or 'type1').
212 # The first field may contain a dash '-', in which case the line is skipped.
213 # The second field is the glyph index.
214 # The third field is a comma-delimited list of its associated unicode points
215 # with annotations w.r.t their applicability ('T', 'L', 'M', 'B', 'R', '0'-'9').
217 # Partial glyphs can apply to different chars. The annotation for '0' is
218 # optional, i.e., it is assumed if no other annotation has been specified.
219 # The list can run over many lines provided the preceding line ends
220 # with a comma.
222 # If the keyword '>PUA' is present, or the unicode point is in plane 1,
223 # a PUA code is assigned, and is associated with the required '&entity;'
224 # that must be provided in the comment field.
225 #########################################################################
226 sub parse_mapping_table
228 my($file, $type) = @_;
230 local(*FILE);
231 open(FILE, $file) || die "Cannot find $file\n";
232 my($data) = join("", <FILE>);
233 close(FILE);
235 my($fontfamily);
236 $type = ':' . $type if $type;
237 if ($data =~ m|<!--cmap:([^>]+?)$type-->(.+?)<!--/cmap:([^>]+?)$type-->|s) {
238 die "ERROR *** Bad mapping: mistmatching tags $type start:$1 end:$3" unless $1 eq $3;
239 ($fontfamily, $data) = ($1, $2);
240 die "ERROR *** No specified font type $1" if $fontfamily =~ /:/;
242 else {
243 die "ERROR *** Bad mapping: data must be enclosed in the <cmap> tag";
246 my($isContinuation) = 0;
247 my($line, $glyph, $comment, $oldline, $oldcomment);
249 my (@lines) = split("\n", $data);
250 foreach $line (@lines) {
251 # skip bogus lines
252 next if $line =~ m/^-/;
254 # remove leading and trailing whitespace
255 $line =~ s/^\s+//; $line =~ s/\s+$//;
257 # cache comments in case the keyword '>PUA' is present
258 $comment = ($line =~ m/#(.*)/) ? $1 : "";
259 $line =~ s/\s*#.*//;
261 # see if this is the continuation of a longer line
262 if ($isContinuation) {
263 $line = $oldline . ' ' . $line;
264 $comment = $oldcomment . ' ' . $comment;
265 $isContinuation = 0;
268 # if the line ends with a comma, the next line is a continuation
269 if ($line =~ m/,$/) {
270 $oldline = $line;
271 $oldcomment = $comment;
272 $isContinuation = 1;
273 next;
276 # get the mapping on the line
277 next unless ($line =~ m/^(0x..)\s+(.+)$/);
278 ($glyph, $data) = ($1, $2);
280 # see if this a '>PUA' or plane 1 character
281 if (($data eq '>PUA') || ($data =~ m/^0x1..../)) {
282 my($entitylist) = '';
283 while ($comment =~ /&(.+?);/g) {
284 $entitylist .= ' ' . $1;
286 $entitylist =~ s/^\s//;
287 die "ERROR *** No entities found: $line" unless $entitylist;
288 $GLYPHDATA{$glyph} = $entitylist;
290 # continue to next line
291 next;
294 # skip bogus lines where uncertainties still remain
295 $data .= ' ';
296 next unless $data =~ /^0x....\s*[, ]/ || $data =~ /^0x....:[TLMBRG0-9]\s*[, ]/;
297 chop($data);
299 # convert from comma-delimited to whitespace-delimited
300 $data =~ s/\s*,\s*/ /g;
302 # add explicit 0 at size0
303 $data =~ s/(0x....) /$1:0 /;
304 $data =~ s/(0x....)$/$1:0/;
306 # $GLYPHDATA{$glyph} is a list of referrers to that glyph
307 $GLYPHDATA{$glyph} = $data;
310 # check for correctness
311 &verify_mapping_table();
313 return $fontfamily;
316 #########################################################################
317 # verify_mapping_table:
319 # helper to check that some common mistakes were not made when
320 # setting up the encoding table.
321 #########################################################################
322 sub verify_mapping_table
324 my($glyph, $entry, $tmp);
325 foreach $glyph (keys %GLYPHDATA) {
326 my(@data) = split(' ', $GLYPHDATA{$glyph});
327 foreach $entry (@data) {
328 # verify that $entry wasn't listed twice on distinct lines
329 foreach $tmp (keys %GLYPHDATA) {
330 next if $tmp eq $glyph;
331 next unless $GLYPHDATA{$tmp} =~ /\b$entry\b/;
332 die "ERROR *** Duplicate: $glyph $GLYPHDATA{$glyph} vs. $tmp $GLYPHDATA{$tmp}";
334 # verify that $entry wasn't listed twice on the same line
335 $tmp = $GLYPHDATA{$glyph};
336 $tmp =~ s/\b$entry\b//;
337 next unless $tmp =~ /\b$entry\b/;
338 die "ERROR *** Duplicate: $glyph $GLYPHDATA{$glyph}";
343 #########################################################################
344 # generate_mapping_data:
346 # This routine resolves the unicode points (PUA and non-PUA) of all
347 # elements. Upon completion, the assignments for all entities, annotated
348 # unicode points, and glyph indices are known.
349 #########################################################################
350 sub generate_mapping_data
352 my($font) = @_;
354 my($glyph);
355 foreach $glyph (sort keys %GLYPHDATA) {
356 # $GLYPHDATA{$glyph} is a list of referrers to that glyph
357 # see if one of the referrers is already resolved
358 my($entry);
359 my($unicode) = '';
360 my(@data) = split(' ', $GLYPHDATA{$glyph});
361 foreach $entry (@data) {
362 # see if this entry is the unicode associated to size0
363 if ($entry =~ /(0x....):0/) {
364 $unicode = $1;
365 last;
367 # see if this entry is already associated to a unicode point
368 $unicode = get_unicode($entry);
369 last if $unicode;
371 # if we found one entry that was already resolved, make
372 # everybody on the list resolve to that unicode point
373 if ($unicode) {
374 foreach $entry (@data) {
375 next if $entry =~ /0x....:0/;
376 if (defined $UNIDATA{$unicode}) {
377 $UNIDATA{$unicode} .= " $entry"
378 unless $UNIDATA{$unicode} =~ /\b$entry\b/;
380 else {
381 $UNIDATA{$unicode} = $entry;
385 else {
386 # make a new assignment to the PUA for this encoding point
387 $unicode = &assign_PUA($GLYPHDATA{$glyph});
389 # now, we know the unicode point of the glyph
390 die "ERROR *** Duplicate $glyph" if defined $MAP{$glyph};
391 $MAP{$glyph} = $unicode;
394 # check that all went well
395 verify_mapping_data();
398 #########################################################################
399 # verify_mapping_data:
401 # helper to check the validity of the resolved mapping.
402 #########################################################################
403 sub verify_mapping_data
405 my($unicode, $entry, $tmp);
406 foreach $unicode (keys %UNIDATA) {
407 my(@data) = split(' ', $UNIDATA{$unicode});
408 foreach $entry (@data) {
409 # verify that $entry wasn't assigned twice on distinct slots
410 foreach $tmp (keys %UNIDATA) {
411 next if $tmp eq $unicode;
412 # we don't care about different mappings outside the PUA because
413 # these are not kept in the mathfontPUA.properties file
414 next unless &is_pua($tmp) && &is_pua($unicode);
415 next unless $UNIDATA{$tmp} =~ /\b$entry\b/;
416 # if we reach here, different mappings in the PUA were found
417 # for the same annotated code point, something is wrong
418 die "ERROR *** Duplicate: $unicode $UNIDATA{$unicode} vs. $tmp $UNIDATA{$tmp}";
420 # verify that $entry wasn't listed twice on the same slot
421 $tmp = $UNIDATA{$unicode};
422 $tmp =~ s/\b$entry\b//;
423 next unless $tmp =~ /\b$entry\b/;
424 die "ERROR *** Duplicate: $unicode $UNIDATA{$unicode}";
429 sub is_pua
431 my($value) = @_;
432 if ($value =~ s/^0x//) {
433 my($numeric) = hex($value);
434 return 1 if $numeric >= 0xE000 && $numeric <= 0xF8FF;
436 return 0;
439 #########################################################################
440 # output_mapping_data:
442 # output the mapping data that goes in the ucvmath module
443 #########################################################################
444 sub output_mapping_data
446 my($font, $type) = @_;
448 $type = "-$type" if $type;
449 my($file) = "$font-ucvmath$type.txt";
450 $file =~ s/ //g;
451 $file = lc($file);
452 print "Saving mapping data of $font in: $file\n";
454 local(*FILE);
455 open(FILE, ">$file") || die "Cannot open $file\n";
456 my($glyph);
457 foreach $glyph (sort keys %MAP) {
458 print FILE "$glyph $MAP{$glyph}\n";
460 close(FILE);
463 #########################################################################
464 # output_stretchy_data:
466 # output the list of stretchy data in the compact format expected by
467 # the MathFont Property File.
468 #########################################################################
469 sub output_stretchy_data
471 my($font, $type) = @_;
473 $type = "-$type" if $type;
474 my($file) = "$font-properties$type.txt";
475 $file =~ s/ //g;
476 $file = lc($file);
478 print "Saving properties of $font in: $file\n";
479 &load_DTD($DTDfile);
481 my($pattern);
482 my(@patterns) = qw {
483 (\S+):[TL]
484 (\S+):M
485 (\S+):[BR]
486 (\S+):G
489 my($unicode, $glyph, $label);
491 # construct the _transpose_ of the GLYPHDATA table so that upon
492 # completion, $table{unicode} = list of its associated glyph
493 # indices with the annotations flipped on the other side
494 my(%table);
495 foreach $glyph (%GLYPHDATA) {
496 my(@data) = split(' ', $GLYPHDATA{$glyph});
497 foreach $unicode (@data) {
498 # the '0x' prefix is not kept here
499 next unless $unicode =~ m|0x(....):(.)|;
500 ($unicode, $label) = ($1, $2);
501 if (defined $table{$unicode}) {
502 $table{$unicode} .= " $glyph:$label";
504 else {
505 $table{$unicode} = "$glyph:$label";
510 # now, replace the glyph indices with their resolved unicode points
511 my($parts, $sizes, $comment);
513 local(*FILE);
514 open(FILE, ">$file") || die "Cannot open $file\n";
515 foreach $unicode (sort keys %table) {
516 my($isMutable) = 0;
518 # partial glyphs
519 $parts = '';
520 foreach $pattern (@patterns) {
521 if ($table{$unicode} =~ m/$pattern/) {
522 $glyph = $1;
523 $parts .= &indirect_pua($MAP{$glyph});
524 $isMutable = 1;
526 else {
527 $parts .= '\uFFFD';
531 # glyphs of larger sizes
532 my($sizes) = "\\u$unicode"; # size0 is the char itself
533 $label = '1';
534 while ($table{$unicode} =~ m/(\S+):$label/) {
535 $glyph = $1;
536 $sizes .= &indirect_pua($MAP{$glyph});
537 ++$label;
538 $isMutable = 1;
541 # ignore this character if it is not mutable
542 next unless $isMutable;
544 # entry for the list of glyphs
545 $comment = $ENTITY{"0x$unicode"};
546 if ($DEBUG) {
547 print FILE "\\u$unicode = $parts$sizes $comment\n";
549 else {
550 print FILE "$comment\n\\u$unicode = $parts$sizes\n";
553 close(FILE);
556 sub indirect_pua
558 my($unicode) = @_;
560 # see if this code is the PUA
561 my($numeric) = hex($unicode);
562 die "ERROR *** Forbidden mapping" if $numeric == 0xF8FF;
563 if ($numeric >= 0xE000 && $numeric <= 0xF8FF) {
564 # return the flag to indicate to lookup in the PUA
565 return '\uF8FF';
567 # re-use the existing unicode point
568 $unicode =~ s/0x/\\u/;
569 return $unicode;
572 #########################################################################
573 # output_encoding_map:
575 # output the encoding map in html format for visual comparison with the
576 # graphical character map.
577 #########################################################################
578 sub output_encoding_map
580 my($font, $type) = @_;
581 $type = "-$type" if $type;
583 my($shorttype) = '';
584 $shorttype = '-t1' if $type eq '-type1';
585 $shorttype = '-ttf' if $type eq '-truetype';
587 my($file) = "$font$shorttype-encoding.html";
588 $file =~ s/ //g;
589 $file = lc($file);
590 print "Saving visual encoding result of $font in: $file\n";
592 local(*FILE);
593 open(FILE, ">$file") || die "Cannot open $file\n";
595 # header
596 print FILE "<html>\n"
597 . "<head>\n"
598 . " <title>$font$type - Visual Encoding Result</title>\n"
599 . " <style type='text/css'>.glyph {font-family: $font} </style>\n"
600 . "</head>\n"
601 . "<body>\n"
602 . "<h2>$font$type - Visual Encoding Result</h2>\n\n";
603 # column indices
604 print FILE "<table border='1' cellpadding='4'>\n<tr align='center'><td> </td>";
605 my($i, $j);
606 for ($j = 0; $j <= 15; ++$j) {
607 print FILE sprintf("<td bgcolor='silver'>%X </td>", $j);
609 print FILE "</tr>\n";
610 # cmap array
611 for ($i = 0; $i <= 15; ++$i) {
612 print FILE sprintf("<tr align='center'><td>%X </td>", $i);
613 for ($j = 0; $j <= 15; ++$j) {
614 my($glyph) = sprintf("0x%X%X", $i, $j);
615 if (defined $GLYPHDATA{$glyph}) {
616 my($unicode) = $MAP{$glyph};
617 my($data) = $GLYPHDATA{$glyph};
618 $unicode =~ s|0(x....)|&#$1;|;
619 $data =~ s| |<br />|g;
620 $data =~ s|(0x....):(.)|<font color='darkblue'>$1</font>:<font color='brown'>$2</font>|g;
621 print FILE "<td>"
622 . "<font size='-1' color='#666666'>$MAP{$glyph}</font><br />"
623 . "<span class='glyph'>$unicode</span><br />"
624 . $data
625 . "</td>";
627 else {
628 print FILE '<td>&nbsp;</td>';
631 print FILE "</tr>\n";
633 # footer
634 print FILE "</table>\n\n</body>\n</html>\n";
636 close(FILE);
639 #########################################################################
640 # load_DTD:
642 # load the mathml DTD so that we can comment outputs with entities
643 #########################################################################
644 sub load_DTD
646 my($file) = @_;
647 local(*FILE);
648 open(FILE, $file) || die "Cannot open $file\n";
649 while (<FILE>) {
650 while (/<!ENTITY\s+(\S+)\s+'&#x([^>]+);'>/g) {
651 my($entity, $unicode) = ($1, "0x$2");
652 $entity = chr(hex($unicode)) if hex($unicode) <= 0xFF;
653 if (defined $ENTITY{$unicode}) {
654 $ENTITY{$unicode} .= ", $entity";
656 else {
657 $ENTITY{$unicode} = "# $entity";
661 close(FILE);