Sentence::tokenize() now uses flex-based ::tokenize()
[vspell.git] / scripts / personal-name-extract-from-hcmuns.pl
blob671149daaf5029b5f29d4fb79ee0f5b13935872a
1 #!/usr/bin/perl
3 %entity = (
4 'à' => 'à',
5 'ả' => 'ä',
6 'ã' => 'ã',
7 'á' => 'á',
8 'ạ' => 'Õ',
9 'ă' => 'å',
10 'ằ' => '¢',
11 'ẳ' => 'Æ',
12 'ẵ' => 'Ç',
13 'ắ' => '¡',
14 'ặ' => '£',
15 'â' => 'â',
16 'ầ' => '¥',
17 'ẩ' => '¦',
18 'ẫ' => 'ç',
19 'ấ' => '¤',
20 'ậ' => '§',
21 'è' => 'è',
22 'ẻ' => 'ë',
23 'ẽ' => '¨',
24 'é' => 'é',
25 'ẹ' => '©',
26 'ê' => 'ê',
27 'ề' => '«',
28 'ể' => '¬',
29 'ễ' => '­',
30 'ế' => 'ª',
31 'ệ' => '®',
32 'ì' => 'ì',
33 'ỉ' => 'ï',
34 'ĩ' => 'î',
35 'í' => 'í',
36 'ị' => '¸',
37 'ò' => 'ò',
38 'ỏ' => 'ö',
39 'õ' => 'õ',
40 'ó' => 'ó',
41 'ọ' => '÷',
42 'ô' => 'ô',
43 'ồ' => '°',
44 'ổ' => '±',
45 'ỗ' => '²',
46 'ố' => '¯',
47 'ộ' => 'µ',
48 'ơ' => '½',
49 'ờ' => '¶',
50 'ở' => '·',
51 'ỡ' => 'Þ',
52 'ớ' => '¾',
53 'ợ' => 'þ',
54 'ù' => 'ù',
55 'ủ' => 'ü',
56 'ũ' => 'û',
57 'ú' => 'ú',
58 'ụ' => 'ø',
59 'ư' => 'ß',
60 'ừ' => '×',
61 'ử' => 'Ø',
62 'ữ' => 'æ',
63 'ứ' => 'Ñ',
64 'ự' => 'ñ',
65 'ỳ' => 'Ï',
66 'ỷ' => 'Ö',
67 'ỹ' => 'Û',
68 'ý' => 'ý',
69 'ỵ' => 'Ü',
70 'đ' => 'ð',
71 'À' => 'À',
72 'Ả' => 'Ä',
73 'Ã' => 'Ã',
74 'Á' => 'Á',
75 'Ạ' => '€',
76 'Ă' => 'Å',
77 'Ằ' => '‚',
78 'Ẳ' => '\x02',
79 'Ẵ' => '\x05',
80 'Ắ' => '�',
81 'Ặ' => 'ƒ',
82 'Â' => 'Â',
83 'Ầ' => '…',
84 'Ẩ' => '†',
85 'Ẫ' => '\x06',
86 'Ấ' => '$',
87 'Ậ' => '‡',
88 'È' => 'È',
89 'Ẻ' => 'Ë',
90 'Ẽ' => 'ˆ',
91 'É' => 'É',
92 'Ẹ' => '‰',
93 'Ê' => 'Ê',
94 'Ề' => '‹',
95 'Ể' => 'Œ',
96 'Ễ' => '�',
97 'Ế' => 'Š',
98 'Ệ' => 'Ž',
99 'Í' => 'Í',
100 'Ỉ' => '›',
101 'Ĩ' => 'Î',
102 'Ì' => 'Ì',
103 'Ị' => '˜',
104 'Ò' => 'Ò',
105 'Ỏ' => '™',
106 'Õ' => ' ',
107 'Ó' => 'Ó',
108 'Ọ' => 'š',
109 'Ô' => 'Ô',
110 'Ồ' => '�',
111 'Ổ' => '‘',
112 'Ỗ' => '’',
113 'Ố' => '�',
114 'Ộ' => '“',
115 'Ơ' => '´',
116 'Ờ' => '–',
117 'Ở' => '—',
118 'Ỡ' => '³',
119 'Ớ' => '•',
120 'Ợ' => '”',
121 'Ù' => 'Ù',
122 'Ủ' => 'œ',
123 'Ũ' => '�',
124 'Ú' => 'Ú',
125 'Ụ' => 'ž',
126 'Ư' => '¿',
127 'Ừ' => '»',
128 'Ử' => '¼',
129 'Ữ' => 'ÿ',
130 'Ứ' => 'º',
131 'Ự' => 'q',
132 'Ỳ' => 'Ÿ',
133 'Ỷ' => '\x14',
134 'Ỹ' => '\x19',
135 'Ý' => 'Ý',
136 'Ỵ' => '\x1e',
137 'Đ' => 'Ð',
140 # Get all content
141 while (<>) {
142 chomp;
143 if (m, <td align="center">[0-9]*</td>,) {
144 $url = <>;
145 $surname = <>;chomp($surname);
146 $name = <>;chomp($name);
147 $surname =~ s,.*<td>(.*)</td>.*,\1,;
148 $name =~ s,.*<td>(.*)</td>.*,\1,;
149 $name =~ s,\s*$,,;
150 $surname =~ s,\s*$,,;
151 $line = "$name, $surname\n";
152 foreach $i (keys %entity) {
153 $val = $entity{$i};
154 $line =~ s/$i/$val/g;
156 print $line;
160 # Local Variables:
161 # coding: viscii
162 # End: