bugfixes in swfc
[swftools.git] / lib / gocr / unicode.c
blobd8ed703676e5e7c86bcf4cc25f8d3c9d73152b61
1 /*
2 This is a Optical-Character-Recognition program
3 Copyright (C) 2000-2007 Joerg Schulenburg
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License
7 as published by the Free Software Foundation; either version 2
8 of the License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 see README for EMAIL-address
22 #include "unicode.h"
23 #include <stdio.h>
25 /* FIXME jb global */
26 int warn=0; /* if 1 a message is generated if composition is not defined */
28 /* Arguments: the character (main), and the modifier (accent, etc). See the
29 function if you want to know the modifiers.
30 Description: This function intends to be a small helper, to avoid having
31 to write switches in functions. It's therefore mainly to accents, and
32 specially for the most usual ones. It supports the basic greek
33 characters too, which is actually not very helpful.
34 Returns: the unicode character corresponding to the composed character.
36 ToDo:
37 - It seems to me, that tables should be more effectiv.
38 So we should use tables in future? (js)
40 wchar_t compose(wchar_t main, wchar_t modifier) {
41 /* supported by now: part of ISO8859-1, basic greek characters */
42 if( main == UNKNOWN || main == PICTURE ) return main;
43 #ifdef DEBUG
44 if(modifier!=UNICODE_NULL && modifier!=SPACE)
45 printf(" compose(%c,%d)",(char)main,(int)modifier);
46 #endif
47 if(main>127 && modifier!=0 && modifier!=SPACE && warn)
48 fprintf(stderr,"# Warning compose %04x + %04x>127\n",
49 (int)modifier,(int)main);
50 switch (modifier) {
51 case UNICODE_NULL:
52 case SPACE:
53 return (wchar_t)main;
55 case APOSTROPHE: /* do NOT USE this. It's here for compatibility only.
56 Use ACUTE_ACCENT instead. */
57 fprintf( stderr, "COMPOSE: got APOSTROPHE instead of ACUTE_ACCENT");
59 case ACUTE_ACCENT: /* acute/cedilla */
60 switch (main) {
61 case 'a': return LATIN_SMALL_LETTER_A_WITH_ACUTE;
62 case 'A': return LATIN_CAPITAL_LETTER_A_WITH_ACUTE;
63 case LATIN_SMALL_LETTER_AE: return LATIN_SMALL_LETTER_AE_WITH_ACUTE;
64 case LATIN_CAPITAL_LETTER_AE: return LATIN_CAPITAL_LETTER_AE_WITH_ACUTE;
65 case 'c': return LATIN_SMALL_LETTER_C_WITH_ACUTE;
66 case 'C': return LATIN_CAPITAL_LETTER_C_WITH_ACUTE;
67 case 'e': return LATIN_SMALL_LETTER_E_WITH_ACUTE;
68 case 'E': return LATIN_CAPITAL_LETTER_E_WITH_ACUTE;
69 case 'g': return LATIN_SMALL_LETTER_G_WITH_ACUTE;
70 case 'G': return LATIN_CAPITAL_LETTER_G_WITH_ACUTE;
71 case 'i': return LATIN_SMALL_LETTER_I_WITH_ACUTE;
72 case 'I': return LATIN_CAPITAL_LETTER_I_WITH_ACUTE;
73 case 'l': return LATIN_SMALL_LETTER_L_WITH_ACUTE;
74 case 'L': return LATIN_CAPITAL_LETTER_L_WITH_ACUTE;
75 case 'n': return LATIN_SMALL_LETTER_N_WITH_ACUTE;
76 case 'N': return LATIN_CAPITAL_LETTER_N_WITH_ACUTE;
77 case 'o': return LATIN_SMALL_LETTER_O_WITH_ACUTE;
78 case 'O': return LATIN_CAPITAL_LETTER_O_WITH_ACUTE;
79 case '0': return LATIN_CAPITAL_LETTER_O_WITH_ACUTE;
80 case 'r': return LATIN_SMALL_LETTER_R_WITH_ACUTE;
81 case 'R': return LATIN_CAPITAL_LETTER_R_WITH_ACUTE;
82 case 's': return LATIN_SMALL_LETTER_S_WITH_ACUTE;
83 case 'S': return LATIN_CAPITAL_LETTER_S_WITH_ACUTE;
84 case 'u': return LATIN_SMALL_LETTER_U_WITH_ACUTE;
85 case 'U': return LATIN_CAPITAL_LETTER_U_WITH_ACUTE;
86 case 'y': return LATIN_SMALL_LETTER_Y_WITH_ACUTE;
87 case 'Y': return LATIN_CAPITAL_LETTER_Y_WITH_ACUTE;
88 case 'z': return LATIN_SMALL_LETTER_Z_WITH_ACUTE;
89 case 'Z': return LATIN_CAPITAL_LETTER_Z_WITH_ACUTE;
90 default:
91 if(warn)fprintf( stderr, " COMPOSE: ACUTE_ACCENT+%04x not defined\n",(int)main);
93 break;
95 case BREVE: /* caron (latin2) "u"-above-... (small bow) */
96 switch (main) {
97 /* FIXME write separate heuristics for breve */
98 case 'a': return LATIN_SMALL_LETTER_A_WITH_BREVE;
99 case 'A': return LATIN_CAPITAL_LETTER_A_WITH_BREVE;
100 case 'e': return LATIN_SMALL_LETTER_E_WITH_BREVE;
101 case 'E': return LATIN_CAPITAL_LETTER_E_WITH_BREVE;
102 case 'g': return LATIN_SMALL_LETTER_G_WITH_BREVE;
103 case 'G': return LATIN_CAPITAL_LETTER_G_WITH_BREVE;
104 case 'i': return LATIN_SMALL_LETTER_I_WITH_BREVE;
105 case 'I': return LATIN_CAPITAL_LETTER_I_WITH_BREVE;
106 case 'o': return LATIN_SMALL_LETTER_O_WITH_BREVE;
107 case 'O': return LATIN_CAPITAL_LETTER_O_WITH_BREVE;
108 case 'u': return LATIN_SMALL_LETTER_U_WITH_BREVE;
109 case 'U': return LATIN_CAPITAL_LETTER_U_WITH_BREVE;
110 default:
111 if(warn)fprintf( stderr, " COMPOSE: BREVE+%04x not defined\n",(int)main);
113 break;
115 case CARON: /* caron (latin2) "v"-above-... */
116 switch (main) {
117 case 'a': return LATIN_SMALL_LETTER_A_WITH_CARON;
118 case 'A': return LATIN_CAPITAL_LETTER_A_WITH_CARON;
119 case 'c': return LATIN_SMALL_LETTER_C_WITH_CARON;
120 case 'C': return LATIN_CAPITAL_LETTER_C_WITH_CARON;
121 case 'e': return LATIN_SMALL_LETTER_E_WITH_CARON;
122 case 'E': return LATIN_CAPITAL_LETTER_E_WITH_CARON;
123 case 'i': return LATIN_SMALL_LETTER_I_WITH_CARON;
124 case 'I': return LATIN_CAPITAL_LETTER_I_WITH_CARON;
125 case 'o': return LATIN_SMALL_LETTER_O_WITH_CARON;
126 case 'O': return LATIN_CAPITAL_LETTER_O_WITH_CARON;
127 case '0': return LATIN_CAPITAL_LETTER_O_WITH_CARON;
128 case 's': return LATIN_SMALL_LETTER_S_WITH_CARON;
129 case 'S': return LATIN_CAPITAL_LETTER_S_WITH_CARON;
130 case 'u': return LATIN_SMALL_LETTER_U_WITH_CARON;
131 case 'U': return LATIN_CAPITAL_LETTER_U_WITH_CARON;
132 case 'z': return LATIN_SMALL_LETTER_Z_WITH_CARON;
133 case 'Z': return LATIN_CAPITAL_LETTER_Z_WITH_CARON;
134 default:
135 if(warn)fprintf( stderr, " COMPOSE: CARON+%04x not defined\n",(int)main);
137 break;
139 case CEDILLA:
140 switch (main) {
141 case 'c': return LATIN_SMALL_LETTER_C_WITH_CEDILLA;
142 case 'C': return LATIN_CAPITAL_LETTER_C_WITH_CEDILLA;
143 default:
144 if(warn)fprintf( stderr, " COMPOSE: CEDILLA+%04x not defined\n",(int)main);
146 break;
148 case TILDE:
149 switch (main) {
150 case 'a': return LATIN_SMALL_LETTER_A_WITH_TILDE;
151 case 'A': return LATIN_CAPITAL_LETTER_A_WITH_TILDE;
152 case 'i': return LATIN_SMALL_LETTER_I_WITH_TILDE;
153 case 'I': return LATIN_CAPITAL_LETTER_I_WITH_TILDE;
154 case 'n': return LATIN_SMALL_LETTER_N_WITH_TILDE;
155 case 'N': return LATIN_CAPITAL_LETTER_N_WITH_TILDE;
156 case 'o': return LATIN_SMALL_LETTER_O_WITH_TILDE;
157 case 'O': return LATIN_CAPITAL_LETTER_O_WITH_TILDE;
158 case '0': return LATIN_CAPITAL_LETTER_O_WITH_TILDE;
159 case 'u': return LATIN_SMALL_LETTER_U_WITH_TILDE;
160 case 'U': return LATIN_CAPITAL_LETTER_U_WITH_TILDE;
161 default:
162 if(warn)fprintf( stderr, " COMPOSE: TILDE+%04x not defined\n",(int)main);
164 break;
166 case GRAVE_ACCENT:
167 switch (main) {
168 case 'a': return LATIN_SMALL_LETTER_A_WITH_GRAVE;
169 case 'A': return LATIN_CAPITAL_LETTER_A_WITH_GRAVE;
170 case 'e': return LATIN_SMALL_LETTER_E_WITH_GRAVE;
171 case 'E': return LATIN_CAPITAL_LETTER_E_WITH_GRAVE;
172 case 'i': return LATIN_SMALL_LETTER_I_WITH_GRAVE;
173 case 'I': return LATIN_CAPITAL_LETTER_I_WITH_GRAVE;
174 case 'n': return LATIN_SMALL_LETTER_N_WITH_GRAVE;
175 case 'N': return LATIN_CAPITAL_LETTER_N_WITH_GRAVE;
176 case 'o': return LATIN_SMALL_LETTER_O_WITH_GRAVE;
177 case 'O': return LATIN_CAPITAL_LETTER_O_WITH_GRAVE;
178 case '0': return LATIN_CAPITAL_LETTER_O_WITH_GRAVE;
179 case 'u': return LATIN_SMALL_LETTER_U_WITH_GRAVE;
180 case 'U': return LATIN_CAPITAL_LETTER_U_WITH_GRAVE;
181 default:
182 if(warn)fprintf( stderr, " COMPOSE: GRAVE_ACCENT+%04x not defined\n",(int)main);
184 break;
186 case QUOTATION_MARK: /* do NOT USE this. It's here for compatibility only.
187 Use DIAERESIS instead. */
188 fprintf( stderr, "COMPOSE: got APOSTROPHE instead of ACUTE_ACCENT");
190 case DIAERESIS:
191 switch (main) {
192 case 'a': return LATIN_SMALL_LETTER_A_WITH_DIAERESIS;
193 case 'A': return LATIN_CAPITAL_LETTER_A_WITH_DIAERESIS;
194 case 'e': return LATIN_SMALL_LETTER_E_WITH_DIAERESIS;
195 case 'E': return LATIN_CAPITAL_LETTER_E_WITH_DIAERESIS;
196 case 'i': return LATIN_SMALL_LETTER_I_WITH_DIAERESIS;
197 case 'I': return LATIN_CAPITAL_LETTER_I_WITH_DIAERESIS;
198 case 'o': return LATIN_SMALL_LETTER_O_WITH_DIAERESIS;
199 case 'O': return LATIN_CAPITAL_LETTER_O_WITH_DIAERESIS;
200 case '0': return LATIN_CAPITAL_LETTER_O_WITH_DIAERESIS;
201 case 'u': return LATIN_SMALL_LETTER_U_WITH_DIAERESIS;
202 case 'U': return LATIN_CAPITAL_LETTER_U_WITH_DIAERESIS;
203 case 'y': return LATIN_SMALL_LETTER_Y_WITH_DIAERESIS;
204 case 'Y': return LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
205 default:
206 if(warn)fprintf( stderr, " COMPOSE: DIAERESIS+%04x (%c) not defined\n",(int)main,(char)main);
208 break;
210 case CIRCUMFLEX_ACCENT: /* ^ */
211 switch (main) {
212 case 'a': return LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX;
213 case 'A': return LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX;
214 case 'c': return LATIN_SMALL_LETTER_C_WITH_CIRCUMFLEX;
215 case 'C': return LATIN_CAPITAL_LETTER_C_WITH_CIRCUMFLEX;
216 case 'e': return LATIN_SMALL_LETTER_E_WITH_CIRCUMFLEX;
217 case 'E': return LATIN_CAPITAL_LETTER_E_WITH_CIRCUMFLEX;
218 case 'g': return LATIN_SMALL_LETTER_G_WITH_CIRCUMFLEX;
219 case 'G': return LATIN_CAPITAL_LETTER_G_WITH_CIRCUMFLEX;
220 case 'h': return LATIN_SMALL_LETTER_H_WITH_CIRCUMFLEX;
221 case 'H': return LATIN_CAPITAL_LETTER_H_WITH_CIRCUMFLEX;
222 case 'i': return LATIN_SMALL_LETTER_I_WITH_CIRCUMFLEX;
223 case 'I': return LATIN_CAPITAL_LETTER_I_WITH_CIRCUMFLEX;
224 case 'j': return LATIN_SMALL_LETTER_J_WITH_CIRCUMFLEX;
225 case 'J': return LATIN_CAPITAL_LETTER_J_WITH_CIRCUMFLEX;
226 case 'o': return LATIN_SMALL_LETTER_O_WITH_CIRCUMFLEX;
227 case 'O': return LATIN_CAPITAL_LETTER_O_WITH_CIRCUMFLEX;
228 case '0': return LATIN_CAPITAL_LETTER_O_WITH_CIRCUMFLEX;
229 case 's': return LATIN_SMALL_LETTER_S_WITH_CIRCUMFLEX;
230 case 'S': return LATIN_CAPITAL_LETTER_S_WITH_CIRCUMFLEX;
231 case 'u': return LATIN_SMALL_LETTER_U_WITH_CIRCUMFLEX;
232 case 'U': return LATIN_CAPITAL_LETTER_U_WITH_CIRCUMFLEX;
233 case 'w': return LATIN_SMALL_LETTER_W_WITH_CIRCUMFLEX;
234 case 'W': return LATIN_CAPITAL_LETTER_W_WITH_CIRCUMFLEX;
235 case 'y': return LATIN_SMALL_LETTER_Y_WITH_CIRCUMFLEX;
236 case 'Y': return LATIN_CAPITAL_LETTER_Y_WITH_CIRCUMFLEX;
237 default:
238 if(warn)fprintf( stderr, " COMPOSE: CIRCUMFLEX_ACCENT+%04x not defined\n",(int)main);
240 break;
242 case MACRON: /* a minus sign above the char (latin2) */
243 switch (main) {
244 case 'a': return LATIN_SMALL_LETTER_A_WITH_MACRON;
245 case 'A': return LATIN_CAPITAL_LETTER_A_WITH_MACRON;
246 case 'e': return LATIN_SMALL_LETTER_E_WITH_MACRON;
247 case 'E': return LATIN_CAPITAL_LETTER_E_WITH_MACRON;
248 case 'i': return LATIN_SMALL_LETTER_I_WITH_MACRON;
249 case 'I': return LATIN_CAPITAL_LETTER_I_WITH_MACRON;
250 case 'o': return LATIN_SMALL_LETTER_O_WITH_MACRON;
251 case 'O': return LATIN_CAPITAL_LETTER_O_WITH_MACRON;
252 case 'u': return LATIN_SMALL_LETTER_U_WITH_MACRON;
253 case 'U': return LATIN_CAPITAL_LETTER_U_WITH_MACRON;
254 case 'y': return LATIN_SMALL_LETTER_Y_WITH_MACRON;
255 case 'Y': return LATIN_CAPITAL_LETTER_Y_WITH_MACRON;
256 case LATIN_SMALL_LETTER_AE: return LATIN_SMALL_LETTER_AE_WITH_MACRON;
257 case LATIN_CAPITAL_LETTER_AE: return LATIN_CAPITAL_LETTER_AE_WITH_MACRON;
258 case '=': return IDENTICAL_TO;
259 case '-': return '=';
260 case ' ': return MODIFIER_LETTER_MACRON;
261 default:
262 if(warn)fprintf( stderr, " COMPOSE: MACRON+%04x not defined\n",(int)main);
264 break;
266 case DOT_ABOVE: /* latin2 */
267 switch (main) {
268 case 'a': return LATIN_SMALL_LETTER_A_WITH_DOT_ABOVE;
269 case 'A': return LATIN_CAPITAL_LETTER_A_WITH_DOT_ABOVE;
270 case 'c': return LATIN_SMALL_LETTER_C_WITH_DOT_ABOVE;
271 case 'C': return LATIN_CAPITAL_LETTER_C_WITH_DOT_ABOVE;
272 case 'e': return LATIN_SMALL_LETTER_E_WITH_DOT_ABOVE;
273 case 'E': return LATIN_CAPITAL_LETTER_E_WITH_DOT_ABOVE;
274 case 'g': return LATIN_SMALL_LETTER_G_WITH_DOT_ABOVE;
275 case 'G': return LATIN_CAPITAL_LETTER_G_WITH_DOT_ABOVE;
276 case 'l': return 'i'; /* correct wrong recognition */
277 case 'i': return 'i';
278 case LATIN_SMALL_LETTER_DOTLESS_I: return 'i';
279 case 'I': return LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
280 case 'j': return 'j';
281 case 'o': return LATIN_SMALL_LETTER_O_WITH_DOT_ABOVE;
282 case 'O': return LATIN_CAPITAL_LETTER_O_WITH_DOT_ABOVE;
283 case 'z': return LATIN_SMALL_LETTER_Z_WITH_DOT_ABOVE;
284 case 'Z': return LATIN_CAPITAL_LETTER_Z_WITH_DOT_ABOVE;
285 case ',': return ';';
286 case '.': return ':';
287 default:
288 if(warn)fprintf( stderr, " COMPOSE: DOT_ABOVE+%04x not defined\n",(int)main);
290 break;
292 case RING_ABOVE:
293 switch (main) {
294 case 'a': return LATIN_SMALL_LETTER_A_WITH_RING_ABOVE;
295 case 'A': return LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE;
296 case 'u': return LATIN_SMALL_LETTER_U_WITH_RING_ABOVE;
297 case 'U': return LATIN_CAPITAL_LETTER_U_WITH_RING_ABOVE;
298 default:
299 if(warn)fprintf( stderr, " COMPOSE: RING_ABOVE+%04x not defined\n",(int)main);
301 break;
303 case 'e': /* e ligatures: ae, oe. */
304 case 'E':
305 switch (main) {
306 case 'a': return LATIN_SMALL_LETTER_AE;
307 case 'A': return LATIN_CAPITAL_LETTER_AE;
308 case 'o': return LATIN_SMALL_LIGATURE_OE;
309 case 'O': return LATIN_CAPITAL_LIGATURE_OE;
310 case '0': return LATIN_CAPITAL_LIGATURE_OE;
311 default:
312 if(warn)fprintf( stderr, " COMPOSE: %04x+e/E not defined\n",(int)main);
314 break;
316 case 'g': /* greek */
317 switch (main) {
318 /* missing 0x37A-0x390 */
319 /* weird cases: Q -> theta (it resembles a little, doesn't it?)
320 V -> psi (what can I do?) */
321 case 'A': return GREEK_CAPITAL_LETTER_ALPHA;
322 case 'B': return GREEK_CAPITAL_LETTER_BETA;
323 case 'G': return GREEK_CAPITAL_LETTER_GAMMA;
324 case 'D': return GREEK_CAPITAL_LETTER_DELTA;
325 case 'E': return GREEK_CAPITAL_LETTER_EPSILON;
326 case 'Z': return GREEK_CAPITAL_LETTER_ZETA;
327 case 'H': return GREEK_CAPITAL_LETTER_ETA;
328 case 'Q': return GREEK_CAPITAL_LETTER_THETA;
329 case 'I': return GREEK_CAPITAL_LETTER_IOTA;
330 case 'K': return GREEK_CAPITAL_LETTER_KAPPA;
331 case 'L': return GREEK_CAPITAL_LETTER_LAMDA;
332 case 'M': return GREEK_CAPITAL_LETTER_MU;
333 case 'N': return GREEK_CAPITAL_LETTER_NU;
334 case 'X': return GREEK_CAPITAL_LETTER_XI;
335 case 'O': return GREEK_CAPITAL_LETTER_OMICRON;
336 case 'P': return GREEK_CAPITAL_LETTER_PI;
337 case 'R': return GREEK_CAPITAL_LETTER_RHO;
338 case 'S': return GREEK_CAPITAL_LETTER_SIGMA;
339 case 'T': return GREEK_CAPITAL_LETTER_TAU;
340 case 'Y': return GREEK_CAPITAL_LETTER_UPSILON;
341 case 'F': return GREEK_CAPITAL_LETTER_PHI;
342 case 'C': return GREEK_CAPITAL_LETTER_CHI;
343 case 'V': return GREEK_CAPITAL_LETTER_PSI;
344 case 'W': return GREEK_CAPITAL_LETTER_OMEGA;
346 case '': return GREEK_CAPITAL_LETTER_IOTA_WITH_DIALYTIKA;
347 case '': return GREEK_CAPITAL_LETTER_UPSILON_WITH_DIALYTIKA;
348 case '': return GREEK_SMALL_LETTER_ALPHA_WITH_TONOS;
349 case '': return GREEK_SMALL_LETTER_EPSILON_WITH_TONOS;
350 case '': return GREEK_SMALL_LETTER_ETA_WITH_TONOS;
351 case '': return GREEK_SMALL_LETTER_IOTA_WITH_TONOS;
352 case '': return GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS;
354 case 'a': return GREEK_SMALL_LETTER_ALPHA;
355 case 'b': return GREEK_SMALL_LETTER_BETA;
356 case 'g': return GREEK_SMALL_LETTER_GAMMA;
357 case 'd': return GREEK_SMALL_LETTER_DELTA;
358 case 'e': return GREEK_SMALL_LETTER_EPSILON;
359 case 'z': return GREEK_SMALL_LETTER_ZETA;
360 case 'h': return GREEK_SMALL_LETTER_ETA;
361 case 'q': return GREEK_SMALL_LETTER_THETA;
362 case 'i': return GREEK_SMALL_LETTER_IOTA;
363 case 'k': return GREEK_SMALL_LETTER_KAPPA;
364 case 'l': return GREEK_SMALL_LETTER_LAMDA;
365 case 'm': return GREEK_SMALL_LETTER_MU;
366 case 'n': return GREEK_SMALL_LETTER_NU;
367 case 'x': return GREEK_SMALL_LETTER_XI;
368 case 'o': return GREEK_SMALL_LETTER_OMICRON;
369 case 'p': return GREEK_SMALL_LETTER_PI;
370 case 'r': return GREEK_SMALL_LETTER_RHO;
371 case '&': return GREEK_SMALL_LETTER_FINAL_SIGMA;
372 case 's': return GREEK_SMALL_LETTER_SIGMA;
373 case 't': return GREEK_SMALL_LETTER_TAU;
374 case 'y': return GREEK_SMALL_LETTER_UPSILON;
375 case 'f': return GREEK_SMALL_LETTER_PHI;
376 case 'c': return GREEK_SMALL_LETTER_CHI;
377 case 'v': return GREEK_SMALL_LETTER_PSI;
378 case 'w': return GREEK_SMALL_LETTER_OMEGA;
380 case '': return GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA;
381 case '': return GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA;
382 case '': return GREEK_SMALL_LETTER_OMICRON_WITH_TONOS;
383 case '': return GREEK_SMALL_LETTER_UPSILON_WITH_TONOS;
384 case '': return GREEK_SMALL_LETTER_OMEGA_WITH_TONOS;
385 case '': return GREEK_BETA_SYMBOL;
386 case '': return GREEK_THETA_SYMBOL;
387 case '': return GREEK_UPSILON_WITH_HOOK_SYMBOL;
388 case '': return GREEK_UPSILON_WITH_ACUTE_AND_HOOK_SYMBOL;
389 case '': return GREEK_UPSILON_WITH_DIAERESIS_AND_HOOK_SYMBOL;
390 case '': return GREEK_PHI_SYMBOL;
391 case '': return GREEK_PI_SYMBOL;
393 default:
394 if(warn)fprintf( stderr, " COMPOSE: GREEK %04x not defined\n",(int)main);
396 break;
398 default:
399 fprintf( stderr, " COMPOSE: modifier %04x not defined\n",(int)modifier);
401 return (wchar_t)main;
404 #define UNDEFINED "~"
406 /* Arguments: character in Unicode format, type of format to convert to.
407 Returns: a string containing the Unicode character converted to the chosen
408 format. This string is statically allocated and should not be freed.
409 ToDo: better using tables?
411 const char *decode(wchar_t c, FORMAT type) {
412 /* static char d; --- js: big bug (missing \0) if &d returned */
413 /*FIXME jb static*/ static char bbuf[8*32]; /* space for 8 buffers, rotating */
414 /*FIXME jb static*/ static char *buf=bbuf; /* used for UTF8 sequences and undefined codes */
415 buf+=32; if(buf>=bbuf+8*32) buf=bbuf;
416 buf[0]=buf[1]=buf[2]=0;
417 switch (type) {
418 case ISO8859_1:
419 if ( c <= 0xFF ) { /* UNICODE == ISO8859-1 */
420 buf[0] = (char)c;
421 return buf;
423 switch (c) { /* not found in list, but perhaps we can describe it */
424 /* todo: add greek. GREEK_SMALL_LETTER_ALPHA = alpha */
426 /* general puctuation */
427 case HYPHEN:
428 return (const char *)"-";
429 case FIGURE_DASH:
430 case EN_DASH:
431 return (const char *)"--";
432 case EM_DASH:
433 return (const char *)"---";
434 case LEFT_SINGLE_QUOTATION_MARK:
435 return (const char *)"`";
436 case RIGHT_SINGLE_QUOTATION_MARK:
437 return (const char *)"'";
438 case SINGLE_LOW_9_QUOTATION_MARK:
439 return (const char *)",";
440 case SINGLE_HIGH_REVERSED_9_QUOTATION_MARK:
441 return (const char *)UNDEFINED;
442 case LEFT_DOUBLE_QUOTATION_MARK:
443 return (const char *)"``";
444 case RIGHT_DOUBLE_QUOTATION_MARK:
445 return (const char *)"''";
446 case DOUBLE_LOW_9_QUOTATION_MARK:
447 return (const char *)",,";
448 case DOUBLE_HIGH_REVERSED_9_QUOTATION_MARK:
449 return (const char *)UNDEFINED;
450 case DAGGER:
451 return (const char *)"+";
452 case DOUBLE_DAGGER:
453 return (const char *)"*";
454 case BULLET:
455 return (const char *)"*";
456 case TRIANGULAR_BULLET:
457 return (const char *)"*";
458 case HYPHENATION_POINT:
459 return (const char *)"-";
460 case HORIZONTAL_ELLIPSIS:
461 return (const char *)"...";
462 case PER_MILLE_SIGN:
463 return (const char *)"%%"; /* awk! */
464 case SINGLE_LEFT_POINTING_ANGLE_QUOTATION_MARK:
465 return (const char *)"<";
466 case SINGLE_RIGHT_POINTING_ANGLE_QUOTATION_MARK:
467 return (const char *)">";
468 case EURO_CURRENCY_SIGN:
469 return (const char *)"EUR"; /* change it! */
471 /* ligatures */
472 case LATIN_SMALL_LIGATURE_FF:
473 return (const char *)"ff";
474 case LATIN_SMALL_LIGATURE_FI:
475 return (const char *)"fi";
476 case LATIN_SMALL_LIGATURE_FL:
477 return (const char *)"fl";
478 case LATIN_SMALL_LIGATURE_FFI:
479 return (const char *)"ffi";
480 case LATIN_SMALL_LIGATURE_FFL:
481 return (const char *)"ffl";
482 case LATIN_SMALL_LIGATURE_LONG_S_T:
483 case LATIN_SMALL_LIGATURE_ST:
484 return (const char *)"st";
486 /* extra */
487 case UNKNOWN:
488 return (const char *)"_";
489 case PICTURE:
490 return (const char *)"_"; /* Due to Mobile OCR */
492 default:
493 /* snprintf seems to be no standard, so I use insecure sprintf */
494 sprintf(buf,"\\code(%04x)",(unsigned)c);
495 return buf; /* UNDEFINED; */
497 break;
498 case TeX:
499 if ( c >= SPACE && c <= TILDE ) { /* ASCII */
500 switch (c) {
501 case '$':
502 return (const char *)"\\$";
503 case '&':
504 return (const char *)"\\&";
505 case '%':
506 return (const char *)"\\%";
507 case '#':
508 return (const char *)"\\#";
509 case '_':
510 return (const char *)"\\_";
511 case '{':
512 return (const char *)"\\{";
513 case '}':
514 return (const char *)"\\}";
515 case '\\':
516 return (const char *)"$\\backslash$";
517 case '~':
518 return (const char *)"\\~{}";
519 case '^':
520 return (const char *)"\\^{}";
521 default:
522 buf[0] = (char)c;
523 return (const char *)buf;
526 switch (c) {
527 /* ISO8859_1 */
528 case NO_BREAK_SPACE:
529 return (const char *)"~";
530 case INVERTED_EXCLAMATION_MARK:
531 return (const char *)"!'";
532 case CENT_SIGN:
533 return (const char *)"\\textcent"; /* \usepackage{textcomp} */
534 case POUND_SIGN:
535 return (const char *)"\\pounds";
536 case EURO_CURRENCY_SIGN:
537 return (const char *)"\\euro"; /* \usepackage{eurosans} */
538 case CURRENCY_SIGN:
539 return (const char *)"\\textcurrency"; /* \usepackage{textcomp} */
540 case YEN_SIGN:
541 return (const char *)"\\textyen"; /* \usepackage{textcomp} */
542 case BROKEN_BAR:
543 return (const char *)"\\textbrokenbar"; /* \usepackage{textcomp} */
544 case SECTION_SIGN:
545 return (const char *)"\\S";
546 case DIAERESIS:
547 return (const char *)"\"";
548 case COPYRIGHT_SIGN:
549 return (const char *)"\\copyright";
550 case FEMININE_ORDINAL_INDICATOR:
551 return (const char *)"$^{\\underbar{a}}$";
552 case LEFT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK:
553 return (const char *)"\\flqq{}";
554 case NOT_SIGN:
555 return (const char *)"$\\lnot$";
556 case SOFT_HYPHEN:
557 return (const char *)"\\-";
558 case REGISTERED_SIGN:
559 return (const char *)"\\textregistered";/* \usepackage{textcomp} */
560 case MACRON:
561 return (const char *)"\\textasciimacron";/* \usepackage{textcomp} */
562 case DEGREE_SIGN:
563 return (const char *)"$^{o}$";
564 case PLUS_MINUS_SIGN:
565 return (const char *)"$\\pm$";
566 case SUPERSCRIPT_TWO:
567 return (const char *)"$^{2}$";
568 case SUPERSCRIPT_THREE:
569 return (const char *)"$^{3}$";
570 case ACUTE_ACCENT:
571 return (const char *)"\\( \\prime \\)";
572 case MICRO_SIGN:
573 return (const char *)"$\\mu$";
574 case PILCROW_SIGN:
575 return (const char *)"\\P";
576 case MIDDLE_DOT:
577 return (const char *)"$\\cdot$";
578 case CEDILLA:
579 return (const char *)"\\,";
580 case SUPERSCRIPT_ONE:
581 return (const char *)"$^{1}$";
582 case MASCULINE_ORDINAL_INDICATOR:
583 return (const char *)"$^{\\underbar{o}}$";
584 case RIGHT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK:
585 return (const char *)"\\frqq{}";
586 case VULGAR_FRACTION_ONE_QUARTER: /* these fractions are not good*/
587 return (const char *)"\\( 1\\over 4 \\)";
588 case VULGAR_FRACTION_ONE_HALF:
589 return (const char *)"\\( 1\\over 2 \\)";
590 case VULGAR_FRACTION_THREE_QUARTERS:
591 return (const char *)"\\( 3\\over 4 \\)";
592 case INVERTED_QUESTION_MARK:
593 return (const char *)"?'";
594 case LATIN_CAPITAL_LETTER_A_WITH_GRAVE:
595 return (const char *)"\\`A";
596 case LATIN_CAPITAL_LETTER_A_WITH_ACUTE:
597 return (const char *)"\\'A";
598 case LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX:
599 return (const char *)"\\^A";
600 case LATIN_CAPITAL_LETTER_A_WITH_TILDE:
601 return (const char *)"\\~A";
602 case LATIN_CAPITAL_LETTER_A_WITH_DIAERESIS:
603 return (const char *)"\\\"A";
604 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
605 return (const char *)"\\AA";
606 case LATIN_CAPITAL_LETTER_AE:
607 return (const char *)"\\AE";
608 case LATIN_CAPITAL_LETTER_C_WITH_CARON:
609 return (const char *)"\\v{C}";
610 case LATIN_CAPITAL_LETTER_C_WITH_CEDILLA:
611 return (const char *)"\\C";
612 case LATIN_CAPITAL_LETTER_E_WITH_GRAVE:
613 return (const char *)"\\`E";
614 case LATIN_CAPITAL_LETTER_E_WITH_ACUTE:
615 return (const char *)"\\'E";
616 case LATIN_CAPITAL_LETTER_E_WITH_CARON:
617 return (const char *)"\\v{E}";
618 case LATIN_CAPITAL_LETTER_E_WITH_CIRCUMFLEX:
619 return (const char *)"\\^E";
620 case LATIN_CAPITAL_LETTER_E_WITH_DIAERESIS:
621 return (const char *)"\\\"E";
622 case LATIN_CAPITAL_LETTER_I_WITH_GRAVE:
623 return (const char *)"\\`I";
624 case LATIN_CAPITAL_LETTER_I_WITH_ACUTE:
625 return (const char *)"\\'I";
626 case LATIN_CAPITAL_LETTER_I_WITH_CIRCUMFLEX:
627 return (const char *)"\\^I";
628 case LATIN_CAPITAL_LETTER_I_WITH_DIAERESIS:
629 return (const char *)"\\\"I";
630 case LATIN_CAPITAL_LETTER_ETH:
631 return (const char *)UNDEFINED;
632 case LATIN_CAPITAL_LETTER_N_WITH_TILDE:
633 return (const char *)"\\~N";
634 case LATIN_CAPITAL_LETTER_O_WITH_GRAVE:
635 return (const char *)"\\`O";
636 case LATIN_CAPITAL_LETTER_O_WITH_ACUTE:
637 return (const char *)"\\'O";
638 case LATIN_CAPITAL_LETTER_O_WITH_CIRCUMFLEX:
639 return (const char *)"\\^O";
640 case LATIN_CAPITAL_LETTER_O_WITH_TILDE:
641 return (const char *)"\\~O";
642 case LATIN_CAPITAL_LETTER_O_WITH_DIAERESIS:
643 return (const char *)"\\\"O";
644 case MULTIPLICATION_SIGN:
645 return (const char *)"$\\times$";
646 case LATIN_CAPITAL_LETTER_O_WITH_STROKE:
647 return (const char *)"\\O";
648 case LATIN_CAPITAL_LETTER_S_WITH_CARON:
649 return (const char *)"\\v{S}";
650 case LATIN_CAPITAL_LETTER_U_WITH_GRAVE:
651 return (const char *)"\\`U";
652 case LATIN_CAPITAL_LETTER_U_WITH_ACUTE:
653 return (const char *)"\\'U";
654 case LATIN_CAPITAL_LETTER_U_WITH_CIRCUMFLEX:
655 return (const char *)"\\^U";
656 case LATIN_CAPITAL_LETTER_U_WITH_DIAERESIS:
657 return (const char *)"\\\"U";
658 case LATIN_CAPITAL_LETTER_Y_WITH_ACUTE:
659 return (const char *)"\\'Y";
660 case LATIN_CAPITAL_LETTER_Z_WITH_CARON:
661 return (const char *)"\\v{Z}";
662 case LATIN_CAPITAL_LETTER_THORN:
663 return (const char *)UNDEFINED;
664 case LATIN_SMALL_LETTER_SHARP_S:
665 return (const char *)"\\ss";
666 case LATIN_SMALL_LETTER_A_WITH_GRAVE:
667 return (const char *)"\\`a";
668 case LATIN_SMALL_LETTER_A_WITH_ACUTE:
669 return (const char *)"\\'a";
670 case LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX:
671 return (const char *)"\\^a";
672 case LATIN_SMALL_LETTER_A_WITH_TILDE:
673 return (const char *)"\\~a";
674 case LATIN_SMALL_LETTER_A_WITH_DIAERESIS:
675 return (const char *)"\\\"a";
676 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
677 return (const char *)"\\aa";
678 case LATIN_SMALL_LETTER_AE:
679 return (const char *)"\\ae";
680 case LATIN_SMALL_LETTER_C_WITH_CARON:
681 return (const char *)"\\v{c}";
682 case LATIN_SMALL_LETTER_C_WITH_CEDILLA:
683 return (const char *)"\\c";
684 case LATIN_SMALL_LETTER_E_WITH_GRAVE:
685 return (const char *)"\\`e";
686 case LATIN_SMALL_LETTER_E_WITH_ACUTE:
687 return (const char *)"\\'e";
688 case LATIN_SMALL_LETTER_E_WITH_CARON:
689 return (const char *)"\\v{e}";
690 case LATIN_SMALL_LETTER_E_WITH_CIRCUMFLEX:
691 return (const char *)"\\^e";
692 case LATIN_SMALL_LETTER_E_WITH_DIAERESIS:
693 return (const char *)"\\\"e";
694 case LATIN_SMALL_LETTER_I_WITH_GRAVE:
695 return (const char *)"\\`i";
696 case LATIN_SMALL_LETTER_I_WITH_ACUTE:
697 return (const char *)"\\'i";
698 case LATIN_SMALL_LETTER_I_WITH_CIRCUMFLEX:
699 return (const char *)"\\^i";
700 case LATIN_SMALL_LETTER_I_WITH_DIAERESIS:
701 return (const char *)"\\\"i";
702 case LATIN_SMALL_LETTER_ETH:
703 return (const char *)UNDEFINED;
704 case LATIN_SMALL_LETTER_N_WITH_TILDE:
705 return (const char *)"\\~n";
706 case LATIN_SMALL_LETTER_O_WITH_GRAVE:
707 return (const char *)"\\`o";
708 case LATIN_SMALL_LETTER_O_WITH_ACUTE:
709 return (const char *)"\\'o";
710 case LATIN_SMALL_LETTER_O_WITH_CIRCUMFLEX:
711 return (const char *)"\\^o";
712 case LATIN_SMALL_LETTER_O_WITH_TILDE:
713 return (const char *)"\\~o";
714 case LATIN_SMALL_LETTER_O_WITH_DIAERESIS:
715 return (const char *)"\\\"o";
716 case DIVISION_SIGN:
717 return (const char *)"$\\div$";
718 case LATIN_SMALL_LETTER_O_WITH_STROKE:
719 return (const char *)"\\o";
720 case LATIN_SMALL_LETTER_S_WITH_CARON:
721 return (const char *)"\\v{s}";
722 case LATIN_SMALL_LETTER_U_WITH_GRAVE:
723 return (const char *)"\\`u";
724 case LATIN_SMALL_LETTER_U_WITH_ACUTE:
725 return (const char *)"\\'u";
726 case LATIN_SMALL_LETTER_U_WITH_CIRCUMFLEX:
727 return (const char *)"\\^u";
728 case LATIN_SMALL_LETTER_U_WITH_DIAERESIS:
729 return (const char *)"\\\"u";
730 case LATIN_SMALL_LETTER_Y_WITH_ACUTE:
731 return (const char *)"\\'y";
732 case LATIN_SMALL_LETTER_THORN:
733 return (const char *)UNDEFINED;
734 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
735 return (const char *)"\\\"y";
736 case LATIN_SMALL_LETTER_Z_WITH_CARON:
737 return (const char *)"\\v{z}";
739 /* greek */
740 /* some (punctuation, accents, accented capital) greek letters missing*/
741 case GREEK_CAPITAL_LETTER_ALPHA:
742 return (const char *)"A";
743 case GREEK_CAPITAL_LETTER_BETA:
744 return (const char *)"B";
745 case GREEK_CAPITAL_LETTER_GAMMA:
746 return (const char *)"\\( \\Gamma \\)";
747 case GREEK_CAPITAL_LETTER_DELTA:
748 return (const char *)"\\( \\Delta \\)";
749 case GREEK_CAPITAL_LETTER_EPSILON:
750 return (const char *)"E";
751 case GREEK_CAPITAL_LETTER_ZETA:
752 return (const char *)"Z";
753 case GREEK_CAPITAL_LETTER_ETA:
754 return (const char *)"H";
755 case GREEK_CAPITAL_LETTER_THETA:
756 return (const char *)"\\( \\Theta \\)";
757 case GREEK_CAPITAL_LETTER_IOTA:
758 return (const char *)"I";
759 case GREEK_CAPITAL_LETTER_KAPPA:
760 return (const char *)"K";
761 case GREEK_CAPITAL_LETTER_LAMDA:
762 return (const char *)"\\( \\Lambda \\)";
763 case GREEK_CAPITAL_LETTER_MU:
764 return (const char *)"M";
765 case GREEK_CAPITAL_LETTER_NU:
766 return (const char *)"N";
767 case GREEK_CAPITAL_LETTER_XI:
768 return (const char *)"\\( \\Xi \\)";
769 case GREEK_CAPITAL_LETTER_OMICRON:
770 return (const char *)"O";
771 case GREEK_CAPITAL_LETTER_PI:
772 return (const char *)"\\( \\Pi \\)";
773 case GREEK_CAPITAL_LETTER_RHO:
774 return (const char *)"P";
775 case GREEK_CAPITAL_LETTER_SIGMA:
776 return (const char *)"\\( \\Sigma \\)";
777 case GREEK_CAPITAL_LETTER_TAU:
778 return (const char *)"T";
779 case GREEK_CAPITAL_LETTER_UPSILON:
780 return (const char *)"\\( \\Upsilon \\)";
781 case GREEK_CAPITAL_LETTER_PHI:
782 return (const char *)"\\( \\Phi \\)";
783 case GREEK_CAPITAL_LETTER_CHI:
784 return (const char *)"\\( \\Chi \\)";
785 case GREEK_CAPITAL_LETTER_PSI:
786 return (const char *)"\\( \\Psi \\)";
787 case GREEK_CAPITAL_LETTER_OMEGA:
788 return (const char *)"\\( \\Omega \\)";
789 case GREEK_CAPITAL_LETTER_IOTA_WITH_DIALYTIKA:
790 return (const char *)UNDEFINED;
791 case GREEK_CAPITAL_LETTER_UPSILON_WITH_DIALYTIKA:
792 return (const char *)UNDEFINED;
793 case GREEK_SMALL_LETTER_ALPHA_WITH_TONOS:
794 return (const char *)UNDEFINED;
795 case GREEK_SMALL_LETTER_EPSILON_WITH_TONOS:
796 return (const char *)UNDEFINED;
797 case GREEK_SMALL_LETTER_ETA_WITH_TONOS:
798 return (const char *)UNDEFINED;
799 case GREEK_SMALL_LETTER_IOTA_WITH_TONOS:
800 return (const char *)UNDEFINED;
801 case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS:
802 return (const char *)UNDEFINED;
803 case GREEK_SMALL_LETTER_ALPHA:
804 return (const char *)"\\( \\alpha \\)";
805 case GREEK_SMALL_LETTER_BETA:
806 return (const char *)"\\( \\beta \\)";
807 case GREEK_SMALL_LETTER_GAMMA:
808 return (const char *)"\\( \\gamma \\)";
809 case GREEK_SMALL_LETTER_DELTA:
810 return (const char *)"\\( \\delta \\)";
811 case GREEK_SMALL_LETTER_EPSILON:
812 return (const char *)"\\( \\epsilon \\)";
813 case GREEK_SMALL_LETTER_ZETA:
814 return (const char *)"\\( \\zeta \\)";
815 case GREEK_SMALL_LETTER_ETA:
816 return (const char *)"\\( \\eta \\)";
817 case GREEK_SMALL_LETTER_THETA:
818 return (const char *)"\\( \\theta \\)";
819 case GREEK_SMALL_LETTER_IOTA:
820 return (const char *)"\\( \\iota \\)";
821 case GREEK_SMALL_LETTER_KAPPA:
822 return (const char *)"\\( \\kappa \\)";
823 case GREEK_SMALL_LETTER_LAMDA:
824 return (const char *)"\\( \\lambda \\)";
825 case GREEK_SMALL_LETTER_MU:
826 return (const char *)"\\( \\mu \\)";
827 case GREEK_SMALL_LETTER_NU:
828 return (const char *)"\\( \\nu \\)";
829 case GREEK_SMALL_LETTER_XI:
830 return (const char *)"\\( \\xi \\)";
831 case GREEK_SMALL_LETTER_OMICRON:
832 return (const char *)"\\( \\omicron \\)";
833 case GREEK_SMALL_LETTER_PI:
834 return (const char *)"\\( \\pi \\)";
835 case GREEK_SMALL_LETTER_RHO:
836 return (const char *)"\\( \\rho \\)";
837 case GREEK_SMALL_LETTER_FINAL_SIGMA:
838 return (const char *)"\\( \\varsigma \\)";
839 case GREEK_SMALL_LETTER_SIGMA:
840 return (const char *)"\\( \\sigma \\)";
841 case GREEK_SMALL_LETTER_TAU:
842 return (const char *)"\\( \\tau \\)";
843 case GREEK_SMALL_LETTER_UPSILON:
844 return (const char *)"\\( \\upsilon \\)";
845 case GREEK_SMALL_LETTER_PHI:
846 return (const char *)"\\( \\varphi \\)";
847 case GREEK_SMALL_LETTER_CHI:
848 return (const char *)"\\( \\chi \\)";
849 case GREEK_SMALL_LETTER_PSI:
850 return (const char *)"\\( \\psi \\)";
851 case GREEK_SMALL_LETTER_OMEGA:
852 return (const char *)"\\( \\omega \\)";
853 case GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA:
854 return (const char *)UNDEFINED;
855 case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA:
856 return (const char *)UNDEFINED;
857 case GREEK_SMALL_LETTER_OMICRON_WITH_TONOS:
858 return (const char *)UNDEFINED;
859 case GREEK_SMALL_LETTER_UPSILON_WITH_TONOS:
860 return (const char *)UNDEFINED;
861 case GREEK_SMALL_LETTER_OMEGA_WITH_TONOS:
862 return (const char *)UNDEFINED;
863 case GREEK_BETA_SYMBOL:
864 return (const char *)UNDEFINED;
865 case GREEK_THETA_SYMBOL:
866 return (const char *)"\\( \\vartheta \\)";
867 case GREEK_UPSILON_WITH_HOOK_SYMBOL:
868 return (const char *)UNDEFINED;
869 case GREEK_UPSILON_WITH_ACUTE_AND_HOOK_SYMBOL:
870 return (const char *)UNDEFINED;
871 case GREEK_UPSILON_WITH_DIAERESIS_AND_HOOK_SYMBOL:
872 return (const char *)UNDEFINED;
873 case GREEK_PHI_SYMBOL:
874 return (const char *)"\\( \\phi \\)";
875 case GREEK_PI_SYMBOL:
876 return (const char *)"\\( \\varpi \\)";
877 /* and some greek letters missing*/
879 /* punctuation (partial) */
880 case HYPHEN:
881 return (const char *)"-";
882 case NON_BREAKING_HYPHEN:
883 return (const char *)UNDEFINED;
884 case FIGURE_DASH:
885 case EN_DASH:
886 return (const char *)"--";
887 case EM_DASH:
888 return (const char *)"---";
889 case HORIZONTAL_BAR:
890 return (const char *)UNDEFINED;
891 case LEFT_SINGLE_QUOTATION_MARK:
892 return (const char *)"`";
893 case RIGHT_SINGLE_QUOTATION_MARK:
894 return (const char *)"'";
895 case SINGLE_LOW_9_QUOTATION_MARK:
896 return (const char *)"\\glq{}";
897 case SINGLE_HIGH_REVERSED_9_QUOTATION_MARK:
898 return (const char *)UNDEFINED;
899 case LEFT_DOUBLE_QUOTATION_MARK:
900 return (const char *)"``";
901 case RIGHT_DOUBLE_QUOTATION_MARK:
902 return (const char *)"''";
903 case DOUBLE_LOW_9_QUOTATION_MARK:
904 return (const char *)"\\glqq{}";
905 case DOUBLE_HIGH_REVERSED_9_QUOTATION_MARK:
906 return (const char *)UNDEFINED;
907 case DAGGER:
908 return (const char *)"\\dag";
909 case DOUBLE_DAGGER:
910 return (const char *)"\\ddag";
911 case BULLET:
912 return (const char *)"$\\bullet$";
913 case TRIANGULAR_BULLET:
914 return (const char *)"$\\blacktriangleright";
915 case HYPHENATION_POINT:
916 return (const char *)"\\-";
917 case HORIZONTAL_ELLIPSIS:
918 return (const char *)"\\ldots";
919 case PER_MILLE_SIGN:
920 return (const char *)UNDEFINED;
921 case SINGLE_LEFT_POINTING_ANGLE_QUOTATION_MARK:
922 return (const char *)"\\flq{}";
923 case SINGLE_RIGHT_POINTING_ANGLE_QUOTATION_MARK:
924 return (const char *)"\\frq{}";
925 /* ligatures */
926 case LATIN_SMALL_LIGATURE_FF:
927 return (const char *)"ff";
928 case LATIN_SMALL_LIGATURE_FI:
929 return (const char *)"fi";
930 case LATIN_SMALL_LIGATURE_FL:
931 return (const char *)"fl";
932 case LATIN_SMALL_LIGATURE_FFI:
933 return (const char *)"ffi";
934 case LATIN_SMALL_LIGATURE_FFL:
935 return (const char *)"ffl";
936 case LATIN_SMALL_LIGATURE_LONG_S_T:
937 case LATIN_SMALL_LIGATURE_ST:
938 return (const char *)"st";
939 /* reserved */
940 case 0:
941 return (const char *)"";
942 case UNKNOWN:
943 return (const char *)"\\_";
944 case PICTURE:
945 return (const char *)"(PICTURE)";
946 default:
947 /* snprintf seems to be no standard, so I use insecure sprintf */
948 sprintf(buf,"\\symbol{%u}",(unsigned)c);
949 return buf; /* UNDEFINED; */
951 case HTML:
952 if ( c >= SPACE && c <= TILDE ) { /* ASCII */
953 switch (c) {
954 case '&':
955 return (const char *)"&amp;";
956 /* semicolon must not be coded */
957 case '\'':
958 return (const char *)"&apos;";
959 case '"':
960 return (const char *)"&quot;";
961 case '<':
962 return (const char *)"&lt;";
963 case '>':
964 return (const char *)"&gt;";
966 buf[0] = (char)c;
967 return buf;
969 switch (c) {
970 case PICTURE:
971 return (const char *)"<!--PICTURE-->";
972 case UNKNOWN:
973 return (const char *)"_"; /* better use colored symbol? */
974 case LINE_FEED:
975 return (const char *)"<br />"; /* \n handled somwhere else? */
976 case FORM_FEED:
977 case CARRIAGE_RETURN:
978 return (const char *)"<br />";
979 case NO_BREAK_SPACE:
980 return (const char *)"<nobr />";
981 case INVERTED_EXCLAMATION_MARK:
982 return (const char *)"&iexcl;";
983 case CENT_SIGN:
984 return (const char *)"&cent;";
985 case POUND_SIGN:
986 return (const char *)"&pound;";
987 case CURRENCY_SIGN:
988 return (const char *)"&curren;";
989 case YEN_SIGN:
990 return (const char *)"&yen;";
991 case BROKEN_BAR:
992 return (const char *)"&brvbar;";
993 case SECTION_SIGN:
994 return (const char *)"&sect;";
995 case DIAERESIS:
996 return (const char *)"&uml;";
997 case COPYRIGHT_SIGN:
998 return (const char *)"&copy;";
999 case FEMININE_ORDINAL_INDICATOR:
1000 return (const char *)"&ordfem;";
1001 case LEFT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK:
1002 return (const char *)"&laquo;";
1003 case NOT_SIGN:
1004 return (const char *)"&not;";
1005 case SOFT_HYPHEN:
1006 return (const char *)"&shy;";
1007 case REGISTERED_SIGN:
1008 return (const char *)"&reg;";
1009 case MACRON:
1010 return (const char *)"&macr;";
1011 case DEGREE_SIGN:
1012 return (const char *)"&deg;";
1013 case PLUS_MINUS_SIGN:
1014 return (const char *)"&plusmn;";
1015 case SUPERSCRIPT_TWO:
1016 return (const char *)"&sup2;";
1017 case SUPERSCRIPT_THREE:
1018 return (const char *)"&sup3;";
1019 case ACUTE_ACCENT:
1020 return (const char *)"&acute;";
1021 case MICRO_SIGN:
1022 return (const char *)"&micro;";
1023 case PILCROW_SIGN:
1024 return (const char *)"&para;";
1025 case MIDDLE_DOT:
1026 return (const char *)"&middot;";
1027 case CEDILLA:
1028 return (const char *)"&cedil;";
1029 case SUPERSCRIPT_ONE:
1030 return (const char *)"&sup1;";
1031 case MASCULINE_ORDINAL_INDICATOR:
1032 return (const char *)"&ordm;";
1033 case RIGHT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK:
1034 return (const char *)"&raquo;";
1035 case VULGAR_FRACTION_ONE_QUARTER:
1036 return (const char *)"&frac14;";
1037 case VULGAR_FRACTION_ONE_HALF:
1038 return (const char *)"&frac12;";
1039 case VULGAR_FRACTION_THREE_QUARTERS:
1040 return (const char *)"&frac34;";
1041 case INVERTED_QUESTION_MARK:
1042 return (const char *)"&iquest;";
1043 case LATIN_CAPITAL_LETTER_A_WITH_GRAVE:
1044 return (const char *)"&Agrave;";
1045 case LATIN_CAPITAL_LETTER_A_WITH_ACUTE:
1046 return (const char *)"&Aacute;";
1047 case LATIN_CAPITAL_LETTER_A_WITH_BREVE:
1048 return (const char *)"&Abreve;";
1049 case LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX:
1050 return (const char *)"&Acirc;";
1051 case LATIN_CAPITAL_LETTER_A_WITH_TILDE:
1052 return (const char *)"&Atilde;";
1053 case LATIN_CAPITAL_LETTER_A_WITH_DIAERESIS:
1054 return (const char *)"&Auml;";
1055 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
1056 return (const char *)"&Aring;";
1057 case LATIN_CAPITAL_LETTER_AE:
1058 return (const char *)"&AElig;";
1059 case LATIN_CAPITAL_LETTER_C_WITH_CARON:
1060 return (const char *)"&Ccaron;";
1061 case LATIN_CAPITAL_LETTER_C_WITH_CEDILLA:
1062 return (const char *)"&Ccedil;";
1063 case LATIN_CAPITAL_LETTER_E_WITH_GRAVE:
1064 return (const char *)"&Egrave;";
1065 case LATIN_CAPITAL_LETTER_E_WITH_ACUTE:
1066 return (const char *)"&Eacute;";
1067 case LATIN_CAPITAL_LETTER_E_WITH_CARON:
1068 return (const char *)"&Ecaron;";
1069 case LATIN_CAPITAL_LETTER_E_WITH_CIRCUMFLEX:
1070 return (const char *)"&Ecirc;";
1071 case LATIN_CAPITAL_LETTER_E_WITH_DIAERESIS:
1072 return (const char *)"&Euml;";
1073 case LATIN_CAPITAL_LETTER_I_WITH_GRAVE:
1074 return (const char *)"&Igrave;";
1075 case LATIN_CAPITAL_LETTER_I_WITH_ACUTE:
1076 return (const char *)"&Iacute;";
1077 case LATIN_CAPITAL_LETTER_I_WITH_CIRCUMFLEX:
1078 return (const char *)"&Icirc;";
1079 case LATIN_CAPITAL_LETTER_I_WITH_DIAERESIS:
1080 return (const char *)"&Iuml;";
1081 case LATIN_CAPITAL_LETTER_ETH:
1082 return (const char *)"&ETH;";
1083 case LATIN_CAPITAL_LETTER_N_WITH_TILDE:
1084 return (const char *)"&Ntilde;";
1085 case LATIN_CAPITAL_LETTER_O_WITH_GRAVE:
1086 return (const char *)"&Ograve;";
1087 case LATIN_CAPITAL_LETTER_O_WITH_ACUTE:
1088 return (const char *)"&Oacute;";
1089 case LATIN_CAPITAL_LETTER_O_WITH_CIRCUMFLEX:
1090 return (const char *)"&Ocirc;";
1091 case LATIN_CAPITAL_LETTER_O_WITH_TILDE:
1092 return (const char *)"&Otilde;";
1093 case LATIN_CAPITAL_LETTER_O_WITH_DIAERESIS:
1094 return (const char *)"&Ouml;";
1095 case MULTIPLICATION_SIGN:
1096 return (const char *)"&times";
1097 case LATIN_CAPITAL_LETTER_O_WITH_STROKE:
1098 return (const char *)"&Oslash;";
1099 case LATIN_CAPITAL_LETTER_S_WITH_CARON:
1100 return (const char *)"&Scaron;";
1101 case LATIN_CAPITAL_LETTER_U_WITH_GRAVE:
1102 return (const char *)"&Ugrave;";
1103 case LATIN_CAPITAL_LETTER_U_WITH_ACUTE:
1104 return (const char *)"&Uacute;";
1105 case LATIN_CAPITAL_LETTER_U_WITH_CIRCUMFLEX:
1106 return (const char *)"&Ucirc;";
1107 case LATIN_CAPITAL_LETTER_U_WITH_DIAERESIS:
1108 return (const char *)"&Uuml;";
1109 case LATIN_CAPITAL_LETTER_Y_WITH_ACUTE:
1110 return (const char *)"&Yacute;";
1111 case LATIN_CAPITAL_LETTER_Z_WITH_CARON:
1112 return (const char *)"&Zcaron;";
1113 case LATIN_CAPITAL_LETTER_THORN:
1114 return (const char *)"&THORN;";
1115 case LATIN_SMALL_LETTER_SHARP_S:
1116 return (const char *)"&szlig;";
1117 case LATIN_SMALL_LETTER_A_WITH_GRAVE:
1118 return (const char *)"&agrave;";
1119 case LATIN_SMALL_LETTER_A_WITH_ACUTE:
1120 return (const char *)"&aacute;";
1121 case LATIN_SMALL_LETTER_A_WITH_BREVE:
1122 return (const char *)"&abreve;";
1123 case LATIN_SMALL_LETTER_A_WITH_CARON:
1124 return (const char *)"&acaron;";
1125 case LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX:
1126 return (const char *)"&acirc;";
1127 case LATIN_SMALL_LETTER_A_WITH_TILDE:
1128 return (const char *)"&atilde;";
1129 case LATIN_SMALL_LETTER_A_WITH_DIAERESIS:
1130 return (const char *)"&auml;";
1131 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
1132 return (const char *)"&aring;";
1133 case LATIN_SMALL_LETTER_AE:
1134 return (const char *)"&aelig;";
1135 case LATIN_SMALL_LETTER_C_WITH_CARON:
1136 return (const char *)"&ccaron;";
1137 case LATIN_SMALL_LETTER_C_WITH_CEDILLA:
1138 return (const char *)"&ccedil;";
1139 case LATIN_SMALL_LETTER_E_WITH_GRAVE:
1140 return (const char *)"&egrave;";
1141 case LATIN_SMALL_LETTER_E_WITH_ACUTE:
1142 return (const char *)"&eacute;";
1143 case LATIN_SMALL_LETTER_E_WITH_CARON:
1144 return (const char *)"&ecaron;";
1145 case LATIN_SMALL_LETTER_E_WITH_CIRCUMFLEX:
1146 return (const char *)"&ecirc;";
1147 case LATIN_SMALL_LETTER_E_WITH_DIAERESIS:
1148 return (const char *)"&euml;";
1149 case LATIN_SMALL_LETTER_I_WITH_GRAVE:
1150 return (const char *)"&igrave;";
1151 case LATIN_SMALL_LETTER_I_WITH_ACUTE:
1152 return (const char *)"&iacute;";
1153 case LATIN_SMALL_LETTER_I_WITH_CIRCUMFLEX:
1154 return (const char *)"&icirc;";
1155 case LATIN_SMALL_LETTER_I_WITH_DIAERESIS:
1156 return (const char *)"&iuml;";
1157 case LATIN_SMALL_LETTER_ETH:
1158 return (const char *)"&eth;";
1159 case LATIN_SMALL_LETTER_N_WITH_TILDE:
1160 return (const char *)"&ntilde;";
1161 case LATIN_SMALL_LETTER_O_WITH_GRAVE:
1162 return (const char *)"&ograve;";
1163 case LATIN_SMALL_LETTER_O_WITH_ACUTE:
1164 return (const char *)"&oacute;";
1165 case LATIN_SMALL_LETTER_O_WITH_CIRCUMFLEX:
1166 return (const char *)"&ocirc;";
1167 case LATIN_SMALL_LETTER_O_WITH_TILDE:
1168 return (const char *)"&otilde;";
1169 case LATIN_SMALL_LETTER_O_WITH_DIAERESIS:
1170 return (const char *)"&ouml;";
1171 case DIVISION_SIGN:
1172 return (const char *)"&divide;";
1173 case LATIN_SMALL_LETTER_O_WITH_STROKE:
1174 return (const char *)"&oslash;";
1175 case LATIN_SMALL_LETTER_S_WITH_CARON:
1176 return (const char *)"&scaron;";
1177 case LATIN_SMALL_LETTER_U_WITH_GRAVE:
1178 return (const char *)"&ugrave;";
1179 case LATIN_SMALL_LETTER_U_WITH_ACUTE:
1180 return (const char *)"&uacute;";
1181 case LATIN_SMALL_LETTER_U_WITH_CIRCUMFLEX:
1182 return (const char *)"&ucirc;";
1183 case LATIN_SMALL_LETTER_U_WITH_DIAERESIS:
1184 return (const char *)"&uuml;";
1185 case LATIN_SMALL_LETTER_Y_WITH_ACUTE:
1186 return (const char *)"&yacute;";
1187 case LATIN_SMALL_LETTER_THORN:
1188 return (const char *)"&thorn;";
1189 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
1190 return (const char *)"&yuml;";
1191 case LATIN_SMALL_LETTER_Z_WITH_CARON:
1192 return (const char *)"&zcaron;";
1193 case EURO_CURRENCY_SIGN:
1194 return (const char *)"&euro;";
1195 case 0:
1196 return (const char *)"";
1197 default:
1198 sprintf(buf,"&#%u;",(unsigned)c);
1199 return buf; /* undefined */
1201 /* break; unreachable code */
1202 case XML: /* only 5 &xxx;-ENTITIES ar defined by default */
1203 if ( c >= SPACE && c <= TILDE ) { /* ASCII */
1204 switch (c) {
1205 case '&':
1206 return (const char *)"&amp;";
1207 case '\'':
1208 return (const char *)"&apos;";
1209 case '"':
1210 return (const char *)"&quot;";
1211 case '<':
1212 return (const char *)"&lt;";
1213 case '>':
1214 return (const char *)"&gt;";
1216 buf[0] = (char)c;
1217 return buf;
1219 switch (c) { /* subject of change! */
1220 case PICTURE:
1221 return (const char *)"(PICTURE)";
1222 case UNKNOWN:
1223 return (const char *)"_"; /* better use colored symbol? */
1224 case LINE_FEED: /* \n handled somwhere else? */
1225 case FORM_FEED:
1226 case CARRIAGE_RETURN:
1227 return (const char *)"<br />";
1228 case NO_BREAK_SPACE:
1229 return (const char *)"<nobr />";
1230 case 0:
1231 return (const char *)"";
1232 default:
1233 sprintf(buf,"&#x%03x;",(unsigned)c);
1234 return buf; /* undefined */
1236 /* break; unreachable code */
1237 case SGML:
1238 switch (c) {
1239 default:
1240 sprintf(buf,"&#%u;",(unsigned)c);
1241 return buf; /* UNDEFINED */
1243 /* break; unreachable code */
1244 case ASCII: /* mainly used for debugging */
1245 if ( c=='\n' || (c>= 0x20 && c <= 0x7F) ) {
1246 buf[0] = (char)c;
1247 return buf;
1249 switch (c) {
1250 /* extra */
1251 case UNKNOWN:
1252 return (const char *)"(?)";
1253 case PICTURE:
1254 return (const char *)"(?)";
1256 default:
1257 /* snprintf seems to be no standard, so I use insecure sprintf */
1258 if ((unsigned)c>255) sprintf(buf,"(0x%04x)",(unsigned)c);
1259 else sprintf(buf,"(0x%02x)",(unsigned)c);
1260 return buf; /* UNDEFINED; */
1262 /* break; unreachable code */
1263 default: /* use UTF8 as default, test with xterm -u8 */
1264 /* extra */
1265 if ( c == UNKNOWN ) return (const char *)"_";
1266 if ( c == PICTURE ) return (const char *)"_"; /* Due to Mobile OCR */
1267 if ( c <= (wchar_t)0x0000007F ) { /* UTF8 == 7bit ASCII */
1268 buf[0] = (char)c;
1269 return buf;
1271 if ( c <= (wchar_t)0x000007FF ) { /* UTF8 == 11bit */
1272 buf[0] = (char)(0xc0|((c>> 6) & 0x1f)); /* 110xxxxx */
1273 buf[1] = (char)(0x80|( c & 0x3f)); /* 10xxxxxx */
1274 buf[2] = (char)0; /* terminate string */
1275 return buf;
1277 /* wchar_t is 16bit for Borland-C !? Jan07 */
1278 if ( c <= (wchar_t)0x0000FFFF ) { /* UTF8 == 16bit */
1279 buf[0] = (char)(0xe0|((c>>12) & 0x0f)); /* 1110xxxx */
1280 buf[1] = (char)(0x80|((c>> 6) & 0x3f)); /* 10xxxxxx */
1281 buf[2] = (char)(0x80|( c & 0x3f)); /* 10xxxxxx */
1282 buf[3] = (char)0; /* terminate string */
1283 return buf;
1285 if ( c <= (wchar_t)0x001FFFFF ) { /* UTF8 == 21bit */
1286 buf[0] = (char)(0xf0|((c>>18) & 0x07)); /* 11110xxx */
1287 buf[1] = (char)(0x80|((c>>12) & 0x3f)); /* 10xxxxxx */
1288 buf[2] = (char)(0x80|((c>> 6) & 0x3f)); /* 10xxxxxx */
1289 buf[3] = (char)(0x80|( c & 0x3f)); /* 10xxxxxx */
1290 buf[4] = (char)0; /* terminate string */
1291 return buf;
1293 if ( c <= (wchar_t)0x03FFFFFF ) { /* UTF8 == 26bit */
1294 buf[0] = (char)(0xf8|((c>>24) & 0x03)); /* 111110xx */
1295 buf[1] = (char)(0x80|((c>>18) & 0x3f)); /* 10xxxxxx */
1296 buf[2] = (char)(0x80|((c>>12) & 0x3f)); /* 10xxxxxx */
1297 buf[3] = (char)(0x80|((c>> 6) & 0x3f)); /* 10xxxxxx */
1298 buf[4] = (char)(0x80|( c & 0x3f)); /* 10xxxxxx */
1299 buf[5] = (char)0; /* terminate string */
1300 return buf;
1302 if ( c <= (wchar_t)0x7FFFFFFF ) { /* UTF8 == 31bit */
1303 buf[0] = (char)(0xfc|((c>>30) & 0x01)); /* 1111110x */
1304 buf[1] = (char)(0x80|((c>>24) & 0x3f)); /* 10xxxxxx */
1305 buf[2] = (char)(0x80|((c>>18) & 0x3f)); /* 10xxxxxx */
1306 buf[3] = (char)(0x80|((c>>12) & 0x3f)); /* 10xxxxxx */
1307 buf[4] = (char)(0x80|((c>> 6) & 0x3f)); /* 10xxxxxx */
1308 buf[5] = (char)(0x80|( c & 0x3f)); /* 10xxxxxx */
1309 buf[6] = (char)0; /* terminate string */
1310 return buf;
1312 return (const char *)UNDEFINED;