use somewhat optimized code path for regexps with only greedy splits (speedup)
[libre9.git] / tools / unigen.c
blob359151b3945332872f97cd773dbaa038a2ca2e7b
1 #include <ctype.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5 #include <unistd.h>
8 static char *trim (char *str) {
9 char *s;
10 if (str == NULL) return NULL;
11 for (s = str; *s && isspace(*s); ++s) ;
12 if (s > str) memmove(str, s, strlen(s)+1);
13 for (s = str+strlen(str)-1; s >= str && isspace(*s); --s) ;
14 s[1] = 0;
15 return str;
19 static int hex2num (char *s) {
20 char *end;
21 int res;
22 trim(s);
23 if (!s[0]) return -1;
24 res = strtol(s, &end, 16);
25 if (end[0] || res < 0) return -1;
26 return res;
30 typedef struct {
31 int code;
32 const char *name;
33 const char *class;
34 int upper;
35 int lower;
36 } UCInfo;
39 static char *stx = NULL;
41 static char *tok (void) {
42 char *s = stx;
43 for (; *stx && *stx != ';'; ++stx) ;
44 if (*stx) *stx++ = 0;
45 return s;
49 // <0: eof; 0: bad record; >0: good record
50 static int read_record (FILE *fi, UCInfo *ui) {
51 static char str[8192];
52 char *tk, *u, *l;
53 int f;
54 if (fgets(str, sizeof(str)-1, fi) == NULL) return -1;
55 stx = trim(str);
56 if ((tk = tok()) == NULL) return 0;
57 if ((ui->code = hex2num(tk)) < 0) return 0;
58 if (ui->code > 65535) return -1;
59 if ((ui->name = trim(tok())) == NULL) return 0;
60 if ((ui->class = trim(tok())) == NULL) return 0;
61 // skip unused fields
62 for (f = 9; f > 0; --f) if (tok() == NULL) { printf("%d\n", f); return 0; }
63 if ((u = trim(tok())) == NULL) return 0;
64 if ((l = trim(tok())) == NULL) return 0;
65 if (!u[0]) ui->upper = ui->code; else ui->upper = hex2num(u);
66 if (!l[0]) ui->lower = ui->code; else ui->lower = hex2num(l);
67 if (ui->upper < 0 || ui->lower < 0) return 0;
68 if (ui->upper > 65535 || ui->lower > 65535) abort();
69 return 1;
73 typedef struct {
74 int code, l, u;
75 } mm;
77 static mm map[65535];
80 int main (int argc, char *argv[]) {
81 UCInfo ui;
82 int rc, f, totalmap = 0;
83 FILE *fi, *fo;
84 if (argc != 3) { fprintf(stderr, "usage: %s unitable.c unitable.txt\n", argv[0]); exit(1); }
85 if ((fi = fopen(argv[2], "r")) == NULL) { fprintf(stderr, "FATAL: can't open input file: '%s'\n", argv[2]); exit(1); }
86 for (;;) {
87 rc = read_record(fi, &ui);
88 if (rc < 0) break;
89 if (rc == 0) continue;
90 if (ui.code < 128) continue;
91 if (strcmp(ui.class, "Lu") != 0 && strcmp(ui.class, "Ll") != 0) continue;
92 if (ui.upper == ui.code && ui.lower == ui.code) continue;
93 for (f = 0; f < totalmap; ++f) {
94 if (map[f].code == ui.code) { fprintf(stderr, "FATAL: duplicate entries in the map!\n"); exit(1); }
95 if (map[f].code > ui.code) { fprintf(stderr, "FATAL: invalid entry order in the map!\n"); exit(1); }
97 if (totalmap > 65535) { fprintf(stderr, "FATAL: too many entries in the map!\n"); exit(1); }
98 map[totalmap].code = ui.code;
99 map[totalmap].l = ui.lower;
100 map[totalmap].u = ui.upper;
101 ++totalmap;
103 fclose(fi);
104 qsort(map, totalmap, sizeof(map[0]), ({
105 int cmp (const void *p0, const void *p1) {
106 const mm *i0 = (const mm *)p0;
107 const mm *i1 = (const mm *)p1;
108 return (i0->code < i1->code ? -1 : (i0->code > i1->code ? 1 : 0));
110 cmp;
111 }));
112 fo = fopen(argv[1], "w");
113 if (fo == NULL) { fprintf(stderr, "FATAL: can't create output file: '%s'\n", argv[1]); exit(1); }
114 fprintf(fo, "static const struct casemap unicode_case_mapping[%d] = {\n", totalmap);
115 for (f = 0; f < totalmap; ++f) fprintf(fo, "{0x%04x,0x%04x,0x%04x},\n", map[f].code, map[f].l, map[f].u);
116 fprintf(fo, "%s\n", "};");
117 fclose(fo);
118 return 0;