12 #define ARRAY_COUNT(arr) (sizeof(arr) / sizeof(arr[0]))
17 int length
; // length of the unicode array. TODO: use dynamic memory allocation
19 uint8_t bytes
[2]; // bytes to convert unicode array to, (e.g. 'A' = 0x0A)
22 static struct HashTable
*charmap
;
24 static void fatal_error(const char *msgfmt
, ...)
28 fputs("error: ", stderr
);
30 va_start(args
, msgfmt
);
31 vfprintf(stderr
, msgfmt
, args
);
39 static void parse_error(const char *filename
, int lineNum
, const char *msgfmt
, ...)
43 fprintf(stderr
, "%s: line %i: ", filename
, lineNum
);
45 va_start(args
, msgfmt
);
46 vfprintf(stderr
, msgfmt
, args
);
54 // Reads the whole file and returns a null-terminated buffer with its contents
55 void *read_text_file(const char *filename
)
57 FILE *file
= fopen(filename
, "rb");
62 fatal_error("failed to open file '%s' for reading: %s", filename
, strerror(errno
));
65 fseek(file
, 0, SEEK_END
);
69 buffer
= malloc(size
+ 1);
72 fseek(file
, 0, SEEK_SET
);
73 if (fread(buffer
, size
, 1, file
) != 1)
74 fatal_error("error reading from file '%s': %s", filename
, strerror(errno
));
76 // null-terminate the buffer
84 static char *skip_whitespace(char *str
)
91 // null terminates the current line and returns a pointer to the next line
92 static char *line_split(char *str
)
97 return str
; // end of string
100 *str
= 0; // terminate line
104 static char *parse_number(const char *str
, unsigned int *num
)
107 unsigned int n
= strtol(str
, &endptr
, 0);
116 static int is_identifier_char(char c
)
118 return isalnum(c
) || c
== '_';
121 static int get_escape_char(int c
)
123 const uint8_t escapeTable
[] =
137 if ((unsigned int)c
< ARRAY_COUNT(escapeTable
) && escapeTable
[c
] != 0)
138 return escapeTable
[c
];
143 static void read_charmap(const char *filename
)
145 char *filedata
= read_text_file(filename
);
146 char *line
= filedata
;
151 char *nextLine
= line_split(line
);
153 struct CharmapEntry entry
;
155 line
= skip_whitespace(line
);
156 if (line
[0] != 0 && line
[0] != '#') // ignore empty lines and comments
163 parse_error(filename
, lineNum
, "expected '");
166 // perform analysis of charmap entry, we are in the quote
174 else if(len
== ARRAY_COUNT(entry
.unicode
))
176 // TODO: Use dynamic memory allocation so this is unnecessary.
177 parse_error(filename
, lineNum
, "string limit exceeded");
179 else if (*line
== '\\')
181 line
++; // advance to get the character being escaped
186 // Backslash at end of line is ignored
189 entry
.unicode
[len
] = get_escape_char(*line
);
190 if (entry
.unicode
[len
] == 0)
191 parse_error(filename
, lineNum
, "unknown escape sequence \\%c", *line
);
192 line
++; // increment again to get past the escape sequence.
196 line
= utf8_decode(line
, &entry
.unicode
[len
]);
198 parse_error(filename
, lineNum
, "invalid UTF8");
205 line
= skip_whitespace(line
);
207 parse_error(filename
, lineNum
, "expected = after character \\%c", *line
);
210 entry
.bytesCount
= 0;
217 if (entry
.bytesCount
>= 2)
218 parse_error(filename
, lineNum
, "more than 2 values specified");
220 line
= skip_whitespace(line
);
222 line
= parse_number(line
, &value
);
224 parse_error(filename
, lineNum
, "expected number after =");
226 parse_error(filename
, lineNum
, "0x%X is larger than 1 byte", value
);
228 entry
.bytes
[entry
.bytesCount
] = value
;
231 line
= skip_whitespace(line
);
235 parse_error(filename
, lineNum
, "junk at end of line");
239 if (hashtable_query(charmap
, &entry
) != NULL
)
240 parse_error(filename
, lineNum
, "entry for character already exists");
241 hashtable_insert(charmap
, &entry
);
251 static int count_line_num(const char *start
, const char *pos
)
256 for (c
= start
; c
< pos
; c
++)
264 static char *convert_string(char *pos
, FILE *fout
, const char *inputFileName
, char *start
, int uncompressed
)
270 pos
= skip_whitespace(pos
);
276 parse_error(inputFileName
, count_line_num(start
, pos
), "expected quoted string after '_('");
278 else if (*pos
!= '"')
279 parse_error(inputFileName
, count_line_num(start
, pos
), "unexpected character '%c'", *pos
);
284 // convert quoted string
287 struct CharmapEntry input
;
288 struct CharmapEntry
*last_valid_entry
= NULL
;
289 struct CharmapEntry
*entry
;
292 char* last_valid_pos
= NULL
;
294 // safely erase the unicode area before use
295 memset(input
.unicode
, 0, sizeof (input
.unicode
));
298 // Find a charmap entry of longest length possible starting from this position
301 if ((uncompressed
&& length
== 1) || length
== ARRAY_COUNT(entry
->unicode
))
303 // Stop searching after length 3; we only support strings of lengths up
304 // to that right now. Unless uncompressed is set, in which we ignore multi
305 // texts by discarding entries longer than 1.
310 parse_error(inputFileName
, count_line_num(start
, pos
), "EOF in string literal");
314 c
= get_escape_char(*pos
);
316 parse_error(inputFileName
, count_line_num(start
, pos
), "unknown escape sequence \\%c", *pos
);
317 input
.unicode
[length
] = c
;
322 pos
= utf8_decode(pos
, &input
.unicode
[length
]);
324 parse_error(inputFileName
, count_line_num(start
, pos
), "invalid unicode encountered in file");
327 input
.length
= length
;
329 entry
= hashtable_query(charmap
, &input
);
332 last_valid_entry
= entry
;
333 last_valid_pos
= pos
;
337 entry
= last_valid_entry
;
338 pos
= last_valid_pos
;
340 parse_error(inputFileName
, count_line_num(start
, pos
), "no charmap entry for U+%X", input
.unicode
[0]);
341 for (i
= 0; i
< entry
->bytesCount
; i
++)
342 fprintf(fout
, "0x%02X,", entry
->bytes
[i
]);
344 pos
++; // skip over closing '"'
346 pos
++; // skip over closing ')'
351 static void convert_file(const char *infilename
, const char *outfilename
)
353 char *in
= read_text_file(infilename
);
354 FILE *fout
= fopen(outfilename
, "wb");
357 fatal_error("failed to open file '%s' for writing: %s", strerror(errno
));
365 if (*pos
== 0) // end of file
372 // skip over // comment
376 // skip over next newline
385 // skip over /* */ comment
386 else if (*pos
== '*')
389 while (*pos
!= '*' && pos
[1] != '/')
398 // skip over normal string literal
399 else if (*pos
== '"')
412 // check for _( sequence
413 else if ((*pos
== '_') && (pos
== in
|| !is_identifier_char(pos
[-1])))
415 int uncompressed
= 0;
418 if (*pos
== '_') // an extra _ signifies uncompressed strings. Enable uncompressed flag
426 fwrite(start
, end
- start
, 1, fout
);
427 pos
= convert_string(pos
, fout
, infilename
, in
, uncompressed
);
438 fwrite(start
, pos
- start
, 1, fout
);
443 static unsigned int charmap_hash(const void *value
)
445 const struct CharmapEntry
* entry
= value
;
446 unsigned int ret
= 0;
447 for (int i
= 0; i
< entry
->length
; i
++)
448 ret
= ret
* 17 + entry
->unicode
[i
];
452 static int charmap_cmp(const void *a
, const void *b
)
454 const struct CharmapEntry
*ea
= a
;
455 const struct CharmapEntry
*eb
= b
;
456 if (ea
->length
!= eb
->length
)
458 for(int i
= 0; i
< ea
->length
; i
++)
459 if(ea
->unicode
[i
] != eb
->unicode
[i
])
464 static void usage(const char *execName
)
466 fprintf(stderr
, "Usage: %s CHARMAP INPUT OUTPUT\n", execName
);
469 int main(int argc
, char **argv
)
477 charmap
= hashtable_new(charmap_hash
, charmap_cmp
, 256, sizeof(struct CharmapEntry
));
479 read_charmap(argv
[1]);
480 convert_file(argv
[2], argv
[3]);
482 hashtable_free(charmap
);