src/strgen/strgen_base.cpp

   1 /* $Id$ */
   2
   3 /*
   4  * This file is part of OpenTTD.
   5  * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
   6  * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   7  * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
   8  */
   9
  10 /** @file strgen_base.cpp Tool to create computer readable (stand-alone) translation files. */
  11
  12 #include "../stdafx.h"
  13 #include "../core/endian_func.hpp"
  14 #include "../string_func.h"
  15 #include "../table/control_codes.h"
  16
  17 #include "strgen.h"
  18
  19
  20 #include "../table/strgen_tables.h"
  21
  22 #include "../safeguards.h"
  23
  24 /* Compiles a list of strings into a compiled string list */
  25
  26 static bool _translated;              ///< Whether the current language is not the master language
  27 static bool _translation;             ///< Is the current file actually a translation or not
  28 const char *_file = "(unknown file)"; ///< The filename of the input, so we can refer to it in errors/warnings
  29 int _cur_line;                        ///< The current line we're parsing in the input file
  30 int _errors, _warnings, _show_todo;
  31 LanguagePackHeader _lang;             ///< Header information about a language.
  32
  33 static const ptrdiff_t MAX_COMMAND_PARAM_SIZE = 100; ///< Maximum size of every command block, not counting the name of the command itself
  34 static const CmdStruct *ParseCommandString(const char **str, char *param, int *argno, int *casei);
  35
  36 /**
  37  * Create a new case.
  38  * @param caseidx The index of the case.
  39  * @param string  The translation of the case.
  40  * @param next    The next chained case.
  41  */
  42 Case::Case(int caseidx, const char *string, Case *next) :
  43                 caseidx(caseidx), string(stredup(string)), next(next)
  44 {
  45 }
  46
  47 /** Free everything we allocated. */
  48 Case::~Case()
  49 {
  50         free(this->string);
  51         delete this->next;
  52 }
  53
  54 /**
  55  * Create a new string.
  56  * @param name    The name of the string.
  57  * @param english The english "translation" of the string.
  58  * @param index   The index in the string table.
  59  * @param line    The line this string was found on.
  60  */
  61 LangString::LangString(const char *name, const char *english, int index, int line) :
  62                 name(stredup(name)), english(stredup(english)), translated(NULL),
  63                 hash_next(0), index(index), line(line), translated_case(NULL)
  64 {
  65 }
  66
  67 /** Free everything we allocated. */
  68 LangString::~LangString()
  69 {
  70         free(this->name);
  71         free(this->english);
  72         free(this->translated);
  73         delete this->translated_case;
  74 }
  75
  76 /** Free all data related to the translation. */
  77 void LangString::FreeTranslation()
  78 {
  79         free(this->translated);
  80         this->translated = NULL;
  81
  82         delete this->translated_case;
  83         this->translated_case = NULL;
  84 }
  85
  86 /**
  87  * Create a new string data container.
  88  * @param max_strings The maximum number of strings.
  89  */
  90 StringData::StringData(size_t tabs) : tabs(tabs), max_strings(tabs * TAB_SIZE)
  91 {
  92         this->strings = CallocT<LangString *>(max_strings);
  93         this->hash_heads = CallocT<uint16>(max_strings);
  94         this->next_string_id = 0;
  95 }
  96
  97 /** Free everything we allocated. */
  98 StringData::~StringData()
  99 {
 100         for (size_t i = 0; i < this->max_strings; i++) delete this->strings[i];
 101         free(this->strings);
 102         free(this->hash_heads);
 103 }
 104
 105 /** Free all data related to the translation. */
 106 void StringData::FreeTranslation()
 107 {
 108         for (size_t i = 0; i < this->max_strings; i++) {
 109                 LangString *ls = this->strings[i];
 110                 if (ls != NULL) ls->FreeTranslation();
 111         }
 112 }
 113
 114 /**
 115  * Create a hash of the string for finding them back quickly.
 116  * @param s The string to hash.
 117  * @return The hashed string.
 118  */
 119 uint StringData::HashStr(const char *s) const
 120 {
 121         uint hash = 0;
 122         for (; *s != '\0'; s++) hash = ROL(hash, 3) ^ *s;
 123         return hash % this->max_strings;
 124 }
 125
 126 /**
 127  * Add a newly created LangString.
 128  * @param s  The name of the string.
 129  * @param ls The string to add.
 130  */
 131 void StringData::Add(const char *s, LangString *ls)
 132 {
 133         uint hash = this->HashStr(s);
 134         ls->hash_next = this->hash_heads[hash];
 135         /* Off-by-one for hash find. */
 136         this->hash_heads[hash] = ls->index + 1;
 137         this->strings[ls->index] = ls;
 138 }
 139
 140 /**
 141  * Find a LangString based on the string name.
 142  * @param s The string name to search on.
 143  * @return The LangString or NULL if it is not known.
 144  */
 145 LangString *StringData::Find(const char *s)
 146 {
 147         int idx = this->hash_heads[this->HashStr(s)];
 148
 149         while (--idx >= 0) {
 150                 LangString *ls = this->strings[idx];
 151
 152                 if (strcmp(ls->name, s) == 0) return ls;
 153                 idx = ls->hash_next;
 154         }
 155         return NULL;
 156 }
 157
 158 /**
 159  * Create a compound hash.
 160  * @param hash The hash to add the string hash to.
 161  * @param s    The string hash.
 162  * @return The new hash.
 163  */
 164 uint StringData::VersionHashStr(uint hash, const char *s) const
 165 {
 166         for (; *s != '\0'; s++) {
 167                 hash = ROL(hash, 3) ^ *s;
 168                 hash = (hash & 1 ? hash >> 1 ^ 0xDEADBEEF : hash >> 1);
 169         }
 170         return hash;
 171 }
 172
 173 /**
 174  * Make a hash of the file to get a unique "version number"
 175  * @return The version number.
 176  */
 177 uint StringData::Version() const
 178 {
 179         uint hash = 0;
 180
 181         for (size_t i = 0; i < this->max_strings; i++) {
 182                 const LangString *ls = this->strings[i];
 183
 184                 if (ls != NULL) {
 185                         const CmdStruct *cs;
 186                         const char *s;
 187                         char buf[MAX_COMMAND_PARAM_SIZE];
 188                         int argno;
 189                         int casei;
 190
 191                         s = ls->name;
 192                         hash ^= i * 0x717239;
 193                         hash = (hash & 1 ? hash >> 1 ^ 0xDEADBEEF : hash >> 1);
 194                         hash = this->VersionHashStr(hash, s + 1);
 195
 196                         s = ls->english;
 197                         while ((cs = ParseCommandString(&s, buf, &argno, &casei)) != NULL) {
 198                                 if (cs->flags & C_DONTCOUNT) continue;
 199
 200                                 hash ^= (cs - _cmd_structs) * 0x1234567;
 201                                 hash = (hash & 1 ? hash >> 1 ^ 0xF00BAA4 : hash >> 1);
 202                         }
 203                 }
 204         }
 205
 206         return hash;
 207 }
 208
 209 /**
 210  * Count the number of tab elements that are in use.
 211  * @param tab The tab to count the elements of.
 212  */
 213 uint StringData::CountInUse(uint tab) const
 214 {
 215         int i;
 216         for (i = TAB_SIZE; --i >= 0;) if (this->strings[(tab * TAB_SIZE) + i] != NULL) break;
 217         return i + 1;
 218 }
 219
 220 static const char *_cur_ident;
 221
 222 struct CmdPair {
 223         const CmdStruct *a;
 224         const char *v;
 225 };
 226
 227 struct ParsedCommandStruct {
 228         uint np;
 229         CmdPair pairs[32];
 230         const CmdStruct *cmd[32]; // ordered by param #
 231 };
 232
 233 /* Used when generating some advanced commands. */
 234 static ParsedCommandStruct _cur_pcs;
 235 static int _cur_argidx;
 236
 237 /** The buffer for writing a single string. */
 238 struct Buffer : SmallVector<byte, 256> {
 239         /**
 240          * Convenience method for adding a byte.
 241          * @param value The value to add.
 242          */
 243         void AppendByte(byte value)
 244         {
 245                 *this->Append() = value;
 246         }
 247
 248         /**
 249          * Add an Unicode character encoded in UTF-8 to the buffer.
 250          * @param value The character to add.
 251          */
 252         void AppendUtf8(uint32 value)
 253         {
 254                 if (value < 0x80) {
 255                         *this->Append() = value;
 256                 } else if (value < 0x800) {
 257                         *this->Append() = 0xC0 + GB(value,  6, 5);
 258                         *this->Append() = 0x80 + GB(value,  0, 6);
 259                 } else if (value < 0x10000) {
 260                         *this->Append() = 0xE0 + GB(value, 12, 4);
 261                         *this->Append() = 0x80 + GB(value,  6, 6);
 262                         *this->Append() = 0x80 + GB(value,  0, 6);
 263                 } else if (value < 0x110000) {
 264                         *this->Append() = 0xF0 + GB(value, 18, 3);
 265                         *this->Append() = 0x80 + GB(value, 12, 6);
 266                         *this->Append() = 0x80 + GB(value,  6, 6);
 267                         *this->Append() = 0x80 + GB(value,  0, 6);
 268                 } else {
 269                         strgen_warning("Invalid unicode value U+0x%X", value);
 270                 }
 271         }
 272 };
 273
 274 size_t Utf8Validate(const char *s)
 275 {
 276         uint32 c;
 277
 278         if (!HasBit(s[0], 7)) {
 279                 /* 1 byte */
 280                 return 1;
 281         } else if (GB(s[0], 5, 3) == 6 && IsUtf8Part(s[1])) {
 282                 /* 2 bytes */
 283                 c = GB(s[0], 0, 5) << 6 | GB(s[1], 0, 6);
 284                 if (c >= 0x80) return 2;
 285         } else if (GB(s[0], 4, 4) == 14 && IsUtf8Part(s[1]) && IsUtf8Part(s[2])) {
 286                 /* 3 bytes */
 287                 c = GB(s[0], 0, 4) << 12 | GB(s[1], 0, 6) << 6 | GB(s[2], 0, 6);
 288                 if (c >= 0x800) return 3;
 289         } else if (GB(s[0], 3, 5) == 30 && IsUtf8Part(s[1]) && IsUtf8Part(s[2]) && IsUtf8Part(s[3])) {
 290                 /* 4 bytes */
 291                 c = GB(s[0], 0, 3) << 18 | GB(s[1], 0, 6) << 12 | GB(s[2], 0, 6) << 6 | GB(s[3], 0, 6);
 292                 if (c >= 0x10000 && c <= 0x10FFFF) return 4;
 293         }
 294
 295         return 0;
 296 }
 297
 298
 299 void EmitSingleChar(Buffer *buffer, char *buf, int value)
 300 {
 301         if (*buf != '\0') strgen_warning("Ignoring trailing letters in command");
 302         buffer->AppendUtf8(value);
 303 }
 304
 305
 306 /* The plural specifier looks like
 307  * {NUM} {PLURAL -1 passenger passengers} then it picks either passenger/passengers depending on the count in NUM */
 308
 309 /* This is encoded like
 310  *  CommandByte <ARG#> <NUM> {Length of each string} {each string} */
 311
 312 bool ParseRelNum(char **buf, int *value, int *offset)
 313 {
 314         const char *s = *buf;
 315         char *end;
 316         bool rel = false;
 317
 318         while (*s == ' ' || *s == '\t') s++;
 319         if (*s == '+') {
 320                 rel = true;
 321                 s++;
 322         }
 323         int v = strtol(s, &end, 0);
 324         if (end == s) return false;
 325         if (rel || v < 0) {
 326                 *value += v;
 327         } else {
 328                 *value = v;
 329         }
 330         if (offset != NULL && *end == ':') {
 331                 /* Take the Nth within */
 332                 s = end + 1;
 333                 *offset = strtol(s, &end, 0);
 334                 if (end == s) return false;
 335         }
 336         *buf = end;
 337         return true;
 338 }
 339
 340 /* Parse out the next word, or NULL */
 341 char *ParseWord(char **buf)
 342 {
 343         char *s = *buf, *r;
 344
 345         while (*s == ' ' || *s == '\t') s++;
 346         if (*s == '\0') return NULL;
 347
 348         if (*s == '"') {
 349                 r = ++s;
 350                 /* parse until next " or NUL */
 351                 for (;;) {
 352                         if (*s == '\0') break;
 353                         if (*s == '"') {
 354                                 *s++ = '\0';
 355                                 break;
 356                         }
 357                         s++;
 358                 }
 359         } else {
 360                 /* proceed until whitespace or NUL */
 361                 r = s;
 362                 for (;;) {
 363                         if (*s == '\0') break;
 364                         if (*s == ' ' || *s == '\t') {
 365                                 *s++ = '\0';
 366                                 break;
 367                         }
 368                         s++;
 369                 }
 370         }
 371         *buf = s;
 372         return r;
 373 }
 374
 375 /* Forward declaration */
 376 static int TranslateArgumentIdx(int arg, int offset = 0);
 377
 378 static void EmitWordList(Buffer *buffer, const char * const *words, uint nw)
 379 {
 380         buffer->AppendByte(nw);
 381         for (uint i = 0; i < nw; i++) buffer->AppendByte((byte)strlen(words[i]) + 1);
 382         for (uint i = 0; i < nw; i++) {
 383                 for (uint j = 0; words[i][j] != '\0'; j++) buffer->AppendByte(words[i][j]);
 384                 buffer->AppendByte(0);
 385         }
 386 }
 387
 388 void EmitPlural(Buffer *buffer, char *buf, int value)
 389 {
 390         int argidx = _cur_argidx;
 391         int offset = -1;
 392         int expected = _plural_forms[_lang.plural_form].plural_count;
 393         const char **words = AllocaM(const char *, max(expected, MAX_PLURALS));
 394         int nw = 0;
 395
 396         /* Parse out the number, if one exists. Otherwise default to prev arg. */
 397         if (!ParseRelNum(&buf, &argidx, &offset)) argidx--;
 398
 399         const CmdStruct *cmd = _cur_pcs.cmd[argidx];
 400         if (offset == -1) {
 401                 /* Use default offset */
 402                 if (cmd == NULL || cmd->default_plural_offset < 0) {
 403                         strgen_fatal("Command '%s' has no (default) plural position", cmd == NULL ? "<empty>" : cmd->cmd);
 404                 }
 405                 offset = cmd->default_plural_offset;
 406         }
 407
 408         /* Parse each string */
 409         for (nw = 0; nw < MAX_PLURALS; nw++) {
 410                 words[nw] = ParseWord(&buf);
 411                 if (words[nw] == NULL) break;
 412         }
 413
 414         if (nw == 0) {
 415                 strgen_fatal("%s: No plural words", _cur_ident);
 416         }
 417
 418         if (expected != nw) {
 419                 if (_translated) {
 420                         strgen_fatal("%s: Invalid number of plural forms. Expecting %d, found %d.", _cur_ident,
 421                                 expected, nw);
 422                 } else {
 423                         if ((_show_todo & 2) != 0) strgen_warning("'%s' is untranslated. Tweaking english string to allow compilation for plural forms", _cur_ident);
 424                         if (nw > expected) {
 425                                 nw = expected;
 426                         } else {
 427                                 for (; nw < expected; nw++) {
 428                                         words[nw] = words[nw - 1];
 429                                 }
 430                         }
 431                 }
 432         }
 433
 434         buffer->AppendUtf8(SCC_PLURAL_LIST);
 435         buffer->AppendByte(_lang.plural_form);
 436         buffer->AppendByte(TranslateArgumentIdx(argidx, offset));
 437         EmitWordList(buffer, words, nw);
 438 }
 439
 440
 441 void EmitGender(Buffer *buffer, char *buf, int value)
 442 {
 443         int argidx = _cur_argidx;
 444         int offset = 0;
 445         uint nw;
 446
 447         if (buf[0] == '=') {
 448                 buf++;
 449
 450                 /* This is a {G=DER} command */
 451                 nw = _lang.GetGenderIndex(buf);
 452                 if (nw >= MAX_NUM_GENDERS) strgen_fatal("G argument '%s' invalid", buf);
 453
 454                 /* now nw contains the gender index */
 455                 buffer->AppendUtf8(SCC_GENDER_INDEX);
 456                 buffer->AppendByte(nw);
 457         } else {
 458                 const char *words[MAX_NUM_GENDERS];
 459
 460                 /* This is a {G 0 foo bar two} command.
 461                  * If no relative number exists, default to +0 */
 462                 if (!ParseRelNum(&buf, &argidx, &offset)) {}
 463
 464                 const CmdStruct *cmd = _cur_pcs.cmd[argidx];
 465                 if (cmd == NULL || (cmd->flags & C_GENDER) == 0) {
 466                         strgen_fatal("Command '%s' can't have a gender", cmd == NULL ? "<empty>" : cmd->cmd);
 467                 }
 468
 469                 for (nw = 0; nw < MAX_NUM_GENDERS; nw++) {
 470                         words[nw] = ParseWord(&buf);
 471                         if (words[nw] == NULL) break;
 472                 }
 473                 if (nw != _lang.num_genders) strgen_fatal("Bad # of arguments for gender command");
 474
 475                 assert(IsInsideBS(cmd->value, SCC_CONTROL_START, UINT8_MAX));
 476                 buffer->AppendUtf8(SCC_GENDER_LIST);
 477                 buffer->AppendByte(TranslateArgumentIdx(argidx, offset));
 478                 EmitWordList(buffer, words, nw);
 479         }
 480 }
 481
 482 static const CmdStruct *FindCmd(const char *s, int len)
 483 {
 484         for (const CmdStruct *cs = _cmd_structs; cs != endof(_cmd_structs); cs++) {
 485                 if (strncmp(cs->cmd, s, len) == 0 && cs->cmd[len] == '\0') return cs;
 486         }
 487         return NULL;
 488 }
 489
 490 static uint ResolveCaseName(const char *str, size_t len)
 491 {
 492         /* First get a clean copy of only the case name, then resolve it. */
 493         char case_str[CASE_GENDER_LEN];
 494         len = min(lengthof(case_str) - 1, len);
 495         memcpy(case_str, str, len);
 496         case_str[len] = '\0';
 497
 498         uint8 case_idx = _lang.GetCaseIndex(case_str);
 499         if (case_idx >= MAX_NUM_CASES) strgen_fatal("Invalid case-name '%s'", case_str);
 500         return case_idx + 1;
 501 }
 502
 503
 504 /* returns NULL on eof
 505  * else returns command struct */
 506 static const CmdStruct *ParseCommandString(const char **str, char *param, int *argno, int *casei)
 507 {
 508         const char *s = *str, *start;
 509         char c;
 510
 511         *argno = -1;
 512         *casei = -1;
 513
 514         /* Scan to the next command, exit if there's no next command. */
 515         for (; *s != '{'; s++) {
 516                 if (*s == '\0') return NULL;
 517         }
 518         s++; // Skip past the {
 519
 520         if (*s >= '0' && *s <= '9') {
 521                 char *end;
 522
 523                 *argno = strtoul(s, &end, 0);
 524                 if (*end != ':') strgen_fatal("missing arg #");
 525                 s = end + 1;
 526         }
 527
 528         /* parse command name */
 529         start = s;
 530         do {
 531                 c = *s++;
 532         } while (c != '}' && c != ' ' && c != '=' && c != '.' && c != 0);
 533
 534         const CmdStruct *cmd = FindCmd(start, s - start - 1);
 535         if (cmd == NULL) {
 536                 strgen_error("Undefined command '%.*s'", (int)(s - start - 1), start);
 537                 return NULL;
 538         }
 539
 540         if (c == '.') {
 541                 const char *casep = s;
 542
 543                 if (!(cmd->flags & C_CASE)) {
 544                         strgen_fatal("Command '%s' can't have a case", cmd->cmd);
 545                 }
 546
 547                 do {
 548                         c = *s++;
 549                 } while (c != '}' && c != ' ' && c != '\0');
 550                 *casei = ResolveCaseName(casep, s - casep - 1);
 551         }
 552
 553         if (c == '\0') {
 554                 strgen_error("Missing } from command '%s'", start);
 555                 return NULL;
 556         }
 557
 558
 559         if (c != '}') {
 560                 if (c == '=') s--;
 561                 /* copy params */
 562                 start = s;
 563                 for (;;) {
 564                         c = *s++;
 565                         if (c == '}') break;
 566                         if (c == '\0') {
 567                                 strgen_error("Missing } from command '%s'", start);
 568                                 return NULL;
 569                         }
 570                         if (s - start == MAX_COMMAND_PARAM_SIZE) error("param command too long");
 571                         *param++ = c;
 572                 }
 573         }
 574         *param = '\0';
 575
 576         *str = s;
 577
 578         return cmd;
 579 }
 580
 581 /**
 582  * Prepare reading.
 583  * @param data        The data to fill during reading.
 584  * @param file        The file we are reading.
 585  * @param master      Are we reading the master file?
 586  * @param translation Are we reading a translation?
 587  */
 588 StringReader::StringReader(StringData &data, const char *file, bool master, bool translation) :
 589                 data(data), file(stredup(file)), master(master), translation(translation)
 590 {
 591 }
 592
 593 /** Make sure the right reader gets freed. */
 594 StringReader::~StringReader()
 595 {
 596         free(file);
 597 }
 598
 599 static void ExtractCommandString(ParsedCommandStruct *p, const char *s, bool warnings)
 600 {
 601         char param[MAX_COMMAND_PARAM_SIZE];
 602         int argno;
 603         int argidx = 0;
 604         int casei;
 605
 606         memset(p, 0, sizeof(*p));
 607
 608         for (;;) {
 609                 /* read until next command from a. */
 610                 const CmdStruct *ar = ParseCommandString(&s, param, &argno, &casei);
 611
 612                 if (ar == NULL) break;
 613
 614                 /* Sanity checking */
 615                 if (argno != -1 && ar->consumes == 0) strgen_fatal("Non consumer param can't have a paramindex");
 616
 617                 if (ar->consumes) {
 618                         if (argno != -1) argidx = argno;
 619                         if (argidx < 0 || (uint)argidx >= lengthof(p->cmd)) strgen_fatal("invalid param idx %d", argidx);
 620                         if (p->cmd[argidx] != NULL && p->cmd[argidx] != ar) strgen_fatal("duplicate param idx %d", argidx);
 621
 622                         p->cmd[argidx++] = ar;
 623                 } else if (!(ar->flags & C_DONTCOUNT)) { // Ignore some of them
 624                         if (p->np >= lengthof(p->pairs)) strgen_fatal("too many commands in string, max " PRINTF_SIZE, lengthof(p->pairs));
 625                         p->pairs[p->np].a = ar;
 626                         p->pairs[p->np].v = param[0] != '\0' ? stredup(param) : "";
 627                         p->np++;
 628                 }
 629         }
 630 }
 631
 632
 633 static const CmdStruct *TranslateCmdForCompare(const CmdStruct *a)
 634 {
 635         if (a == NULL) return NULL;
 636
 637         if (strcmp(a->cmd, "STRING1") == 0 ||
 638                         strcmp(a->cmd, "STRING2") == 0 ||
 639                         strcmp(a->cmd, "STRING3") == 0 ||
 640                         strcmp(a->cmd, "STRING4") == 0 ||
 641                         strcmp(a->cmd, "STRING5") == 0 ||
 642                         strcmp(a->cmd, "STRING6") == 0 ||
 643                         strcmp(a->cmd, "STRING7") == 0 ||
 644                         strcmp(a->cmd, "RAW_STRING") == 0) {
 645                 return FindCmd("STRING", 6);
 646         }
 647
 648         return a;
 649 }
 650
 651
 652 static bool CheckCommandsMatch(char *a, char *b, const char *name)
 653 {
 654         /* If we're not translating, i.e. we're compiling the base language,
 655          * it is pointless to do all these checks as it'll always be correct.
 656          * After all, all checks are based on the base language.
 657          */
 658         if (!_translation) return true;
 659
 660         ParsedCommandStruct templ;
 661         ParsedCommandStruct lang;
 662         bool result = true;
 663
 664         ExtractCommandString(&templ, b, true);
 665         ExtractCommandString(&lang, a, true);
 666
 667         /* For each string in templ, see if we find it in lang */
 668         if (templ.np != lang.np) {
 669                 strgen_warning("%s: template string and language string have a different # of commands", name);
 670                 result = false;
 671         }
 672
 673         for (uint i = 0; i < templ.np; i++) {
 674                 /* see if we find it in lang, and zero it out */
 675                 bool found = false;
 676                 for (uint j = 0; j < lang.np; j++) {
 677                         if (templ.pairs[i].a == lang.pairs[j].a &&
 678                                         strcmp(templ.pairs[i].v, lang.pairs[j].v) == 0) {
 679                                 /* it was found in both. zero it out from lang so we don't find it again */
 680                                 lang.pairs[j].a = NULL;
 681                                 found = true;
 682                                 break;
 683                         }
 684                 }
 685
 686                 if (!found) {
 687                         strgen_warning("%s: command '%s' exists in template file but not in language file", name, templ.pairs[i].a->cmd);
 688                         result = false;
 689                 }
 690         }
 691
 692         /* if we reach here, all non consumer commands match up.
 693          * Check if the non consumer commands match up also. */
 694         for (uint i = 0; i < lengthof(templ.cmd); i++) {
 695                 if (TranslateCmdForCompare(templ.cmd[i]) != lang.cmd[i]) {
 696                         strgen_warning("%s: Param idx #%d '%s' doesn't match with template command '%s'", name, i,
 697                                 lang.cmd[i]  == NULL ? "<empty>" : TranslateCmdForCompare(lang.cmd[i])->cmd,
 698                                 templ.cmd[i] == NULL ? "<empty>" : templ.cmd[i]->cmd);
 699                         result = false;
 700                 }
 701         }
 702
 703         return result;
 704 }
 705
 706 void StringReader::HandleString(char *str)
 707 {
 708         if (*str == '#') {
 709                 if (str[1] == '#' && str[2] != '#') this->HandlePragma(str + 2);
 710                 return;
 711         }
 712
 713         /* Ignore comments & blank lines */
 714         if (*str == ';' || *str == ' ' || *str == '\0') return;
 715
 716         char *s = strchr(str, ':');
 717         if (s == NULL) {
 718                 strgen_error("Line has no ':' delimiter");
 719                 return;
 720         }
 721
 722         char *t;
 723         /* Trim spaces.
 724          * After this str points to the command name, and s points to the command contents */
 725         for (t = s; t > str && (t[-1] == ' ' || t[-1] == '\t'); t--) {}
 726         *t = 0;
 727         s++;
 728
 729         /* Check string is valid UTF-8 */
 730         const char *tmp;
 731         for (tmp = s; *tmp != '\0';) {
 732                 size_t len = Utf8Validate(tmp);
 733                 if (len == 0) strgen_fatal("Invalid UTF-8 sequence in '%s'", s);
 734
 735                 WChar c;
 736                 Utf8Decode(&c, tmp);
 737                 if (c <= 0x001F || // ASCII control character range
 738                                 c == 0x200B || // Zero width space
 739                                 (c >= 0xE000 && c <= 0xF8FF) || // Private range
 740                                 (c >= 0xFFF0 && c <= 0xFFFF)) { // Specials range
 741                         strgen_fatal("Unwanted UTF-8 character U+%04X in sequence '%s'", c, s);
 742                 }
 743
 744                 tmp += len;
 745         }
 746
 747         /* Check if the string has a case..
 748          * The syntax for cases is IDENTNAME.case */
 749         char *casep = strchr(str, '.');
 750         if (casep != NULL) *casep++ = '\0';
 751
 752         /* Check if this string already exists.. */
 753         LangString *ent = this->data.Find(str);
 754
 755         if (this->master) {
 756                 if (casep != NULL) {
 757                         strgen_error("Cases in the base translation are not supported.");
 758                         return;
 759                 }
 760
 761                 if (ent != NULL) {
 762                         strgen_error("String name '%s' is used multiple times", str);
 763                         return;
 764                 }
 765
 766                 if (this->data.strings[this->data.next_string_id] != NULL) {
 767                         strgen_error("String ID 0x%X for '%s' already in use by '%s'", this->data.next_string_id, str, this->data.strings[this->data.next_string_id]->name);
 768                         return;
 769                 }
 770
 771                 /* Allocate a new LangString */
 772                 this->data.Add(str, new LangString(str, s, this->data.next_string_id++, _cur_line));
 773         } else {
 774                 if (ent == NULL) {
 775                         strgen_warning("String name '%s' does not exist in master file", str);
 776                         return;
 777                 }
 778
 779                 if (ent->translated && casep == NULL) {
 780                         strgen_error("String name '%s' is used multiple times", str);
 781                         return;
 782                 }
 783
 784                 /* make sure that the commands match */
 785                 if (!CheckCommandsMatch(s, ent->english, str)) return;
 786
 787                 if (casep != NULL) {
 788                         ent->translated_case = new Case(ResolveCaseName(casep, strlen(casep)), s, ent->translated_case);
 789                 } else {
 790                         ent->translated = stredup(s);
 791                         /* If the string was translated, use the line from the
 792                          * translated language so errors in the translated file
 793                          * are properly referenced to. */
 794                         ent->line = _cur_line;
 795                 }
 796         }
 797 }
 798
 799 void StringReader::HandlePragma(char *str)
 800 {
 801         if (!memcmp(str, "plural ", 7)) {
 802                 _lang.plural_form = atoi(str + 7);
 803                 if (_lang.plural_form >= lengthof(_plural_forms)) {
 804                         strgen_fatal("Invalid pluralform %d", _lang.plural_form);
 805                 }
 806         } else {
 807                 strgen_fatal("unknown pragma '%s'", str);
 808         }
 809 }
 810
 811 static void rstrip(char *buf)
 812 {
 813         size_t i = strlen(buf);
 814         while (i > 0 && (buf[i - 1] == '\r' || buf[i - 1] == '\n' || buf[i - 1] == ' ')) i--;
 815         buf[i] = '\0';
 816 }
 817
 818 void StringReader::ParseFile()
 819 {
 820         char buf[2048];
 821         _warnings = _errors = 0;
 822
 823         _translation = this->master || this->translation;
 824         _file = this->file;
 825
 826         /* For each new file we parse, reset the genders, and language codes. */
 827         MemSetT(&_lang, 0);
 828         strecpy(_lang.digit_group_separator, ",", lastof(_lang.digit_group_separator));
 829         strecpy(_lang.digit_group_separator_currency, ",", lastof(_lang.digit_group_separator_currency));
 830         strecpy(_lang.digit_decimal_separator, ".", lastof(_lang.digit_decimal_separator));
 831
 832         _cur_line = 1;
 833         while (this->ReadLine(buf, lastof(buf)) != NULL) {
 834                 rstrip(buf);
 835                 this->HandleString(buf);
 836                 _cur_line++;
 837         }
 838 }
 839
 840 /**
 841  * Write the header information.
 842  * @param data The data about the string.
 843  */
 844 void HeaderWriter::WriteHeader(const StringData &data)
 845 {
 846         int last = 0;
 847         for (size_t i = 0; i < data.max_strings; i++) {
 848                 if (data.strings[i] != NULL) {
 849                         this->WriteStringID(data.strings[i]->name, (int)i);
 850                         last = (int)i;
 851                 }
 852         }
 853
 854         this->WriteStringID("STR_LAST_STRINGID", last);
 855 }
 856
 857 static int TranslateArgumentIdx(int argidx, int offset)
 858 {
 859         int sum;
 860
 861         if (argidx < 0 || (uint)argidx >= lengthof(_cur_pcs.cmd)) {
 862                 strgen_fatal("invalid argidx %d", argidx);
 863         }
 864         const CmdStruct *cs = _cur_pcs.cmd[argidx];
 865         if (cs != NULL && cs->consumes <= offset) {
 866                 strgen_fatal("invalid argidx offset %d:%d", argidx, offset);
 867         }
 868
 869         if (_cur_pcs.cmd[argidx] == NULL) {
 870                 strgen_fatal("no command for this argidx %d", argidx);
 871         }
 872
 873         for (int i = sum = 0; i < argidx; i++) {
 874                 const CmdStruct *cs = _cur_pcs.cmd[i];
 875
 876                 sum += (cs != NULL) ? cs->consumes : 1;
 877         }
 878
 879         return sum + offset;
 880 }
 881
 882 static void PutArgidxCommand(Buffer *buffer)
 883 {
 884         buffer->AppendUtf8(SCC_ARG_INDEX);
 885         buffer->AppendByte(TranslateArgumentIdx(_cur_argidx));
 886 }
 887
 888
 889 static void PutCommandString(Buffer *buffer, const char *str)
 890 {
 891         _cur_argidx = 0;
 892
 893         while (*str != '\0') {
 894                 /* Process characters as they are until we encounter a { */
 895                 if (*str != '{') {
 896                         buffer->AppendByte(*str++);
 897                         continue;
 898                 }
 899
 900                 char param[MAX_COMMAND_PARAM_SIZE];
 901                 int argno;
 902                 int casei;
 903                 const CmdStruct *cs = ParseCommandString(&str, param, &argno, &casei);
 904                 if (cs == NULL) break;
 905
 906                 if (casei != -1) {
 907                         buffer->AppendUtf8(SCC_SET_CASE); // {SET_CASE}
 908                         buffer->AppendByte(casei);
 909                 }
 910
 911                 /* For params that consume values, we need to handle the argindex properly */
 912                 if (cs->consumes > 0) {
 913                         /* Check if we need to output a move-param command */
 914                         if (argno != -1 && argno != _cur_argidx) {
 915                                 _cur_argidx = argno;
 916                                 PutArgidxCommand(buffer);
 917                         }
 918
 919                         /* Output the one from the master string... it's always accurate. */
 920                         cs = _cur_pcs.cmd[_cur_argidx++];
 921                         if (cs == NULL) {
 922                                 strgen_fatal("%s: No argument exists at position %d", _cur_ident, _cur_argidx - 1);
 923                         }
 924                 }
 925
 926                 cs->proc(buffer, param, cs->value);
 927         }
 928 }
 929
 930 /**
 931  * Write the length as a simple gamma.
 932  * @param length The number to write.
 933  */
 934 void LanguageWriter::WriteLength(uint length)
 935 {
 936         char buffer[2];
 937         int offs = 0;
 938         if (length >= 0x4000) {
 939                 strgen_fatal("string too long");
 940         }
 941
 942         if (length >= 0xC0) {
 943                 buffer[offs++] = (length >> 8) | 0xC0;
 944         }
 945         buffer[offs++] = length & 0xFF;
 946         this->Write((byte*)buffer, offs);
 947 }
 948
 949 /**
 950  * Actually write the language.
 951  * @param data The data about the string.
 952  */
 953 void LanguageWriter::WriteLang(const StringData &data)
 954 {
 955         uint *in_use = AllocaM(uint, data.tabs);
 956         for (size_t tab = 0; tab < data.tabs; tab++) {
 957                 uint n = data.CountInUse((uint)tab);
 958
 959                 in_use[tab] = n;
 960                 _lang.offsets[tab] = TO_LE16(n);
 961
 962                 for (uint j = 0; j != in_use[tab]; j++) {
 963                         const LangString *ls = data.strings[(tab * TAB_SIZE) + j];
 964                         if (ls != NULL && ls->translated == NULL) _lang.missing++;
 965                 }
 966         }
 967
 968         _lang.ident = TO_LE32(LanguagePackHeader::IDENT);
 969         _lang.version = TO_LE32(data.Version());
 970         _lang.missing = TO_LE16(_lang.missing);
 971         _lang.winlangid = TO_LE16(_lang.winlangid);
 972
 973         this->WriteHeader(&_lang);
 974         Buffer buffer;
 975
 976         for (size_t tab = 0; tab < data.tabs; tab++) {
 977                 for (uint j = 0; j != in_use[tab]; j++) {
 978                         const LangString *ls = data.strings[(tab * TAB_SIZE) + j];
 979                         const Case *casep;
 980                         const char *cmdp;
 981
 982                         /* For undefined strings, just set that it's an empty string */
 983                         if (ls == NULL) {
 984                                 this->WriteLength(0);
 985                                 continue;
 986                         }
 987
 988                         _cur_ident = ls->name;
 989                         _cur_line = ls->line;
 990
 991                         /* Produce a message if a string doesn't have a translation. */
 992                         if (_show_todo > 0 && ls->translated == NULL) {
 993                                 if ((_show_todo & 2) != 0) {
 994                                         strgen_warning("'%s' is untranslated", ls->name);
 995                                 }
 996                                 if ((_show_todo & 1) != 0) {
 997                                         const char *s = "<TODO> ";
 998                                         while (*s != '\0') buffer.AppendByte(*s++);
 999                                 }
1000                         }
1001
1002                         /* Extract the strings and stuff from the english command string */
1003                         ExtractCommandString(&_cur_pcs, ls->english, false);
1004
1005                         if (ls->translated_case != NULL || ls->translated != NULL) {
1006                                 casep = ls->translated_case;
1007                                 cmdp = ls->translated;
1008                         } else {
1009                                 casep = NULL;
1010                                 cmdp = ls->english;
1011                         }
1012
1013                         _translated = cmdp != ls->english;
1014
1015                         if (casep != NULL) {
1016                                 const Case *c;
1017                                 uint num;
1018
1019                                 /* Need to output a case-switch.
1020                                  * It has this format
1021                                  * <0x9E> <NUM CASES> <CASE1> <LEN1> <STRING1> <CASE2> <LEN2> <STRING2> <CASE3> <LEN3> <STRING3> <STRINGDEFAULT>
1022                                  * Each LEN is printed using 2 bytes in big endian order. */
1023                                 buffer.AppendUtf8(SCC_SWITCH_CASE);
1024                                 /* Count the number of cases */
1025                                 for (num = 0, c = casep; c; c = c->next) num++;
1026                                 buffer.AppendByte(num);
1027
1028                                 /* Write each case */
1029                                 for (c = casep; c != NULL; c = c->next) {
1030                                         buffer.AppendByte(c->caseidx);
1031                                         /* Make some space for the 16-bit length */
1032                                         uint pos = buffer.Length();
1033                                         buffer.AppendByte(0);
1034                                         buffer.AppendByte(0);
1035                                         /* Write string */
1036                                         PutCommandString(&buffer, c->string);
1037                                         buffer.AppendByte(0); // terminate with a zero
1038                                         /* Fill in the length */
1039                                         uint size = buffer.Length() - (pos + 2);
1040                                         buffer[pos + 0] = GB(size, 8, 8);
1041                                         buffer[pos + 1] = GB(size, 0, 8);
1042                                 }
1043                         }
1044
1045                         if (cmdp != NULL) PutCommandString(&buffer, cmdp);
1046
1047                         this->WriteLength(buffer.Length());
1048                         this->Write(buffer.Begin(), buffer.Length());
1049                         buffer.Clear();
1050                 }
1051         }
1052 }