src/strgen/strgen_base.cpp

   1 /*
   2  * This file is part of OpenTTD.
   3  * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
   4  * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   5  * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
   6  */
   7
   8 /** @file strgen_base.cpp Tool to create computer readable (stand-alone) translation files. */
   9
  10 #include "../stdafx.h"
  11 #include "../core/endian_func.hpp"
  12 #include "../string_func.h"
  13 #include "../table/control_codes.h"
  14
  15 #include "strgen.h"
  16
  17
  18 #include "../table/strgen_tables.h"
  19
  20 #include "../safeguards.h"
  21
  22 /* Compiles a list of strings into a compiled string list */
  23
  24 static bool _translated;              ///< Whether the current language is not the master language
  25 static bool _translation;             ///< Is the current file actually a translation or not
  26 const char *_file = "(unknown file)"; ///< The filename of the input, so we can refer to it in errors/warnings
  27 int _cur_line;                        ///< The current line we're parsing in the input file
  28 int _errors, _warnings, _show_todo;
  29 LanguagePackHeader _lang;             ///< Header information about a language.
  30
  31 static const ptrdiff_t MAX_COMMAND_PARAM_SIZE = 100; ///< Maximum size of every command block, not counting the name of the command itself
  32 static const CmdStruct *ParseCommandString(const char **str, char *param, int *argno, int *casei);
  33
  34 /**
  35  * Create a new case.
  36  * @param caseidx The index of the case.
  37  * @param string  The translation of the case.
  38  * @param next    The next chained case.
  39  */
  40 Case::Case(int caseidx, const char *string, Case *next) :
  41                 caseidx(caseidx), string(stredup(string)), next(next)
  42 {
  43 }
  44
  45 /** Free everything we allocated. */
  46 Case::~Case()
  47 {
  48         free(this->string);
  49         delete this->next;
  50 }
  51
  52 /**
  53  * Create a new string.
  54  * @param name    The name of the string.
  55  * @param english The english "translation" of the string.
  56  * @param index   The index in the string table.
  57  * @param line    The line this string was found on.
  58  */
  59 LangString::LangString(const char *name, const char *english, size_t index, int line) :
  60                 name(stredup(name)), english(stredup(english)), translated(nullptr),
  61                 hash_next(0), index(index), line(line), translated_case(nullptr)
  62 {
  63 }
  64
  65 /** Free everything we allocated. */
  66 LangString::~LangString()
  67 {
  68         free(this->name);
  69         free(this->english);
  70         free(this->translated);
  71         delete this->translated_case;
  72 }
  73
  74 /** Free all data related to the translation. */
  75 void LangString::FreeTranslation()
  76 {
  77         free(this->translated);
  78         this->translated = nullptr;
  79
  80         delete this->translated_case;
  81         this->translated_case = nullptr;
  82 }
  83
  84 /**
  85  * Create a new string data container.
  86  * @param tabs The maximum number of strings.
  87  */
  88 StringData::StringData(size_t tabs) : tabs(tabs), max_strings(tabs * TAB_SIZE)
  89 {
  90         this->strings = CallocT<LangString *>(max_strings);
  91         this->hash_heads = CallocT<size_t>(max_strings);
  92         this->next_string_id = 0;
  93 }
  94
  95 /** Free everything we allocated. */
  96 StringData::~StringData()
  97 {
  98         for (size_t i = 0; i < this->max_strings; i++) delete this->strings[i];
  99         free(this->strings);
 100         free(this->hash_heads);
 101 }
 102
 103 /** Free all data related to the translation. */
 104 void StringData::FreeTranslation()
 105 {
 106         for (size_t i = 0; i < this->max_strings; i++) {
 107                 LangString *ls = this->strings[i];
 108                 if (ls != nullptr) ls->FreeTranslation();
 109         }
 110 }
 111
 112 /**
 113  * Create a hash of the string for finding them back quickly.
 114  * @param s The string to hash.
 115  * @return The hashed string.
 116  */
 117 uint StringData::HashStr(const char *s) const
 118 {
 119         uint hash = 0;
 120         for (; *s != '\0'; s++) hash = ROL(hash, 3) ^ *s;
 121         return hash % this->max_strings;
 122 }
 123
 124 /**
 125  * Add a newly created LangString.
 126  * @param s  The name of the string.
 127  * @param ls The string to add.
 128  */
 129 void StringData::Add(const char *s, LangString *ls)
 130 {
 131         uint hash = this->HashStr(s);
 132         ls->hash_next = this->hash_heads[hash];
 133         /* Off-by-one for hash find. */
 134         this->hash_heads[hash] = ls->index + 1;
 135         this->strings[ls->index] = ls;
 136 }
 137
 138 /**
 139  * Find a LangString based on the string name.
 140  * @param s The string name to search on.
 141  * @return The LangString or nullptr if it is not known.
 142  */
 143 LangString *StringData::Find(const char *s)
 144 {
 145         size_t idx = this->hash_heads[this->HashStr(s)];
 146
 147         while (idx-- > 0) {
 148                 LangString *ls = this->strings[idx];
 149
 150                 if (strcmp(ls->name, s) == 0) return ls;
 151                 idx = ls->hash_next;
 152         }
 153         return nullptr;
 154 }
 155
 156 /**
 157  * Create a compound hash.
 158  * @param hash The hash to add the string hash to.
 159  * @param s    The string hash.
 160  * @return The new hash.
 161  */
 162 uint StringData::VersionHashStr(uint hash, const char *s) const
 163 {
 164         for (; *s != '\0'; s++) {
 165                 hash = ROL(hash, 3) ^ *s;
 166                 hash = (hash & 1 ? hash >> 1 ^ 0xDEADBEEF : hash >> 1);
 167         }
 168         return hash;
 169 }
 170
 171 /**
 172  * Make a hash of the file to get a unique "version number"
 173  * @return The version number.
 174  */
 175 uint StringData::Version() const
 176 {
 177         uint hash = 0;
 178
 179         for (size_t i = 0; i < this->max_strings; i++) {
 180                 const LangString *ls = this->strings[i];
 181
 182                 if (ls != nullptr) {
 183                         const CmdStruct *cs;
 184                         const char *s;
 185                         char buf[MAX_COMMAND_PARAM_SIZE];
 186                         int argno;
 187                         int casei;
 188
 189                         s = ls->name;
 190                         hash ^= i * 0x717239;
 191                         hash = (hash & 1 ? hash >> 1 ^ 0xDEADBEEF : hash >> 1);
 192                         hash = this->VersionHashStr(hash, s + 1);
 193
 194                         s = ls->english;
 195                         while ((cs = ParseCommandString(&s, buf, &argno, &casei)) != nullptr) {
 196                                 if (cs->flags & C_DONTCOUNT) continue;
 197
 198                                 hash ^= (cs - _cmd_structs) * 0x1234567;
 199                                 hash = (hash & 1 ? hash >> 1 ^ 0xF00BAA4 : hash >> 1);
 200                         }
 201                 }
 202         }
 203
 204         return hash;
 205 }
 206
 207 /**
 208  * Count the number of tab elements that are in use.
 209  * @param tab The tab to count the elements of.
 210  */
 211 uint StringData::CountInUse(uint tab) const
 212 {
 213         int i;
 214         for (i = TAB_SIZE; --i >= 0;) if (this->strings[(tab * TAB_SIZE) + i] != nullptr) break;
 215         return i + 1;
 216 }
 217
 218 static const char *_cur_ident;
 219
 220 struct CmdPair {
 221         const CmdStruct *a;
 222         const char *v;
 223 };
 224
 225 struct ParsedCommandStruct {
 226         uint np;
 227         CmdPair pairs[32];
 228         const CmdStruct *cmd[32]; // ordered by param #
 229 };
 230
 231 /* Used when generating some advanced commands. */
 232 static ParsedCommandStruct _cur_pcs;
 233 static int _cur_argidx;
 234
 235 /** The buffer for writing a single string. */
 236 struct Buffer : std::vector<byte> {
 237         /**
 238          * Convenience method for adding a byte.
 239          * @param value The value to add.
 240          */
 241         void AppendByte(byte value)
 242         {
 243                 this->push_back(value);
 244         }
 245
 246         /**
 247          * Add an Unicode character encoded in UTF-8 to the buffer.
 248          * @param value The character to add.
 249          */
 250         void AppendUtf8(uint32 value)
 251         {
 252                 if (value < 0x80) {
 253                         this->push_back(value);
 254                 } else if (value < 0x800) {
 255                         this->push_back(0xC0 + GB(value,  6, 5));
 256                         this->push_back(0x80 + GB(value,  0, 6));
 257                 } else if (value < 0x10000) {
 258                         this->push_back(0xE0 + GB(value, 12, 4));
 259                         this->push_back(0x80 + GB(value,  6, 6));
 260                         this->push_back(0x80 + GB(value,  0, 6));
 261                 } else if (value < 0x110000) {
 262                         this->push_back(0xF0 + GB(value, 18, 3));
 263                         this->push_back(0x80 + GB(value, 12, 6));
 264                         this->push_back(0x80 + GB(value,  6, 6));
 265                         this->push_back(0x80 + GB(value,  0, 6));
 266                 } else {
 267                         strgen_warning("Invalid unicode value U+0x%X", value);
 268                 }
 269         }
 270 };
 271
 272 size_t Utf8Validate(const char *s)
 273 {
 274         uint32 c;
 275
 276         if (!HasBit(s[0], 7)) {
 277                 /* 1 byte */
 278                 return 1;
 279         } else if (GB(s[0], 5, 3) == 6 && IsUtf8Part(s[1])) {
 280                 /* 2 bytes */
 281                 c = GB(s[0], 0, 5) << 6 | GB(s[1], 0, 6);
 282                 if (c >= 0x80) return 2;
 283         } else if (GB(s[0], 4, 4) == 14 && IsUtf8Part(s[1]) && IsUtf8Part(s[2])) {
 284                 /* 3 bytes */
 285                 c = GB(s[0], 0, 4) << 12 | GB(s[1], 0, 6) << 6 | GB(s[2], 0, 6);
 286                 if (c >= 0x800) return 3;
 287         } else if (GB(s[0], 3, 5) == 30 && IsUtf8Part(s[1]) && IsUtf8Part(s[2]) && IsUtf8Part(s[3])) {
 288                 /* 4 bytes */
 289                 c = GB(s[0], 0, 3) << 18 | GB(s[1], 0, 6) << 12 | GB(s[2], 0, 6) << 6 | GB(s[3], 0, 6);
 290                 if (c >= 0x10000 && c <= 0x10FFFF) return 4;
 291         }
 292
 293         return 0;
 294 }
 295
 296
 297 void EmitSingleChar(Buffer *buffer, char *buf, int value)
 298 {
 299         if (*buf != '\0') strgen_warning("Ignoring trailing letters in command");
 300         buffer->AppendUtf8(value);
 301 }
 302
 303
 304 /* The plural specifier looks like
 305  * {NUM} {PLURAL -1 passenger passengers} then it picks either passenger/passengers depending on the count in NUM */
 306
 307 /* This is encoded like
 308  *  CommandByte <ARG#> <NUM> {Length of each string} {each string} */
 309
 310 bool ParseRelNum(char **buf, int *value, int *offset)
 311 {
 312         const char *s = *buf;
 313         char *end;
 314         bool rel = false;
 315
 316         while (*s == ' ' || *s == '\t') s++;
 317         if (*s == '+') {
 318                 rel = true;
 319                 s++;
 320         }
 321         int v = strtol(s, &end, 0);
 322         if (end == s) return false;
 323         if (rel || v < 0) {
 324                 *value += v;
 325         } else {
 326                 *value = v;
 327         }
 328         if (offset != nullptr && *end == ':') {
 329                 /* Take the Nth within */
 330                 s = end + 1;
 331                 *offset = strtol(s, &end, 0);
 332                 if (end == s) return false;
 333         }
 334         *buf = end;
 335         return true;
 336 }
 337
 338 /* Parse out the next word, or nullptr */
 339 char *ParseWord(char **buf)
 340 {
 341         char *s = *buf, *r;
 342
 343         while (*s == ' ' || *s == '\t') s++;
 344         if (*s == '\0') return nullptr;
 345
 346         if (*s == '"') {
 347                 r = ++s;
 348                 /* parse until next " or NUL */
 349                 for (;;) {
 350                         if (*s == '\0') break;
 351                         if (*s == '"') {
 352                                 *s++ = '\0';
 353                                 break;
 354                         }
 355                         s++;
 356                 }
 357         } else {
 358                 /* proceed until whitespace or NUL */
 359                 r = s;
 360                 for (;;) {
 361                         if (*s == '\0') break;
 362                         if (*s == ' ' || *s == '\t') {
 363                                 *s++ = '\0';
 364                                 break;
 365                         }
 366                         s++;
 367                 }
 368         }
 369         *buf = s;
 370         return r;
 371 }
 372
 373 /* Forward declaration */
 374 static int TranslateArgumentIdx(int arg, int offset = 0);
 375
 376 static void EmitWordList(Buffer *buffer, const char * const *words, uint nw)
 377 {
 378         buffer->AppendByte(nw);
 379         for (uint i = 0; i < nw; i++) buffer->AppendByte((byte)strlen(words[i]) + 1);
 380         for (uint i = 0; i < nw; i++) {
 381                 for (uint j = 0; words[i][j] != '\0'; j++) buffer->AppendByte(words[i][j]);
 382                 buffer->AppendByte(0);
 383         }
 384 }
 385
 386 void EmitPlural(Buffer *buffer, char *buf, int value)
 387 {
 388         int argidx = _cur_argidx;
 389         int offset = -1;
 390         int expected = _plural_forms[_lang.plural_form].plural_count;
 391         const char **words = AllocaM(const char *, max(expected, MAX_PLURALS));
 392         int nw = 0;
 393
 394         /* Parse out the number, if one exists. Otherwise default to prev arg. */
 395         if (!ParseRelNum(&buf, &argidx, &offset)) argidx--;
 396
 397         const CmdStruct *cmd = _cur_pcs.cmd[argidx];
 398         if (offset == -1) {
 399                 /* Use default offset */
 400                 if (cmd == nullptr || cmd->default_plural_offset < 0) {
 401                         strgen_fatal("Command '%s' has no (default) plural position", cmd == nullptr ? "<empty>" : cmd->cmd);
 402                 }
 403                 offset = cmd->default_plural_offset;
 404         }
 405
 406         /* Parse each string */
 407         for (nw = 0; nw < MAX_PLURALS; nw++) {
 408                 words[nw] = ParseWord(&buf);
 409                 if (words[nw] == nullptr) break;
 410         }
 411
 412         if (nw == 0) {
 413                 strgen_fatal("%s: No plural words", _cur_ident);
 414         }
 415
 416         if (expected != nw) {
 417                 if (_translated) {
 418                         strgen_fatal("%s: Invalid number of plural forms. Expecting %d, found %d.", _cur_ident,
 419                                 expected, nw);
 420                 } else {
 421                         if ((_show_todo & 2) != 0) strgen_warning("'%s' is untranslated. Tweaking english string to allow compilation for plural forms", _cur_ident);
 422                         if (nw > expected) {
 423                                 nw = expected;
 424                         } else {
 425                                 for (; nw < expected; nw++) {
 426                                         words[nw] = words[nw - 1];
 427                                 }
 428                         }
 429                 }
 430         }
 431
 432         buffer->AppendUtf8(SCC_PLURAL_LIST);
 433         buffer->AppendByte(_lang.plural_form);
 434         buffer->AppendByte(TranslateArgumentIdx(argidx, offset));
 435         EmitWordList(buffer, words, nw);
 436 }
 437
 438
 439 void EmitGender(Buffer *buffer, char *buf, int value)
 440 {
 441         int argidx = _cur_argidx;
 442         int offset = 0;
 443         uint nw;
 444
 445         if (buf[0] == '=') {
 446                 buf++;
 447
 448                 /* This is a {G=DER} command */
 449                 nw = _lang.GetGenderIndex(buf);
 450                 if (nw >= MAX_NUM_GENDERS) strgen_fatal("G argument '%s' invalid", buf);
 451
 452                 /* now nw contains the gender index */
 453                 buffer->AppendUtf8(SCC_GENDER_INDEX);
 454                 buffer->AppendByte(nw);
 455         } else {
 456                 const char *words[MAX_NUM_GENDERS];
 457
 458                 /* This is a {G 0 foo bar two} command.
 459                  * If no relative number exists, default to +0 */
 460                 if (!ParseRelNum(&buf, &argidx, &offset)) {}
 461
 462                 const CmdStruct *cmd = _cur_pcs.cmd[argidx];
 463                 if (cmd == nullptr || (cmd->flags & C_GENDER) == 0) {
 464                         strgen_fatal("Command '%s' can't have a gender", cmd == nullptr ? "<empty>" : cmd->cmd);
 465                 }
 466
 467                 for (nw = 0; nw < MAX_NUM_GENDERS; nw++) {
 468                         words[nw] = ParseWord(&buf);
 469                         if (words[nw] == nullptr) break;
 470                 }
 471                 if (nw != _lang.num_genders) strgen_fatal("Bad # of arguments for gender command");
 472
 473                 assert(IsInsideBS(cmd->value, SCC_CONTROL_START, UINT8_MAX));
 474                 buffer->AppendUtf8(SCC_GENDER_LIST);
 475                 buffer->AppendByte(TranslateArgumentIdx(argidx, offset));
 476                 EmitWordList(buffer, words, nw);
 477         }
 478 }
 479
 480 static const CmdStruct *FindCmd(const char *s, int len)
 481 {
 482         for (const CmdStruct *cs = _cmd_structs; cs != endof(_cmd_structs); cs++) {
 483                 if (strncmp(cs->cmd, s, len) == 0 && cs->cmd[len] == '\0') return cs;
 484         }
 485         return nullptr;
 486 }
 487
 488 static uint ResolveCaseName(const char *str, size_t len)
 489 {
 490         /* First get a clean copy of only the case name, then resolve it. */
 491         char case_str[CASE_GENDER_LEN];
 492         len = min(lengthof(case_str) - 1, len);
 493         memcpy(case_str, str, len);
 494         case_str[len] = '\0';
 495
 496         uint8 case_idx = _lang.GetCaseIndex(case_str);
 497         if (case_idx >= MAX_NUM_CASES) strgen_fatal("Invalid case-name '%s'", case_str);
 498         return case_idx + 1;
 499 }
 500
 501
 502 /* returns nullptr on eof
 503  * else returns command struct */
 504 static const CmdStruct *ParseCommandString(const char **str, char *param, int *argno, int *casei)
 505 {
 506         const char *s = *str, *start;
 507         char c;
 508
 509         *argno = -1;
 510         *casei = -1;
 511
 512         /* Scan to the next command, exit if there's no next command. */
 513         for (; *s != '{'; s++) {
 514                 if (*s == '\0') return nullptr;
 515         }
 516         s++; // Skip past the {
 517
 518         if (*s >= '0' && *s <= '9') {
 519                 char *end;
 520
 521                 *argno = strtoul(s, &end, 0);
 522                 if (*end != ':') strgen_fatal("missing arg #");
 523                 s = end + 1;
 524         }
 525
 526         /* parse command name */
 527         start = s;
 528         do {
 529                 c = *s++;
 530         } while (c != '}' && c != ' ' && c != '=' && c != '.' && c != 0);
 531
 532         const CmdStruct *cmd = FindCmd(start, s - start - 1);
 533         if (cmd == nullptr) {
 534                 strgen_error("Undefined command '%.*s'", (int)(s - start - 1), start);
 535                 return nullptr;
 536         }
 537
 538         if (c == '.') {
 539                 const char *casep = s;
 540
 541                 if (!(cmd->flags & C_CASE)) {
 542                         strgen_fatal("Command '%s' can't have a case", cmd->cmd);
 543                 }
 544
 545                 do {
 546                         c = *s++;
 547                 } while (c != '}' && c != ' ' && c != '\0');
 548                 *casei = ResolveCaseName(casep, s - casep - 1);
 549         }
 550
 551         if (c == '\0') {
 552                 strgen_error("Missing } from command '%s'", start);
 553                 return nullptr;
 554         }
 555
 556
 557         if (c != '}') {
 558                 if (c == '=') s--;
 559                 /* copy params */
 560                 start = s;
 561                 for (;;) {
 562                         c = *s++;
 563                         if (c == '}') break;
 564                         if (c == '\0') {
 565                                 strgen_error("Missing } from command '%s'", start);
 566                                 return nullptr;
 567                         }
 568                         if (s - start == MAX_COMMAND_PARAM_SIZE) error("param command too long");
 569                         *param++ = c;
 570                 }
 571         }
 572         *param = '\0';
 573
 574         *str = s;
 575
 576         return cmd;
 577 }
 578
 579 /**
 580  * Prepare reading.
 581  * @param data        The data to fill during reading.
 582  * @param file        The file we are reading.
 583  * @param master      Are we reading the master file?
 584  * @param translation Are we reading a translation?
 585  */
 586 StringReader::StringReader(StringData &data, const char *file, bool master, bool translation) :
 587                 data(data), file(stredup(file)), master(master), translation(translation)
 588 {
 589 }
 590
 591 /** Make sure the right reader gets freed. */
 592 StringReader::~StringReader()
 593 {
 594         free(file);
 595 }
 596
 597 static void ExtractCommandString(ParsedCommandStruct *p, const char *s, bool warnings)
 598 {
 599         char param[MAX_COMMAND_PARAM_SIZE];
 600         int argno;
 601         int argidx = 0;
 602         int casei;
 603
 604         memset(p, 0, sizeof(*p));
 605
 606         for (;;) {
 607                 /* read until next command from a. */
 608                 const CmdStruct *ar = ParseCommandString(&s, param, &argno, &casei);
 609
 610                 if (ar == nullptr) break;
 611
 612                 /* Sanity checking */
 613                 if (argno != -1 && ar->consumes == 0) strgen_fatal("Non consumer param can't have a paramindex");
 614
 615                 if (ar->consumes) {
 616                         if (argno != -1) argidx = argno;
 617                         if (argidx < 0 || (uint)argidx >= lengthof(p->cmd)) strgen_fatal("invalid param idx %d", argidx);
 618                         if (p->cmd[argidx] != nullptr && p->cmd[argidx] != ar) strgen_fatal("duplicate param idx %d", argidx);
 619
 620                         p->cmd[argidx++] = ar;
 621                 } else if (!(ar->flags & C_DONTCOUNT)) { // Ignore some of them
 622                         if (p->np >= lengthof(p->pairs)) strgen_fatal("too many commands in string, max " PRINTF_SIZE, lengthof(p->pairs));
 623                         p->pairs[p->np].a = ar;
 624                         p->pairs[p->np].v = param[0] != '\0' ? stredup(param) : "";
 625                         p->np++;
 626                 }
 627         }
 628 }
 629
 630
 631 static const CmdStruct *TranslateCmdForCompare(const CmdStruct *a)
 632 {
 633         if (a == nullptr) return nullptr;
 634
 635         if (strcmp(a->cmd, "STRING1") == 0 ||
 636                         strcmp(a->cmd, "STRING2") == 0 ||
 637                         strcmp(a->cmd, "STRING3") == 0 ||
 638                         strcmp(a->cmd, "STRING4") == 0 ||
 639                         strcmp(a->cmd, "STRING5") == 0 ||
 640                         strcmp(a->cmd, "STRING6") == 0 ||
 641                         strcmp(a->cmd, "STRING7") == 0 ||
 642                         strcmp(a->cmd, "RAW_STRING") == 0) {
 643                 return FindCmd("STRING", 6);
 644         }
 645
 646         return a;
 647 }
 648
 649
 650 static bool CheckCommandsMatch(char *a, char *b, const char *name)
 651 {
 652         /* If we're not translating, i.e. we're compiling the base language,
 653          * it is pointless to do all these checks as it'll always be correct.
 654          * After all, all checks are based on the base language.
 655          */
 656         if (!_translation) return true;
 657
 658         ParsedCommandStruct templ;
 659         ParsedCommandStruct lang;
 660         bool result = true;
 661
 662         ExtractCommandString(&templ, b, true);
 663         ExtractCommandString(&lang, a, true);
 664
 665         /* For each string in templ, see if we find it in lang */
 666         if (templ.np != lang.np) {
 667                 strgen_warning("%s: template string and language string have a different # of commands", name);
 668                 result = false;
 669         }
 670
 671         for (uint i = 0; i < templ.np; i++) {
 672                 /* see if we find it in lang, and zero it out */
 673                 bool found = false;
 674                 for (uint j = 0; j < lang.np; j++) {
 675                         if (templ.pairs[i].a == lang.pairs[j].a &&
 676                                         strcmp(templ.pairs[i].v, lang.pairs[j].v) == 0) {
 677                                 /* it was found in both. zero it out from lang so we don't find it again */
 678                                 lang.pairs[j].a = nullptr;
 679                                 found = true;
 680                                 break;
 681                         }
 682                 }
 683
 684                 if (!found) {
 685                         strgen_warning("%s: command '%s' exists in template file but not in language file", name, templ.pairs[i].a->cmd);
 686                         result = false;
 687                 }
 688         }
 689
 690         /* if we reach here, all non consumer commands match up.
 691          * Check if the non consumer commands match up also. */
 692         for (uint i = 0; i < lengthof(templ.cmd); i++) {
 693                 if (TranslateCmdForCompare(templ.cmd[i]) != lang.cmd[i]) {
 694                         strgen_warning("%s: Param idx #%d '%s' doesn't match with template command '%s'", name, i,
 695                                 lang.cmd[i]  == nullptr ? "<empty>" : TranslateCmdForCompare(lang.cmd[i])->cmd,
 696                                 templ.cmd[i] == nullptr ? "<empty>" : templ.cmd[i]->cmd);
 697                         result = false;
 698                 }
 699         }
 700
 701         return result;
 702 }
 703
 704 void StringReader::HandleString(char *str)
 705 {
 706         if (*str == '#') {
 707                 if (str[1] == '#' && str[2] != '#') this->HandlePragma(str + 2);
 708                 return;
 709         }
 710
 711         /* Ignore comments & blank lines */
 712         if (*str == ';' || *str == ' ' || *str == '\0') return;
 713
 714         char *s = strchr(str, ':');
 715         if (s == nullptr) {
 716                 strgen_error("Line has no ':' delimiter");
 717                 return;
 718         }
 719
 720         char *t;
 721         /* Trim spaces.
 722          * After this str points to the command name, and s points to the command contents */
 723         for (t = s; t > str && (t[-1] == ' ' || t[-1] == '\t'); t--) {}
 724         *t = 0;
 725         s++;
 726
 727         /* Check string is valid UTF-8 */
 728         const char *tmp;
 729         for (tmp = s; *tmp != '\0';) {
 730                 size_t len = Utf8Validate(tmp);
 731                 if (len == 0) strgen_fatal("Invalid UTF-8 sequence in '%s'", s);
 732
 733                 WChar c;
 734                 Utf8Decode(&c, tmp);
 735                 if (c <= 0x001F || // ASCII control character range
 736                                 c == 0x200B || // Zero width space
 737                                 (c >= 0xE000 && c <= 0xF8FF) || // Private range
 738                                 (c >= 0xFFF0 && c <= 0xFFFF)) { // Specials range
 739                         strgen_fatal("Unwanted UTF-8 character U+%04X in sequence '%s'", c, s);
 740                 }
 741
 742                 tmp += len;
 743         }
 744
 745         /* Check if the string has a case..
 746          * The syntax for cases is IDENTNAME.case */
 747         char *casep = strchr(str, '.');
 748         if (casep != nullptr) *casep++ = '\0';
 749
 750         /* Check if this string already exists.. */
 751         LangString *ent = this->data.Find(str);
 752
 753         if (this->master) {
 754                 if (casep != nullptr) {
 755                         strgen_error("Cases in the base translation are not supported.");
 756                         return;
 757                 }
 758
 759                 if (ent != nullptr) {
 760                         strgen_error("String name '%s' is used multiple times", str);
 761                         return;
 762                 }
 763
 764                 if (this->data.strings[this->data.next_string_id] != nullptr) {
 765                         strgen_error("String ID 0x" PRINTF_SIZEX " for '%s' already in use by '%s'", this->data.next_string_id, str, this->data.strings[this->data.next_string_id]->name);
 766                         return;
 767                 }
 768
 769                 /* Allocate a new LangString */
 770                 this->data.Add(str, new LangString(str, s, this->data.next_string_id++, _cur_line));
 771         } else {
 772                 if (ent == nullptr) {
 773                         strgen_warning("String name '%s' does not exist in master file", str);
 774                         return;
 775                 }
 776
 777                 if (ent->translated && casep == nullptr) {
 778                         strgen_error("String name '%s' is used multiple times", str);
 779                         return;
 780                 }
 781
 782                 /* make sure that the commands match */
 783                 if (!CheckCommandsMatch(s, ent->english, str)) return;
 784
 785                 if (casep != nullptr) {
 786                         ent->translated_case = new Case(ResolveCaseName(casep, strlen(casep)), s, ent->translated_case);
 787                 } else {
 788                         ent->translated = stredup(s);
 789                         /* If the string was translated, use the line from the
 790                          * translated language so errors in the translated file
 791                          * are properly referenced to. */
 792                         ent->line = _cur_line;
 793                 }
 794         }
 795 }
 796
 797 void StringReader::HandlePragma(char *str)
 798 {
 799         if (!memcmp(str, "plural ", 7)) {
 800                 _lang.plural_form = atoi(str + 7);
 801                 if (_lang.plural_form >= lengthof(_plural_forms)) {
 802                         strgen_fatal("Invalid pluralform %d", _lang.plural_form);
 803                 }
 804         } else {
 805                 strgen_fatal("unknown pragma '%s'", str);
 806         }
 807 }
 808
 809 static void rstrip(char *buf)
 810 {
 811         size_t i = strlen(buf);
 812         while (i > 0 && (buf[i - 1] == '\r' || buf[i - 1] == '\n' || buf[i - 1] == ' ')) i--;
 813         buf[i] = '\0';
 814 }
 815
 816 void StringReader::ParseFile()
 817 {
 818         char buf[2048];
 819         _warnings = _errors = 0;
 820
 821         _translation = this->master || this->translation;
 822         _file = this->file;
 823
 824         /* For each new file we parse, reset the genders, and language codes. */
 825         MemSetT(&_lang, 0);
 826         strecpy(_lang.digit_group_separator, ",", lastof(_lang.digit_group_separator));
 827         strecpy(_lang.digit_group_separator_currency, ",", lastof(_lang.digit_group_separator_currency));
 828         strecpy(_lang.digit_decimal_separator, ".", lastof(_lang.digit_decimal_separator));
 829
 830         _cur_line = 1;
 831         while (this->data.next_string_id < this->data.max_strings && this->ReadLine(buf, lastof(buf)) != nullptr) {
 832                 rstrip(buf);
 833                 this->HandleString(buf);
 834                 _cur_line++;
 835         }
 836
 837         if (this->data.next_string_id == this->data.max_strings) {
 838                 strgen_error("Too many strings, maximum allowed is " PRINTF_SIZE, this->data.max_strings);
 839         }
 840 }
 841
 842 /**
 843  * Write the header information.
 844  * @param data The data about the string.
 845  */
 846 void HeaderWriter::WriteHeader(const StringData &data)
 847 {
 848         int last = 0;
 849         for (size_t i = 0; i < data.max_strings; i++) {
 850                 if (data.strings[i] != nullptr) {
 851                         this->WriteStringID(data.strings[i]->name, (int)i);
 852                         last = (int)i;
 853                 }
 854         }
 855
 856         this->WriteStringID("STR_LAST_STRINGID", last);
 857 }
 858
 859 static int TranslateArgumentIdx(int argidx, int offset)
 860 {
 861         int sum;
 862
 863         if (argidx < 0 || (uint)argidx >= lengthof(_cur_pcs.cmd)) {
 864                 strgen_fatal("invalid argidx %d", argidx);
 865         }
 866         const CmdStruct *cs = _cur_pcs.cmd[argidx];
 867         if (cs != nullptr && cs->consumes <= offset) {
 868                 strgen_fatal("invalid argidx offset %d:%d", argidx, offset);
 869         }
 870
 871         if (_cur_pcs.cmd[argidx] == nullptr) {
 872                 strgen_fatal("no command for this argidx %d", argidx);
 873         }
 874
 875         for (int i = sum = 0; i < argidx; i++) {
 876                 const CmdStruct *cs = _cur_pcs.cmd[i];
 877
 878                 sum += (cs != nullptr) ? cs->consumes : 1;
 879         }
 880
 881         return sum + offset;
 882 }
 883
 884 static void PutArgidxCommand(Buffer *buffer)
 885 {
 886         buffer->AppendUtf8(SCC_ARG_INDEX);
 887         buffer->AppendByte(TranslateArgumentIdx(_cur_argidx));
 888 }
 889
 890
 891 static void PutCommandString(Buffer *buffer, const char *str)
 892 {
 893         _cur_argidx = 0;
 894
 895         while (*str != '\0') {
 896                 /* Process characters as they are until we encounter a { */
 897                 if (*str != '{') {
 898                         buffer->AppendByte(*str++);
 899                         continue;
 900                 }
 901
 902                 char param[MAX_COMMAND_PARAM_SIZE];
 903                 int argno;
 904                 int casei;
 905                 const CmdStruct *cs = ParseCommandString(&str, param, &argno, &casei);
 906                 if (cs == nullptr) break;
 907
 908                 if (casei != -1) {
 909                         buffer->AppendUtf8(SCC_SET_CASE); // {SET_CASE}
 910                         buffer->AppendByte(casei);
 911                 }
 912
 913                 /* For params that consume values, we need to handle the argindex properly */
 914                 if (cs->consumes > 0) {
 915                         /* Check if we need to output a move-param command */
 916                         if (argno != -1 && argno != _cur_argidx) {
 917                                 _cur_argidx = argno;
 918                                 PutArgidxCommand(buffer);
 919                         }
 920
 921                         /* Output the one from the master string... it's always accurate. */
 922                         cs = _cur_pcs.cmd[_cur_argidx++];
 923                         if (cs == nullptr) {
 924                                 strgen_fatal("%s: No argument exists at position %d", _cur_ident, _cur_argidx - 1);
 925                         }
 926                 }
 927
 928                 cs->proc(buffer, param, cs->value);
 929         }
 930 }
 931
 932 /**
 933  * Write the length as a simple gamma.
 934  * @param length The number to write.
 935  */
 936 void LanguageWriter::WriteLength(uint length)
 937 {
 938         char buffer[2];
 939         int offs = 0;
 940         if (length >= 0x4000) {
 941                 strgen_fatal("string too long");
 942         }
 943
 944         if (length >= 0xC0) {
 945                 buffer[offs++] = (length >> 8) | 0xC0;
 946         }
 947         buffer[offs++] = length & 0xFF;
 948         this->Write((byte*)buffer, offs);
 949 }
 950
 951 /**
 952  * Actually write the language.
 953  * @param data The data about the string.
 954  */
 955 void LanguageWriter::WriteLang(const StringData &data)
 956 {
 957         uint *in_use = AllocaM(uint, data.tabs);
 958         for (size_t tab = 0; tab < data.tabs; tab++) {
 959                 uint n = data.CountInUse((uint)tab);
 960
 961                 in_use[tab] = n;
 962                 _lang.offsets[tab] = TO_LE16(n);
 963
 964                 for (uint j = 0; j != in_use[tab]; j++) {
 965                         const LangString *ls = data.strings[(tab * TAB_SIZE) + j];
 966                         if (ls != nullptr && ls->translated == nullptr) _lang.missing++;
 967                 }
 968         }
 969
 970         _lang.ident = TO_LE32(LanguagePackHeader::IDENT);
 971         _lang.version = TO_LE32(data.Version());
 972         _lang.missing = TO_LE16(_lang.missing);
 973         _lang.winlangid = TO_LE16(_lang.winlangid);
 974
 975         this->WriteHeader(&_lang);
 976         Buffer buffer;
 977
 978         for (size_t tab = 0; tab < data.tabs; tab++) {
 979                 for (uint j = 0; j != in_use[tab]; j++) {
 980                         const LangString *ls = data.strings[(tab * TAB_SIZE) + j];
 981                         const Case *casep;
 982                         const char *cmdp;
 983
 984                         /* For undefined strings, just set that it's an empty string */
 985                         if (ls == nullptr) {
 986                                 this->WriteLength(0);
 987                                 continue;
 988                         }
 989
 990                         _cur_ident = ls->name;
 991                         _cur_line = ls->line;
 992
 993                         /* Produce a message if a string doesn't have a translation. */
 994                         if (_show_todo > 0 && ls->translated == nullptr) {
 995                                 if ((_show_todo & 2) != 0) {
 996                                         strgen_warning("'%s' is untranslated", ls->name);
 997                                 }
 998                                 if ((_show_todo & 1) != 0) {
 999                                         const char *s = "<TODO> ";
1000                                         while (*s != '\0') buffer.AppendByte(*s++);
1001                                 }
1002                         }
1003
1004                         /* Extract the strings and stuff from the english command string */
1005                         ExtractCommandString(&_cur_pcs, ls->english, false);
1006
1007                         if (ls->translated_case != nullptr || ls->translated != nullptr) {
1008                                 casep = ls->translated_case;
1009                                 cmdp = ls->translated;
1010                         } else {
1011                                 casep = nullptr;
1012                                 cmdp = ls->english;
1013                         }
1014
1015                         _translated = cmdp != ls->english;
1016
1017                         if (casep != nullptr) {
1018                                 const Case *c;
1019                                 uint num;
1020
1021                                 /* Need to output a case-switch.
1022                                  * It has this format
1023                                  * <0x9E> <NUM CASES> <CASE1> <LEN1> <STRING1> <CASE2> <LEN2> <STRING2> <CASE3> <LEN3> <STRING3> <STRINGDEFAULT>
1024                                  * Each LEN is printed using 2 bytes in big endian order. */
1025                                 buffer.AppendUtf8(SCC_SWITCH_CASE);
1026                                 /* Count the number of cases */
1027                                 for (num = 0, c = casep; c; c = c->next) num++;
1028                                 buffer.AppendByte(num);
1029
1030                                 /* Write each case */
1031                                 for (c = casep; c != nullptr; c = c->next) {
1032                                         buffer.AppendByte(c->caseidx);
1033                                         /* Make some space for the 16-bit length */
1034                                         uint pos = (uint)buffer.size();
1035                                         buffer.AppendByte(0);
1036                                         buffer.AppendByte(0);
1037                                         /* Write string */
1038                                         PutCommandString(&buffer, c->string);
1039                                         buffer.AppendByte(0); // terminate with a zero
1040                                         /* Fill in the length */
1041                                         uint size = (uint)buffer.size() - (pos + 2);
1042                                         buffer[pos + 0] = GB(size, 8, 8);
1043                                         buffer[pos + 1] = GB(size, 0, 8);
1044                                 }
1045                         }
1046
1047                         if (cmdp != nullptr) PutCommandString(&buffer, cmdp);
1048
1049                         this->WriteLength((uint)buffer.size());
1050                         this->Write(buffer.data(), buffer.size());
1051                         buffer.clear();
1052                 }
1053         }
1054 }