src/strgen/strgen_base.cpp

   1 /*
   2  * This file is part of OpenTTD.
   3  * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
   4  * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   5  * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
   6  */
   7
   8 /** @file strgen_base.cpp Tool to create computer readable (stand-alone) translation files. */
   9
  10 #include "../stdafx.h"
  11 #include "../core/alloc_func.hpp"
  12 #include "../core/endian_func.hpp"
  13 #include "../core/mem_func.hpp"
  14 #include "../error_func.h"
  15 #include "../string_func.h"
  16 #include "../table/control_codes.h"
  17
  18 #include "strgen.h"
  19
  20
  21 #include "../table/strgen_tables.h"
  22
  23 #include "../safeguards.h"
  24
  25 /* Compiles a list of strings into a compiled string list */
  26
  27 static bool _translated;              ///< Whether the current language is not the master language
  28 static bool _translation;             ///< Is the current file actually a translation or not
  29 const char *_file = "(unknown file)"; ///< The filename of the input, so we can refer to it in errors/warnings
  30 int _cur_line;                        ///< The current line we're parsing in the input file
  31 int _errors, _warnings, _show_todo;
  32 LanguagePackHeader _lang;             ///< Header information about a language.
  33
  34 static const CmdStruct *ParseCommandString(const char **str, std::string &param, int *argno, int *casei);
  35
  36 /**
  37  * Create a new case.
  38  * @param caseidx The index of the case.
  39  * @param string  The translation of the case.
  40  */
  41 Case::Case(int caseidx, const std::string &string) :
  42                 caseidx(caseidx), string(string)
  43 {
  44 }
  45
  46 /**
  47  * Create a new string.
  48  * @param name    The name of the string.
  49  * @param english The english "translation" of the string.
  50  * @param index   The index in the string table.
  51  * @param line    The line this string was found on.
  52  */
  53 LangString::LangString(const std::string &name, const std::string &english, size_t index, int line) :
  54                 name(name), english(english), index(index), line(line)
  55 {
  56 }
  57
  58 /** Free all data related to the translation. */
  59 void LangString::FreeTranslation()
  60 {
  61         this->translated.clear();
  62         this->translated_cases.clear();
  63 }
  64
  65 /**
  66  * Create a new string data container.
  67  * @param tabs The maximum number of strings.
  68  */
  69 StringData::StringData(size_t tabs) : tabs(tabs), max_strings(tabs * TAB_SIZE)
  70 {
  71         this->strings.resize(max_strings);
  72         this->next_string_id = 0;
  73 }
  74
  75 /** Free all data related to the translation. */
  76 void StringData::FreeTranslation()
  77 {
  78         for (size_t i = 0; i < this->max_strings; i++) {
  79                 LangString *ls = this->strings[i].get();
  80                 if (ls != nullptr) ls->FreeTranslation();
  81         }
  82 }
  83
  84 /**
  85  * Add a newly created LangString.
  86  * @param s  The name of the string.
  87  * @param ls The string to add.
  88  */
  89 void StringData::Add(std::unique_ptr<LangString> ls)
  90 {
  91         this->name_to_string[ls->name] = ls.get();
  92         this->strings[ls->index].swap(ls);
  93 }
  94
  95 /**
  96  * Find a LangString based on the string name.
  97  * @param s The string name to search on.
  98  * @return The LangString or nullptr if it is not known.
  99  */
 100 LangString *StringData::Find(const std::string_view s)
 101 {
 102         auto it = this->name_to_string.find(s);
 103         if (it == this->name_to_string.end()) return nullptr;
 104
 105         return it->second;
 106 }
 107
 108 /**
 109  * Create a compound hash.
 110  * @param hash The hash to add the string hash to.
 111  * @param s    The string hash.
 112  * @return The new hash.
 113  */
 114 uint StringData::VersionHashStr(uint hash, const char *s) const
 115 {
 116         for (; *s != '\0'; s++) {
 117                 hash = std::rotl(hash, 3) ^ *s;
 118                 hash = (hash & 1 ? hash >> 1 ^ 0xDEADBEEF : hash >> 1);
 119         }
 120         return hash;
 121 }
 122
 123 /**
 124  * Make a hash of the file to get a unique "version number"
 125  * @return The version number.
 126  */
 127 uint StringData::Version() const
 128 {
 129         uint hash = 0;
 130
 131         for (size_t i = 0; i < this->max_strings; i++) {
 132                 const LangString *ls = this->strings[i].get();
 133
 134                 if (ls != nullptr) {
 135                         const CmdStruct *cs;
 136                         const char *s;
 137                         std::string buf;
 138                         int argno;
 139                         int casei;
 140
 141                         s = ls->name.c_str();
 142                         hash ^= i * 0x717239;
 143                         hash = (hash & 1 ? hash >> 1 ^ 0xDEADBEEF : hash >> 1);
 144                         hash = this->VersionHashStr(hash, s + 1);
 145
 146                         s = ls->english.c_str();
 147                         while ((cs = ParseCommandString(&s, buf, &argno, &casei)) != nullptr) {
 148                                 if (cs->flags & C_DONTCOUNT) continue;
 149
 150                                 hash ^= (cs - _cmd_structs) * 0x1234567;
 151                                 hash = (hash & 1 ? hash >> 1 ^ 0xF00BAA4 : hash >> 1);
 152                         }
 153                 }
 154         }
 155
 156         return hash;
 157 }
 158
 159 /**
 160  * Count the number of tab elements that are in use.
 161  * @param tab The tab to count the elements of.
 162  */
 163 uint StringData::CountInUse(uint tab) const
 164 {
 165         int i;
 166         for (i = TAB_SIZE; --i >= 0;) if (this->strings[(tab * TAB_SIZE) + i] != nullptr) break;
 167         return i + 1;
 168 }
 169
 170 static const char *_cur_ident;
 171
 172 /* Used when generating some advanced commands. */
 173 static ParsedCommandStruct _cur_pcs;
 174 static int _cur_argidx;
 175
 176 /** The buffer for writing a single string. */
 177 struct Buffer : std::vector<uint8_t> {
 178         /**
 179          * Convenience method for adding a byte.
 180          * @param value The value to add.
 181          */
 182         void AppendByte(uint8_t value)
 183         {
 184                 this->push_back(value);
 185         }
 186
 187         /**
 188          * Add an Unicode character encoded in UTF-8 to the buffer.
 189          * @param value The character to add.
 190          */
 191         void AppendUtf8(uint32_t value)
 192         {
 193                 if (value < 0x80) {
 194                         this->push_back(value);
 195                 } else if (value < 0x800) {
 196                         this->push_back(0xC0 + GB(value,  6, 5));
 197                         this->push_back(0x80 + GB(value,  0, 6));
 198                 } else if (value < 0x10000) {
 199                         this->push_back(0xE0 + GB(value, 12, 4));
 200                         this->push_back(0x80 + GB(value,  6, 6));
 201                         this->push_back(0x80 + GB(value,  0, 6));
 202                 } else if (value < 0x110000) {
 203                         this->push_back(0xF0 + GB(value, 18, 3));
 204                         this->push_back(0x80 + GB(value, 12, 6));
 205                         this->push_back(0x80 + GB(value,  6, 6));
 206                         this->push_back(0x80 + GB(value,  0, 6));
 207                 } else {
 208                         StrgenWarning("Invalid unicode value U+0x{:X}", value);
 209                 }
 210         }
 211 };
 212
 213 size_t Utf8Validate(const char *s)
 214 {
 215         uint32_t c;
 216
 217         if (!HasBit(s[0], 7)) {
 218                 /* 1 byte */
 219                 return 1;
 220         } else if (GB(s[0], 5, 3) == 6 && IsUtf8Part(s[1])) {
 221                 /* 2 bytes */
 222                 c = GB(s[0], 0, 5) << 6 | GB(s[1], 0, 6);
 223                 if (c >= 0x80) return 2;
 224         } else if (GB(s[0], 4, 4) == 14 && IsUtf8Part(s[1]) && IsUtf8Part(s[2])) {
 225                 /* 3 bytes */
 226                 c = GB(s[0], 0, 4) << 12 | GB(s[1], 0, 6) << 6 | GB(s[2], 0, 6);
 227                 if (c >= 0x800) return 3;
 228         } else if (GB(s[0], 3, 5) == 30 && IsUtf8Part(s[1]) && IsUtf8Part(s[2]) && IsUtf8Part(s[3])) {
 229                 /* 4 bytes */
 230                 c = GB(s[0], 0, 3) << 18 | GB(s[1], 0, 6) << 12 | GB(s[2], 0, 6) << 6 | GB(s[3], 0, 6);
 231                 if (c >= 0x10000 && c <= 0x10FFFF) return 4;
 232         }
 233
 234         return 0;
 235 }
 236
 237
 238 void EmitSingleChar(Buffer *buffer, char *buf, int value)
 239 {
 240         if (*buf != '\0') StrgenWarning("Ignoring trailing letters in command");
 241         buffer->AppendUtf8(value);
 242 }
 243
 244
 245 /* The plural specifier looks like
 246  * {NUM} {PLURAL -1 passenger passengers} then it picks either passenger/passengers depending on the count in NUM */
 247
 248 /* This is encoded like
 249  *  CommandByte <ARG#> <NUM> {Length of each string} {each string} */
 250
 251 bool ParseRelNum(char **buf, int *value, int *offset)
 252 {
 253         const char *s = *buf;
 254         char *end;
 255         bool rel = false;
 256
 257         while (*s == ' ' || *s == '\t') s++;
 258         if (*s == '+') {
 259                 rel = true;
 260                 s++;
 261         }
 262         int v = std::strtol(s, &end, 0);
 263         if (end == s) return false;
 264         if (rel || v < 0) {
 265                 *value += v;
 266         } else {
 267                 *value = v;
 268         }
 269         if (offset != nullptr && *end == ':') {
 270                 /* Take the Nth within */
 271                 s = end + 1;
 272                 *offset = std::strtol(s, &end, 0);
 273                 if (end == s) return false;
 274         }
 275         *buf = end;
 276         return true;
 277 }
 278
 279 /* Parse out the next word, or nullptr */
 280 char *ParseWord(char **buf)
 281 {
 282         char *s = *buf, *r;
 283
 284         while (*s == ' ' || *s == '\t') s++;
 285         if (*s == '\0') return nullptr;
 286
 287         if (*s == '"') {
 288                 r = ++s;
 289                 /* parse until next " or NUL */
 290                 for (;;) {
 291                         if (*s == '\0') break;
 292                         if (*s == '"') {
 293                                 *s++ = '\0';
 294                                 break;
 295                         }
 296                         s++;
 297                 }
 298         } else {
 299                 /* proceed until whitespace or NUL */
 300                 r = s;
 301                 for (;;) {
 302                         if (*s == '\0') break;
 303                         if (*s == ' ' || *s == '\t') {
 304                                 *s++ = '\0';
 305                                 break;
 306                         }
 307                         s++;
 308                 }
 309         }
 310         *buf = s;
 311         return r;
 312 }
 313
 314 /* Forward declaration */
 315 static int TranslateArgumentIdx(int arg, int offset = 0);
 316
 317 static void EmitWordList(Buffer *buffer, const std::vector<const char *> &words, uint nw)
 318 {
 319         /* Maximum word length in bytes, excluding trailing NULL. */
 320         constexpr uint MAX_WORD_LENGTH = UINT8_MAX - 2;
 321
 322         buffer->AppendByte(nw);
 323         for (uint i = 0; i < nw; i++) {
 324                 size_t len = strlen(words[i]) + 1;
 325                 if (len >= UINT8_MAX) StrgenFatal("WordList {}/{} string '{}' too long, max bytes {}", i + 1, nw, words[i], MAX_WORD_LENGTH);
 326                 buffer->AppendByte(static_cast<uint8_t>(len));
 327         }
 328         for (uint i = 0; i < nw; i++) {
 329                 for (uint j = 0; words[i][j] != '\0'; j++) buffer->AppendByte(words[i][j]);
 330                 buffer->AppendByte(0);
 331         }
 332 }
 333
 334 void EmitPlural(Buffer *buffer, char *buf, int)
 335 {
 336         int argidx = _cur_argidx;
 337         int offset = -1;
 338         int expected = _plural_forms[_lang.plural_form].plural_count;
 339         std::vector<const char *> words(std::max(expected, MAX_PLURALS), nullptr);
 340         int nw = 0;
 341
 342         /* Parse out the number, if one exists. Otherwise default to prev arg. */
 343         if (!ParseRelNum(&buf, &argidx, &offset)) argidx--;
 344
 345         const CmdStruct *cmd = _cur_pcs.consuming_commands[argidx];
 346         if (offset == -1) {
 347                 /* Use default offset */
 348                 if (cmd == nullptr || cmd->default_plural_offset < 0) {
 349                         StrgenFatal("Command '{}' has no (default) plural position", cmd == nullptr ? "<empty>" : cmd->cmd);
 350                 }
 351                 offset = cmd->default_plural_offset;
 352         }
 353
 354         /* Parse each string */
 355         for (nw = 0; nw < MAX_PLURALS; nw++) {
 356                 words[nw] = ParseWord(&buf);
 357                 if (words[nw] == nullptr) break;
 358         }
 359
 360         if (nw == 0) {
 361                 StrgenFatal("{}: No plural words", _cur_ident);
 362         }
 363
 364         if (expected != nw) {
 365                 if (_translated) {
 366                         StrgenFatal("{}: Invalid number of plural forms. Expecting {}, found {}.", _cur_ident,
 367                                 expected, nw);
 368                 } else {
 369                         if ((_show_todo & 2) != 0) StrgenWarning("'{}' is untranslated. Tweaking english string to allow compilation for plural forms", _cur_ident);
 370                         if (nw > expected) {
 371                                 nw = expected;
 372                         } else {
 373                                 for (; nw < expected; nw++) {
 374                                         words[nw] = words[nw - 1];
 375                                 }
 376                         }
 377                 }
 378         }
 379
 380         buffer->AppendUtf8(SCC_PLURAL_LIST);
 381         buffer->AppendByte(_lang.plural_form);
 382         buffer->AppendByte(TranslateArgumentIdx(argidx, offset));
 383         EmitWordList(buffer, words, nw);
 384 }
 385
 386 void EmitGender(Buffer *buffer, char *buf, int)
 387 {
 388         int argidx = _cur_argidx;
 389         int offset = 0;
 390         uint nw;
 391
 392         if (buf[0] == '=') {
 393                 buf++;
 394
 395                 /* This is a {G=DER} command */
 396                 nw = _lang.GetGenderIndex(buf);
 397                 if (nw >= MAX_NUM_GENDERS) StrgenFatal("G argument '{}' invalid", buf);
 398
 399                 /* now nw contains the gender index */
 400                 buffer->AppendUtf8(SCC_GENDER_INDEX);
 401                 buffer->AppendByte(nw);
 402         } else {
 403                 std::vector<const char *> words(MAX_NUM_GENDERS, nullptr);
 404
 405                 /* This is a {G 0 foo bar two} command.
 406                  * If no relative number exists, default to +0 */
 407                 ParseRelNum(&buf, &argidx, &offset);
 408
 409                 const CmdStruct *cmd = _cur_pcs.consuming_commands[argidx];
 410                 if (cmd == nullptr || (cmd->flags & C_GENDER) == 0) {
 411                         StrgenFatal("Command '{}' can't have a gender", cmd == nullptr ? "<empty>" : cmd->cmd);
 412                 }
 413
 414                 for (nw = 0; nw < MAX_NUM_GENDERS; nw++) {
 415                         words[nw] = ParseWord(&buf);
 416                         if (words[nw] == nullptr) break;
 417                 }
 418                 if (nw != _lang.num_genders) StrgenFatal("Bad # of arguments for gender command");
 419
 420                 assert(IsInsideBS(cmd->value, SCC_CONTROL_START, UINT8_MAX));
 421                 buffer->AppendUtf8(SCC_GENDER_LIST);
 422                 buffer->AppendByte(TranslateArgumentIdx(argidx, offset));
 423                 EmitWordList(buffer, words, nw);
 424         }
 425 }
 426
 427 static const CmdStruct *FindCmd(const char *s, int len)
 428 {
 429         for (const auto &cs : _cmd_structs) {
 430                 if (strncmp(cs.cmd, s, len) == 0 && cs.cmd[len] == '\0') return &cs;
 431         }
 432         return nullptr;
 433 }
 434
 435 static uint ResolveCaseName(const char *str, size_t len)
 436 {
 437         /* First get a clean copy of only the case name, then resolve it. */
 438         char case_str[CASE_GENDER_LEN];
 439         len = std::min(lengthof(case_str) - 1, len);
 440         memcpy(case_str, str, len);
 441         case_str[len] = '\0';
 442
 443         uint8_t case_idx = _lang.GetCaseIndex(case_str);
 444         if (case_idx >= MAX_NUM_CASES) StrgenFatal("Invalid case-name '{}'", case_str);
 445         return case_idx + 1;
 446 }
 447
 448
 449 /* returns nullptr on eof
 450  * else returns command struct */
 451 static const CmdStruct *ParseCommandString(const char **str, std::string &param, int *argno, int *casei)
 452 {
 453         const char *s = *str, *start;
 454         char c;
 455
 456         *argno = -1;
 457         *casei = -1;
 458
 459         /* Scan to the next command, exit if there's no next command. */
 460         for (; *s != '{'; s++) {
 461                 if (*s == '\0') return nullptr;
 462         }
 463         s++; // Skip past the {
 464
 465         if (*s >= '0' && *s <= '9') {
 466                 char *end;
 467
 468                 *argno = std::strtoul(s, &end, 0);
 469                 if (*end != ':') StrgenFatal("missing arg #");
 470                 s = end + 1;
 471         }
 472
 473         /* parse command name */
 474         start = s;
 475         do {
 476                 c = *s++;
 477         } while (c != '}' && c != ' ' && c != '=' && c != '.' && c != 0);
 478
 479         const CmdStruct *cmd = FindCmd(start, s - start - 1);
 480         if (cmd == nullptr) {
 481                 std::string command(start, s - start - 1);
 482                 StrgenError("Undefined command '{}'", command);
 483                 return nullptr;
 484         }
 485
 486         if (c == '.') {
 487                 const char *casep = s;
 488
 489                 if (!(cmd->flags & C_CASE)) {
 490                         StrgenFatal("Command '{}' can't have a case", cmd->cmd);
 491                 }
 492
 493                 do {
 494                         c = *s++;
 495                 } while (c != '}' && c != ' ' && c != '\0');
 496                 *casei = ResolveCaseName(casep, s - casep - 1);
 497         }
 498
 499         if (c == '\0') {
 500                 StrgenError("Missing }} from command '{}'", start);
 501                 return nullptr;
 502         }
 503
 504
 505         if (c != '}') {
 506                 if (c == '=') s--;
 507                 /* copy params */
 508                 start = s;
 509                 for (;;) {
 510                         c = *s++;
 511                         if (c == '}') break;
 512                         if (c == '\0') {
 513                                 StrgenError("Missing }} from command '{}'", start);
 514                                 return nullptr;
 515                         }
 516                         param += c;
 517                 }
 518         }
 519
 520         *str = s;
 521
 522         return cmd;
 523 }
 524
 525 /**
 526  * Prepare reading.
 527  * @param data        The data to fill during reading.
 528  * @param file        The file we are reading.
 529  * @param master      Are we reading the master file?
 530  * @param translation Are we reading a translation?
 531  */
 532 StringReader::StringReader(StringData &data, const std::string &file, bool master, bool translation) :
 533                 data(data), file(file), master(master), translation(translation)
 534 {
 535 }
 536
 537 ParsedCommandStruct ExtractCommandString(const char *s, bool)
 538 {
 539         int argno;
 540         int argidx = 0;
 541         int casei;
 542
 543         ParsedCommandStruct p;
 544
 545         for (;;) {
 546                 /* read until next command from a. */
 547                 std::string param;
 548                 const CmdStruct *ar = ParseCommandString(&s, param, &argno, &casei);
 549
 550                 if (ar == nullptr) break;
 551
 552                 /* Sanity checking */
 553                 if (argno != -1 && ar->consumes == 0) StrgenFatal("Non consumer param can't have a paramindex");
 554
 555                 if (ar->consumes) {
 556                         if (argno != -1) argidx = argno;
 557                         if (argidx < 0 || (uint)argidx >= p.consuming_commands.max_size()) StrgenFatal("invalid param idx {}", argidx);
 558                         if (p.consuming_commands[argidx] != nullptr && p.consuming_commands[argidx] != ar) StrgenFatal("duplicate param idx {}", argidx);
 559
 560                         p.consuming_commands[argidx++] = ar;
 561                 } else if (!(ar->flags & C_DONTCOUNT)) { // Ignore some of them
 562                         p.non_consuming_commands.emplace_back(CmdPair{ar, std::move(param)});
 563                 }
 564         }
 565
 566         return p;
 567 }
 568
 569
 570 const CmdStruct *TranslateCmdForCompare(const CmdStruct *a)
 571 {
 572         if (a == nullptr) return nullptr;
 573
 574         if (strcmp(a->cmd, "STRING1") == 0 ||
 575                         strcmp(a->cmd, "STRING2") == 0 ||
 576                         strcmp(a->cmd, "STRING3") == 0 ||
 577                         strcmp(a->cmd, "STRING4") == 0 ||
 578                         strcmp(a->cmd, "STRING5") == 0 ||
 579                         strcmp(a->cmd, "STRING6") == 0 ||
 580                         strcmp(a->cmd, "STRING7") == 0 ||
 581                         strcmp(a->cmd, "RAW_STRING") == 0) {
 582                 return FindCmd("STRING", 6);
 583         }
 584
 585         return a;
 586 }
 587
 588
 589 static bool CheckCommandsMatch(const char *a, const char *b, const char *name)
 590 {
 591         /* If we're not translating, i.e. we're compiling the base language,
 592          * it is pointless to do all these checks as it'll always be correct.
 593          * After all, all checks are based on the base language.
 594          */
 595         if (!_translation) return true;
 596
 597         bool result = true;
 598
 599         ParsedCommandStruct templ = ExtractCommandString(b, true);
 600         ParsedCommandStruct lang = ExtractCommandString(a, true);
 601
 602         /* For each string in templ, see if we find it in lang */
 603         if (templ.non_consuming_commands.max_size() != lang.non_consuming_commands.max_size()) {
 604                 StrgenWarning("{}: template string and language string have a different # of commands", name);
 605                 result = false;
 606         }
 607
 608         for (auto &templ_nc : templ.non_consuming_commands) {
 609                 /* see if we find it in lang, and zero it out */
 610                 bool found = false;
 611                 for (auto &lang_nc : lang.non_consuming_commands) {
 612                         if (templ_nc.cmd == lang_nc.cmd && templ_nc.param == lang_nc.param) {
 613                                 /* it was found in both. zero it out from lang so we don't find it again */
 614                                 lang_nc.cmd = nullptr;
 615                                 found = true;
 616                                 break;
 617                         }
 618                 }
 619
 620                 if (!found) {
 621                         StrgenWarning("{}: command '{}' exists in template file but not in language file", name, templ_nc.cmd->cmd);
 622                         result = false;
 623                 }
 624         }
 625
 626         /* if we reach here, all non consumer commands match up.
 627          * Check if the non consumer commands match up also. */
 628         for (uint i = 0; i < templ.consuming_commands.max_size(); i++) {
 629                 if (TranslateCmdForCompare(templ.consuming_commands[i]) != lang.consuming_commands[i]) {
 630                         StrgenWarning("{}: Param idx #{} '{}' doesn't match with template command '{}'", name, i,
 631                                 lang.consuming_commands[i]  == nullptr ? "<empty>" : TranslateCmdForCompare(lang.consuming_commands[i])->cmd,
 632                                 templ.consuming_commands[i] == nullptr ? "<empty>" : templ.consuming_commands[i]->cmd);
 633                         result = false;
 634                 }
 635         }
 636
 637         return result;
 638 }
 639
 640 void StringReader::HandleString(char *str)
 641 {
 642         if (*str == '#') {
 643                 if (str[1] == '#' && str[2] != '#') this->HandlePragma(str + 2);
 644                 return;
 645         }
 646
 647         /* Ignore comments & blank lines */
 648         if (*str == ';' || *str == ' ' || *str == '\0') return;
 649
 650         char *s = strchr(str, ':');
 651         if (s == nullptr) {
 652                 StrgenError("Line has no ':' delimiter");
 653                 return;
 654         }
 655
 656         char *t;
 657         /* Trim spaces.
 658          * After this str points to the command name, and s points to the command contents */
 659         for (t = s; t > str && (t[-1] == ' ' || t[-1] == '\t'); t--) {}
 660         *t = 0;
 661         s++;
 662
 663         /* Check string is valid UTF-8 */
 664         const char *tmp;
 665         for (tmp = s; *tmp != '\0';) {
 666                 size_t len = Utf8Validate(tmp);
 667                 if (len == 0) StrgenFatal("Invalid UTF-8 sequence in '{}'", s);
 668
 669                 char32_t c;
 670                 Utf8Decode(&c, tmp);
 671                 if (c <= 0x001F || // ASCII control character range
 672                                 c == 0x200B || // Zero width space
 673                                 (c >= 0xE000 && c <= 0xF8FF) || // Private range
 674                                 (c >= 0xFFF0 && c <= 0xFFFF)) { // Specials range
 675                         StrgenFatal("Unwanted UTF-8 character U+{:04X} in sequence '{}'", (int)c, s);
 676                 }
 677
 678                 tmp += len;
 679         }
 680
 681         /* Check if the string has a case..
 682          * The syntax for cases is IDENTNAME.case */
 683         char *casep = strchr(str, '.');
 684         if (casep != nullptr) *casep++ = '\0';
 685
 686         /* Check if this string already exists.. */
 687         LangString *ent = this->data.Find(str);
 688
 689         if (this->master) {
 690                 if (casep != nullptr) {
 691                         StrgenError("Cases in the base translation are not supported.");
 692                         return;
 693                 }
 694
 695                 if (ent != nullptr) {
 696                         StrgenError("String name '{}' is used multiple times", str);
 697                         return;
 698                 }
 699
 700                 if (this->data.strings[this->data.next_string_id] != nullptr) {
 701                         StrgenError("String ID 0x{:X} for '{}' already in use by '{}'", this->data.next_string_id, str, this->data.strings[this->data.next_string_id]->name);
 702                         return;
 703                 }
 704
 705                 /* Allocate a new LangString */
 706                 this->data.Add(std::make_unique<LangString>(str, s, this->data.next_string_id++, _cur_line));
 707         } else {
 708                 if (ent == nullptr) {
 709                         StrgenWarning("String name '{}' does not exist in master file", str);
 710                         return;
 711                 }
 712
 713                 if (!ent->translated.empty() && casep == nullptr) {
 714                         StrgenError("String name '{}' is used multiple times", str);
 715                         return;
 716                 }
 717
 718                 /* make sure that the commands match */
 719                 if (!CheckCommandsMatch(s, ent->english.c_str(), str)) return;
 720
 721                 if (casep != nullptr) {
 722                         ent->translated_cases.emplace_back(ResolveCaseName(casep, strlen(casep)), s);
 723                 } else {
 724                         ent->translated = s;
 725                         /* If the string was translated, use the line from the
 726                          * translated language so errors in the translated file
 727                          * are properly referenced to. */
 728                         ent->line = _cur_line;
 729                 }
 730         }
 731 }
 732
 733 void StringReader::HandlePragma(char *str)
 734 {
 735         if (!memcmp(str, "plural ", 7)) {
 736                 _lang.plural_form = atoi(str + 7);
 737                 if (_lang.plural_form >= lengthof(_plural_forms)) {
 738                         StrgenFatal("Invalid pluralform {}", _lang.plural_form);
 739                 }
 740         } else {
 741                 StrgenFatal("unknown pragma '{}'", str);
 742         }
 743 }
 744
 745 static void StripTrailingWhitespace(std::string &str)
 746 {
 747         str.erase(str.find_last_not_of("\r\n ") + 1);
 748 }
 749
 750 void StringReader::ParseFile()
 751 {
 752         _warnings = _errors = 0;
 753
 754         _translation = this->translation;
 755         _file = this->file.c_str();
 756
 757         /* Abusing _show_todo to replace "warning" with "info" for translations. */
 758         _show_todo &= 3;
 759         if (!this->translation) _show_todo |= 4;
 760
 761         /* For each new file we parse, reset the genders, and language codes. */
 762         MemSetT(&_lang, 0);
 763         strecpy(_lang.digit_group_separator, ",");
 764         strecpy(_lang.digit_group_separator_currency, ",");
 765         strecpy(_lang.digit_decimal_separator, ".");
 766
 767         _cur_line = 1;
 768         while (this->data.next_string_id < this->data.max_strings) {
 769                 std::optional<std::string> line = this->ReadLine();
 770                 if (!line.has_value()) return;
 771
 772                 StripTrailingWhitespace(line.value());
 773                 this->HandleString(line.value().data());
 774                 _cur_line++;
 775         }
 776
 777         if (this->data.next_string_id == this->data.max_strings) {
 778                 StrgenError("Too many strings, maximum allowed is {}", this->data.max_strings);
 779         }
 780 }
 781
 782 /**
 783  * Write the header information.
 784  * @param data The data about the string.
 785  */
 786 void HeaderWriter::WriteHeader(const StringData &data)
 787 {
 788         int last = 0;
 789         for (size_t i = 0; i < data.max_strings; i++) {
 790                 if (data.strings[i] != nullptr) {
 791                         this->WriteStringID(data.strings[i]->name, (int)i);
 792                         last = (int)i;
 793                 }
 794         }
 795
 796         this->WriteStringID("STR_LAST_STRINGID", last);
 797 }
 798
 799 static int TranslateArgumentIdx(int argidx, int offset)
 800 {
 801         int sum;
 802
 803         if (argidx < 0 || (uint)argidx >= _cur_pcs.consuming_commands.max_size()) {
 804                 StrgenFatal("invalid argidx {}", argidx);
 805         }
 806         const CmdStruct *cs = _cur_pcs.consuming_commands[argidx];
 807         if (cs != nullptr && cs->consumes <= offset) {
 808                 StrgenFatal("invalid argidx offset {}:{}", argidx, offset);
 809         }
 810
 811         if (_cur_pcs.consuming_commands[argidx] == nullptr) {
 812                 StrgenFatal("no command for this argidx {}", argidx);
 813         }
 814
 815         for (int i = sum = 0; i < argidx; i++) {
 816                 cs = _cur_pcs.consuming_commands[i];
 817
 818                 sum += (cs != nullptr) ? cs->consumes : 1;
 819         }
 820
 821         return sum + offset;
 822 }
 823
 824 static void PutArgidxCommand(Buffer *buffer)
 825 {
 826         buffer->AppendUtf8(SCC_ARG_INDEX);
 827         buffer->AppendByte(TranslateArgumentIdx(_cur_argidx));
 828 }
 829
 830
 831 static void PutCommandString(Buffer *buffer, const char *str)
 832 {
 833         _cur_argidx = 0;
 834
 835         while (*str != '\0') {
 836                 /* Process characters as they are until we encounter a { */
 837                 if (*str != '{') {
 838                         buffer->AppendByte(*str++);
 839                         continue;
 840                 }
 841
 842                 std::string param;
 843                 int argno;
 844                 int casei;
 845                 const CmdStruct *cs = ParseCommandString(&str, param, &argno, &casei);
 846                 if (cs == nullptr) break;
 847
 848                 if (casei != -1) {
 849                         buffer->AppendUtf8(SCC_SET_CASE); // {SET_CASE}
 850                         buffer->AppendByte(casei);
 851                 }
 852
 853                 /* For params that consume values, we need to handle the argindex properly */
 854                 if (cs->consumes > 0) {
 855                         /* Check if we need to output a move-param command */
 856                         if (argno != -1 && argno != _cur_argidx) {
 857                                 _cur_argidx = argno;
 858                                 PutArgidxCommand(buffer);
 859                         }
 860
 861                         /* Output the one from the master string... it's always accurate. */
 862                         cs = _cur_pcs.consuming_commands[_cur_argidx++];
 863                         if (cs == nullptr) {
 864                                 StrgenFatal("{}: No argument exists at position {}", _cur_ident, _cur_argidx - 1);
 865                         }
 866                 }
 867
 868                 cs->proc(buffer, param.data(), cs->value);
 869         }
 870 }
 871
 872 /**
 873  * Write the length as a simple gamma.
 874  * @param length The number to write.
 875  */
 876 void LanguageWriter::WriteLength(uint length)
 877 {
 878         char buffer[2];
 879         int offs = 0;
 880         if (length >= 0x4000) {
 881                 StrgenFatal("string too long");
 882         }
 883
 884         if (length >= 0xC0) {
 885                 buffer[offs++] = (length >> 8) | 0xC0;
 886         }
 887         buffer[offs++] = length & 0xFF;
 888         this->Write((uint8_t*)buffer, offs);
 889 }
 890
 891 /**
 892  * Actually write the language.
 893  * @param data The data about the string.
 894  */
 895 void LanguageWriter::WriteLang(const StringData &data)
 896 {
 897         std::vector<uint> in_use;
 898         for (size_t tab = 0; tab < data.tabs; tab++) {
 899                 uint n = data.CountInUse((uint)tab);
 900
 901                 in_use.push_back(n);
 902                 _lang.offsets[tab] = TO_LE16(n);
 903
 904                 for (uint j = 0; j != in_use[tab]; j++) {
 905                         const LangString *ls = data.strings[(tab * TAB_SIZE) + j].get();
 906                         if (ls != nullptr && ls->translated.empty()) _lang.missing++;
 907                 }
 908         }
 909
 910         _lang.ident = TO_LE32(LanguagePackHeader::IDENT);
 911         _lang.version = TO_LE32(data.Version());
 912         _lang.missing = TO_LE16(_lang.missing);
 913         _lang.winlangid = TO_LE16(_lang.winlangid);
 914
 915         this->WriteHeader(&_lang);
 916         Buffer buffer;
 917
 918         for (size_t tab = 0; tab < data.tabs; tab++) {
 919                 for (uint j = 0; j != in_use[tab]; j++) {
 920                         const LangString *ls = data.strings[(tab * TAB_SIZE) + j].get();
 921                         const std::string *cmdp;
 922
 923                         /* For undefined strings, just set that it's an empty string */
 924                         if (ls == nullptr) {
 925                                 this->WriteLength(0);
 926                                 continue;
 927                         }
 928
 929                         _cur_ident = ls->name.c_str();
 930                         _cur_line = ls->line;
 931
 932                         /* Produce a message if a string doesn't have a translation. */
 933                         if (_show_todo > 0 && ls->translated.empty()) {
 934                                 if ((_show_todo & 2) != 0) {
 935                                         StrgenWarning("'{}' is untranslated", ls->name);
 936                                 }
 937                                 if ((_show_todo & 1) != 0) {
 938                                         const char *s = "<TODO> ";
 939                                         while (*s != '\0') buffer.AppendByte(*s++);
 940                                 }
 941                         }
 942
 943                         /* Extract the strings and stuff from the english command string */
 944                         _cur_pcs = ExtractCommandString(ls->english.c_str(), false);
 945
 946                         if (!ls->translated_cases.empty() || !ls->translated.empty()) {
 947                                 cmdp = &ls->translated;
 948                         } else {
 949                                 cmdp = &ls->english;
 950                         }
 951
 952                         _translated = cmdp != &ls->english;
 953
 954                         if (!ls->translated_cases.empty()) {
 955                                 /* Need to output a case-switch.
 956                                  * It has this format
 957                                  * <0x9E> <NUM CASES> <CASE1> <LEN1> <STRING1> <CASE2> <LEN2> <STRING2> <CASE3> <LEN3> <STRING3> <STRINGDEFAULT>
 958                                  * Each LEN is printed using 2 bytes in big endian order. */
 959                                 buffer.AppendUtf8(SCC_SWITCH_CASE);
 960                                 buffer.AppendByte((uint8_t)ls->translated_cases.size());
 961
 962                                 /* Write each case */
 963                                 for (const Case &c : ls->translated_cases) {
 964                                         buffer.AppendByte(c.caseidx);
 965                                         /* Make some space for the 16-bit length */
 966                                         uint pos = (uint)buffer.size();
 967                                         buffer.AppendByte(0);
 968                                         buffer.AppendByte(0);
 969                                         /* Write string */
 970                                         PutCommandString(&buffer, c.string.c_str());
 971                                         buffer.AppendByte(0); // terminate with a zero
 972                                         /* Fill in the length */
 973                                         uint size = (uint)buffer.size() - (pos + 2);
 974                                         buffer[pos + 0] = GB(size, 8, 8);
 975                                         buffer[pos + 1] = GB(size, 0, 8);
 976                                 }
 977                         }
 978
 979                         if (!cmdp->empty()) PutCommandString(&buffer, cmdp->c_str());
 980
 981                         this->WriteLength((uint)buffer.size());
 982                         this->Write(buffer.data(), buffer.size());
 983                         buffer.clear();
 984                 }
 985         }
 986 }