gnu/dist/groff/src/preproc/refer/token.cpp

   1 /*      $NetBSD$        */
   2
   3 // -*- C++ -*-
   4 /* Copyright (C) 1989, 1990, 1991, 1992, 2001 Free Software Foundation, Inc.
   5      Written by James Clark (jjc@jclark.com)
   6
   7 This file is part of groff.
   8
   9 groff is free software; you can redistribute it and/or modify it under
  10 the terms of the GNU General Public License as published by the Free
  11 Software Foundation; either version 2, or (at your option) any later
  12 version.
  13
  14 groff is distributed in the hope that it will be useful, but WITHOUT ANY
  15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  16 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  17 for more details.
  18
  19 You should have received a copy of the GNU General Public License along
  20 with groff; see the file COPYING.  If not, write to the Free Software
  21 Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
  22
  23 #include "refer.h"
  24 #include "token.h"
  25
  26 #define TOKEN_TABLE_SIZE 1009
  27 // I believe in Icelandic thorn sorts after z.
  28 #define THORN_SORT_KEY "{"
  29
  30 struct token_table_entry {
  31   const char *tok;
  32   token_info ti;
  33   token_table_entry();
  34 };
  35
  36 token_table_entry token_table[TOKEN_TABLE_SIZE];
  37 int ntokens = 0;
  38
  39 static void skip_name(const char **ptr, const char *end)
  40 {
  41   if (*ptr < end) {
  42     switch (*(*ptr)++) {
  43     case '(':
  44       if (*ptr < end) {
  45         *ptr += 1;
  46         if (*ptr < end)
  47           *ptr += 1;
  48       }
  49       break;
  50     case '[':
  51       while (*ptr < end)
  52         if (*(*ptr)++ == ']')
  53           break;
  54       break;
  55     }
  56   }
  57 }
  58
  59 int get_token(const char **ptr, const char *end)
  60 {
  61   if (*ptr >= end)
  62     return 0;
  63   char c = *(*ptr)++;
  64   if (c == '\\' && *ptr < end) {
  65     switch (**ptr) {
  66     default:
  67       *ptr += 1;
  68       break;
  69     case '(':
  70     case '[':
  71       skip_name(ptr, end);
  72       break;
  73     case '*':
  74     case 'f':
  75       *ptr += 1;
  76       skip_name(ptr, end);
  77       break;
  78     }
  79   }
  80   return 1;
  81 }
  82
  83 token_info::token_info()
  84 : type(TOKEN_OTHER), sort_key(0), other_case(0)
  85 {
  86 }
  87
  88 void token_info::set(token_type t, const char *sk, const char *oc)
  89 {
  90   assert(oc == 0 || t == TOKEN_UPPER || t == TOKEN_LOWER);
  91   type = t;
  92   sort_key = sk;
  93   other_case = oc;
  94 }
  95
  96 void token_info::sortify(const char *start, const char *end, string &result)
  97      const
  98 {
  99   if (sort_key)
 100     result += sort_key;
 101   else if (type == TOKEN_UPPER || type == TOKEN_LOWER) {
 102     for (; start < end; start++)
 103       if (csalpha(*start))
 104         result += cmlower(*start);
 105   }
 106 }
 107
 108 int token_info::sortify_non_empty(const char *start, const char *end) const
 109 {
 110   if (sort_key)
 111     return *sort_key != '\0';
 112   if (type != TOKEN_UPPER && type != TOKEN_LOWER)
 113     return 0;
 114   for (; start < end; start++)
 115     if (csalpha(*start))
 116       return 1;
 117   return 0;
 118 }
 119
 120
 121 void token_info::lower_case(const char *start, const char *end,
 122                             string &result) const
 123 {
 124   if (type != TOKEN_UPPER) {
 125     while (start < end)
 126       result += *start++;
 127   }
 128   else if (other_case)
 129     result += other_case;
 130   else {
 131     while (start < end)
 132       result += cmlower(*start++);
 133   }
 134 }
 135
 136 void token_info::upper_case(const char *start, const char *end,
 137                             string &result) const
 138 {
 139   if (type != TOKEN_LOWER) {
 140     while (start < end)
 141       result += *start++;
 142   }
 143   else if (other_case)
 144     result += other_case;
 145   else {
 146     while (start < end)
 147       result += cmupper(*start++);
 148   }
 149 }
 150
 151 token_table_entry::token_table_entry()
 152 : tok(0)
 153 {
 154 }
 155
 156 static void store_token(const char *tok, token_type typ,
 157                         const char *sk = 0, const char *oc = 0)
 158 {
 159   unsigned n = hash_string(tok, strlen(tok)) % TOKEN_TABLE_SIZE;
 160   for (;;) {
 161     if (token_table[n].tok == 0) {
 162       if (++ntokens == TOKEN_TABLE_SIZE)
 163         assert(0);
 164       token_table[n].tok = tok;
 165       break;
 166     }
 167     if (strcmp(tok, token_table[n].tok) == 0)
 168       break;
 169     if (n == 0)
 170       n = TOKEN_TABLE_SIZE - 1;
 171     else
 172       --n;
 173   }
 174   token_table[n].ti.set(typ, sk, oc);
 175 }
 176
 177
 178 token_info default_token_info;
 179
 180 const token_info *lookup_token(const char *start, const char *end)
 181 {
 182   unsigned n = hash_string(start, end - start) % TOKEN_TABLE_SIZE;
 183   for (;;) {
 184     if (token_table[n].tok == 0)
 185       break;
 186     if (strlen(token_table[n].tok) == size_t(end - start)
 187         && memcmp(token_table[n].tok, start, end - start) == 0)
 188       return &(token_table[n].ti);
 189     if (n == 0)
 190       n = TOKEN_TABLE_SIZE - 1;
 191     else
 192       --n;
 193   }
 194   return &default_token_info;
 195 }
 196
 197 static void init_ascii()
 198 {
 199   const char *p;
 200   for (p = "abcdefghijklmnopqrstuvwxyz"; *p; p++) {
 201     char buf[2];
 202     buf[0] = *p;
 203     buf[1] = '\0';
 204     store_token(strsave(buf), TOKEN_LOWER);
 205     buf[0] = cmupper(buf[0]);
 206     store_token(strsave(buf), TOKEN_UPPER);
 207   }
 208   for (p = "0123456789"; *p; p++) {
 209     char buf[2];
 210     buf[0] = *p;
 211     buf[1] = '\0';
 212     const char *s = strsave(buf);
 213     store_token(s, TOKEN_OTHER, s);
 214   }
 215   for (p = ".,:;?!"; *p; p++) {
 216     char buf[2];
 217     buf[0] = *p;
 218     buf[1] = '\0';
 219     store_token(strsave(buf), TOKEN_PUNCT);
 220   }
 221   store_token("-", TOKEN_HYPHEN);
 222 }
 223
 224 static void store_letter(const char *lower, const char *upper,
 225                   const char *sort_key = 0)
 226 {
 227   store_token(lower, TOKEN_LOWER, sort_key, upper);
 228   store_token(upper, TOKEN_UPPER, sort_key, lower);
 229 }
 230
 231 static void init_letter(unsigned char uc_code, unsigned char lc_code,
 232                  const char *sort_key)
 233 {
 234   char lbuf[2];
 235   lbuf[0] = lc_code;
 236   lbuf[1] = 0;
 237   char ubuf[2];
 238   ubuf[0] = uc_code;
 239   ubuf[1] = 0;
 240   store_letter(strsave(lbuf), strsave(ubuf), sort_key);
 241 }
 242
 243 static void init_latin1()
 244 {
 245   init_letter(0xc0, 0xe0, "a");
 246   init_letter(0xc1, 0xe1, "a");
 247   init_letter(0xc2, 0xe2, "a");
 248   init_letter(0xc3, 0xe3, "a");
 249   init_letter(0xc4, 0xe4, "a");
 250   init_letter(0xc5, 0xe5, "a");
 251   init_letter(0xc6, 0xe6, "ae");
 252   init_letter(0xc7, 0xe7, "c");
 253   init_letter(0xc8, 0xe8, "e");
 254   init_letter(0xc9, 0xe9, "e");
 255   init_letter(0xca, 0xea, "e");
 256   init_letter(0xcb, 0xeb, "e");
 257   init_letter(0xcc, 0xec, "i");
 258   init_letter(0xcd, 0xed, "i");
 259   init_letter(0xce, 0xee, "i");
 260   init_letter(0xcf, 0xef, "i");
 261
 262   init_letter(0xd0, 0xf0, "d");
 263   init_letter(0xd1, 0xf1, "n");
 264   init_letter(0xd2, 0xf2, "o");
 265   init_letter(0xd3, 0xf3, "o");
 266   init_letter(0xd4, 0xf4, "o");
 267   init_letter(0xd5, 0xf5, "o");
 268   init_letter(0xd6, 0xf6, "o");
 269   init_letter(0xd8, 0xf8, "o");
 270   init_letter(0xd9, 0xf9, "u");
 271   init_letter(0xda, 0xfa, "u");
 272   init_letter(0xdb, 0xfb, "u");
 273   init_letter(0xdc, 0xfc, "u");
 274   init_letter(0xdd, 0xfd, "y");
 275   init_letter(0xde, 0xfe, THORN_SORT_KEY);
 276
 277   store_token("\337", TOKEN_LOWER, "ss", "SS");
 278   store_token("\377", TOKEN_LOWER, "y", "Y");
 279 }
 280
 281 static void init_two_char_letter(char l1, char l2, char u1, char u2,
 282                                  const char *sk = 0)
 283 {
 284   char buf[6];
 285   buf[0] = '\\';
 286   buf[1] = '(';
 287   buf[2] = l1;
 288   buf[3] = l2;
 289   buf[4] = '\0';
 290   const char *p = strsave(buf);
 291   buf[2] = u1;
 292   buf[3] = u2;
 293   store_letter(p, strsave(buf), sk);
 294   buf[1] = '[';
 295   buf[4] = ']';
 296   buf[5] = '\0';
 297   p = strsave(buf);
 298   buf[2] = l1;
 299   buf[3] = l2;
 300   store_letter(strsave(buf), p, sk);
 301
 302 }
 303
 304 static void init_special_chars()
 305 {
 306   const char *p;
 307   for (p = "':^`~"; *p; p++)
 308     for (const char *q = "aeiouy"; *q; q++) {
 309       // Use a variable to work around bug in gcc 2.0
 310       char c = cmupper(*q);
 311       init_two_char_letter(*p, *q, *p, c);
 312     }
 313   for (p = "/l/o~n,coeaeij"; *p; p += 2) {
 314     // Use variables to work around bug in gcc 2.0
 315     char c0 = cmupper(p[0]);
 316     char c1 = cmupper(p[1]);
 317     init_two_char_letter(p[0], p[1], c0, c1);
 318   }
 319   init_two_char_letter('v', 's', 'v', 'S', "s");
 320   init_two_char_letter('v', 'z', 'v', 'Z', "z");
 321   init_two_char_letter('o', 'a', 'o', 'A', "a");
 322   init_two_char_letter('T', 'p', 'T', 'P', THORN_SORT_KEY);
 323   init_two_char_letter('-', 'd', '-', 'D');
 324
 325   store_token("\\(ss", TOKEN_LOWER, 0, "SS");
 326   store_token("\\[ss]", TOKEN_LOWER, 0, "SS");
 327
 328   store_token("\\(Sd", TOKEN_LOWER, "d", "\\(-D");
 329   store_token("\\[Sd]", TOKEN_LOWER, "d", "\\[-D]");
 330   store_token("\\(hy", TOKEN_HYPHEN);
 331   store_token("\\[hy]", TOKEN_HYPHEN);
 332   store_token("\\(en", TOKEN_RANGE_SEP);
 333   store_token("\\[en]", TOKEN_RANGE_SEP);
 334 }
 335
 336 static void init_strings()
 337 {
 338   char buf[6];
 339   buf[0] = '\\';
 340   buf[1] = '*';
 341   for (const char *p = "'`^^,:~v_o./;"; *p; p++) {
 342     buf[2] = *p;
 343     buf[3] = '\0';
 344     store_token(strsave(buf), TOKEN_ACCENT);
 345     buf[2] = '[';
 346     buf[3] = *p;
 347     buf[4] = ']';
 348     buf[5] = '\0';
 349     store_token(strsave(buf), TOKEN_ACCENT);
 350   }
 351
 352   // -ms special letters
 353   store_letter("\\*(th", "\\*(Th", THORN_SORT_KEY);
 354   store_letter("\\*[th]", "\\*[Th]", THORN_SORT_KEY);
 355   store_letter("\\*(d-", "\\*(D-");
 356   store_letter("\\*[d-]", "\\*[D-]");
 357   store_letter("\\*(ae", "\\*(Ae", "ae");
 358   store_letter("\\*[ae]", "\\*[Ae]", "ae");
 359   store_letter("\\*(oe", "\\*(Oe", "oe");
 360   store_letter("\\*[oe]", "\\*[Oe]", "oe");
 361
 362   store_token("\\*3", TOKEN_LOWER, "y", "Y");
 363   store_token("\\*8", TOKEN_LOWER, "ss", "SS");
 364   store_token("\\*q", TOKEN_LOWER, "o", "O");
 365 }
 366
 367 struct token_initer {
 368   token_initer();
 369 };
 370
 371 static token_initer the_token_initer;
 372
 373 token_initer::token_initer()
 374 {
 375   init_ascii();
 376   init_latin1();
 377   init_special_chars();
 378   init_strings();
 379   default_token_info.set(TOKEN_OTHER);
 380 }