No empty .Rs/.Re
[netbsd-mini2440.git] / gnu / dist / groff / src / preproc / refer / token.cpp
blob5f39b7531ddcbd17fdf295189502b3bfa662ab42
1 /* $NetBSD$ */
3 // -*- C++ -*-
4 /* Copyright (C) 1989, 1990, 1991, 1992, 2001 Free Software Foundation, Inc.
5 Written by James Clark (jjc@jclark.com)
7 This file is part of groff.
9 groff is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 2, or (at your option) any later
12 version.
14 groff is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 for more details.
19 You should have received a copy of the GNU General Public License along
20 with groff; see the file COPYING. If not, write to the Free Software
21 Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
23 #include "refer.h"
24 #include "token.h"
26 #define TOKEN_TABLE_SIZE 1009
27 // I believe in Icelandic thorn sorts after z.
28 #define THORN_SORT_KEY "{"
30 struct token_table_entry {
31 const char *tok;
32 token_info ti;
33 token_table_entry();
36 token_table_entry token_table[TOKEN_TABLE_SIZE];
37 int ntokens = 0;
39 static void skip_name(const char **ptr, const char *end)
41 if (*ptr < end) {
42 switch (*(*ptr)++) {
43 case '(':
44 if (*ptr < end) {
45 *ptr += 1;
46 if (*ptr < end)
47 *ptr += 1;
49 break;
50 case '[':
51 while (*ptr < end)
52 if (*(*ptr)++ == ']')
53 break;
54 break;
59 int get_token(const char **ptr, const char *end)
61 if (*ptr >= end)
62 return 0;
63 char c = *(*ptr)++;
64 if (c == '\\' && *ptr < end) {
65 switch (**ptr) {
66 default:
67 *ptr += 1;
68 break;
69 case '(':
70 case '[':
71 skip_name(ptr, end);
72 break;
73 case '*':
74 case 'f':
75 *ptr += 1;
76 skip_name(ptr, end);
77 break;
80 return 1;
83 token_info::token_info()
84 : type(TOKEN_OTHER), sort_key(0), other_case(0)
88 void token_info::set(token_type t, const char *sk, const char *oc)
90 assert(oc == 0 || t == TOKEN_UPPER || t == TOKEN_LOWER);
91 type = t;
92 sort_key = sk;
93 other_case = oc;
96 void token_info::sortify(const char *start, const char *end, string &result)
97 const
99 if (sort_key)
100 result += sort_key;
101 else if (type == TOKEN_UPPER || type == TOKEN_LOWER) {
102 for (; start < end; start++)
103 if (csalpha(*start))
104 result += cmlower(*start);
108 int token_info::sortify_non_empty(const char *start, const char *end) const
110 if (sort_key)
111 return *sort_key != '\0';
112 if (type != TOKEN_UPPER && type != TOKEN_LOWER)
113 return 0;
114 for (; start < end; start++)
115 if (csalpha(*start))
116 return 1;
117 return 0;
121 void token_info::lower_case(const char *start, const char *end,
122 string &result) const
124 if (type != TOKEN_UPPER) {
125 while (start < end)
126 result += *start++;
128 else if (other_case)
129 result += other_case;
130 else {
131 while (start < end)
132 result += cmlower(*start++);
136 void token_info::upper_case(const char *start, const char *end,
137 string &result) const
139 if (type != TOKEN_LOWER) {
140 while (start < end)
141 result += *start++;
143 else if (other_case)
144 result += other_case;
145 else {
146 while (start < end)
147 result += cmupper(*start++);
151 token_table_entry::token_table_entry()
152 : tok(0)
156 static void store_token(const char *tok, token_type typ,
157 const char *sk = 0, const char *oc = 0)
159 unsigned n = hash_string(tok, strlen(tok)) % TOKEN_TABLE_SIZE;
160 for (;;) {
161 if (token_table[n].tok == 0) {
162 if (++ntokens == TOKEN_TABLE_SIZE)
163 assert(0);
164 token_table[n].tok = tok;
165 break;
167 if (strcmp(tok, token_table[n].tok) == 0)
168 break;
169 if (n == 0)
170 n = TOKEN_TABLE_SIZE - 1;
171 else
172 --n;
174 token_table[n].ti.set(typ, sk, oc);
178 token_info default_token_info;
180 const token_info *lookup_token(const char *start, const char *end)
182 unsigned n = hash_string(start, end - start) % TOKEN_TABLE_SIZE;
183 for (;;) {
184 if (token_table[n].tok == 0)
185 break;
186 if (strlen(token_table[n].tok) == size_t(end - start)
187 && memcmp(token_table[n].tok, start, end - start) == 0)
188 return &(token_table[n].ti);
189 if (n == 0)
190 n = TOKEN_TABLE_SIZE - 1;
191 else
192 --n;
194 return &default_token_info;
197 static void init_ascii()
199 const char *p;
200 for (p = "abcdefghijklmnopqrstuvwxyz"; *p; p++) {
201 char buf[2];
202 buf[0] = *p;
203 buf[1] = '\0';
204 store_token(strsave(buf), TOKEN_LOWER);
205 buf[0] = cmupper(buf[0]);
206 store_token(strsave(buf), TOKEN_UPPER);
208 for (p = "0123456789"; *p; p++) {
209 char buf[2];
210 buf[0] = *p;
211 buf[1] = '\0';
212 const char *s = strsave(buf);
213 store_token(s, TOKEN_OTHER, s);
215 for (p = ".,:;?!"; *p; p++) {
216 char buf[2];
217 buf[0] = *p;
218 buf[1] = '\0';
219 store_token(strsave(buf), TOKEN_PUNCT);
221 store_token("-", TOKEN_HYPHEN);
224 static void store_letter(const char *lower, const char *upper,
225 const char *sort_key = 0)
227 store_token(lower, TOKEN_LOWER, sort_key, upper);
228 store_token(upper, TOKEN_UPPER, sort_key, lower);
231 static void init_letter(unsigned char uc_code, unsigned char lc_code,
232 const char *sort_key)
234 char lbuf[2];
235 lbuf[0] = lc_code;
236 lbuf[1] = 0;
237 char ubuf[2];
238 ubuf[0] = uc_code;
239 ubuf[1] = 0;
240 store_letter(strsave(lbuf), strsave(ubuf), sort_key);
243 static void init_latin1()
245 init_letter(0xc0, 0xe0, "a");
246 init_letter(0xc1, 0xe1, "a");
247 init_letter(0xc2, 0xe2, "a");
248 init_letter(0xc3, 0xe3, "a");
249 init_letter(0xc4, 0xe4, "a");
250 init_letter(0xc5, 0xe5, "a");
251 init_letter(0xc6, 0xe6, "ae");
252 init_letter(0xc7, 0xe7, "c");
253 init_letter(0xc8, 0xe8, "e");
254 init_letter(0xc9, 0xe9, "e");
255 init_letter(0xca, 0xea, "e");
256 init_letter(0xcb, 0xeb, "e");
257 init_letter(0xcc, 0xec, "i");
258 init_letter(0xcd, 0xed, "i");
259 init_letter(0xce, 0xee, "i");
260 init_letter(0xcf, 0xef, "i");
262 init_letter(0xd0, 0xf0, "d");
263 init_letter(0xd1, 0xf1, "n");
264 init_letter(0xd2, 0xf2, "o");
265 init_letter(0xd3, 0xf3, "o");
266 init_letter(0xd4, 0xf4, "o");
267 init_letter(0xd5, 0xf5, "o");
268 init_letter(0xd6, 0xf6, "o");
269 init_letter(0xd8, 0xf8, "o");
270 init_letter(0xd9, 0xf9, "u");
271 init_letter(0xda, 0xfa, "u");
272 init_letter(0xdb, 0xfb, "u");
273 init_letter(0xdc, 0xfc, "u");
274 init_letter(0xdd, 0xfd, "y");
275 init_letter(0xde, 0xfe, THORN_SORT_KEY);
277 store_token("\337", TOKEN_LOWER, "ss", "SS");
278 store_token("\377", TOKEN_LOWER, "y", "Y");
281 static void init_two_char_letter(char l1, char l2, char u1, char u2,
282 const char *sk = 0)
284 char buf[6];
285 buf[0] = '\\';
286 buf[1] = '(';
287 buf[2] = l1;
288 buf[3] = l2;
289 buf[4] = '\0';
290 const char *p = strsave(buf);
291 buf[2] = u1;
292 buf[3] = u2;
293 store_letter(p, strsave(buf), sk);
294 buf[1] = '[';
295 buf[4] = ']';
296 buf[5] = '\0';
297 p = strsave(buf);
298 buf[2] = l1;
299 buf[3] = l2;
300 store_letter(strsave(buf), p, sk);
304 static void init_special_chars()
306 const char *p;
307 for (p = "':^`~"; *p; p++)
308 for (const char *q = "aeiouy"; *q; q++) {
309 // Use a variable to work around bug in gcc 2.0
310 char c = cmupper(*q);
311 init_two_char_letter(*p, *q, *p, c);
313 for (p = "/l/o~n,coeaeij"; *p; p += 2) {
314 // Use variables to work around bug in gcc 2.0
315 char c0 = cmupper(p[0]);
316 char c1 = cmupper(p[1]);
317 init_two_char_letter(p[0], p[1], c0, c1);
319 init_two_char_letter('v', 's', 'v', 'S', "s");
320 init_two_char_letter('v', 'z', 'v', 'Z', "z");
321 init_two_char_letter('o', 'a', 'o', 'A', "a");
322 init_two_char_letter('T', 'p', 'T', 'P', THORN_SORT_KEY);
323 init_two_char_letter('-', 'd', '-', 'D');
325 store_token("\\(ss", TOKEN_LOWER, 0, "SS");
326 store_token("\\[ss]", TOKEN_LOWER, 0, "SS");
328 store_token("\\(Sd", TOKEN_LOWER, "d", "\\(-D");
329 store_token("\\[Sd]", TOKEN_LOWER, "d", "\\[-D]");
330 store_token("\\(hy", TOKEN_HYPHEN);
331 store_token("\\[hy]", TOKEN_HYPHEN);
332 store_token("\\(en", TOKEN_RANGE_SEP);
333 store_token("\\[en]", TOKEN_RANGE_SEP);
336 static void init_strings()
338 char buf[6];
339 buf[0] = '\\';
340 buf[1] = '*';
341 for (const char *p = "'`^^,:~v_o./;"; *p; p++) {
342 buf[2] = *p;
343 buf[3] = '\0';
344 store_token(strsave(buf), TOKEN_ACCENT);
345 buf[2] = '[';
346 buf[3] = *p;
347 buf[4] = ']';
348 buf[5] = '\0';
349 store_token(strsave(buf), TOKEN_ACCENT);
352 // -ms special letters
353 store_letter("\\*(th", "\\*(Th", THORN_SORT_KEY);
354 store_letter("\\*[th]", "\\*[Th]", THORN_SORT_KEY);
355 store_letter("\\*(d-", "\\*(D-");
356 store_letter("\\*[d-]", "\\*[D-]");
357 store_letter("\\*(ae", "\\*(Ae", "ae");
358 store_letter("\\*[ae]", "\\*[Ae]", "ae");
359 store_letter("\\*(oe", "\\*(Oe", "oe");
360 store_letter("\\*[oe]", "\\*[Oe]", "oe");
362 store_token("\\*3", TOKEN_LOWER, "y", "Y");
363 store_token("\\*8", TOKEN_LOWER, "ss", "SS");
364 store_token("\\*q", TOKEN_LOWER, "o", "O");
367 struct token_initer {
368 token_initer();
371 static token_initer the_token_initer;
373 token_initer::token_initer()
375 init_ascii();
376 init_latin1();
377 init_special_chars();
378 init_strings();
379 default_token_info.set(TOKEN_OTHER);