Prepare for release 1.6.1.
[xombrero.git] / tldlist.c
blob4b6ea9345f30ec76ca1c2d5c2c73df8c228652c0
1 /*
2 * Copyright (c) 2012 Elias Norberg <xyzzy@kudzu.se>
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 #include <xombrero.h>
19 #define TLD_TREE_END_NODE 1
20 #define TLD_TREE_EXCEPTION 2
22 struct tld_tree_node {
23 struct tld_tree_node *next;
24 struct tld_tree_node *child;
25 const char *lbl;
26 char flags;
29 struct tld_tree_node tld_tree_root = { NULL, NULL, "" };
31 #define TREE_INSERT_CHILD(n, data) \
32 n->child = g_malloc(sizeof *n); \
33 n->child->next = NULL; \
34 n->child->child = NULL; \
35 n->child->flags = 0; \
36 n->child->lbl = data;
38 #define TREE_INSERT_NEXT(n, data) \
39 n->next = g_malloc(sizeof *n); \
40 n->next->next = NULL; \
41 n->next->child = NULL; \
42 n->next->flags = 0; \
43 n->next->lbl = data;
45 #define P_BASE (36)
46 #define P_TMIN (1)
47 #define P_TMAX (26)
48 #define P_SKEW (38)
49 #define P_DAMP (700)
50 #define INITIAL_BIAS (72)
51 #define INITIAL_N (128)
53 int
54 adapt(int delta, int numpoints, int firsttime)
56 int k;
58 if (firsttime)
59 delta = delta / P_DAMP;
60 else
61 delta = delta / 2;
63 delta += (delta / numpoints);
65 k = 0;
66 while (delta > (((P_BASE - P_TMIN) * P_TMAX) / 2)) {
67 delta = delta / (P_BASE - P_TMIN);
68 k += P_BASE;
71 k += (((P_BASE - P_TMIN + 1) * delta) / (delta + P_SKEW));
72 return (k);
75 int
76 get_minimum_char(char *str, int n)
78 gunichar ch = 0;
79 gunichar min = 0xffffff;
81 for(; *str; str = g_utf8_next_char(str)) {
82 ch = g_utf8_get_char(str);
83 if (ch >= n && ch < min)
84 min = ch;
87 return (min);
90 char
91 encode_digit(int n)
93 if (n < 26)
94 return n + 'a';
95 return (n - 26) + '0';
98 char *
99 punycode_encode(char *str)
101 char output[1024];
102 char *s;
103 gunichar c;
104 int need_coding = 0;
105 int l, len, i;
107 int n = INITIAL_N;
108 int delta = 0;
109 int bias = INITIAL_BIAS;
110 int h, b, m, k, t, q;
112 l = 0;
113 for (s=str; *s; s = g_utf8_next_char(s)) {
114 c = g_utf8_get_char(s);
115 if (c < 128)
116 output[l++] = *s;
117 else
118 need_coding = 1;
121 output[l] = '\0';
123 if (!need_coding)
124 return g_strdup(output);
126 h = b = strlen(output);
128 if (l > 0)
129 output[l++] = '-';
130 output[l] = '\0';
132 len = g_utf8_strlen(str, -1);
133 while (h < len) {
134 m = get_minimum_char(str, n);
135 delta += (m - n) * (h + 1);
136 n = m;
137 for (s=str; *s; s = g_utf8_next_char(s)) {
138 c = g_utf8_get_char(s);
140 if (c < n) delta ++;
141 if (c == n) {
142 q = delta;
143 for (k=P_BASE;; k+=P_BASE) {
144 if (k <= bias)
145 t = P_TMIN;
146 else if(k >= bias + P_TMAX)
147 t = P_TMAX;
148 else
149 t = k - bias;
151 if (q < t)
152 break;
154 output[l++] = encode_digit(t+((q-t)%(P_BASE-t)));
155 q = (q - t) / (P_BASE - t);
157 output[l++] = encode_digit(q);
158 bias = adapt(delta, h + 1, h == b);
159 delta = 0;
160 h ++;
163 delta ++;
164 n ++;
167 output[l] = '\0';
168 for (i=l+4;i>=4;i--)
169 output[i] = output[i-4];
170 l += 4;
171 output[0] = 'x';
172 output[1] = 'n';
173 output[2] = '-';
174 output[3] = '-';
175 output[l] = '\0';
176 return g_strdup(output);
180 * strrchr2(str, saveptr, ch)
182 * Walk backwards through str, jumping to next 'ch'
183 * On first call, *saveptr should be set to NULL.
184 * On following calls, supply the same saveptr.
186 * Returns NULL when the whole string 'str' has been
187 * looped through. Otherwise returns the position
188 * before the next 'ch'.
190 const char *
191 strrchr2(const char *str, const char **saveptr, int ch)
193 const char *ptr;
195 if (str != NULL && *saveptr == NULL) {
196 *saveptr = str + strlen(str);
197 } else if (str == *saveptr) {
198 return (NULL);
201 for (ptr= *saveptr - 1; ptr != str && *ptr != ch; ptr--)
204 *saveptr = ptr;
205 if (ptr != str)
206 return (ptr+1);
207 return (ptr);
211 * tld_tree_add(rule)
213 * Adds a tld-rule to the tree
215 void
216 tld_tree_add(const char *rule)
218 struct tld_tree_node *n;
219 const char *lbl;
220 const char *saveptr;
222 saveptr = NULL;
223 lbl = strrchr2(rule, &saveptr, '.');
225 for (n = &tld_tree_root; lbl != NULL;) {
227 if (strcmp(n->lbl, lbl) == 0) {
228 lbl = strrchr2(rule, &saveptr, '.');
230 if (!n->child)
231 break;
233 n = n->child;
234 continue;
237 if (n->next == NULL) {
238 TREE_INSERT_NEXT(n, lbl);
239 n = n->next;
241 lbl = strrchr2(rule, &saveptr, '.');
242 break;
244 n = n->next;
247 while (lbl) {
248 TREE_INSERT_CHILD(n, lbl);
250 lbl = strrchr2(rule, &saveptr, '.');
251 n = n->child;
254 n->flags |= TLD_TREE_END_NODE;
255 if (n->lbl[0] == '!') {
256 n->flags |= TLD_TREE_EXCEPTION;
257 n->lbl ++;
261 void
262 tld_tree_init()
264 FILE *fd;
265 char buf[1024];
266 char file[PATH_MAX];
267 char *ptr, *next_lbl;
268 char *enc_lbl;
269 char *rule, *rp;
270 char extra_ch;
272 snprintf(file, sizeof file, "%s" PS "tld-rules", resource_dir);
273 fd = fopen(file, "r");
274 if (fd == NULL) {
275 /* a poor replacement for the real list - but it's
276 * better than nothing.
278 tld_tree_add("*");
279 startpage_add("Could not open %s: this file is required "
280 "to handle TLD whitelisting properly", file);
281 return;
284 for (;;) {
285 ptr = fgets(buf, sizeof buf, fd);
286 if (ptr == NULL || feof(fd))
287 break;
289 /* skip comments */
290 if ((ptr = strstr(buf, "//")) != NULL)
291 *ptr = '\0';
292 /* skip anything after space or tab */
293 for (ptr = buf; *ptr; ptr++)
294 if (*ptr == ' ' || *ptr == '\t' ||
295 *ptr == '\n' || *ptr == '\r')
296 break;
297 *ptr = '\0';
299 if (!strlen(buf))
300 continue;
302 extra_ch = 0;
303 ptr = buf;
304 if (buf[0] == '!' && buf[0] == '*') {
305 extra_ch = buf[0];
306 ptr ++;
310 rule = NULL;
311 /* split into labels, and convert them one by one */
312 for (;;) {
313 if ((next_lbl = strchr(ptr, '.')))
314 *next_lbl = '\0';
316 enc_lbl = punycode_encode(ptr);
317 if (rule) {
318 rp = rule;
319 rule = g_strdup_printf("%s%s%s", rp, enc_lbl,
320 next_lbl ? "." : "");
321 g_free(rp);
322 g_free(enc_lbl);
323 } else {
324 rule = g_strdup_printf("%.1s%s%s",
325 extra_ch ? buf:"", enc_lbl,
326 next_lbl ? ".":"");
327 g_free(enc_lbl);
330 if (!next_lbl)
331 break;
332 ptr = next_lbl + 1;
334 tld_tree_add(rule);
337 fclose(fd);
341 * tld_get_suffix(domain)
343 * Find the public suffix for domain.
345 * Returns a pointer to the suffix position
346 * in domain, or NULL if no public suffix
347 * was found.
349 char *
350 tld_get_suffix(const char *domain)
352 struct tld_tree_node *n;
353 const char *suffix;
354 const char *lbl, *saveptr;
355 const char *tmp_saveptr, *tmp_lbl;
357 if (domain == NULL)
358 return (NULL);
359 if (domain[0] == '.')
360 return (NULL);
362 saveptr = NULL;
363 suffix = NULL;
364 lbl = strrchr2(domain, &saveptr, '.');
366 for (n = &tld_tree_root; n != NULL && lbl != NULL;) {
368 if (!strlen(n->lbl)) {
369 n = n->next;
370 continue;
373 if (n->lbl[0] == '*') {
374 if (n->flags & TLD_TREE_END_NODE) {
376 tmp_saveptr = saveptr;
377 tmp_lbl = lbl;
379 lbl = strrchr2(domain, &saveptr, '.');
381 /* Save possible public suffix */
382 suffix = lbl;
383 saveptr = tmp_saveptr;
384 lbl = tmp_lbl;
387 n = n->next;
388 continue;
391 if (strcmp(n->lbl, lbl) == 0) {
392 if (n->flags & TLD_TREE_EXCEPTION) {
393 /* We're done looking */
394 suffix = lbl;
395 break;
398 lbl = strrchr2(domain, &saveptr, '.');
400 /* Possible public suffix - other rules might
401 * still apply
403 if (n->flags & TLD_TREE_END_NODE)
404 suffix = lbl;
406 /* Domain too short */
407 if (lbl == NULL) {
408 /* Check if we have a child that is '*' */
409 for (n = n->child; n; n = n->next)
410 if (n->lbl[0] == '*')
411 suffix = NULL;
412 break;
415 if (n->child == NULL)
416 break;
418 n = n->child;
419 continue;
422 if (n->next == NULL)
423 break;
424 n = n->next;
427 /* If we can't find a matching suffix, it can mean that either
428 * a) the user is surfing a local prefix
429 * b) the list is not properly updated
431 * In any case - in order not to break stuff while surfing
432 * new TLD's, we return the public suffix as the top 2 labels
434 * www.abc.xyz therefore has public suffix 'abc.xyz'
436 if (!suffix) {
437 saveptr = NULL;
438 strrchr2(domain, &saveptr, '.');
439 lbl = strrchr2(domain, &saveptr, '.');
440 suffix = lbl;
443 return ((char*)suffix);