add build bug on newline in action and privmsg
[fillybot.git] / entities.c
blob6332bb76608a0d01a77f756ac00c180644fca9b2
1 #include "entities.h"
3 #include <errno.h>
4 #include <string.h>
5 #include <stdlib.h>
7 #define UNICODE_MAX 0x10FFFFul
9 static const char *named_entities[][2] =
11 { "AElig;", "Æ" },
12 { "Aacute;", "Á" },
13 { "Acirc;", "Â" },
14 { "Agrave;", "À" },
15 { "Alpha;", "Α" },
16 { "Aring;", "Å" },
17 { "Atilde;", "Ã" },
18 { "Auml;", "Ä" },
19 { "Beta;", "Β" },
20 { "Ccedil;", "Ç" },
21 { "Chi;", "Χ" },
22 { "Dagger;", "‡" },
23 { "Delta;", "Δ" },
24 { "ETH;", "Ð" },
25 { "Eacute;", "É" },
26 { "Ecirc;", "Ê" },
27 { "Egrave;", "È" },
28 { "Epsilon;", "Ε" },
29 { "Eta;", "Η" },
30 { "Euml;", "Ë" },
31 { "Gamma;", "Γ" },
32 { "Iacute;", "Í" },
33 { "Icirc;", "Î" },
34 { "Igrave;", "Ì" },
35 { "Iota;", "Ι" },
36 { "Iuml;", "Ï" },
37 { "Kappa;", "Κ" },
38 { "Lambda;", "Λ" },
39 { "Mu;", "Μ" },
40 { "Ntilde;", "Ñ" },
41 { "Nu;", "Ν" },
42 { "OElig;", "Œ" },
43 { "Oacute;", "Ó" },
44 { "Ocirc;", "Ô" },
45 { "Ograve;", "Ò" },
46 { "Omega;", "Ω" },
47 { "Omicron;", "Ο" },
48 { "Oslash;", "Ø" },
49 { "Otilde;", "Õ" },
50 { "Ouml;", "Ö" },
51 { "Phi;", "Φ" },
52 { "Pi;", "Π" },
53 { "Prime;", "″" },
54 { "Psi;", "Ψ" },
55 { "Rho;", "Ρ" },
56 { "Scaron;", "Š" },
57 { "Sigma;", "Σ" },
58 { "THORN;", "Þ" },
59 { "Tau;", "Τ" },
60 { "Theta;", "Θ" },
61 { "Uacute;", "Ú" },
62 { "Ucirc;", "Û" },
63 { "Ugrave;", "Ù" },
64 { "Upsilon;", "Υ" },
65 { "Uuml;", "Ü" },
66 { "Xi;", "Ξ" },
67 { "Yacute;", "Ý" },
68 { "Yuml;", "Ÿ" },
69 { "Zeta;", "Ζ" },
70 { "aacute;", "á" },
71 { "acirc;", "â" },
72 { "acute;", "´" },
73 { "aelig;", "æ" },
74 { "agrave;", "à" },
75 { "alefsym;", "ℵ" },
76 { "alpha;", "α" },
77 { "amp;", "&" },
78 { "and;", "∧" },
79 { "ang;", "∠" },
80 { "apos;", "'" },
81 { "aring;", "å" },
82 { "asymp;", "≈" },
83 { "atilde;", "ã" },
84 { "auml;", "ä" },
85 { "bdquo;", "„" },
86 { "beta;", "β" },
87 { "brvbar;", "¦" },
88 { "bull;", "•" },
89 { "cap;", "∩" },
90 { "ccedil;", "ç" },
91 { "cedil;", "¸" },
92 { "cent;", "¢" },
93 { "chi;", "χ" },
94 { "circ;", "ˆ" },
95 { "clubs;", "♣" },
96 { "cong;", "≅" },
97 { "copy;", "©" },
98 { "crarr;", "↵" },
99 { "cup;", "∪" },
100 { "curren;", "¤" },
101 { "dArr;", "⇓" },
102 { "dagger;", "†" },
103 { "darr;", "↓" },
104 { "deg;", "°" },
105 { "delta;", "δ" },
106 { "diams;", "♦" },
107 { "divide;", "÷" },
108 { "eacute;", "é" },
109 { "ecirc;", "ê" },
110 { "egrave;", "è" },
111 { "empty;", "∅" },
112 { "emsp;", " " },
113 { "ensp;", " " },
114 { "epsilon;", "ε" },
115 { "equiv;", "≡" },
116 { "eta;", "η" },
117 { "eth;", "ð" },
118 { "euml;", "ë" },
119 { "euro;", "€" },
120 { "exist;", "∃" },
121 { "fnof;", "ƒ" },
122 { "forall;", "∀" },
123 { "frac12;", "½" },
124 { "frac14;", "¼" },
125 { "frac34;", "¾" },
126 { "frasl;", "⁄" },
127 { "gamma;", "γ" },
128 { "ge;", "≥" },
129 { "gt;", ">" },
130 { "hArr;", "⇔" },
131 { "harr;", "↔" },
132 { "hearts;", "♥" },
133 { "hellip;", "…" },
134 { "iacute;", "í" },
135 { "icirc;", "î" },
136 { "iexcl;", "¡" },
137 { "igrave;", "ì" },
138 { "image;", "ℑ" },
139 { "infin;", "∞" },
140 { "int;", "∫" },
141 { "iota;", "ι" },
142 { "iquest;", "¿" },
143 { "isin;", "∈" },
144 { "iuml;", "ï" },
145 { "kappa;", "κ" },
146 { "lArr;", "⇐" },
147 { "lambda;", "λ" },
148 { "lang;", "〈" },
149 { "laquo;", "«" },
150 { "larr;", "←" },
151 { "lceil;", "⌈" },
152 { "ldquo;", "“" },
153 { "le;", "≤" },
154 { "lfloor;", "⌊" },
155 { "lowast;", "∗" },
156 { "loz;", "◊" },
157 { "lrm;", "\xE2\x80\x8E" },
158 { "lsaquo;", "‹" },
159 { "lsquo;", "‘" },
160 { "lt;", "<" },
161 { "macr;", "¯" },
162 { "mdash;", "—" },
163 { "micro;", "µ" },
164 { "middot;", "·" },
165 { "minus;", "−" },
166 { "mu;", "μ" },
167 { "nabla;", "∇" },
168 { "nbsp;", " " },
169 { "ndash;", "–" },
170 { "ne;", "≠" },
171 { "ni;", "∋" },
172 { "not;", "¬" },
173 { "notin;", "∉" },
174 { "nsub;", "⊄" },
175 { "ntilde;", "ñ" },
176 { "nu;", "ν" },
177 { "oacute;", "ó" },
178 { "ocirc;", "ô" },
179 { "oelig;", "œ" },
180 { "ograve;", "ò" },
181 { "oline;", "‾" },
182 { "omega;", "ω" },
183 { "omicron;", "ο" },
184 { "oplus;", "⊕" },
185 { "or;", "∨" },
186 { "ordf;", "ª" },
187 { "ordm;", "º" },
188 { "oslash;", "ø" },
189 { "otilde;", "õ" },
190 { "otimes;", "⊗" },
191 { "ouml;", "ö" },
192 { "para;", "¶" },
193 { "part;", "∂" },
194 { "permil;", "‰" },
195 { "perp;", "⊥" },
196 { "phi;", "φ" },
197 { "pi;", "π" },
198 { "piv;", "ϖ" },
199 { "plusmn;", "±" },
200 { "pound;", "£" },
201 { "prime;", "′" },
202 { "prod;", "∏" },
203 { "prop;", "∝" },
204 { "psi;", "ψ" },
205 { "quot;", "\"" },
206 { "rArr;", "⇒" },
207 { "radic;", "√" },
208 { "rang;", "〉" },
209 { "raquo;", "»" },
210 { "rarr;", "→" },
211 { "rceil;", "⌉" },
212 { "rdquo;", "”" },
213 { "real;", "ℜ" },
214 { "reg;", "®" },
215 { "rfloor;", "⌋" },
216 { "rho;", "ρ" },
217 { "rlm;", "\xE2\x80\x8F" },
218 { "rsaquo;", "›" },
219 { "rsquo;", "’" },
220 { "sbquo;", "‚" },
221 { "scaron;", "š" },
222 { "sdot;", "⋅" },
223 { "sect;", "§" },
224 { "shy;", "\xC2\xAD" },
225 { "sigma;", "σ" },
226 { "sigmaf;", "ς" },
227 { "sim;", "∼" },
228 { "spades;", "♠" },
229 { "sub;", "⊂" },
230 { "sube;", "⊆" },
231 { "sum;", "∑" },
232 { "sup;", "⊃" },
233 { "sup1;", "¹" },
234 { "sup2;", "²" },
235 { "sup3;", "³" },
236 { "supe;", "⊇" },
237 { "szlig;", "ß" },
238 { "tau;", "τ" },
239 { "there4;", "∴" },
240 { "theta;", "θ" },
241 { "thetasym;", "ϑ" },
242 { "thinsp;", " " },
243 { "thorn;", "þ" },
244 { "tilde;", "˜" },
245 { "times;", "×" },
246 { "trade;", "™" },
247 { "uArr;", "⇑" },
248 { "uacute;", "ú" },
249 { "uarr;", "↑" },
250 { "ucirc;", "û" },
251 { "ugrave;", "ù" },
252 { "uml;", "¨" },
253 { "upsih;", "ϒ" },
254 { "upsilon;", "υ" },
255 { "uuml;", "ü" },
256 { "weierp;", "℘" },
257 { "xi;", "ξ" },
258 { "yacute;", "ý" },
259 { "yen;", "¥" },
260 { "yuml;", "ÿ" },
261 { "zeta;", "ζ" },
262 { "zwj;", "\xE2\x80\x8D" },
263 { "zwnj;", "\xE2\x80\x8C" }
266 static int cmp(const void *key, const void *element)
268 return strncmp((const char *)key, *(const char **)element,
269 strlen(*(const char **)element));
272 static const char *get_named_entity(const char *name)
274 const char **entity = bsearch(name, named_entities,
275 sizeof(named_entities) / sizeof(*named_entities),
276 sizeof(*named_entities), cmp);
278 return entity ? entity[1] : NULL;
281 static size_t putc_utf8(unsigned long cp, char *buffer)
283 unsigned char *bytes = (unsigned char *)buffer;
285 if(cp <= 0x007Ful)
287 bytes[0] = (unsigned char)cp;
288 return 1;
291 if(cp <= 0x07FFul)
293 bytes[1] = (unsigned char)((2u << 6) | (cp & 0x3Fu));
294 bytes[0] = (unsigned char)((6u << 5) | (cp >> 6));
295 return 2;
298 if(cp <= 0xFFFFul)
300 bytes[2] = (unsigned char)(( 2u << 6) | ( cp & 0x3Fu));
301 bytes[1] = (unsigned char)(( 2u << 6) | ((cp >> 6) & 0x3Fu));
302 bytes[0] = (unsigned char)((14u << 4) | (cp >> 12));
303 return 3;
306 if(cp <= 0x10FFFFul)
308 bytes[3] = (unsigned char)(( 2u << 6) | ( cp & 0x3Fu));
309 bytes[2] = (unsigned char)(( 2u << 6) | ((cp >> 6) & 0x3Fu));
310 bytes[1] = (unsigned char)(( 2u << 6) | ((cp >> 12) & 0x3Fu));
311 bytes[0] = (unsigned char)((30u << 3) | (cp >> 18));
312 return 4;
315 return 0;
318 static _Bool parse_entity(const char *current, char **to,
319 const char **from)
321 const char *end = strchr(current, ';');
322 if(!end) return 0;
324 if(current[1] == '#')
326 char *tail = NULL;
327 errno = 0;
329 _Bool hex = current[2] == 'x' || current[2] == 'X';
331 unsigned long cp = strtoul(
332 current + (hex ? 3 : 2), &tail, hex ? 16 : 10);
334 if(tail == end && !errno && cp <= UNICODE_MAX)
336 *to += putc_utf8(cp, *to);
337 *from = end + 1;
339 return 1;
342 else
344 const char *entity = get_named_entity(&current[1]);
345 if(entity)
347 size_t len = strlen(entity);
348 memcpy(*to, entity, len);
350 *to += len;
351 *from = end + 1;
353 return 1;
357 return 0;
360 size_t decode_html_entities_utf8(char *dest, const char *src)
362 if(!src) src = dest;
364 char *to = dest;
365 const char *from = src;
367 const char *current;
368 while((current = strchr(from, '&')))
370 memcpy(to, from, (size_t)(current - from));
371 to += current - from;
373 if(parse_entity(current, &to, &from))
374 continue;
376 from = current;
377 *to++ = *from++;
380 size_t remaining = strlen(from);
382 memcpy(to, from, remaining);
383 to += remaining;
385 *to = 0;
386 return (size_t)(to - dest);