7 #define UNICODE_MAX 0x10FFFFul
9 static const char *named_entities
[][2] =
157 { "lrm;", "\xE2\x80\x8E" },
217 { "rlm;", "\xE2\x80\x8F" },
224 { "shy;", "\xC2\xAD" },
241 { "thetasym;", "ϑ" },
262 { "zwj;", "\xE2\x80\x8D" },
263 { "zwnj;", "\xE2\x80\x8C" }
266 static int cmp(const void *key
, const void *element
)
268 return strncmp((const char *)key
, *(const char **)element
,
269 strlen(*(const char **)element
));
272 static const char *get_named_entity(const char *name
)
274 const char **entity
= bsearch(name
, named_entities
,
275 sizeof(named_entities
) / sizeof(*named_entities
),
276 sizeof(*named_entities
), cmp
);
278 return entity
? entity
[1] : NULL
;
281 static size_t putc_utf8(unsigned long cp
, char *buffer
)
283 unsigned char *bytes
= (unsigned char *)buffer
;
287 bytes
[0] = (unsigned char)cp
;
293 bytes
[1] = (unsigned char)((2u << 6) | (cp
& 0x3Fu
));
294 bytes
[0] = (unsigned char)((6u << 5) | (cp
>> 6));
300 bytes
[2] = (unsigned char)(( 2u << 6) | ( cp
& 0x3Fu
));
301 bytes
[1] = (unsigned char)(( 2u << 6) | ((cp
>> 6) & 0x3Fu
));
302 bytes
[0] = (unsigned char)((14u << 4) | (cp
>> 12));
308 bytes
[3] = (unsigned char)(( 2u << 6) | ( cp
& 0x3Fu
));
309 bytes
[2] = (unsigned char)(( 2u << 6) | ((cp
>> 6) & 0x3Fu
));
310 bytes
[1] = (unsigned char)(( 2u << 6) | ((cp
>> 12) & 0x3Fu
));
311 bytes
[0] = (unsigned char)((30u << 3) | (cp
>> 18));
318 static _Bool
parse_entity(const char *current
, char **to
,
321 const char *end
= strchr(current
, ';');
324 if(current
[1] == '#')
329 _Bool hex
= current
[2] == 'x' || current
[2] == 'X';
331 unsigned long cp
= strtoul(
332 current
+ (hex
? 3 : 2), &tail
, hex
? 16 : 10);
334 if(tail
== end
&& !errno
&& cp
<= UNICODE_MAX
)
336 *to
+= putc_utf8(cp
, *to
);
344 const char *entity
= get_named_entity(¤t
[1]);
347 size_t len
= strlen(entity
);
348 memcpy(*to
, entity
, len
);
360 size_t decode_html_entities_utf8(char *dest
, const char *src
)
365 const char *from
= src
;
368 while((current
= strchr(from
, '&')))
370 memcpy(to
, from
, (size_t)(current
- from
));
371 to
+= current
- from
;
373 if(parse_entity(current
, &to
, &from
))
380 size_t remaining
= strlen(from
);
382 memcpy(to
, from
, remaining
);
386 return (size_t)(to
- dest
);