1 /* $Id: preconv.c,v 1.15 2015/10/06 18:32:19 schwarze Exp $ */
3 * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org>
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
20 #include <sys/types.h>
26 #include "libmandoc.h"
29 preconv_encode(struct buf
*ib
, size_t *ii
, struct buf
*ob
, size_t *oi
,
36 cu
= (unsigned char *)ib
->buf
+ *ii
;
39 if ( ! (*filenc
& MPARSE_UTF8
))
43 while (nby
< 5 && *cu
& (1 << (7 - nby
)))
49 if (accum
< 0x02) /* Obfuscated ASCII. */
57 if (accum
> 0x04) /* Beyond Unicode. */
60 default: /* Bad sequence header. */
67 if ((accum
== 0x00 && ! (*cu
& 0x20)) || /* Use 2-byte. */
68 (accum
== 0x0d && *cu
& 0x20)) /* Surrogates. */
72 if ((accum
== 0x00 && ! (*cu
& 0x30)) || /* Use 3-byte. */
73 (accum
== 0x04 && *cu
& 0x30)) /* Beyond Unicode. */
81 if ((*cu
& 0xc0) != 0x80) /* Invalid continuation. */
89 assert(accum
< 0x110000);
90 assert(accum
< 0xd800 || accum
> 0xdfff);
92 *oi
+= snprintf(ob
->buf
+ *oi
, 11, "\\[u%.4X]", accum
);
93 *ii
= (char *)cu
- ib
->buf
;
94 *filenc
&= ~MPARSE_LATIN1
;
98 if ( ! (*filenc
& MPARSE_LATIN1
))
101 *oi
+= snprintf(ob
->buf
+ *oi
, 11,
102 "\\[u%.4X]", (unsigned char)ib
->buf
[(*ii
)++]);
104 *filenc
&= ~MPARSE_UTF8
;
109 preconv_cue(const struct buf
*b
, size_t offset
)
111 const char *ln
, *eoln
, *eoph
;
114 ln
= b
->buf
+ offset
;
117 /* Look for the end-of-line. */
119 if (NULL
== (eoln
= memchr(ln
, '\n', sz
)))
122 /* Check if we have the correct header/trailer. */
124 if ((sz
= (size_t)(eoln
- ln
)) < 10 ||
125 memcmp(ln
, ".\\\" -*-", 7) || memcmp(eoln
- 3, "-*-", 3))
126 return MPARSE_UTF8
| MPARSE_LATIN1
;
128 /* Move after the header and adjust for the trailer. */
134 while (sz
> 0 && ' ' == *ln
) {
141 /* Find the end-of-phrase marker (or eoln). */
143 if (NULL
== (eoph
= memchr(ln
, ';', sz
)))
148 /* Only account for the "coding" phrase. */
150 if ((phsz
= eoph
- ln
) < 7 ||
151 strncasecmp(ln
, "coding:", 7)) {
160 while (sz
> 0 && ' ' == *ln
) {
167 /* Check us against known encodings. */
169 if (phsz
> 4 && !strncasecmp(ln
, "utf-8", 5))
171 if (phsz
> 10 && !strncasecmp(ln
, "iso-latin-1", 11))
172 return MPARSE_LATIN1
;
175 return MPARSE_UTF8
| MPARSE_LATIN1
;