7 /* Handle =?utf-8?q?Welcome_to_=22foobar=22_ML?= via RFC 2047 */
8 const utf8_from_encodedword : (encoded : byte[:] -> byte[:])
11 type rfc2047_state = union
16 `Saw_Closing_Question_Mark
21 const ascii_lc = { b : byte
22 if b > 0x40 && b < 0x5B
29 const caseieq = { s : byte[:], t : byte[:]
33 for var j : std.size = 0; j < s.len; ++j
34 if ascii_lc(s[j]) != ascii_lc(t[j])
42 const atox = { b : byte
43 if b >= ('0' : byte) && b <= ('9' : byte)
44 -> `std.Ok (b - ('0' : byte))
46 if b >= ('A' : byte) && b <= ('F' : byte)
47 -> `std.Ok (b - ('A' : byte) + 0xA)
49 if b >= ('a' : byte) && b <= ('f' : byte)
50 -> `std.Ok (b - ('a' : byte) + 0xA)
56 const decode_q = { s : byte[:]
57 var dec : std.strbuf# = std.mksb()
58 for var j : std.size = 0; j < s.len; ++j
63 | '_': std.sbputb(dec, (' ' : byte))
71 | `std.Ok n: b += (16 * n)
72 | `std.Err void: goto err
76 | `std.Err void: goto err
80 | _: std.sbputb(dec, s[j])
84 -> `std.Ok std.sbfin(dec)
91 const token_safe_byte = { b : byte
93 * Any CHAR except SPACE, CTLs, and especials
95 * especials = "(" / ")" / "<" / ">" / "@" / "," / ";" /
96 * ":" / " <"> / "/" / "[" / "]" / "?" / "." / "="
99 if b <= (' ' : byte) || b > ('~' : byte)
122 const text_safe_byte = { b : byte
124 * 1*<Any printable ASCII character other than "?"
126 * ; (but see "Use of encoded-words in message
127 * ; headers", section 5)
129 * Section 5 adds context to where encoded-words are allowed.
130 * We over-enforce that in the state machine, so we don't
131 * have to worry about it here. I think.
133 -> b > 0x20 && b < 0x7f && (b : char) != '?'
136 const utf8_from_encodedword = { encoded : byte[:]
137 var decoded : std.strbuf# = std.mksb()
138 var charset : byte[:] = [][:]
139 var encoding : byte[:] = [][:]
140 var s : rfc2047_state = `Boring_ASCII
143 var last_was_whitespace : bool = true
144 var this_word_start : std.size = 0
145 var cs_start : std.size = 0
146 var e_start : std.size = 0
147 var text_start : std.size = 0
148 var decode_word : (s : byte[:] -> std.result(byte[:], void))
150 var err : std.strbuf# = std.mksb()
152 for var j : std.size = 0; j < encoded.len; ++j
156 var c : char = (b : char)
159 if !last_was_whitespace
162 s = `Saw_Opening_Equals
164 if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
165 if last_was_whitespace
168 std.sbputb(decoded, 0x20)
169 last_was_whitespace = true
171 std.sbputb(decoded, b)
172 last_was_whitespace = false
175 | `Just_Finished_Text:
177 * RFC 2047, section 5 requires that in
178 * some contexts, encoded text be separated
179 * from surrounding ASCII by linear whitespace.
180 * That "in some contexts" is hard, so we
181 * enforce it everywhere. This state is for
182 * requiring that, right after the ?=, we
183 * do not have a non-whitespace character
185 var c : char = (b : char)
186 if c != ' ' && c != '\t' && c != '\r' && c != '\n'
191 elif c == '\n' && j < encoded.len - 1
193 * Between =?...?= atoms, "\n "
194 * should be discarded, not folded
197 if encoded[j + 1] == 0x20 || encoded[j + 1] == 0x09
200 last_was_whitespace = true
204 std.sbputb(decoded, b)
205 last_was_whitespace = true
207 | `Saw_Opening_Equals:
212 if j + 1 >= encoded.len
220 if j - this_word_start > 75
225 s = `Reading_Encoding
226 charset = encoded[cs_start:j]
227 if j + 1 >= encoded.len
232 if !token_safe_byte(b)
237 if j - this_word_start > 75
242 encoding = encoded[e_start:j]
244 TODO: RFC 2231 means we should
245 strip trailing *FOO from this.
247 if !caseieq(charset, "utf-8")
251 if caseieq(encoding, "q")
252 decode_word = decode_q
253 elif caseieq(encoding, "b")
254 decode_word = utf8_from_base64
258 if j + 0 >= encoded.len
264 if !token_safe_byte(b)
269 if j - this_word_start > 75
274 match decode_word(encoded[text_start:j])
276 std.sbputs(decoded, dec)
281 s = `Saw_Closing_Question_Mark
283 if !text_safe_byte(b)
287 | `Saw_Closing_Question_Mark:
288 if j - this_word_start > 75
293 s = `Just_Finished_Text
301 * Because Q and B encoding work on a byte-level, there's
302 * a chance that what we have isn't valid UTF-8. That would
305 * TODO: we die if tabs are in the subject here. Is that bad?
307 if !util.non_ctrl_utf8(std.sbpeek(decoded))
312 | `Boring_ASCII: goto done
313 | `Just_Finished_Text: goto done
314 | _: goto not_encodedword
320 -> only_fold_whitespace(encoded)
323 -> std.sbfin(decoded)
326 const only_fold_whitespace = { s : byte[:]
327 var r : std.strbuf# = std.mksb()
328 var last_was_whitespace : bool = false
330 var c : char = (b : char)
331 if c == '\n' || c == '\r' || c == '\t' || c == ' '
332 if last_was_whitespace
337 last_was_whitespace = true
340 last_was_whitespace = false