2 * coded by Ketmar // Invisible Vector <ketmar@ketmar.no-ip.org>
3 * Understanding is not required. Only obedience.
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, version 3 of the License ONLY.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 module chiroptera
.decode
is aliced
;
23 import iv
.utfutil
: utf8CodeLen
, utf8Valid
;
27 // ////////////////////////////////////////////////////////////////////////// //
30 public byte utf8CodeLen (char ch) pure nothrow @trusted @nogc {
31 //pragma(inline, true);
32 if (ch < 0x80) return 1;
33 if ((ch&0b1111_1110) == 0b1111_1100) return 6;
34 if ((ch&0b1111_1100) == 0b1111_1000) return 5;
35 if ((ch&0b1111_1000) == 0b1111_0000) return 4;
36 if ((ch&0b1111_0000) == 0b1110_0000) return 3;
37 if ((ch&0b1110_0000) == 0b1100_0000) return 2;
42 // ////////////////////////////////////////////////////////////////////////// //
43 public bool utf8Valid (const(void)[] buf) pure nothrow @trusted @nogc {
44 const(ubyte)* bp = cast(const(ubyte)*)buf.ptr;
45 auto left = buf.length;
47 auto len = utf8CodeLen(*bp++)-1;
48 if (len < 0 || len > left) return false;
50 while (len-- > 0) if (((*bp++)&0b1100_0000) != 0b1000_0000) return false;
57 // ////////////////////////////////////////////////////////////////////////// //
58 public bool isValidNickUniChar (immutable dchar ch
) pure nothrow @safe @nogc {
61 (ch
>= '0' && ch
<= '9') ||
62 (ch
>= 'A' && ch
<= 'Z') ||
63 (ch
>= 'a' && ch
<= 'z') ||
64 ch
== '-' || ch
== '_' || ch
== '.' ||
65 isValidCyrillicUni(ch
);
69 public bool isValidUTFNick (const(char)[] s
) pure nothrow @safe @nogc {
70 if (s
.length
== 0) return false;
72 foreach (immutable char ch
; s
) {
73 dc
.decode(cast(ubyte)ch
);
74 if (dc
.invalid
) return false;
75 if (dc
.complete
&& !isValidNickUniChar(dc
.codepoint
)) return false;
81 // ////////////////////////////////////////////////////////////////////////// //
82 public bool isGoodCtlChar (immutable char ch
) pure nothrow @safe @nogc {
84 return (ch
== '\t' || ch
== '\n');
88 // ////////////////////////////////////////////////////////////////////////// //
89 public bool isGoodText (const(char)[] buf
) pure nothrow @trusted @nogc {
90 foreach (immutable char ch
; buf
) {
91 if (ch
== 127 ||
(ch
< 32 && !isGoodCtlChar(ch
))) return false;
97 // ////////////////////////////////////////////////////////////////////////// //
98 private bool isGoodFileNameChar (immutable char ch
) pure nothrow @safe @nogc {
99 if (ch
<= 32 || ch
== 127) return false;
100 if (ch
>= 128) return true;
101 if (ch
== '/' || ch
== '\\') return false;
106 // ////////////////////////////////////////////////////////////////////////// //
107 // this also sanitizes it
108 public T
toLowerStr (T
:const(char)[]) (T s
) nothrow @trusted {
109 static if (is(T
== typeof(null))) {
112 bool needwork
= false;
113 foreach (immutable char ch
; s
) {
114 if (ch
== 127 ||
(ch
< 32 && !isGoodCtlChar(ch
)) ||
(ch
>= 'A' && ch
<= 'Z')) {
123 res
.reserve(s
.length
);
124 foreach (immutable idx
, char ch
; s
) {
125 if (ch
== 13) { if (idx
+1 >= s
.length || s
.ptr
[idx
+1] != 10) res
~= '\n'; }
126 else if (ch
< 32 && !isGoodCtlChar(ch
)) res
~= ' ';
127 else if (ch
== 127) res
~= '~';
128 else if (ch
>= 'A' && ch
<= 'Z') res
~= ch
.tolower
;
131 return cast(T
)res
; // it is safe to cast here
137 // ////////////////////////////////////////////////////////////////////////// //
138 // this also sanitizes it
139 public T
sanitizeFileNameStr (T
:const(char)[]) (T s
) nothrow @trusted {
140 static if (is(T
== typeof(null))) {
143 bool needwork
= false;
144 foreach (immutable char ch
; s
) if (!isGoodFileNameChar(ch
)) { needwork
= true; break; }
148 char[] res
= new char[s
.length
];
150 foreach (ref char ch
; res
) {
151 if (!isGoodFileNameChar(ch
)) ch
= '_';
153 return cast(T
)res
; // it is safe to cast here
159 // ////////////////////////////////////////////////////////////////////////// //
160 public T
sanitizeStr (T
:const(char)[]) (T s
) nothrow @trusted {
161 static if (is(T
== typeof(null))) {
168 res
.reserve(s
.length
);
169 foreach (immutable idx
, char ch
; s
) {
170 if (ch
== 13) { if (idx
+1 >= s
.length || s
.ptr
[idx
+1] != 10) res
~= '\n'; }
171 else if (ch
< 32 && !isGoodCtlChar(ch
)) res
~= ' ';
172 else if (ch
== 127) res
~= '~';
175 return cast(T
)res
; // it is safe to cast here
181 // ////////////////////////////////////////////////////////////////////////// //
182 public T
sanitizeStrLine (T
:const(char)[]) (T s
) nothrow @trusted {
183 static if (is(T
== typeof(null))) {
187 foreach (immutable idx
, char ch
; s
) {
188 if (ch
< 32 || ch
== 127) { found
= true; break; }
189 if (ch
== 32 && (idx
== 0 || s
.ptr
[idx
-1] <= 32)) { found
= true; break; }
195 res
.reserve(s
.length
);
196 foreach (char ch
; s
) {
197 if (ch
< 32 || ch
== 127) ch
= ' ';
198 if (ch
<= 32 && (res
.length
== 0 || res
[$-1] <= 32)) continue;
201 while (res
.length
&& res
[$-1] <= 32) res
= res
[0..$-1];
202 return cast(T
)res
; // it is safe to cast here
208 // ////////////////////////////////////////////////////////////////////////// //
209 // for decoded subject parts
210 public T
sanitizeStrSubjPart (T
:const(char)[]) (T s
) nothrow @trusted {
211 static if (is(T
== typeof(null))) {
215 foreach (immutable idx
, immutable char ch
; s
) {
216 if (ch
< 32 || ch
== 127 || ch
== '_') { found
= true; break; }
221 char[] res
= new char[s
.length
];
223 foreach (ref char ch
; res
) if (ch
< 32 || ch
== 127 || ch
== '_') ch
= ' ';
224 return cast(T
)res
; // it is safe to cast here
230 // ////////////////////////////////////////////////////////////////////////// //
231 // this also sanitizes it
232 public T
binaryToUtf8 (T
:const(char)[]) (T s
) nothrow @trusted {
233 static if (is(T
== typeof(null))) {
237 foreach (immutable char ch
; s
) {
238 if (ch
>= 127 ||
(ch
< 32 && !isGoodCtlChar(ch
))) { found
= true; break; }
243 import iv
.utfutil
: utf8Valid
;
244 if (utf8Valid(s
)) return sanitizeStr(s
);
249 foreach (immutable char ch
; s
) {
251 immutable int len
= utf8Encode(uc
[], cast(dchar)ch
);
257 foreach (immutable idx
, char ch
; s
) {
259 if (ch
== 13) { if (idx
+1 >= s
.length || s
.ptr
[idx
+1] != 10) res
~= '\n'; }
260 else if (ch
< 32 && !isGoodCtlChar(ch
)) res
~= ' ';
261 else if (ch
== 127) res
~= '~';
264 immutable int len
= utf8Encode(uc
[], cast(dchar)ch
);
269 return cast(T
)res
; // it is safe to cast here
275 // ////////////////////////////////////////////////////////////////////////// //
276 // this also sanitizes it
277 public T
utf8ToUtf8 (T
:const(char)[]) (T s
) nothrow @trusted {
278 static if (is(T
== typeof(null))) {
282 foreach (immutable char ch
; s
) {
283 if (ch
>= 127 ||
(ch
< 32 && !isGoodCtlChar(ch
))) { found
= true; break; }
288 import iv
.utfutil
: utf8Valid
;
289 if (utf8Valid(s
)) return sanitizeStr(s
);
292 res
.reserve(s
.length
);
294 foreach (immutable idx
, char ch
; s
) {
295 if (utfleft
) { --utfleft
; res
~= ch
; continue; }
297 if (ch
== 13) { if (idx
+1 >= s
.length || s
.ptr
[idx
+1] != 10) res
~= '\n'; }
298 else if (ch
< 32 && !isGoodCtlChar(ch
)) res
~= ' ';
299 else if (ch
== 127) res
~= '~';
302 immutable byte ulen
= utf8CodeLen(ch
);
303 if (ulen
< 1) { res
~= '?'; continue; }
304 if (s
.length
-idx
< ulen
) { res
~= '?'; break; }
305 if (!utf8Valid(s
[idx
..idx
+ulen
])) { res
~= '?'; continue; }
310 return cast(T
)res
; // it is safe to cast here
316 // ////////////////////////////////////////////////////////////////////////// //
317 public T
subjRemoveRe(T
:const(char)[]) (T s
) nothrow @trusted {
318 static if (is(T
== typeof(null))) {
323 if (s
.length
< 3) break;
324 if (s
.ptr
[0] != 'r' && s
.ptr
[0] != 'R') break;
325 if (s
.ptr
[1] != 'e' && s
.ptr
[1] != 'E') break;
327 while (pp
< s
.length
&& s
.ptr
[pp
] <= 32) ++pp
;
328 if (pp
>= s
.length || s
.ptr
[pp
] != ':') break;
336 // ////////////////////////////////////////////////////////////////////////// //
337 private static immutable string b64alphabet
= "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
339 private static immutable ubyte[256] b64dc
= () {
340 ubyte[256] res
= 0xff; // invalid
341 foreach (immutable idx
, immutable char ch
; b64alphabet
) {
342 res
[cast(ubyte)ch
] = cast(ubyte)idx
;
344 res
['='] = 0xfe; // padding
347 res
[127] = 0xf0; // just in case
351 public char[] decodeBase64 (const(void)[] datavoid
, out bool error
) nothrow @trusted {
352 const(ubyte)[] data
= cast(const(ubyte)[])datavoid
;
354 bool inPadding
= false;
359 dcx
.reserve((data
.length
+3U)/4U*3U+8U);
362 bool decodeChunk () nothrow @trusted {
363 if (btspos
== 0) return true;
364 if (btspos
== 1) return false; //throw new Base64Exception("incomplete data in base64 decoder");
365 dcx
~= cast(char)((bts.ptr
[0]<<2)|
((bts.ptr
[1]&0x30)>>4)); // 2 and more
366 if (btspos
> 2) dcx
~= cast(char)(((bts.ptr
[1]&0x0f)<<4)|
((bts.ptr
[2]&0x3c)>>2)); // 3 and more
367 if (btspos
> 3) dcx
~= cast(char)(((bts.ptr
[2]&0x03)<<6)|
bts.ptr
[3]);
371 while (data
.length
) {
372 immutable ubyte cb
= b64dc
.ptr
[data
.ptr
[0]];
373 if (cb
== 0xff) { error
= true; delete dcx
; return "<invalid base64 data>".dup
; }
375 if (cb
== 0xf0) continue; // empty
379 if (!decodeChunk()) { error
= true; delete dcx
; return "<invalid base64 data>".dup
; }
382 if (++btspos
== 4) { inPadding
= false; btspos
= 0; }
386 if (btspos
!= 0) { error
= true; delete dcx
; return "<invalid base64 data>".dup
; }
389 bts.ptr
[btspos
++] = cb
;
391 if (!decodeChunk()) { error
= true; delete dcx
; return "<invalid base64 data>".dup
; }
396 if (btspos
!= 0 && !inPadding
) {
397 // assume that it is not padded
398 if (!decodeChunk()) { error
= true; delete dcx
; return "<invalid base64 data>".dup
; }
405 // ////////////////////////////////////////////////////////////////////////// //
406 public char[] decodeQuotedPrintable(bool multiline
) (const(void)[] datavoid
) nothrow @trusted {
407 const(char)[] data
= cast(const(char)[])datavoid
;
409 dcx
.reserve(data
.length
);
410 while (data
.length
) {
411 if (data
[0] == '=') {
412 if (data
.length
== 1) break;
413 if (data
.length
>= 3 && digitInBase(data
.ptr
[1], 16) >= 0 && digitInBase(data
.ptr
[2], 16) >= 0) {
414 dcx
~= cast(char)(digitInBase(data
.ptr
[1], 16)*16+digitInBase(data
.ptr
[2], 16));
418 // check if it is followed by blanks up to the newline
419 // if it is so, then this is "line continuation" -- remove both '=' and blanks
420 static if (multiline
) {
423 while (epos
< data
.length
) {
424 immutable char ch
= data
.ptr
[epos
++];
425 if (ch
== 9 || ch
== 32) continue;
426 if (ch
== 10) { ateol
= true; break; }
428 if (epos
>= data
.length || data
.ptr
[epos
] != 10) { ateol
= true; break; }
434 if (ateol || epos
>= data
.length
) {
435 data
= data
[epos
..$];
447 // ////////////////////////////////////////////////////////////////////////// //
448 public T
ensureProper7Bit(T
:const(char)[]) (T s
) nothrow @trusted {
449 static if (is(T
== typeof(null))) {
452 bool needwork
= false;
453 foreach (immutable char ch
; s
) if (ch
>= 128) { needwork
= true; break; }
454 if (!needwork
) return s
;
455 char[] dcx
= new char[s
.length
];
457 foreach (ref char ch
; dcx
) ch
&= 0x7f;
458 return cast(T
)dcx
; // it is safe to cast here
463 // ////////////////////////////////////////////////////////////////////////// //
464 // decode things like "=?UTF-8?B?Tm9yZGzDtnc=?="
465 public T
decodeSubj(T
:const(char)[]) (T s
) nothrow @trusted {
466 static if (is(T
== typeof(null))) {
469 if (s
.indexOf("=?") < 0) return s
.sanitizeStrLine
.utf8ToUtf8
;
471 // have to do some work
474 res
.reserve(s
.length
); // at least
476 while (s
.length
> 2) {
477 auto stqpos
= s
.indexOf("=?");
478 if (stqpos
< 0) break;
479 if (stqpos
> 0) res
~= s
[0..stqpos
].utf8ToUtf8
;
482 auto eepos
= s
.indexOf('?');
483 if (eepos
< 0) break;
484 auto enc
= s
[0..eepos
];
486 //conwriteln("ENCODING: '", enc, "'");
488 if (enc
.length
== 0) enc
= "utf-8";
489 if (s
.length
< 2 || s
.ptr
[1] != '?') return origs
.sanitizeStrLine
.utf8ToUtf8
;
493 eepos
= s
.indexOf("?=");
494 if (eepos
< 0) return origs
.sanitizeStrLine
.utf8ToUtf8
;
496 auto part
= s
[0..eepos
];
499 // several encoded parts may be separated with spaces; those spaces should be ignored
501 while (stqpos
< s
.length
&& s
.ptr
[stqpos
] <= ' ') ++stqpos
;
502 if (s
.length
-stqpos
>= 2 && s
.ptr
[stqpos
] == '=' && s
.ptr
[stqpos
+1] == '?') s
= s
[stqpos
..$];
505 if (ect
== 'Q' || ect
== 'q') {
507 part
= cast(T
)decodeQuotedPrintable
!false(part
); // it is safe to cast here
508 } else if (ect
== 'B' || ect
== 'b') {
511 part
= cast(T
)decodeBase64(part
, out error
); // it is safe to cast here
512 if (error
) { delete part
; return origs
.sanitizeStrLine
.utf8ToUtf8
; }
515 // reencode part if necessary
516 if (!enc
.strEquCI("utf-8") && !enc
.strEquCI("utf8") && !enc
.strEquCI("US-ASCII")) {
518 part
= recode(part
, "utf-8", enc
);
519 } catch (Exception e
) {
520 return origs
.sanitizeStrLine
.utf8ToUtf8
;
524 part
= part
.sanitizeStrSubjPart
.utf8ToUtf8
;
525 if (part
.length
) res
~= part
;
528 if (s
.length
) res
~= s
.utf8ToUtf8
;
529 return cast(T
)res
.sanitizeStrLine
; // it should be valid utf8 here; also, it is safe to cast here
534 // ////////////////////////////////////////////////////////////////////////// //
535 // decode content with the given encoding type
536 public T
decodeContent(T
:const(char)[]) (T data
, const(char)[] encoding
) nothrow @trusted {
537 static if (is(T
== typeof(null))) {
540 if (data
.length
== 0 || encoding
.length
== 0 || encoding
.strEquCI("8bit") || encoding
.strEquCI("binary")) {
544 if (encoding
.strEquCI("7bit")) {
545 return cast(T
)ensureProper7Bit(data
); // it is safe to cast here
548 if (encoding
.strEquCI("base64")) {
550 return cast(T
)decodeBase64(data
, out error
); // it is safe to cast here
553 if (encoding
.strEquCI("quoted-printable")) {
554 return cast(T
)decodeQuotedPrintable
!true(data
); // it is safe to cast here
557 if (encoding
.length
!= 0) {
558 char[] res
= "<invalid encoding:".dup
;
561 return cast(T
)res
; // it is safe to cast here
569 // ////////////////////////////////////////////////////////////////////////// //
570 public T
recodeToUtf8(T
:const(char)[]) (T data
, const(char)[] charset
) nothrow @trusted {
571 static if (is(T
== typeof(null))) {
574 if (data
.length
== 0) return data
;
576 foreach (immutable char ch
; data
) if (ch
>= 128) { found
= true; break; }
577 if (!found
) return sanitizeStr(data
);
578 if (charset
.length
== 0 || charset
.strEquCI("utf-8") || charset
.strEquCI("utf8") || charset
.strEquCI("US-ASCII")) {
579 return utf8ToUtf8(data
);
582 data
= recode(data
, "utf-8", charset
);
583 if (data
.length
== 0) return data
;
584 return data
.sanitizeStr
;
585 } catch (Exception e
) {}
586 char[] res
= "<cannot decode '".dup
;
587 foreach (char ch
; charset
) {
588 if (ch
<= 32 || ch
>= 127) continue;
592 return cast(T
)res
; // it is safe to cast here
597 // ////////////////////////////////////////////////////////////////////////// //
598 private T
mailNameUnquote (T
:const(char)[]) (T buf
) pure nothrow @trusted @nogc {
599 static if (is(T
== typeof(null))) {
603 if (buf
.length
>= 2) {
604 if ((buf
.ptr
[0] == '"' && buf
[$-1] == '"') ||
605 (buf
.ptr
[0] == '<' && buf
[$-1] == '>') ||
606 (buf
.ptr
[0] == '`' && buf
[$-1] == '\'') ||
607 (buf
.ptr
[0] == '\'' && buf
[$-1] == '\''))
609 buf
= buf
[1..$-1].xstrip
;
617 // ////////////////////////////////////////////////////////////////////////// //
618 // extract email from decoded "From" and "To" fields
619 public T
extractMail(bool doSanitize
=true, T
:const(char)[]) (T data
) nothrow @trusted {
620 static if (is(T
== typeof(null))) {
623 if (data
.length
== 0) return data
;
624 if (data
[$-1] == '>') {
625 usize pos
= data
.length
;
626 while (pos
> 0 && data
.ptr
[pos
-1] != '<') --pos
;
627 data
= data
[pos
..$-1].xstrip
;
631 static if (doSanitize
) {
632 // hack for idiotic LJ (those morons are breaking all possible standards)
633 auto sppos
= data
.indexOf(' ');
634 if (sppos
> 0) data
= data
[0..sppos
];
636 return data
.toLowerStr
;
641 // ////////////////////////////////////////////////////////////////////////// //
642 // strip email from decoded "From" and "To" fields
643 public T
stripMail(T
:const(char)[]) (T data
) nothrow @trusted {
644 static if (is(T
== typeof(null))) {
647 if (data
.length
== 0) return data
;
648 if (data
[$-1] == '>') {
649 usize pos
= data
.length
;
650 while (pos
> 0 && data
.ptr
[pos
-1] != '<') --pos
;
651 if (pos
== 0) return data
[0..0];
652 return data
[0..pos
-1].xstrip
;
659 // ////////////////////////////////////////////////////////////////////////// //
660 // extract name from decoded "From" and "To" fields
661 // can construct name if there is none
662 // special hack for idiotic LJ
663 public T
extractName(T
:const(char)[]) (T data
) nothrow @trusted {
664 static if (is(T
== typeof(null))) {
667 if (data
.length
== 0) return data
;
668 auto origData
= data
;
669 T mail
= extractMail(data
);
670 data
= stripMail(data
);
671 // hack for idiotic LJ (those morons are breaking all possible standards)
672 if (mail
.startsWith("lj_dontreply@lj.rossia.org")) {
673 auto dd = extractMail
!false(origData
);
674 auto spos
= dd.indexOf(" (");
676 dd = dd[spos
+2..$-(dd[$-1] == ')' ?
1 : 0)].xstrip
;
677 if (dd == "LJR Comment") {
679 } else if (dd.endsWith(" - LJR Comment")) {
680 auto dpos
= dd.lastIndexOf('-');
681 dd = dd[0..dpos
].xstrip
;
682 if (dd.length
== 0) dd = "anonymous";
684 dd = dd.mailNameUnquote
;
685 if (dd.length
) return dd;
688 data
= data
.mailNameUnquote
;
690 if (mail
.startsWith("lj-notify@livejournal.com")) {
691 if (data
== "LJ Comment") {
693 } else if (data
.endsWith(" - LJ Comment")) {
694 auto dpos
= data
.lastIndexOf('-');
695 data
= data
[0..dpos
].xstrip
;
696 if (data
.length
== 0) data
= "anonymous";
701 // construct name from the mail
702 auto npos
= mail
.indexOf('@');
703 if (npos
<= 0) return mail
;
704 data
= mail
[0..npos
].xstrip
;
705 if (data
.length
== 0) return mail
;
707 res
.reserve(data
.length
);
708 foreach (char ch
; data
) {
709 if (ch
<= 32 || ch
== '.' || ch
== '-' || ch
== '_') ch
= 32;
711 if (res
.length
&& res
[$-1] != 32) res
~= ch
;
713 if (res
.length
== 0 || res
[$-1] == 32) ch
= ch
.toupper
; else ch
= ch
.tolower
;
718 if (res
.length
== 0) return mail
;
719 return cast(T
)res
; // it is safe to cast here
724 // ////////////////////////////////////////////////////////////////////////// //
725 // encode string if it contains some non-ascii
726 // always returns new string, which is safe to `delete`
727 // passed string must be in UTF-8
728 // can return `null` for empty string
729 public char[] strEncodeQ (const(char)[] s
) nothrow @trusted {
730 static bool isSpecial (immutable char ch
) pure nothrow @safe @nogc {
740 if (s
.length
== 0) return null;
741 static immutable string hexd
= "0123456789abcdef";
742 bool needWork
= (s
[0] == '=' || s
[0] == '?');
743 if (!needWork
) foreach (char ch
; s
) if (isSpecial(ch
)) { needWork
= true; break; }
746 res
= new char[s
.length
];
749 res
.reserve(s
.length
*3+32);
750 res
~= "=?UTF-8?Q?"; // quoted printable
751 foreach (char ch
; s
) {
752 if (ch
<= ' ') ch
= '_';
753 if (!isSpecial(ch
) && ch
!= '=' && ch
!= '?') {
757 res
~= hexd
[(cast(ubyte)ch
)>>4];
758 res
~= hexd
[(cast(ubyte)ch
)&0x0f];