2 * coded by Ketmar // Invisible Vector <ketmar@ketmar.no-ip.org>
3 * Understanding is not required. Only obedience.
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, version 3 of the License ONLY.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 module chibackend
.decode
is aliced
;
24 import iv
.utfutil
: utf8CodeLen
, utf8Valid
;
28 // ////////////////////////////////////////////////////////////////////////// //
31 public byte utf8CodeLen (char ch) pure nothrow @trusted @nogc {
32 //pragma(inline, true);
33 if (ch < 0x80) return 1;
34 if ((ch&0b1111_1110) == 0b1111_1100) return 6;
35 if ((ch&0b1111_1100) == 0b1111_1000) return 5;
36 if ((ch&0b1111_1000) == 0b1111_0000) return 4;
37 if ((ch&0b1111_0000) == 0b1110_0000) return 3;
38 if ((ch&0b1110_0000) == 0b1100_0000) return 2;
43 // ////////////////////////////////////////////////////////////////////////// //
44 public bool utf8Valid (const(void)[] buf) pure nothrow @trusted @nogc {
45 const(ubyte)* bp = cast(const(ubyte)*)buf.ptr;
46 auto left = buf.length;
48 auto len = utf8CodeLen(*bp++)-1;
49 if (len < 0 || len > left) return false;
51 while (len-- > 0) if (((*bp++)&0b1100_0000) != 0b1000_0000) return false;
58 // ////////////////////////////////////////////////////////////////////////// //
59 public bool isValidNickUniChar (immutable dchar ch
) pure nothrow @safe @nogc {
62 (ch
>= '0' && ch
<= '9') ||
63 (ch
>= 'A' && ch
<= 'Z') ||
64 (ch
>= 'a' && ch
<= 'z') ||
65 ch
== '-' || ch
== '_' || ch
== '.' ||
66 isValidCyrillicUni(ch
);
70 public bool isValidUTFNick (const(char)[] s
) pure nothrow @safe @nogc {
71 if (s
.length
== 0) return false;
73 foreach (immutable char ch
; s
) {
74 dc
.decode(cast(ubyte)ch
);
75 if (dc
.invalid
) return false;
76 if (dc
.complete
&& !isValidNickUniChar(dc
.codepoint
)) return false;
82 // ////////////////////////////////////////////////////////////////////////// //
83 public bool isGoodCtlChar (immutable char ch
) pure nothrow @safe @nogc {
85 return (ch
== '\t' || ch
== '\n');
89 // ////////////////////////////////////////////////////////////////////////// //
90 public bool isGoodText (const(char)[] buf
) pure nothrow @trusted @nogc {
91 foreach (immutable char ch
; buf
) {
92 if (ch
== 127 ||
(ch
< 32 && !isGoodCtlChar(ch
))) return false;
98 // ////////////////////////////////////////////////////////////////////////// //
99 private bool isGoodFileNameChar (immutable char ch
) pure nothrow @safe @nogc {
100 if (ch
<= 32 || ch
== 127) return false;
101 if (ch
>= 128) return true;
102 if (ch
== '/' || ch
== '\\') return false;
107 // ////////////////////////////////////////////////////////////////////////// //
108 // this also sanitizes it
109 public T
toLowerStr (T
:const(char)[]) (T s
) nothrow @trusted {
110 static if (is(T
== typeof(null))) {
113 bool needwork
= false;
114 foreach (immutable char ch
; s
) {
115 if (ch
== 127 ||
(ch
< 32 && !isGoodCtlChar(ch
)) ||
(ch
>= 'A' && ch
<= 'Z')) {
124 res
.reserve(s
.length
);
125 foreach (immutable idx
, char ch
; s
) {
126 if (ch
== 13) { if (idx
+1 >= s
.length || s
.ptr
[idx
+1] != 10) res
~= '\n'; }
127 else if (ch
< 32 && !isGoodCtlChar(ch
)) res
~= ' ';
128 else if (ch
== 127) res
~= '~';
129 else if (ch
>= 'A' && ch
<= 'Z') res
~= ch
.tolower
;
132 return cast(T
)res
; // it is safe to cast here
138 // ////////////////////////////////////////////////////////////////////////// //
139 // this also sanitizes it
140 public T
sanitizeFileNameStr (T
:const(char)[]) (T s
) nothrow @trusted {
141 static if (is(T
== typeof(null))) {
144 bool needwork
= false;
145 foreach (immutable char ch
; s
) if (!isGoodFileNameChar(ch
)) { needwork
= true; break; }
149 char[] res
= new char[s
.length
];
151 foreach (ref char ch
; res
) {
152 if (!isGoodFileNameChar(ch
)) ch
= '_';
154 return cast(T
)res
; // it is safe to cast here
160 // ////////////////////////////////////////////////////////////////////////// //
161 public T
sanitizeStr (T
:const(char)[]) (T s
) nothrow @trusted {
162 static if (is(T
== typeof(null))) {
169 res
.reserve(s
.length
);
170 foreach (immutable idx
, char ch
; s
) {
171 if (ch
== 13) { if (idx
+1 >= s
.length || s
.ptr
[idx
+1] != 10) res
~= '\n'; }
172 else if (ch
< 32 && !isGoodCtlChar(ch
)) res
~= ' ';
173 else if (ch
== 127) res
~= '~';
176 return cast(T
)res
; // it is safe to cast here
182 // ////////////////////////////////////////////////////////////////////////// //
183 public T
sanitizeStrLine (T
:const(char)[]) (T s
) nothrow @trusted {
184 static if (is(T
== typeof(null))) {
188 foreach (immutable idx
, char ch
; s
) {
189 if (ch
< 32 || ch
== 127) { found
= true; break; }
190 if (ch
== 32 && (idx
== 0 || s
.ptr
[idx
-1] <= 32)) { found
= true; break; }
196 res
.reserve(s
.length
);
197 foreach (char ch
; s
) {
198 if (ch
< 32 || ch
== 127) ch
= ' ';
199 if (ch
<= 32 && (res
.length
== 0 || res
[$-1] <= 32)) continue;
202 while (res
.length
&& res
[$-1] <= 32) res
= res
[0..$-1];
203 return cast(T
)res
; // it is safe to cast here
209 // ////////////////////////////////////////////////////////////////////////// //
210 // for decoded subject parts
211 public T
sanitizeStrSubjPart (T
:const(char)[]) (T s
) nothrow @trusted {
212 static if (is(T
== typeof(null))) {
216 foreach (immutable idx
, immutable char ch
; s
) {
217 if (ch
< 32 || ch
== 127 || ch
== '_') { found
= true; break; }
222 char[] res
= new char[s
.length
];
224 foreach (ref char ch
; res
) if (ch
< 32 || ch
== 127 || ch
== '_') ch
= ' ';
225 return cast(T
)res
; // it is safe to cast here
231 // ////////////////////////////////////////////////////////////////////////// //
232 // this also sanitizes it
233 public T
binaryToUtf8 (T
:const(char)[]) (T s
) nothrow @trusted {
234 static if (is(T
== typeof(null))) {
238 foreach (immutable char ch
; s
) {
239 if (ch
>= 127 ||
(ch
< 32 && !isGoodCtlChar(ch
))) { found
= true; break; }
244 import iv
.utfutil
: utf8Valid
;
245 if (utf8Valid(s
)) return sanitizeStr(s
);
250 foreach (immutable char ch
; s
) {
252 immutable int len
= utf8Encode(uc
[], cast(dchar)ch
);
258 foreach (immutable idx
, char ch
; s
) {
260 if (ch
== 13) { if (idx
+1 >= s
.length || s
.ptr
[idx
+1] != 10) res
~= '\n'; }
261 else if (ch
< 32 && !isGoodCtlChar(ch
)) res
~= ' ';
262 else if (ch
== 127) res
~= '~';
265 immutable int len
= utf8Encode(uc
[], cast(dchar)ch
);
270 return cast(T
)res
; // it is safe to cast here
276 // ////////////////////////////////////////////////////////////////////////// //
277 // this also sanitizes it
278 public T
utf8ToUtf8 (T
:const(char)[]) (T s
) nothrow @trusted {
279 static if (is(T
== typeof(null))) {
283 foreach (immutable char ch
; s
) {
284 if (ch
>= 127 ||
(ch
< 32 && !isGoodCtlChar(ch
))) { found
= true; break; }
289 import iv
.utfutil
: utf8Valid
;
290 if (utf8Valid(s
)) return sanitizeStr(s
);
293 res
.reserve(s
.length
);
295 foreach (immutable idx
, char ch
; s
) {
296 if (utfleft
) { --utfleft
; res
~= ch
; continue; }
298 if (ch
== 13) { if (idx
+1 >= s
.length || s
.ptr
[idx
+1] != 10) res
~= '\n'; }
299 else if (ch
< 32 && !isGoodCtlChar(ch
)) res
~= ' ';
300 else if (ch
== 127) res
~= '~';
303 immutable byte ulen
= utf8CodeLen(ch
);
304 if (ulen
< 1) { res
~= '?'; continue; }
305 if (s
.length
-idx
< ulen
) { res
~= '?'; break; }
306 if (!utf8Valid(s
[idx
..idx
+ulen
])) { res
~= '?'; continue; }
311 return cast(T
)res
; // it is safe to cast here
317 // ////////////////////////////////////////////////////////////////////////// //
318 public T
subjRemoveRe(T
:const(char)[]) (T s
) nothrow @trusted {
319 static if (is(T
== typeof(null))) {
324 if (s
.length
< 3) break;
325 if (s
.ptr
[0] != 'r' && s
.ptr
[0] != 'R') break;
326 if (s
.ptr
[1] != 'e' && s
.ptr
[1] != 'E') break;
328 while (pp
< s
.length
&& s
.ptr
[pp
] <= 32) ++pp
;
329 if (pp
>= s
.length || s
.ptr
[pp
] != ':') break;
337 // ////////////////////////////////////////////////////////////////////////// //
338 private static immutable string b64alphabet
= "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
340 private static immutable ubyte[256] b64dc
= () {
341 ubyte[256] res
= 0xff; // invalid
342 foreach (immutable idx
, immutable char ch
; b64alphabet
) {
343 res
[cast(ubyte)ch
] = cast(ubyte)idx
;
345 res
['='] = 0xfe; // padding
348 res
[127] = 0xf0; // just in case
352 public char[] decodeBase64(bool ignoreUnderscore
=false) (const(void)[] datavoid
, out bool error
) nothrow @trusted {
353 const(ubyte)[] data
= cast(const(ubyte)[])datavoid
;
355 bool inPadding
= false;
360 dcx
.reserve((data
.length
+3U)/4U*3U+8U);
363 bool decodeChunk () nothrow @trusted {
364 if (btspos
== 0) return true;
365 if (btspos
== 1) return false; //throw new Base64Exception("incomplete data in base64 decoder");
366 dcx
~= cast(char)((bts.ptr
[0]<<2)|
((bts.ptr
[1]&0x30)>>4)); // 2 and more
367 if (btspos
> 2) dcx
~= cast(char)(((bts.ptr
[1]&0x0f)<<4)|
((bts.ptr
[2]&0x3c)>>2)); // 3 and more
368 if (btspos
> 3) dcx
~= cast(char)(((bts.ptr
[2]&0x03)<<6)|
bts.ptr
[3]);
372 while (data
.length
) {
373 immutable ubyte cb
= b64dc
.ptr
[data
.ptr
[0]];
374 if (cb
== 0xff) { error
= true; delete dcx
; return "<invalid base64 data>".dup
; }
376 if (cb
== 0xf0) continue; // empty
377 static if (ignoreUnderscore
) {
378 if (cb
== '_') continue;
383 if (!decodeChunk()) { error
= true; delete dcx
; return "<invalid base64 data>".dup
; }
386 if (++btspos
== 4) { inPadding
= false; btspos
= 0; }
390 if (btspos
!= 0) { error
= true; delete dcx
; return "<invalid base64 data>".dup
; }
393 bts.ptr
[btspos
++] = cb
;
395 if (!decodeChunk()) { error
= true; delete dcx
; return "<invalid base64 data>".dup
; }
400 if (btspos
!= 0 && !inPadding
) {
401 // assume that it is not padded
402 if (!decodeChunk()) { error
= true; delete dcx
; return "<invalid base64 data>".dup
; }
409 // ////////////////////////////////////////////////////////////////////////// //
410 public char[] decodeQuotedPrintable(bool multiline
) (const(void)[] datavoid
) nothrow @trusted {
411 const(char)[] data
= cast(const(char)[])datavoid
;
413 dcx
.reserve(data
.length
);
414 while (data
.length
) {
415 if (data
[0] == '=') {
416 if (data
.length
== 1) break;
417 if (data
.length
>= 3 && digitInBase(data
.ptr
[1], 16) >= 0 && digitInBase(data
.ptr
[2], 16) >= 0) {
418 dcx
~= cast(char)(digitInBase(data
.ptr
[1], 16)*16+digitInBase(data
.ptr
[2], 16));
422 // check if it is followed by blanks up to the newline
423 // if it is so, then this is "line continuation" -- remove both '=' and blanks
424 static if (multiline
) {
427 while (epos
< data
.length
) {
428 immutable char ch
= data
.ptr
[epos
++];
429 if (ch
== 9 || ch
== 32) continue;
430 if (ch
== 10) { ateol
= true; break; }
432 if (epos
>= data
.length || data
.ptr
[epos
] != 10) { ateol
= true; break; }
438 if (ateol || epos
>= data
.length
) {
439 data
= data
[epos
..$];
451 // ////////////////////////////////////////////////////////////////////////// //
452 public T
ensureProper7Bit(T
:const(char)[]) (T s
) nothrow @trusted {
453 static if (is(T
== typeof(null))) {
456 bool needwork
= false;
457 foreach (immutable char ch
; s
) if (ch
>= 128) { needwork
= true; break; }
458 if (!needwork
) return s
;
459 char[] dcx
= new char[s
.length
];
461 foreach (ref char ch
; dcx
) ch
&= 0x7f;
462 return cast(T
)dcx
; // it is safe to cast here
467 // ////////////////////////////////////////////////////////////////////////// //
468 // decode things like "=?UTF-8?B?Tm9yZGzDtnc=?="
469 public T
decodeSubj(T
:const(char)[]) (T s
) nothrow @trusted {
470 static if (is(T
== typeof(null))) {
473 if (s
.indexOf("=?") < 0) return s
.sanitizeStrLine
.utf8ToUtf8
;
475 // have to do some work
478 res
.reserve(s
.length
); // at least
480 while (s
.length
> 2) {
481 auto stqpos
= s
.indexOf("=?");
482 if (stqpos
< 0) break;
483 if (stqpos
> 0) res
~= s
[0..stqpos
].utf8ToUtf8
;
486 auto eepos
= s
.indexOf('?');
487 if (eepos
< 0) break;
488 auto enc
= s
[0..eepos
];
490 //conwriteln("ENCODING: '", enc, "'");
492 if (enc
.length
== 0) enc
= "utf-8";
493 if (s
.length
< 2 || s
.ptr
[1] != '?') return origs
.sanitizeStrLine
.utf8ToUtf8
;
497 eepos
= s
.indexOf("?=");
498 if (eepos
< 0) return origs
.sanitizeStrLine
.utf8ToUtf8
;
500 auto part
= s
[0..eepos
];
503 // several encoded parts may be separated with spaces; those spaces should be ignored
505 while (stqpos
< s
.length
&& s
.ptr
[stqpos
] <= ' ') ++stqpos
;
506 if (s
.length
-stqpos
>= 2 && s
.ptr
[stqpos
] == '=' && s
.ptr
[stqpos
+1] == '?') s
= s
[stqpos
..$];
509 if (ect
== 'Q' || ect
== 'q') {
511 part
= cast(T
)decodeQuotedPrintable
!false(part
); // it is safe to cast here
512 } else if (ect
== 'B' || ect
== 'b') {
516 part
= cast(T
)decodeBase64
!true(part
, out error
); // it is safe to cast here
518 //conwriteln("CANNOT DECODE B64: ", xpart);
520 return origs
.sanitizeStrLine
.utf8ToUtf8
;
524 // reencode part if necessary
525 if (!enc
.strEquCI("utf-8") && !enc
.strEquCI("utf8") && !enc
.strEquCI("US-ASCII")) {
527 //conwriteln("RECODING: ", enc);
528 part
= recode(part
, "utf-8", enc
);
529 } catch (Exception e
) {
530 //conwriteln("RECODE ERROR: ", e.msg);
531 return origs
.sanitizeStrLine
.utf8ToUtf8
;
535 part
= part
.sanitizeStrSubjPart
.utf8ToUtf8
;
536 if (part
.length
) res
~= part
;
539 if (s
.length
) res
~= s
.utf8ToUtf8
;
540 return cast(T
)res
.sanitizeStrLine
; // it should be valid utf8 here; also, it is safe to cast here
545 // ////////////////////////////////////////////////////////////////////////// //
546 // decode content with the given encoding type
547 public T
decodeContent(T
:const(char)[]) (T data
, const(char)[] encoding
) nothrow @trusted {
548 static if (is(T
== typeof(null))) {
551 if (data
.length
== 0 || encoding
.length
== 0 || encoding
.strEquCI("8bit") || encoding
.strEquCI("binary")) {
555 if (encoding
.strEquCI("7bit")) {
556 return cast(T
)ensureProper7Bit(data
); // it is safe to cast here
559 if (encoding
.strEquCI("base64")) {
561 return cast(T
)decodeBase64(data
, out error
); // it is safe to cast here
564 if (encoding
.strEquCI("quoted-printable")) {
565 return cast(T
)decodeQuotedPrintable
!true(data
); // it is safe to cast here
568 if (encoding
.length
!= 0) {
569 char[] res
= "<invalid encoding:".dup
;
572 return cast(T
)res
; // it is safe to cast here
580 // ////////////////////////////////////////////////////////////////////////// //
581 public T
recodeToUtf8(T
:const(char)[]) (T data
, const(char)[] charset
) nothrow @trusted {
582 static if (is(T
== typeof(null))) {
585 if (data
.length
== 0) return data
;
587 foreach (immutable char ch
; data
) if (ch
>= 128) { found
= true; break; }
588 if (!found
) return sanitizeStr(data
);
589 if (charset
.length
== 0 || charset
.strEquCI("utf-8") || charset
.strEquCI("utf8") || charset
.strEquCI("US-ASCII")) {
590 return utf8ToUtf8(data
);
593 data
= recode(data
, "utf-8", charset
);
594 if (data
.length
== 0) return data
;
595 return data
.sanitizeStr
;
596 } catch (Exception e
) {}
597 char[] res
= "<cannot decode '".dup
;
598 foreach (char ch
; charset
) {
599 if (ch
<= 32 || ch
>= 127) continue;
603 return cast(T
)res
; // it is safe to cast here
608 // ////////////////////////////////////////////////////////////////////////// //
609 private T
mailNameUnquote (T
:const(char)[]) (T buf
) pure nothrow @trusted @nogc {
610 static if (is(T
== typeof(null))) {
614 if (buf
.length
>= 2) {
615 if ((buf
.ptr
[0] == '"' && buf
[$-1] == '"') ||
616 (buf
.ptr
[0] == '<' && buf
[$-1] == '>') ||
617 (buf
.ptr
[0] == '`' && buf
[$-1] == '\'') ||
618 (buf
.ptr
[0] == '\'' && buf
[$-1] == '\''))
620 buf
= buf
[1..$-1].xstrip
;
628 // ////////////////////////////////////////////////////////////////////////// //
629 // extract email from decoded "From" and "To" fields
630 public T
extractMail(bool doSanitize
=true, T
:const(char)[]) (T data
) nothrow @trusted {
631 static if (is(T
== typeof(null))) {
634 if (data
.length
== 0) return data
;
635 if (data
[$-1] == '>') {
636 usize pos
= data
.length
;
637 while (pos
> 0 && data
.ptr
[pos
-1] != '<') --pos
;
638 data
= data
[pos
..$-1].xstrip
;
642 static if (doSanitize
) {
643 // hack for idiotic LJ (those morons are breaking all possible standards)
644 auto sppos
= data
.indexOf(' ');
645 if (sppos
> 0) data
= data
[0..sppos
];
647 return data
.toLowerStr
;
652 // ////////////////////////////////////////////////////////////////////////// //
653 // strip email from decoded "From" and "To" fields
654 public T
stripMail(T
:const(char)[]) (T data
) nothrow @trusted {
655 static if (is(T
== typeof(null))) {
658 if (data
.length
== 0) return data
;
659 if (data
[$-1] == '>') {
660 usize pos
= data
.length
;
661 while (pos
> 0 && data
.ptr
[pos
-1] != '<') --pos
;
662 if (pos
== 0) return data
[0..0];
663 return data
[0..pos
-1].xstrip
;
670 // ////////////////////////////////////////////////////////////////////////// //
671 // extract name from decoded "From" and "To" fields
672 // can construct name if there is none
673 // special hack for idiotic LJ
674 public T
extractName(T
:const(char)[]) (T data
) nothrow @trusted {
675 static if (is(T
== typeof(null))) {
678 if (data
.length
== 0) return data
;
679 auto origData
= data
;
680 T mail
= extractMail(data
);
681 data
= stripMail(data
).decodeSubj
.xstrip
;
682 // hack for idiotic LJ (those morons are breaking all possible standards)
683 if (mail
.startsWith("lj_dontreply@lj.rossia.org")) {
684 auto dd = extractMail
!false(origData
);
685 auto spos
= dd.indexOf(" (");
687 dd = dd[spos
+2..$-(dd[$-1] == ')' ?
1 : 0)].xstrip
;
688 if (dd == "LJR Comment") {
690 } else if (dd.endsWith(" - LJR Comment")) {
691 auto dpos
= dd.lastIndexOf('-');
692 dd = dd[0..dpos
].xstrip
;
693 if (dd.length
== 0) dd = "anonymous";
695 dd = dd.mailNameUnquote
;
696 if (dd.length
) return dd;
699 data
= data
.mailNameUnquote
;
701 if (mail
.startsWith("lj-notify@livejournal.com")) {
702 if (data
== "LJ Comment") {
704 } else if (data
.endsWith(" - LJ Comment")) {
705 auto dpos
= data
.lastIndexOf('-');
706 data
= data
[0..dpos
].xstrip
;
707 if (data
.length
== 0) data
= "anonymous";
712 // construct name from the mail
713 auto npos
= mail
.indexOf('@');
714 if (npos
<= 0) return mail
;
715 data
= mail
[0..npos
].xstrip
;
716 if (data
.length
== 0) return mail
;
718 res
.reserve(data
.length
);
719 foreach (char ch
; data
) {
720 if (ch
<= 32 || ch
== '.' || ch
== '-' || ch
== '_') ch
= 32;
722 if (res
.length
&& res
[$-1] != 32) res
~= ch
;
724 if (res
.length
== 0 || res
[$-1] == 32) ch
= ch
.toupper
; else ch
= ch
.tolower
;
729 if (res
.length
== 0) return mail
;
730 return cast(T
)res
; // it is safe to cast here
735 // ////////////////////////////////////////////////////////////////////////// //
736 // encode string if it contains some non-ascii
737 // always returns new string, which is safe to `delete`
738 // passed string must be in UTF-8
739 // can return `null` for empty string
740 public dynstring
strEncodeQ (const(char)[] s
) nothrow @trusted {
741 static bool isSpecial (immutable char ch
) pure nothrow @safe @nogc {
752 if (s
.length
== 0) return res
;
753 static immutable string hexd
= "0123456789abcdef";
754 bool needWork
= (s
[0] == '=' || s
[0] == '?');
755 if (!needWork
) foreach (char ch
; s
) if (isSpecial(ch
)) { needWork
= true; break; }
759 res
.reserve(s
.length
*3+32);
760 res
~= "=?UTF-8?Q?"; // quoted printable
761 foreach (char ch
; s
) {
762 if (ch
<= ' ') ch
= '_';
763 if (!isSpecial(ch
) && ch
!= '=' && ch
!= '?') {
767 res
~= hexd
[(cast(ubyte)ch
)>>4];
768 res
~= hexd
[(cast(ubyte)ch
)&0x0f];