sqlite: yet another schema change: threading is tag-local now, and msgid into is...
[chiroptera.git] / sqbase_experiment / zsq_rebuild_support.d
blobcd0f52e24cedac0cf8bae15aa9cb08b45b80063e
1 // this rebuilds supplementary tables in SQLite mail database.
2 // it parses the messages, extract mime parts, and populates
3 // "headers", "content", and "attaches" tables.
4 // those tables are required for the main program to work properly,
5 // but they can be dropped and recreated at any time.
6 module zsq_rebuild is aliced;
8 import std.digest.ripemd;
10 import iv.encoding;
11 import iv.sq3;
12 import iv.strex;
13 import iv.timer;
14 import iv.vfs;
15 import iv.vfs.io;
17 import chiro_sqbase;
18 import chiro_decode;
19 import chiro_parse;
22 // ////////////////////////////////////////////////////////////////////////// //
23 uint parseMailDate (const(char)[] s) {
24 import std.datetime;
25 if (s.length == 0) return 0;
26 try {
27 return cast(uint)(parseRFC822DateTime(s).toUTC.toUnixTime);
28 } catch (Exception) {}
29 // sometimes this helps
30 s ~= "00";
31 try {
32 return cast(uint)(parseRFC822DateTime(s).toUTC.toUnixTime);
33 } catch (Exception) {}
34 return 0;
38 // ////////////////////////////////////////////////////////////////////////// //
39 void main (string[] args) {
40 // here, i don't have any threads at all
41 if (sqlite3_config(SQLITE_CONFIG_SINGLETHREAD) != SQLITE_OK) throw new Exception("cannot switch SQLite to multi-threaded mode");
43 string mailpath = "~/Mail";
44 int packlevel = ChiroDefaultPackLevel;
45 if (args.length > 1) {
46 for (usize f = 1; f < args.length; ++f) {
47 if (args[f].strEquCI("--nocompress")) packlevel = 0;
48 else if (args[f].strEquCI("--max")) packlevel = 666;
49 else if (args[f].length == 2 && args[f][0] == '-' && args[f][1] >= '0' && args[f][1] <= '9') packlevel = args[f][1]-'0';
50 else if (args[f].strEquCI("--mailpath")) { mailpath = args[f+1]; ++f; }
54 auto dbstore = chiroOpenStorageDB(readonly:true);
56 writeln("creating message support db (compression level ", packlevel, ")...");
57 auto db = chiroRecreateSupportDB();
59 uint total = 0;
60 foreach (auto row; dbstore.persistentStatement(`SELECT count(uid) AS total FROM messages WHERE tags <> '';`).range) {
61 total = row.total!uint;
62 break;
64 writeln("found ", total, " messages to process.");
66 // this monstrosity selects all unique tags
67 auto stAllTags = dbstore.persistentStatement(`
68 WITH RECURSIVE tagtable(tag, rest) AS (
69 VALUES('', (SELECT group_concat(tags,'|') FROM (SELECT DISTINCT(tags) AS tags FROM messages WHERE tags <> ''))||'|')
70 UNION ALL
71 SELECT
72 SUBSTR(rest, 0, INSTR(rest, '|')),
73 SUBSTR(rest, INSTR(rest, '|')+1)
74 FROM tagtable
75 WHERE rest <> '')
76 SELECT DISTINCT(tag) AS tag
77 FROM tagtable
78 WHERE tag <> ''
79 ORDER BY tag;
80 `);
82 auto stInsTagName = db.persistentStatement("INSERT INTO tagnames (tag, hidden) VALUES(:tag,:hidden);");
84 // insert all unique tags
85 foreach (auto row; stAllTags.range) {
86 auto tag = row.tag!SQ3Text;
87 assert(tag.length);
88 stInsTagName
89 .bindText(":tag", tag, transient:false)
90 // all non-folder tags are hidden
91 .bind(":hidden", (tag[0] != '/' ? 1 : 0))
92 .doAll();
95 auto stInsInfo = db.persistentStatement(`
96 INSERT INTO info
97 ( uid, from_name, from_mail, subj, to_name, to_mail)
98 VALUES(:uid,:from_name,:from_mail,:subj,:to_name,:to_mail);
99 `);
101 auto stInsMsgId = db.persistentStatement(`
102 INSERT INTO msgids
103 ( uid, msgid, time)
104 VALUES(:uid,:msgid,:time);
107 auto stInsMsgRefId = db.persistentStatement(`
108 INSERT INTO refids
109 ( uid, idx, msgid)
110 VALUES(:uid,:idx,:msgid);
113 auto stInsThreads = db.persistentStatement(`
114 INSERT INTO threads
115 ( uid, tagid, time, nntpidx, appearance)
116 VALUES(:uid,(SELECT uid FROM tagnames WHERE tag=:tagname LIMIT 1),:time,:nntpidx, 1);
119 auto stInsContent = db.persistentStatement(`
120 INSERT INTO content
121 ( uid, mime, format, content)
122 VALUES(:uid,:mime,:format, ChiroPack(:content, :dopack));
125 auto stInsAttach = db.persistentStatement(`
126 INSERT INTO attaches
127 ( uid, mime, name, content)
128 VALUES(:uid,:mime,:name, ChiroPack(:content, :dopack));
131 version(none) {
132 // this unpacks twice, but meh
133 auto stAllMsgs = dbstore.persistentStatement(`
134 SELECT
135 uid AS uid
136 , ChiroExtractHeaders(ChiroUnpack(data)) AS headers
137 , ChiroExtractBody(ChiroUnpack(data)) AS body
138 , tags AS tags
139 FROM messages
140 WHERE tags <> ''
141 ORDER BY uid;`);
143 //stAllMsgs.doAll();
144 foreach (auto row; stAllMsgs.range) {
145 //writeln("COLCOUNT: ", row.colcount_); foreach (int cn; 0..row.colcount_) writeln(" ", cn, ": ", row.colname_(cn));
146 writeln(row.addr!SQ3Text, ": ", row.opcode!SQ3Text,
147 "; p1=", row.p1!SQ3Text, "; p2=", row.p2!SQ3Text,
148 "; p3=", row.p3!SQ3Text, "; p4=", row.p4!SQ3Text, "; p5=", row.p5!SQ3Text,
149 "; comment=", row.comment!SQ3Text);
150 //writeln("id=", row.id!SQ3Text, "; parent=", row.parent!SQ3Text, "; notused=", row.notused!SQ3Text, "; detail=", row.detail!SQ3Text);
152 return;
154 } else {
155 // this should cache unpack results
156 auto stAllMsgs = dbstore.persistentStatement(`
157 WITH msgunpacked(msguid, msgdata, msgtags) AS (
158 SELECT uid AS msguid, ChiroUnpack(data) AS msgdata, tags AS msgtags
159 FROM messages
160 WHERE tags <> ''
161 ORDER BY uid
163 SELECT
164 msguid AS uid
165 , ChiroExtractHeaders(msgdata) AS headers
166 , ChiroExtractBody(msgdata) AS body
167 , msgtags AS tags
168 FROM msgunpacked;`);
171 Timer ctm = Timer(true);
173 db.execute("begin transaction;");
174 uint count = 0;
175 foreach (auto row; stAllMsgs.range) {
176 //import std.conv : to;
177 //writeln("type=", row.text!DBFieldType, "; val=<", row.text!SQLStringc, ">");
178 auto hdrs = row.headers!SQ3Text;
179 auto body = row.body!SQ3Text;
180 auto tags = row.tags!SQ3Text;
181 uint uid = row.uid!uint;
182 assert(tags.length);
184 version(none) {
185 writeln("---------------- ", uid, " ----------------");
186 writeln(row.data!SQ3Text);
187 writeln("---");
188 writeln(hdrs);
189 writeln("---");
190 writeln(body);
191 writeln("============================================");
194 // insert from/to/subj info
196 auto subj = findHeaderField(hdrs, "Subject").decodeSubj.subjRemoveRe;
197 auto from = findHeaderField(hdrs, "From").decodeSubj;
198 auto to = findHeaderField(hdrs, "To").decodeSubj;
199 stInsInfo
200 .bind(":uid", uid)
201 .bind(":from_name", from.extractName)
202 .bind(":from_mail", from.extractMail)
203 .bind(":subj", subj)
204 .bind(":to_name", to.extractName)
205 .bind(":to_mail", to.extractMail)
206 .doAll();
209 // insert references
211 uint refidx = 0;
212 auto inreplyfld = findHeaderField(hdrs, "In-Reply-To");
213 while (inreplyfld.length) {
214 auto id = getNextFieldValue(inreplyfld);
215 if (id.length) {
216 stInsMsgRefId
217 .bind(":uid", uid)
218 .bind(":idx", refidx++)
219 .bind(":msgid", id)
220 .doAll();
224 inreplyfld = findHeaderField(hdrs, "References");
225 while (inreplyfld.length) {
226 auto id = getNextFieldValue(inreplyfld);
227 if (id.length) {
228 stInsMsgRefId
229 .bind(":uid", uid)
230 .bind(":idx", refidx++)
231 .bind(":msgid", id)
232 .doAll();
237 // will be used to mark msgids
238 uint msgtime = 0;
240 // basic thread info
242 // nntp index
243 uint nntpidx = 0;
245 auto nntpidxfld = findHeaderField(hdrs, "NNTP-Index");
246 if (nntpidxfld.length) {
247 auto id = nntpidxfld.getFieldValue;
248 if (id.length) {
249 foreach (immutable ch; id) {
250 if (ch < '0' || ch > '9') { nntpidx = 0; break; }
251 if (nntpidx == 0 && ch == '0') continue;
252 immutable uint nn = nntpidx*10u+(ch-'0');
253 if (nn <= nntpidx) nntpidx = 0x7fffffff; else nntpidx = nn;
259 // time
260 uint time = 0;
262 auto datefld = findHeaderField(hdrs, "Injection-Date");
263 if (datefld.length != 0) {
264 auto v = datefld.getFieldValue;
265 try {
266 time = parseMailDate(v);
267 } catch (Exception) {
268 writeln("UID=", uid, ": FUCKED INJECTION-DATE: |", v, "|");
269 time = 0; // just in case
272 if (!time) {
273 // obsolete NNTP date field, because why not?
274 datefld = findHeaderField(hdrs, "NNTP-Posting-Date");
275 if (datefld.length != 0) {
276 auto v = datefld.getFieldValue;
277 try {
278 time = parseMailDate(v);
279 } catch (Exception) {
280 writeln("UID=", uid, ": FUCKED NNTP-POSTING-DATE: |", v, "|");
281 time = 0; // just in case
285 if (!time) {
286 datefld = findHeaderField(hdrs, "Date");
287 if (datefld.length != 0) {
288 auto v = datefld.getFieldValue;
289 try {
290 time = parseMailDate(v);
291 } catch (Exception) {
292 writeln("UID=", uid, ": FUCKED DATE: |", v, "|");
293 time = 0; // just in case
299 // finally, try to get time from "Received:"
300 //Received: from dns9.fly.us ([131.103.96.154]) by np5-d2.fly.us with Microsoft SMTPSVC(5.0.2195.6824); Tue, 21 Mar 2017 17:35:54 -0400
301 if (!time) {
302 //writeln("!!! --- !!!");
303 uint lowesttime = uint.max;
304 foreach (uint fidx; 0..uint.max) {
305 auto recvfld = findHeaderField(hdrs, "Received", fidx);
306 if (recvfld.length == 0) break;
307 auto lsemi = recvfld.lastIndexOf(';');
308 if (lsemi >= 0) recvfld = recvfld[lsemi+1..$].xstrip;
309 if (recvfld.length != 0) {
310 auto v = recvfld.getFieldValue;
311 uint tm = 0;
312 try {
313 tm = parseMailDate(v);
314 } catch (Exception) {
315 writeln("UID=", uid, ": FUCKED RECV DATE: |", v, "|");
316 tm = 0; // just in case
318 //writeln(tm, " : ", lowesttime);
319 if (tm && tm < lowesttime) lowesttime = tm;
322 if (lowesttime != uint.max) time = lowesttime;
324 msgtime = time;
326 // create thread record for each tag
327 while (tags.length) {
328 auto eep = tags.indexOf('|');
329 auto tagname = (eep >= 0 ? tags[0..eep] : tags[0..$]);
330 tags = (eep >= 0 ? tags[eep+1..$] : tags[0..0]);
331 if (tagname.length == 0) continue;
332 stInsThreads
333 .bind(":uid", uid)
334 .bindText(":tagname", tagname, transient:false)
335 .bind(":time", time)
336 .bind(":nntpidx", nntpidx)
337 .doAll();
341 // insert msgid
343 bool hasmsgid = false;
344 auto msgidfield = findHeaderField(hdrs, "Message-Id");
345 if (msgidfield.length) {
346 auto id = msgidfield.getFieldValue;
347 if (id.length) {
348 hasmsgid = true;
349 stInsMsgId
350 .bind(":uid", uid)
351 .bind("time", msgtime)
352 .bindText(":msgid", id, transient:false)
353 .doAll();
356 // if there is no msgid, create one
357 if (!hasmsgid) {
358 RIPEMD160 hash;
359 hash.start();
360 hash.put(cast(const(ubyte)[])hdrs);
361 hash.put(cast(const(ubyte)[])body);
362 ubyte[20] digest = hash.finish();
363 char[20*2+2] buf;
364 import core.stdc.stdio : snprintf;
365 foreach (immutable idx, ubyte b; digest[]) snprintf(buf.ptr+idx*2, 3, "%02x", b);
366 stInsMsgId
367 .bind(":uid", uid)
368 .bind("time", msgtime)
369 .bindText(":msgid", buf[0..20*2], transient:false)
370 .doAll();
374 // insert base content and attaches
376 Content[] content;
377 parseContent(ref content, hdrs, body);
378 foreach (const ref Content cc; content) {
379 if (cc.name.length) {
380 stInsAttach
381 .bind(":uid", uid)
382 .bindText(":mime", cc.mime)
383 .bindText(":name", cc.name)
384 .bindBlob(":content", cc.data)
385 .bind(":dopack", packlevel)
386 .doAll();
387 } else {
388 stInsContent
389 .bind(":uid", uid)
390 .bindText(":mime", cc.mime)
391 .bindText(":format", cc.format)
392 .bindBlob(":content", cc.data)
393 .bind(":dopack", packlevel)
394 .doAll();
399 version(none) {
400 writeln("==== UID: ", row.uid!uint, " ===");
401 auto msgidfield = findHeaderField(hdrs, "Message-Id");
402 if (msgidfield.length) {
403 auto id = msgidfield.getFieldValue;
404 writeln("MSGID: {", id, "}");
406 auto subj = findHeaderField(hdrs, "Subject").decodeSubj.subjRemoveRe;
407 auto from = findHeaderField(hdrs, "From").decodeSubj;
408 auto to = findHeaderField(hdrs, "To").decodeSubj;
409 writeln("FROM: |", from.recodeToKOI8, "| MAIL=|", from.extractMail.recodeToKOI8, "| NAME=|", from.extractName.recodeToKOI8, "|");
410 writeln(" TO: |", to.recodeToKOI8, "| MAIL=|", to.extractMail.recodeToKOI8, "| NAME=|", to.extractName.recodeToKOI8, "|");
411 writeln("SUBJ: |", subj.recodeToKOI8, "|");
414 ++count;
415 if (count%1024 == 1) {
416 write(" ", count, " ", count*100u/total, "% ", row.tags!SQ3Text, "\x1b[K\r");
417 db.execute("commit transaction;");
418 db.execute("begin transaction;");
419 //if (count > 1) break;
422 db.execute("commit transaction;");
424 ctm.stop;
425 writeln("time: ", ctm.toString, "\x1b[K");
427 writeln(count, " messages processed.\x1b[K");
429 writeln("closing the db");
430 db.close();
432 dbstore.close();