update changes
[QuestHelper.git] / Development / gmail.py
blobf7c17ae4e37eef917f5ea015401346cf3125da79
1 #!/usr/bin/python
3 #Note: We have a custom version of libgmail to fix a bug involving binary attachments (which obviously we have.)
5 import libgmail
6 import md5
7 import sys
8 import passwords
9 import os
10 import commands
11 import re
12 import time
14 os.system("rm rawdata_*")
16 filehashdict = {}
18 ct = 0
19 tregex = re.compile("rawdata_([0-9a-f]{32,32})(.*)\.bz2")
20 outp = commands.getoutput("s3cmd ls s3://questhelper_data/rawdata_")
21 print "S3 listing snagged"
22 for line in outp.split('\n'):
23 if line == "Bucket 'questhelper_data':":
24 continue
25 serch = tregex.search(line)
26 if not serch:
27 print line
28 toki = serch.group(1)
29 ext = serch.group(2)
30 #print toki
31 filehashdict[toki] = ext
32 print "Filenames isolated: %d" % len(filehashdict)
34 ga = libgmail.GmailAccount(passwords.gmail_username, passwords.gmail_password)
35 ga.login()
37 destination="./LocalInput/"
38 label=passwords.gmail_label
40 argument = "!label:" + label + " has:attachment"
41 argument = "has:attachment"
42 inbox=ga.getMessagesByQuery(argument)
43 i=0
45 print `len(inbox)`+" messages"
46 while len(inbox) > 0:
47 try:
48 for thread in inbox:
49 for message in thread:
50 mark = True
51 if thread.getLabels().count(label) != 0:
52 mark = False
54 clear = True
55 print "message "+`i`+" id: "+message.id
56 #print thread.getLabels()
57 #print thread.getLabels().count("downloaded")
59 if True: # we used to make sure it had the right label, or more accurately, didn't
60 #print 'hoohah'
61 print '\t'+`len(message.attachments)`+" attachments"
62 for a in message.attachments:
63 a.filename = a.filename.encode('ascii', 'ignore').replace('*', '_')
64 print '\t\t filename:', a.filename
65 dig=md5.new()
66 cont=a.content
67 if cont <> None:
68 dig.update(cont)
69 pre=dig.hexdigest()
70 #dex=filename.find(".")
71 tup=a.filename.partition(".")
72 name=pre+tup[1]+tup[2]
73 f=open(destination+name,"w")
74 f.write(cont)
75 f.close()
76 #message.addLabel("downloaded")
78 print "\t\t saved"
80 s3name = "rawdata_" + name + ".bz2"
81 if not pre in filehashdict:
82 # okay, that's cool. Now we S3 it.
83 assert(os.system("bzip2 -k --best -c \"%s\" > \"%s\"" % (destination + name, s3name)) == 0)
84 assert(os.system("s3cmd put \"%s\" s3://questhelper_data" % (s3name)) == 0)
85 assert(os.system("rm rawdata_*") == 0)
86 print "\t\t S3 saved"
87 filehashdict[pre] = name.partition(".")[1] + name.partition(".")[2] # we only look at the first page of emails, over and over. this way, on the second pass through that page, we'll get and delete instead of just re-storing over and over.
88 clear = False
89 else:
90 s3oldname = "rawdata_" + pre + filehashdict[pre] + ".bz2"
91 if s3oldname != s3name:
92 print "\t\t WARNING: Name mismatch! %s vs %s" % (s3name, s3oldname)
93 s3cg = "s3cmd --force get \"s3://questhelper_data/%s\" \"%s\"" % (s3oldname, s3oldname)
94 while os.system(s3cg) != 0:
95 print "\t\t s3cmd failed, sleeping for 15 seconds . . ."
96 time.sleep(30)
97 assert(os.system("cat \"%s\" | bunzip2 > rawdata_temptest" % (s3oldname)) == 0)
98 assert(os.system("diff -q rawdata_temptest \"%s\"" % (destination + name)) == 0)
99 assert(os.system("rm rawdata_temptest \"%s\"" % (s3oldname)) == 0)
100 else:
101 print "foobared attachment"
102 mark = False
103 clear = False
104 if clear:
105 print "\t Trashing"
106 ga.trashMessage(message)
107 i=i+1
108 if mark:
109 print "\t Marking"
110 thread.addLabel(label)
111 except Exception, e:
112 raise
113 #print "whoops"
114 inbox=ga.getMessagesByQuery(argument)
115 #print len(inbox)
117 print `i`+" messages examined and saved"
119 os.system("rm rawdata_*")