Small update
[Python-Scripts.git] / MiniScripts / motherless-dl.py
blobbe72639aa978b071c86cfd86efe8c5a00849eb53
1 #!/usr/bin/env python
3 '''
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the Revised BSD License.
7 This program is distributed in the hope that it will be useful,
8 but WITHOUT ANY WARRANTY; without even the implied warranty of
9 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 Revised BSD License for more details.
12 Copyright 2013 Cool Dude 2k - http://idb.berlios.de/
13 Copyright 2013 Game Maker 2k - http://intdb.sourceforge.net/
14 Copyright 2013 Kazuki Przyborowski - https://github.com/KazukiPrzyborowski
16 $FileInfo: motherless-dl.py - Last Update: 11/10/2013 Ver. 1.6.7 RC 1 - Author: cooldude2k $
17 '''
19 from __future__ import absolute_import, division, print_function
21 import argparse
22 import datetime
23 import gzip
24 import os
25 import re
26 import sys
27 import time
28 import urllib
30 import cookielib
31 import StringIO
32 import urllib2
33 import urlparse
35 if (__name__ == "__main__"):
36 sys.tracebacklimit = 0
37 __version_info__ = (1, 6, 5, "RC 3")
38 __version_date__ = "2013.10.23"
39 if (__version_info__[3] is not None):
40 __version__ = str(__version_info__[0]) + "." + str(__version_info__[1]) + "." + str(
41 __version_info__[2]) + " " + str(__version_info__[3])
42 if (__version_info__[3] is None):
43 __version__ = str(__version_info__[
44 0]) + "." + str(__version_info__[1]) + "." + str(__version_info__[2])
46 parser = argparse.ArgumentParser(
47 description="get urls of images/videos from motherless.com",
48 conflict_handler="resolve",
49 add_help=True)
50 parser.add_argument("url", nargs='*', help='motherless url')
51 parser.add_argument('-v', '--version', action='version', version=__version__)
52 parser.add_argument("--pages-start", nargs="*", help="start at page number")
53 parser.add_argument("--pages-end", nargs="*", help="end at page number")
54 parser.add_argument(
55 "--update",
56 action='store_true',
57 help="update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed)")
58 parser.add_argument("--dump-user-agent", action='store_true',
59 help="display the current browser identification")
60 parser.add_argument(
61 "--user-agent",
62 nargs="?",
63 default="Mozilla/5.0 (Windows NT 7.0; rv:25.0) Gecko/20100101 Firefox/25.0",
64 help="specify a custom user agent")
65 parser.add_argument("--referer", nargs="?", default="http://motherless.com/",
66 help="specify a custom referer, use if the video access")
67 parser.add_argument("--proxy", nargs="?", default=None,
68 help="Use the specified HTTP/HTTPS proxy")
69 parser.add_argument("--id", action='store_true',
70 help="use only video ID in file name")
71 parser.add_argument("--get-url", action='store_true',
72 help="simulate, quiet but print URL")
73 parser.add_argument("--get-pageurl", action='store_true',
74 help="simulate, quiet but print URL")
75 parser.add_argument("--get-title", action='store_true',
76 help="simulate, quiet but print title")
77 parser.add_argument("--get-posts", action='store_true',
78 help="simulate, quiet but print user posts")
79 parser.add_argument("--get-id", action='store_true',
80 help="simulate, quiet but print id")
81 parser.add_argument("--get-thumbnail", action='store_true',
82 help="simulate, quiet but print thumbnail URL")
83 parser.add_argument("--get-filename", action='store_true',
84 help="simulate, quiet but print output filename")
85 parser.add_argument("--get-format", action='store_true',
86 help="simulate, quiet but print file format")
87 parser.add_argument("--get-type", action='store_true',
88 help="simulate, quiet but print file type")
89 parser.add_argument("--get-username", action='store_true',
90 help="simulate, quiet but print uploaders username")
91 parser.add_argument("--get-bbcode", action='store_true',
92 help="simulate, quiet but print bbcode")
93 parser.add_argument("--get-html", action='store_true',
94 help="simulate, quiet but print html code")
95 parser.add_argument(
96 "--get-dimensions",
97 action='store_true',
98 help="simulate, quiet but print dimensions (width x height)")
99 parser.add_argument("--get-width", action='store_true',
100 help="simulate, quiet but print width")
101 parser.add_argument("--get-height", action='store_true',
102 help="simulate, quiet but print height")
103 parser.add_argument("--get-views", action='store_true',
104 help="simulate, quiet but print number of views")
105 parser.add_argument("--get-favorites", action='store_true',
106 help="simulate, quiet but print number of favorites")
107 parser.add_argument("--verbose", action='store_true',
108 help="print various debugging information")
109 getargs = parser.parse_args()
110 if (getargs.update):
111 from distutils.version import LooseVersion as VerCheck
112 fakeua = getargs.user_agent
113 proxycfg = None
114 if (getargs.proxy is not None):
115 proxycfg = urllib2.ProxyHandler({"http": getargs.proxy})
116 geturls_cj = cookielib.CookieJar()
117 if (proxycfg is None):
118 geturls_opener = urllib2.build_opener(
119 urllib2.HTTPCookieProcessor(geturls_cj))
120 if (proxycfg is not None):
121 geturls_opener = urllib2.build_opener(
122 urllib2.HTTPCookieProcessor(geturls_cj), proxycfg)
123 geturls_opener.addheaders = [
124 ("Referer",
125 "https://github.com/GameMaker2k/Python-Scripts/"),
126 ("User-Agent",
127 fakeua),
128 ("Accept-Encoding",
129 "gzip, deflate"),
130 ("Accept-Language",
131 "en-US,en;q=0.8,en-CA,en-GB;q=0.6"),
132 ("Accept-Charset",
133 "ISO-8859-1,ISO-8859-15,utf-8;q=0.7,*;q=0.7"),
134 ("Accept",
135 "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"),
136 ("Connection",
137 "close")]
138 urllib2.install_opener(geturls_opener)
139 geturls_text = geturls_opener.open(
140 "https://raw.github.com/GameMaker2k/Python-Scripts/master/MiniScripts/motherless-dl.py")
141 if (geturls_text.info().get("Content-Encoding") ==
142 "gzip" or geturls_text.info().get("Content-Encoding") == "deflate"):
143 strbuf = StringIO.StringIO(geturls_text.read())
144 gzstrbuf = gzip.GzipFile(fileobj=strbuf)
145 pyfile_text = gzstrbuf.read()[:]
146 if (geturls_text.info().get("Content-Encoding") !=
147 "gzip" and geturls_text.info().get("Content-Encoding") != "deflate"):
148 pyfile_text = geturls_text.read()[:]
149 regex_finddate_text = re.escape(
150 "__version_date__ = \"") + "([0-9\\.]+)" + re.escape("\"")
151 finddate_text = re.findall(regex_finddate_text, pyfile_text)
152 regex_findver_text = re.escape("__version_info__ = (") + "([0-9]+)" + re.escape(
153 ", ") + "([0-9]+)" + re.escape(", ") + "([0-9]+)" + re.escape(", \"") + "([A-Z0-9 ]+)" + re.escape("\");")
154 findver_text = re.findall(regex_findver_text, pyfile_text)
155 ProVerStr = str(__version_info__[0]) + "." + str(__version_info__[1]) + "." + str(
156 __version_info__[2]) + __version_info__[3].replace(" ", "").lower()
157 ProVerCheck = VerCheck(ProVerStr)
158 ProDateCheck = VerCheck(__version_date__)
159 NewVerStr = findver_text[0][0] + "." + findver_text[0][1] + "." + \
160 findver_text[0][2] + findver_text[0][3].replace(" ", "").lower()
161 NewVerCheck = VerCheck(NewVerStr)
162 NewDateCheck = VerCheck(finddate_text[0])
163 if (ProVerStr < NewVerCheck and ProDateCheck <= NewDateCheck):
164 fileopen = open(__file__, "w+")
165 fileopen.write(pyfile_text)
166 fileopen.close()
167 print()
168 sys.exit()
170 if (getargs.dump_user_agent):
171 print(getargs.user_agent)
172 sys.exit()
173 if (len(getargs.url) == 0):
174 parser.print_help()
175 sys.exit()
178 def motherless_dl(mtlessgetargs=vars(getargs)):
179 fakeua = mtlessgetargs["user_agent"]
180 proxycfg = None
181 if (mtlessgetargs["proxy"] is not None):
182 proxycfg = urllib2.ProxyHandler({"http": mtlessgetargs["proxy"]})
183 geturls_cj = cookielib.CookieJar()
184 if (proxycfg is None):
185 geturls_opener = urllib2.build_opener(
186 urllib2.HTTPCookieProcessor(geturls_cj))
187 if (proxycfg is not None):
188 geturls_opener = urllib2.build_opener(
189 urllib2.HTTPCookieProcessor(geturls_cj), proxycfg)
190 geturls_opener.addheaders = [
191 ("Referer",
192 mtlessgetargs["referer"]),
193 ("User-Agent",
194 fakeua),
195 ("Accept-Encoding",
196 "gzip, deflate"),
197 ("Accept-Language",
198 "en-US,en;q=0.8,en-CA,en-GB;q=0.6"),
199 ("Accept-Charset",
200 "ISO-8859-1,ISO-8859-15,utf-8;q=0.7,*;q=0.7"),
201 ("Accept",
202 "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"),
203 ("Connection",
204 "close")]
205 urllib2.install_opener(geturls_opener)
206 per_gal_sleep = 0
207 per_url_sleep = 0
208 numurlarg = len(mtlessgetargs["url"])
209 cururlarg = 0
210 while (cururlarg < numurlarg):
211 mlessvid = mtlessgetargs["url"][cururlarg]
212 if (re.match("^s([0-9]+)" + re.escape(".motherlessmedia.com"),
213 urlparse.urlparse(mlessvid).hostname)):
214 geturls_text = geturls_opener.open(
215 "http://motherless.com/mogile_api.php?path=" +
216 urllib.quote_plus(mlessvid) +
217 "&redirect=1")
218 mlessvid = geturls_text.geturl()
219 mregex_text = re.escape(
220 "http://motherless.com/") + "([\\w\\/\\?\\&\\=]+)"
221 if (re.findall(mregex_text, mlessvid)):
222 mlessvid = re.findall(mregex_text, mlessvid)
223 mlessvid = "http://motherless.com/" + mlessvid[0]
224 if (mtlessgetargs["verbose"]):
225 print(mlessvid)
226 if (re.match("^" + re.escape("thumbs.motherlessmedia.com"),
227 urlparse.urlparse(mlessvid).hostname)):
228 mlessvid = re.sub(re.escape("-zoom"), "", mlessvid)
229 mlessvid = re.sub(re.escape("-strip"), "", mlessvid)
230 mlessvidtmp = urlparse.urlparse(mlessvid).path.split("/")
231 mlessvid = "http://motherless.com/" + mlessvidtmp[2]
232 mregex_text = re.escape(
233 "http://motherless.com/") + "([\\w\\/\\?\\&\\=]+)"
234 if (re.findall(mregex_text, mlessvid)):
235 mlessvid = re.findall(mregex_text, mlessvid)
236 mlessvid = "http://motherless.com/" + mlessvid[0]
237 mlessvid = re.sub(re.escape("http://motherless.com/"), "", mlessvid)
238 mlessvid = re.sub(
239 re.escape("http://www.motherless.com/"), "", mlessvid)
240 mlessvid = re.sub(re.escape("https://motherless.com/"), "", mlessvid)
241 mlessvid = re.sub(
242 re.escape("https://www.motherless.com/"), "", mlessvid)
243 mlessvid = re.sub(re.escape("motherless.com/"), "", mlessvid)
244 mlessvid = re.sub(re.escape("www.motherless.com/"), "", mlessvid)
245 mlessvid = re.sub("^" + re.escape("/"), "", mlessvid)
246 mlessvid = "http://motherless.com/" + mlessvid
247 mregex_text = re.escape(
248 "http://motherless.com/") + "([\\w\\/\\?\\&\\=]+)"
249 if (re.findall(mregex_text, mlessvid)):
250 mlessvid = re.findall(mregex_text, mlessvid)
251 mlessvid = "/" + mlessvid[0]
252 mlessvidqstr = urlparse.parse_qs(urlparse.urlparse(mlessvid).query)
253 mlessvidid = urlparse.urlparse(mlessvid).path.split("/")
254 mlessgallist = []
255 if ((re.match("^random",
256 mlessvidid[1]) and len(mlessvidid) == 2) or (re.match("^random",
257 mlessvidid[1]) and len(mlessvidid) == 3) and (re.match("^image",
258 mlessvidid[2]) or re.match("^video",
259 mlessvidid[2]))):
260 geturls_text = geturls_opener.open(
261 "http://motherless.com" + mlessvid)
262 mlessvid = geturls_text.geturl()
263 if (re.findall(mregex_text, mlessvid)):
264 mlessvid = re.findall(mregex_text, mlessvid)
265 mlessvid = mlessvid[0]
266 if (mtlessgetargs["verbose"]):
267 print(mlessvid)
268 if ((re.match("^galleries",
269 mlessvidid[1]) and len(mlessvidid) == 4) or (re.match("^f",
270 mlessvidid[1]) and re.match("^galleries",
271 mlessvidid[2]) and len(mlessvidid) == 4) or (re.match("^term",
272 mlessvidid[1]) and re.match("^galleries",
273 mlessvidid[2]) and len(mlessvidid) == 4)):
274 geturls_text = geturls_opener.open(
275 "http://motherless.com" + mlessvid + "?page=1")
276 if (geturls_text.info().get("Content-Encoding") ==
277 "gzip" or geturls_text.info().get("Content-Encoding") == "deflate"):
278 strbuf = StringIO.StringIO(geturls_text.read())
279 gzstrbuf = gzip.GzipFile(fileobj=strbuf)
280 out_text = gzstrbuf.read()[:]
281 if (geturls_text.info().get("Content-Encoding") !=
282 "gzip" and geturls_text.info().get("Content-Encoding") != "deflate"):
283 out_text = geturls_text.read()[:]
284 out_text = re.sub(re.escape("http://motherless.com"), "", out_text)
285 out_text = re.sub(
286 re.escape("http://www.motherless.com"), "", out_text)
287 out_text = re.sub(
288 re.escape("https://motherless.com"), "", out_text)
289 out_text = re.sub(
290 re.escape("https://www.motherless.com"), "", out_text)
291 regex_ptext = re.escape(
292 "class=\"pop\" rel=\"") + "([0-9]+)" + re.escape("\">") + "([0-9]+)" + re.escape("</a>")
293 page_text = re.findall(regex_ptext, out_text)
294 try:
295 numpages = int(page_text[-1][0])
296 except IndexError:
297 numpages = 1
298 curpage = 1
299 if (not mtlessgetargs["pages_start"]
300 is None and mtlessgetargs["pages_start"][0].isdigit()):
301 if (int(mtlessgetargs["pages_start"][0]) <= numpages):
302 curpage = int(mtlessgetargs["pages_start"][0])
303 if (not mtlessgetargs["pages_end"]
304 is None and mtlessgetargs["pages_end"][0].isdigit()):
305 if (int(mtlessgetargs["pages_end"][0]) >= curpage):
306 numpages = int(mtlessgetargs["pages_end"][0])
307 if (int(mtlessgetargs["pages_end"][0]) <= numpages):
308 numpages = int(mtlessgetargs["pages_end"][0])
309 while (curpage <= numpages):
310 if (curpage > 1):
311 geturls_text = geturls_opener.open(
312 "http://motherless.com/" + mlessvid + "?page=" + str(curpage))
313 if (geturls_text.info().get("Content-Encoding") ==
314 "gzip" or geturls_text.info().get("Content-Encoding") == "deflate"):
315 strbuf = StringIO.StringIO(geturls_text.read())
316 gzstrbuf = gzip.GzipFile(fileobj=strbuf)
317 out_text = gzstrbuf.read()[:]
318 if (geturls_text.info().get("Content-Encoding") !=
319 "gzip" and geturls_text.info().get("Content-Encoding") != "deflate"):
320 out_text = geturls_text.read()[:]
321 out_text = re.sub(
322 re.escape("http://motherless.com"), "", out_text)
323 out_text = re.sub(
324 re.escape("http://www.motherless.com"), "", out_text)
325 out_text = re.sub(
326 re.escape("http://motherless.com"), "", out_text)
327 out_text = re.sub(
328 re.escape("http://www.motherless.com"), "", out_text)
329 regex_text = re.escape(
330 "") + "([\\w\\/]+)" + re.escape("\" class=\"img-container\" target=\"_self\">")
331 post_text = re.findall(regex_text, out_text)
332 numgal = len(post_text)
333 curgal = 0
334 while (curgal < numgal):
335 mlessgallist.append(post_text[curgal])
336 if (mtlessgetargs["verbose"]):
337 print(post_text[curgal])
338 curgal = curgal + 1
339 curpage = curpage + 1
340 if (not re.match("^galleries",
341 mlessvidid[1]) or (re.match("^galleries",
342 mlessvidid[1]) and len(mlessvidid) < 4) or (re.match("^galleries",
343 mlessvidid[1]) and len(mlessvidid) > 5)):
344 mlessgallist.append(mlessvid)
345 numusrgal = len(mlessgallist)
346 curusrgal = 0
347 while (curusrgal < numusrgal):
348 mlessvid = mlessgallist[curusrgal]
349 if (not re.match("^\\/", mlessvid)):
350 mlessvid = "/" + mlessvid
351 mlessvidqstr = urlparse.parse_qs(urlparse.urlparse(mlessvid).query)
352 mlessvidid = urlparse.urlparse(mlessvid).path.split("/")
353 mlessurllist = []
354 if (
356 re.match(
357 "^G",
358 mlessvidid[1]) and len(mlessvidid) == 2) or (
359 re.match(
360 "^H",
361 mlessvidid[1]) and len(mlessvidid) == 2) or (
362 re.match(
363 "^V",
364 mlessvidid[1]) and len(mlessvidid) == 2) or (
365 re.match(
366 "^live",
367 mlessvidid[1]) and len(mlessvidid) == 2) or (
368 re.match(
369 "^g",
370 mlessvidid[1]) and len(mlessvidid) == 3) or (
371 re.match(
372 "^u",
373 mlessvidid[1]) and len(mlessvidid) == 3) or (
374 re.match(
375 "^term",
376 mlessvidid[1]) and (
377 re.match(
378 "^videos",
379 mlessvidid[2]) or re.match(
380 "^images",
381 mlessvidid[2])) and len(mlessvidid) == 4) or (
382 re.match(
383 "^f",
384 mlessvidid[1]) and len(mlessvidid) == 4 and (
385 re.match(
386 "^videos",
387 mlessvidid[3]) or re.match(
388 "^images",
389 mlessvidid[3]))) or (
390 re.match(
391 "^live",
392 mlessvidid[1]) and len(mlessvidid) == 3 and (
393 re.match(
394 "^images",
395 mlessvidid[2]) or re.match(
396 "^videos",
397 mlessvidid[2]))) or (
398 re.match(
399 "^images",
400 mlessvidid[1]) and len(mlessvidid) == 3 and (
401 re.match(
402 "^favorited",
403 mlessvidid[2]) or re.match(
404 "^viewed",
405 mlessvidid[2]) or re.match(
406 "^commented",
407 mlessvidid[2]) or re.match(
408 "^popular",
409 mlessvidid[2]))) or (
410 re.match(
411 "^videos",
412 mlessvidid[1]) and len(mlessvidid) == 3 and (
413 re.match(
414 "^favorited",
415 mlessvidid[2]) or re.match(
416 "^viewed",
417 mlessvidid[2]) or re.match(
418 "^commented",
419 mlessvidid[2]) or re.match(
420 "^popular",
421 mlessvidid[2])))):
422 addtvar = False
423 tvaradd = ""
424 if (re.match("^u", mlessvidid[1]) and len(mlessvidid) == 3):
425 try:
426 if (mlessvidqstr["t"][0] ==
427 "i" or mlessvidqstr["t"][0] == "v"):
428 tvaradd = "&t=" + mlessvidqstr["t"][0]
429 addtvar = True
430 except KeyError:
431 addtvar = False
432 except IndexError:
433 addtvar = False
434 geturls_text = geturls_opener.open(
435 "http://motherless.com" + mlessvid + "?page=1" + tvaradd)
436 if (geturls_text.info().get("Content-Encoding") ==
437 "gzip" or geturls_text.info().get("Content-Encoding") == "deflate"):
438 strbuf = StringIO.StringIO(geturls_text.read())
439 gzstrbuf = gzip.GzipFile(fileobj=strbuf)
440 out_text = gzstrbuf.read()[:]
441 if (geturls_text.info().get("Content-Encoding") !=
442 "gzip" and geturls_text.info().get("Content-Encoding") != "deflate"):
443 out_text = geturls_text.read()[:]
444 out_text = re.sub(
445 re.escape("http://motherless.com"), "", out_text)
446 out_text = re.sub(
447 re.escape("http://www.motherless.com"), "", out_text)
448 out_text = re.sub(
449 re.escape("http://motherless.com"), "", out_text)
450 out_text = re.sub(
451 re.escape("http://www.motherless.com"), "", out_text)
452 regex_ptext = re.escape(
453 "class=\"pop\" rel=\"") + "([0-9]+)" + re.escape("\">") + "([0-9]+)" + re.escape("</a>")
454 page_text = re.findall(regex_ptext, out_text)
455 try:
456 numpages = int(page_text[-1][0])
457 except IndexError:
458 numpages = 1
459 curpage = 1
460 if (not mtlessgetargs["pages_start"]
461 is None and mtlessgetargs["pages_start"][0].isdigit()):
462 if (int(mtlessgetargs["pages_start"][0]) <= numpages):
463 curpage = int(mtlessgetargs["pages_start"][0])
464 if (not mtlessgetargs["pages_end"]
465 is None and mtlessgetargs["pages_end"][0].isdigit()):
466 if (int(mtlessgetargs["pages_end"][0]) >= curpage):
467 numpages = int(mtlessgetargs["pages_end"][0])
468 if (int(mtlessgetargs["pages_end"][0]) <= numpages):
469 numpages = int(mtlessgetargs["pages_end"][0])
470 while (curpage <= numpages):
471 if (curpage > 1):
472 geturls_text = geturls_opener.open(
473 "http://motherless.com" + mlessvid + "?page=" + str(curpage) + tvaradd)
474 if (geturls_text.info().get("Content-Encoding") ==
475 "gzip" or geturls_text.info().get("Content-Encoding") == "deflate"):
476 strbuf = StringIO.StringIO(geturls_text.read())
477 gzstrbuf = gzip.GzipFile(fileobj=strbuf)
478 out_text = gzstrbuf.read()[:]
479 if (geturls_text.info().get("Content-Encoding") !=
480 "gzip" and geturls_text.info().get("Content-Encoding") != "deflate"):
481 out_text = geturls_text.read()[:]
482 out_text = re.sub(
483 re.escape("http://motherless.com"), "", out_text)
484 out_text = re.sub(
485 re.escape("http://www.motherless.com"), "", out_text)
486 out_text = re.sub(
487 re.escape("http://motherless.com"), "", out_text)
488 out_text = re.sub(
489 re.escape("http://www.motherless.com"), "", out_text)
490 if (re.match("^V", mlessvidid[1])):
491 out_text = re.sub(
492 re.escape("class=\"img-container\" target=\"_self\""),
493 "title=\"motherless link\"",
494 out_text)
495 out_text = re.sub(
496 re.escape("class=\"pop plain\" target=\"_blank\""),
497 "title=\"motherless link\"",
498 out_text)
499 regex_text = re.escape(
500 "<a href=\"") + "([\\w\\/]+)" + re.escape("\" title=\"motherless link\">")
501 if (not re.match("^V", mlessvidid[1])):
502 regex_text = re.escape(
503 "") + "([\\w\\/]+)" + re.escape("\" class=\"img-container\" target=\"_self\">")
504 post_text = re.findall(regex_text, out_text)
505 numurls = len(post_text)
506 cururl = 0
507 while (cururl < numurls):
508 mlessurllist.append(post_text[cururl])
509 if (mtlessgetargs["verbose"]):
510 print(post_text[cururl])
511 cururl = cururl + 1
512 curpage = curpage + 1
513 if (
515 re.match(
516 "^G",
517 mlessvidid[1]) and len(mlessvidid) == 3 and re.match(
518 "([0-9A-F]+)",
519 mlessvidid[2])) or (
520 re.match(
521 "^g",
522 mlessvidid[1]) and len(mlessvidid) == 4) or (
523 len(mlessvidid) == 2 and re.match(
524 "([0-9A-F]+)",
525 mlessvidid[1]))):
526 mlessurllist.append(mlessvid)
527 numlist = len(mlessurllist)
528 curlurl = 0
529 mlessoutlist = []
530 while (curlurl < numlist):
531 skiplnk = False
532 try:
533 geturls_text = geturls_opener.open(
534 "http://motherless.com" + mlessurllist[curlurl])
535 except urllib2.HTTPError:
536 skiplnk = True
537 if (skiplnk == False):
538 if (geturls_text.info().get("Content-Encoding") ==
539 "gzip" or geturls_text.info().get("Content-Encoding") == "deflate"):
540 strbuf = StringIO.StringIO(geturls_text.read())
541 gzstrbuf = gzip.GzipFile(fileobj=strbuf)
542 subout_text = gzstrbuf.read()[:]
543 if (geturls_text.info().get("Content-Encoding") !=
544 "gzip" and geturls_text.info().get("Content-Encoding") != "deflate"):
545 subout_text = geturls_text.read()[:]
546 subout_text = re.sub(
547 re.escape("http://motherless.com"), "", subout_text)
548 subout_text = re.sub(
549 re.escape("http://www.motherless.com"), "", subout_text)
550 subout_text = re.sub(
551 re.escape("http://motherless.com"), "", subout_text)
552 subout_text = re.sub(
553 re.escape("http://www.motherless.com"), "", subout_text)
554 regex_title = re.escape(
555 "<title>") + "(.*)" + re.escape("</title>")
556 title_text = re.findall(regex_title, subout_text)
557 mlesstitle = re.sub(
558 re.escape(" - MOTHERLESS.COM"), "", title_text[0])
559 regex_thumb = re.escape(
560 "src=&quot;") + "(.*)" + re.escape("&quot;")
561 thumb_text = re.findall(regex_thumb, subout_text)
562 mlessthumb = thumb_text[0]
563 regex_text = re.escape(
564 "__fileurl = '") + "(.*)" + re.escape("';")
565 post_text = re.findall(regex_text, subout_text)
566 regex_img = re.escape(
567 "<meta property=\"og:image\" content=\"") + "(.*)" + re.escape("\">")
568 img_text = re.findall(regex_img, subout_text)
569 mlessimg = img_text[0]
570 regex_mediatype = re.escape(
571 "__mediatype = '") + "(.*)" + re.escape("',")
572 mediatype_text = re.findall(regex_mediatype, subout_text)
573 regex_altimg = re.escape("<link rel=\"image_src\" type=\"image/") + \
574 "(.*)" + re.escape("\" href=\"") + "(.*)" + re.escape("\">")
575 altimg_text = re.findall(regex_altimg, subout_text)
576 mlessaltimg = altimg_text[0][1]
577 regex_usrname = re.escape("<a href=\"/u/") + "([\\w]+)" + re.escape(
578 "\" class=\"pop plain thumb-member-link-uploads\">Uploads</a>")
579 usrname_text = re.findall(regex_usrname, subout_text)
580 mlessusrname = usrname_text[0]
581 mlessid = re.sub("^" + re.escape("/"), "",
582 mlessurllist[curlurl])
583 mlesspurl = "http://motherless.com" + mlessurllist[curlurl]
584 regex_numviews = re.escape(
585 "<strong>Views</strong>") + "\n+\t+([^\t]+)\t+" + re.escape("</h2>")
586 numviews_text = re.findall(regex_numviews, subout_text)
587 mlessnumviews = numviews_text[0]
588 mlessnumviews = re.sub(re.escape(","), "", mlessnumviews)
589 regex_numfavs = re.escape(
590 "<strong>Favorited</strong>") + "\n+\t+([^\t]+)\t+" + re.escape("</h2>")
591 numfavs_text = re.findall(regex_numfavs, subout_text)
592 mlessnumfavs = numfavs_text[0]
593 mlessnumfavs = re.sub(re.escape(","), "", mlessnumfavs)
594 ''' some good regex "!-%'-?A-~ " "!-%'-?A-~ \\<\\>\"\'\\@\\#" '''
595 regex_postdata = re.escape("<div class=\"media-comment-contents\">") + "\n\t+" + re.escape("<h4>") + "\n\t+" + re.escape("<a href=\"/m/") + "([\\w]+)" + re.escape("\" class=\"pop plain\" target=\"_blank\">") + "\n\t+([^\t]+)\t+" + re.escape(
596 "</a>") + "\n\t+" + re.escape("</h4>") + "\n\t+" + re.escape("<div class=\"media-comment-meta\">") + "\n\t+([^\t]+)\t+" + re.escape("</div>") + "\n\t+" + re.escape("<div style=\"text-align: justify;\">") + "\n\t+([^\t]+)\t+" + re.escape("</div>")
597 postdata_text = re.findall(regex_postdata, subout_text)
598 numpost = len(postdata_text)
599 regex_servsecs = re.escape(
600 "Served by web") + "([0-9]+)" + re.escape(" in ") + "([0-9\\.]+)" + re.escape(" seconds")
601 servsecs_text = re.findall(regex_servsecs, subout_text)
602 servname = "web" + servsecs_text[0][0]
603 servsecs = float(servsecs_text[0][1])
604 curpost = 0
605 mlesspostlist = []
606 ''' From Amber @ http://stackoverflow.com/a/9662362 '''
607 TAG_RE = re.compile(r'<[^>]+>')
608 while (numpost > 0 and curpost < numpost):
609 newpostext = re.sub(
610 re.escape("<br>"), "\n", postdata_text[curpost][3])
611 newpostext = re.sub(
612 re.escape("<br/>"), "\n", newpostext)
613 newpostext = re.sub(
614 re.escape("<br />"), "\n", newpostext)
615 newpostext = TAG_RE.sub('', newpostext)
616 newpostext = re.sub(
617 re.escape("/") + "([\\w\\/]+)",
618 r"http://motherless.com/\1",
619 newpostext)
620 mlesspostlist.append(
622 "username": postdata_text[curpost][0],
623 "avatar": "http://avatars.motherlessmedia.com/avatars/member/" +
624 postdata_text[curpost][0] +
625 ".jpg",
626 "smallavatar": "http://avatars.motherlessmedia.com/avatars/member/" +
627 postdata_text[curpost][0] +
628 "-small.jpg",
629 "post": newpostext})
630 curpost = curpost + 1
631 if (post_text > 0):
632 mlesslink = post_text[0]
633 mlessext = os.path.splitext(
634 urlparse.urlparse(mlesslink).path)[1]
635 mlessext = mlessext.replace(".", "")
636 mlessext = mlessext.lower()
637 if (mtlessgetargs["id"] == False):
638 mlessfname = urlparse.urlsplit(
639 mlesslink).path.split("/")[-1]
640 if (mtlessgetargs["id"]):
641 mlessfname = re.sub(
642 re.escape("/"), "_", mlessid) + "." + mlessext
643 if (not mlessext == "mp4" and not mlessext == "flv"):
644 imginfo = {}
645 regex_ii_dimensions = re.escape("style=\"width: ") + "([0-9]+)" + re.escape(
646 "px; height: ") + "([0-9]+)" + re.escape("px; border: none;\"")
647 post_ii_dimensions = re.findall(
648 regex_ii_dimensions, subout_text)
649 post_ii_width = post_ii_dimensions[0][0]
650 post_ii_height = post_ii_dimensions[0][1]
651 imginfo = {
652 "width": int(post_ii_height),
653 "height": int(post_ii_width),
654 "views": int(mlessnumviews),
655 "favorites": int(mlessnumfavs)}
656 if (mlessext == "mp4" or mlessext == "flv"):
657 vidinfo = {}
658 mlesslink = mlesslink + "?start=0"
659 regex_vi_file = re.escape(
660 "\"file\" : \"") + "(.*)" + re.escape("\",")
661 post_vi_file = re.findall(
662 regex_vi_file, subout_text)
663 regex_vi_image = re.escape(
664 "\"image\" : \"") + "(.*)" + re.escape("\",")
665 post_vi_image = re.findall(
666 regex_vi_image, subout_text)
667 regex_vi_height = re.escape(
668 "\"height\" : ") + "([0-9]+)" + re.escape(",")
669 post_vi_height = re.findall(
670 regex_vi_height, subout_text)
671 regex_vi_width = re.escape(
672 "\"width\" : ") + "([0-9]+)" + re.escape(",")
673 post_vi_width = re.findall(
674 regex_vi_width, subout_text)
675 regex_vi_filethumb = re.escape(
676 "\"file\": ") + "(.*)" + re.escape(",")
677 post_vi_filethumb = re.findall(
678 regex_vi_filethumb, subout_text)
679 regex_vi_kind = re.escape(
680 "\"kind\": \"") + "(.*)" + re.escape("\"")
681 post_vi_kind = re.findall(
682 regex_vi_kind, subout_text)
683 vidinfo = {
684 "file": post_vi_file[0],
685 "image": post_vi_image[0],
686 "width": int(
687 post_vi_width[0]),
688 "height": int(
689 post_vi_height[0]),
690 "views": int(mlessnumviews),
691 "favorites": int(mlessnumfavs),
692 "filethumb": post_vi_filethumb[0],
693 "thumbstrip": "http://thumbs.motherlessmedia.com/thumbs/" +
694 mlessid +
695 "-strip.jpg",
696 "kind": post_vi_kind[0]}
697 if (mtlessgetargs["verbose"]):
698 print(mlesslink)
699 mlesslistitms = {}
700 mlesslistitms.update({"id": mlessid})
701 mlesslistitms.update({"title": mlesstitle})
702 mlesslistitms.update({"format": mlessext})
703 mlesslistitms.update({"filename": mlessfname})
704 mlesslistitms.update({"thumbnail": mlessthumb})
705 mlesslistitms.update({"servername": servname})
706 mlesslistitms.update({"servingtime": servsecs})
707 mlesslistitms.update({"mediatype": mediatype_text[0]})
708 if (not mlessext == "mp4" and not mlessext == "flv"):
709 mlesslistitms.update({"vidpic": mlesslink})
710 mlesslistitms.update({"type": "image"})
711 mlesslistitms.update({"info": imginfo})
712 mlesslistitms.update(
713 {"dimensions": str(imginfo["width"]) + "x" + str(imginfo["height"])})
714 mlesslistitms.update({"width": imginfo["width"]})
715 mlesslistitms.update({"height": imginfo["height"]})
716 mlesslistitms.update({"views": imginfo["views"]})
717 mlesslistitms.update(
718 {"favorites": imginfo["favorites"]})
719 if (mlessext == "mp4" or mlessext == "flv"):
720 mlesslistitms.update({"vidpic": mlessimg})
721 mlesslistitms.update({"type": "video"})
722 mlesslistitms.update({"info": vidinfo})
723 mlesslistitms.update(
724 {"dimensions": str(vidinfo["width"]) + "x" + str(vidinfo["height"])})
725 mlesslistitms.update({"width": vidinfo["width"]})
726 mlesslistitms.update({"height": vidinfo["height"]})
727 mlesslistitms.update({"views": vidinfo["views"]})
728 mlesslistitms.update(
729 {"favorites": vidinfo["favorites"]})
730 mlesslistitms.update({"username": mlessusrname})
731 mlesslistitms.update(
732 {"avatar": "http://avatars.motherlessmedia.com/avatars/member/" + mlessusrname + ".jpg"})
733 mlesslistitms.update(
735 "smallavatar": "http://avatars.motherlessmedia.com/avatars/member/" +
736 mlessusrname +
737 "-small.jpg"})
738 mlesslistitms.update({"posts": mlesspostlist})
739 mlesslistitms.update({"pageurl": mlesspurl})
740 mlesslistitms.update({"url": mlesslink})
741 mlessoutlist.append(mlesslistitms)
742 if (curlurl < (numlist - 1)):
743 time.sleep(per_url_sleep)
744 curlurl = curlurl + 1
745 if (curusrgal < (numusrgal - 1)):
746 time.sleep(per_gal_sleep)
747 curusrgal = curusrgal + 1
748 cururlarg = cururlarg + 1
749 return mlessoutlist
752 if (__name__ == "__main__"):
753 mtlesslinks = motherless_dl()
754 mtlesslncount = len(mtlesslinks)
755 mtlesscurln = 0
756 while (mtlesscurln < mtlesslncount):
757 if (getargs.get_id):
758 print(mtlesslinks[mtlesscurln]["id"])
759 if (getargs.get_title):
760 print(mtlesslinks[mtlesscurln]["title"])
761 if (getargs.get_posts):
762 numpost = len(mtlesslinks[mtlesscurln]["posts"])
763 curpost = 0
764 mlesspostlist = []
765 while (numpost > 0 and curpost < numpost):
766 print(
767 mtlesslinks[mtlesscurln]["posts"][curpost]["username"] +
768 ": " +
769 mtlesslinks[mtlesscurln]["posts"][curpost]["post"])
770 curpost = curpost + 1
771 if (getargs.get_format):
772 print(mtlesslinks[mtlesscurln]["format"])
773 if (getargs.get_type):
774 print(mtlesslinks[mtlesscurln]["type"])
775 if (getargs.get_filename):
776 print(mtlesslinks[mtlesscurln]["filename"])
777 if (getargs.get_thumbnail):
778 print(mtlesslinks[mtlesscurln]["thumbnail"])
779 if (mtlesslinks[mtlesscurln]["format"] ==
780 "mp4" or mtlesslinks[mtlesscurln]["format"] == "flv"):
781 print(mtlesslinks[mtlesscurln]["vidpic"])
782 if (getargs.get_username):
783 print(mtlesslinks[mtlesscurln]["username"])
784 if (getargs.get_pageurl):
785 print(mtlesslinks[mtlesscurln]["pageurl"])
786 if (getargs.get_bbcode):
787 print("[URL=" + mtlesslinks[mtlesscurln]["pageurl"] + "][IMG]" +
788 mtlesslinks[mtlesscurln]["thumbnail"] + "[/IMG][/URL]")
789 if (getargs.get_html):
790 print(
791 "<a href=\"" +
792 mtlesslinks[mtlesscurln]["pageurl"] +
793 "\"><img src=\"" +
794 mtlesslinks[mtlesscurln]["thumbnail"] +
795 "\"></a>")
796 if (getargs.get_dimensions):
797 print(mtlesslinks[mtlesscurln]["dimensions"])
798 if (getargs.get_width):
799 print(str(mtlesslinks[mtlesscurln]["width"]))
800 if (getargs.get_height):
801 print(str(mtlesslinks[mtlesscurln]["height"]))
802 if (getargs.get_views):
803 print(mtlesslinks[mtlesscurln]["views"])
804 if (getargs.get_favorites):
805 print(mtlesslinks[mtlesscurln]["favorites"])
806 if (getargs.get_url or (getargs.get_id == False and getargs.get_title == False and getargs.get_posts == False and getargs.get_format == False and getargs.get_filename == False and getargs.get_thumbnail == False and getargs.get_username == False and getargs.get_pageurl ==
807 False and getargs.get_bbcode == False and getargs.get_html == False and getargs.get_dimensions == False and getargs.get_width == False and getargs.get_height == False and getargs.get_views == False and getargs.get_favorites == False and getargs.get_type == False)):
808 print(mtlesslinks[mtlesscurln]["url"])
809 mtlesscurln = mtlesscurln + 1