From 4953134f5a52df03e76e9496801734c25b1199f7 Mon Sep 17 00:00:00 2001 From: ECHibiki Date: Tue, 2 Apr 2019 23:18:22 -0400 Subject: [PATCH] Handle 404's on --raw --- py-cmd/regexscraper.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/py-cmd/regexscraper.py b/py-cmd/regexscraper.py index 1fc9e5b..0c7139f 100644 --- a/py-cmd/regexscraper.py +++ b/py-cmd/regexscraper.py @@ -6,6 +6,7 @@ import re import string import json import requests +import traceback browser = None nojs = False @@ -48,8 +49,14 @@ def raw_scrape_pages(site_list, pattern): site_list_arr = site_list.split(',') matches = [] for site in site_list_arr: - page_text = requests.get(site).text - matches = matches + re.findall(pattern, page_text) + try: + page_text = requests.get(site, timeout=15.00).text + matches = matches + re.findall(pattern, page_text) + except Exception as err: + print(site + " " + "Not found") + with open("err_log.txt", "a+") as log: + log.write(str(traceback.format_exc()) + "\n\n") + log.close return matches if __name__ == "__main__": -- 2.11.4.GIT