modified: src1/input.c
[GalaxyCodeBases.git] / python / etc / giga.py
blobe3465584162f85aa51203a9c42cbf153ac2e0535
1 #!/usr/bin/env python3
3 import os, sys, bz2, csv, re, json, sqlite3
4 import urllib.request
6 import pprint
8 myUA='Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:58.0) Gecko/20100101 Firefox/58.0'
9 conn = sqlite3.connect('giga.authors.sqlite')
10 conn.execute('CREATE TABLE PubDat (DOI TEXT, Title TEXT, Type TEXT, Authors TEXT, RefList TEXT)')
13 with bz2.open('giga.tsv.bz2','rt',encoding='utf-8') as tsvin:
14 tsvin = csv.reader(tsvin, delimiter='\t')
15 for row in tsvin:
16 if row[0] == 'Title' or len(row)==0: continue
17 print((len(row),row[0]))
18 theurl = 'https://academic.oup.com/gigascience/article-lookup/doi/' + row[16];
19 req = urllib.request.Request(theurl)
20 req.add_header('Referer', 'https://academic.oup.com/gigascience/')
21 req.add_header('User-Agent',myUA)
22 with urllib.request.urlopen(req) as r:
23 htm = r.read().decode('utf-8')
24 it = iter( htm.split('\n') )
25 data={'strAuthors':'NA','reflist':'NA','tocSections':'NA'}
26 for line, secline in zip(it, it):
27 if re.search('<script type="application\/ld\+json">',line):
28 datAuthors = json.loads(secline)
29 data['strAuthors'] = json.dumps(datAuthors['author'])
30 #pprint.pprint(datAuthors['author'])
31 if re.search('<div class="ref-list">',line):
32 data['reflist'] = line.strip('\t\r \n')
33 if re.search('Issue Section',line):
34 m = re.search('>([^<>]+)<\/a>',secline)
35 if m:
36 data['tocSections'] = m.group(1)
37 #pprint.pprint(data)
38 conn.execute('INSERT INTO PubDat ( DOI,Title,Type,Authors,RefList ) VALUES ( ?,?,?,?,? )',(row[16],row[0], data['tocSections'],data['strAuthors'],data['reflist'] ) )
39 conn.commit()
41 conn.close()