3 import os
, sys
, bz2
, csv
, re
, json
, sqlite3
8 myUA
='Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:58.0) Gecko/20100101 Firefox/58.0'
9 conn
= sqlite3
.connect('giga.authors.sqlite')
10 conn
.execute('CREATE TABLE PubDat (DOI TEXT, Title TEXT, Type TEXT, Authors TEXT, RefList TEXT)')
13 with bz2
.open('giga.tsv.bz2','rt',encoding
='utf-8') as tsvin
:
14 tsvin
= csv
.reader(tsvin
, delimiter
='\t')
16 if row
[0] == 'Title' or len(row
)==0: continue
17 print((len(row
),row
[0]))
18 theurl
= 'https://academic.oup.com/gigascience/article-lookup/doi/' + row
[16];
19 req
= urllib
.request
.Request(theurl
)
20 req
.add_header('Referer', 'https://academic.oup.com/gigascience/')
21 req
.add_header('User-Agent',myUA
)
22 with urllib
.request
.urlopen(req
) as r
:
23 htm
= r
.read().decode('utf-8')
24 it
= iter( htm
.split('\n') )
25 data
={'strAuthors':'NA','reflist':'NA','tocSections':'NA'}
26 for line
, secline
in zip(it
, it
):
27 if re
.search('<script type="application\/ld\+json">',line
):
28 datAuthors
= json
.loads(secline
)
29 data
['strAuthors'] = json
.dumps(datAuthors
['author'])
30 #pprint.pprint(datAuthors['author'])
31 if re
.search('<div class="ref-list">',line
):
32 data
['reflist'] = line
.strip('\t\r \n')
33 if re
.search('Issue Section',line
):
34 m
= re
.search('>([^<>]+)<\/a>',secline
)
36 data
['tocSections'] = m
.group(1)
38 conn
.execute('INSERT INTO PubDat ( DOI,Title,Type,Authors,RefList ) VALUES ( ?,?,?,?,? )',(row
[16],row
[0], data
['tocSections'],data
['strAuthors'],data
['reflist'] ) )