modified: Makefile
[GalaxyCodeBases.git] / perl / etc / justonce / giga.pl
blobc21640c2da2e3ce8c31c9eddc21a4c37f1f1a732
1 #!/usr/bin/env perl
2 use strict;
3 use warnings;
4 use LWP;
5 use LWP::UserAgent;
6 use JSON qw( decode_json );
7 use IO::Handle;
8 use Data::Dumper;
9 use DBI;
11 my $dbh = DBI->connect("dbi:SQLite:dbname=giga.authors.sqlite","","",{RaiseError => 0,PrintError => 1,AutoCommit => 0}) or die $DBI::errstr;
12 $dbh->do("CREATE TABLE PubDat (DOI TEXT, Title TEXT, Type TEXT, Authors TEXT, RefList TEXT)") or die $dbh->errstr;
13 # CREATE TABLE IF NOT EXISTS PubDat ? But we have not check previous got entries.
14 $dbh->commit;
15 my $sthi = $dbh->prepare( "INSERT INTO PubDat ( DOI,Title,Type,Authors,RefList ) VALUES ( ?,?,?,?,? )" );
17 my $ua = LWP::UserAgent->new;
18 $ua->agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:58.0) Gecko/20100101 Firefox/58.0");
20 sub openfile($) {
21 my ($filename)=@_;
22 my $infile;
23 if ($filename=~/.xz$/) {
24 open( $infile,"-|","xz -dc $filename") or die "Error opening $filename: $!\n";
25 } elsif ($filename=~/.gz$/) {
26 open( $infile,"-|","gzip -dc $filename") or die "Error opening $filename: $!\n";
27 } elsif ($filename=~/.bz2$/) {
28 open( $infile,"-|","bzip2 -dc $filename") or die "Error opening $filename: $!\n";
29 } else {open( $infile,"<",$filename) or die "Error opening $filename: $!\n";}
30 return $infile;
33 sub getthings($) {
34 my ($cnt) = @_;
35 my ($tocSections,$reflist,$jsonData)=('','');
36 my @lines = split(/^/m,$cnt);
37 for (my $i=0;$i<=$#lines;$i++) {
38 if ($lines[$i] =~ /<script type="application\/ld\+json">/) {
39 $jsonData = $lines[$i+1];
41 if ($lines[$i] =~ /<div class="ref-list">/) {
42 $reflist = $lines[$i];
43 chomp($reflist);
45 if ($lines[$i] =~ /Issue Section:/) {
46 $lines[$i+1] =~ />([^<>]+)<\/a>/;
47 $tocSections = $1;
48 last;
51 my $decoded_json = decode_json( $jsonData );
52 #print ">>>$tocSections<<<\n";
53 #print Dumper $decoded_json;
54 return [$tocSections,$decoded_json,$reflist];
57 sub fetchURL($$) {
58 my ($URL,$times)=@_;
59 my $req = HTTP::Request->new(GET => $URL);
60 my $ret;
61 for my $i (1 .. $times) {
62 my $res = $ua->request($req);
63 if ($res->is_success) {
64 $ret = getthings($res->content);
65 return $ret;
66 } else {
67 print $res->status_line, " <<<--- $i of $times\n";
68 $ret = ["\n",$res->status_line];
69 sleep $i;
72 return $ret;
76 my $fh = openfile('giga.tsv.bz2');
77 open O,'>','giga.authors.ini' || die("[x]Cannot Open Output File.");
78 binmode(O, ":utf8");
79 <$fh>;
80 while (<$fh>) {
81 chomp;
82 my @dat = split /\t/;
83 print join(" | ",@dat[0,16]),"\n";
84 unless ($dat[16] =~ /\//) {
85 print O "[$dat[16]]\nTitle=\"$dat[0]\"\nType=\"Missing or Misformatted DOI !\"\n\n";
86 next;
88 my $url = 'https://academic.oup.com/gigascience/article-lookup/doi/' . $dat[16];
89 my $ret=fetchURL($url,5);
90 #my $ret=[''];
91 if ($ret->[0] eq '') {
92 print O "[$dat[16]]\nTitle=\"$dat[0]\"\nType=\"Wrong DOI !\"\n\n";
93 next;
94 } elsif ($ret->[0] eq "\n") {
95 print O "[$dat[16]]\nTitle=\"$dat[0]\"\nType=\"Error: $ret->[1]\"\n\n";
96 next;
98 print O "[$dat[16]]\nTitle=\"$dat[0]\"\nType=\"$ret->[0]\"\nAuthors={\n";
99 #print Dumper $ret;
100 my $authors = ${$ret->[1]}{'author'};
101 my $AuthorStr = '';
102 for (@$authors) {
103 #print Dumper $_;
104 print O join('"',"\t",$_->{'name'},"=",$_->{'affiliation'},"\n");
105 $AuthorStr .= join('"','',$_->{'name'},"=",$_->{'affiliation'},"\n");
107 print O "}\nRefList=\{$ret->[2]\}\n\n";
108 O->flush();
109 $sthi->execute($dat[16],$dat[0],$ret->[0],$AuthorStr,$ret->[2]) or die $sthi->errstr;
110 $dbh->commit;
112 close O;
113 $dbh->disconnect;
115 __END__
119 [10.1186/s13742-015-0066-5]
120 Title="The ocean sampling day consortium"
121 Type="Commentary"
122 Authors={
123 "Kopf, Anna"="1 Max Planck Institute for Marine Microbiology, Celsiusstrasse 1, D-28359Bremen, Germany 2 Jacobs University Bremen gGmbH, Campus Ring 1, D-28759 Bremen, Germany"
124 "Bicak, Mesude"="3 University of Oxford, 7 Keble Road, OX1 3QG Oxford, Oxfordshire, UK"
125 "Kottmann, Renzo"="1 Max Planck Institute for Marine Microbiology, Celsiusstrasse 1, D-28359Bremen, Germany"
126 "Schnetzer, Julia"="1 Max Planck Institute for Marine Microbiology, Celsiusstrasse 1, D-28359Bremen, Germany 2 Jacobs University Bremen gGmbH, Campus Ring 1, D-28759 Bremen, Germany"
127 "Øvreås, Lise"="26 Department of Biology, University of Bergen, Thormøhlensgate 53 B, 5020 Bergen, Norway"
128 "Glöckner, Frank Oliver"="1 Max Planck Institute for Marine Microbiology, Celsiusstrasse 1, D-28359Bremen, Germany 2 Jacobs University Bremen gGmbH, Campus Ring 1, D-28759 Bremen, Germany"