modified: myjupyterlab.sh
[GalaxyCodeBases.git] / perl / etc / crep / fetch_crep_anno.pl
blob71d83ca7c559e09084d821d75fd27ed6239ffdd1
1 #!/usr/bin/perl -w
2 use strict;
3 use warnings;
4 #use IO::Handle;
5 use Time::HiRes qw ( gettimeofday tv_interval );
6 use Getopt::Std;
7 use Term::ANSIColor qw(:constants);
8 use LWP::UserAgent;
9 use LWP::Simple qw ( get );
10 use HTTP::Cookies;
11 use Text::CSV_XS;
12 use HTML::TableExtract;
13 use Data::Dump;
15 $main::VERSION=1.3.2;
17 if (@ARGV == 0) {
18 &VERSION_MESSAGE();
19 &HELP_MESSAGE();
20 die "\n";
22 our($opt_p, $opt_o, $opt_n, $opt_s);
24 $Getopt::Std::STANDARD_HELP_VERSION=1;
25 getopts('p:o:s:');
26 #print "$opt_i, $opt_o, $opt_e\n";
28 sub HELP_MESSAGE() {
29 #my ($scr) = ($0 =~ m,([^/\\]+)$,);
30 my $help=<<EOH;
31 \t-p NetAffx Rice Probe Set Information (.psi) file [./Rice.psi]
32 \t-o Output file (crep_anno_tsv.txt)
33 \t-s To Start at . (for Rice 1..57381)
34 EOH
35 print STDERR <<EOH;
36 \nUsage: \033[0;1m$0\033[0;0m [-OPTIONS [-MORE_OPTIONS]] [--] [PROGRAM_ARG1 ...]
38 The following single-character options are accepted:
39 \033[32;1m$help\033[0;0m
40 Options may be merged together. -- stops processing of options.
41 Space is not required between options and their arguments.
42 EOH
44 sub VERSION_MESSAGE() {
45 my $perlv = $];
46 $perlv = sprintf "%vd", $^V if $] >= 5.006;
47 my $ver = sprintf "%vd", $main::VERSION;
48 my ($scr) = ($0 =~ m,([^/\\]+)$,);
49 print STDERR <<EOH;
50 $scr version \033[0;1m$ver\033[0;0m, running under Perl version $perlv.
51 EOH
54 $opt_p='./Rice.psi' if ! defined $opt_p;
55 $opt_o='crep_anno_tsv.txt' if ! defined $opt_o;
56 $opt_s=1 if ! defined $opt_s;
57 $opt_s=int($opt_s);
58 $opt_s=1 if $opt_s < 1;
59 &VERSION_MESSAGE();
60 warn "\nInput PSI: \033[32;1m$opt_p\033[0;0m\nOutput: \033[32;1m$opt_o\033[0;0m\n\n";
61 warn "Start at \033[32;1m$opt_s\033[0;0m\n";
62 #warn 'Check fasta id matches: ',GREEN,BOLD,$opt_c?'Enabled':'Disabled',RESET,"\n";
64 my $start_time = [gettimeofday];
65 ################### MAIN ####################
66 my (@psid,$tmp)=();
67 open PSI,'<',$opt_p or die "Error: $!\n";
68 while (<PSI>) {
69 next if /^#/;
70 push @psid,(split /\t/,$_)[1];
71 #print "[$tmp_line]\n";
73 close PSI;
74 #for (@psid) {
75 # print "[$_]\t";
77 $tmp = $#psid + 1;
78 warn GREEN,BOLD,$tmp,RESET," ProbeSets loaded.\n";
79 $tmp = $opt_s-1;
80 if ($#psid >= $tmp-1) {
81 splice(@psid,0,$tmp);
83 $tmp = $#psid + 1;
84 warn "Remains: \033[32;1m$tmp\033[0;0m.\n\n";
85 ################### Login ####################
86 my $ua = LWP::UserAgent->new;
87 my ($req,$res,$count);
88 $ua->cookie_jar(HTTP::Cookies->new(file => "lwpcookies2.txt", ignore_discard => 1,
89 autosave => 1));
91 $req = HTTP::Request->new(GET => 'http://crep.ncpgr.cn/crep-cgi/login2.pl?User_Name=guest&amp;Password=57510426775c5b0f');
92 # send request
93 $res = $ua->request($req);
94 # check the outcome
95 if (! $res->is_success) {
96 die "Error: Login as Guest failed !\n";
99 ################### Retrieve ####################
100 # http://crep.ncpgr.cn/crep-cgi/elements_details.pl?unique_id=Os.24470.1.A1_at
101 $req = HTTP::Request->new(POST => 'http://crep.ncpgr.cn/crep-cgi/elements_details.pl');
102 $req->content_type('application/x-www-form-urlencoded');
103 my $ps_count = $#psid + 1;
104 my ($with_head,$to_retrieve,$csv_data_ref,$percent)=(1);
105 if ($opt_s==1) {
106 open OUTPUT,'>',$opt_o or die "Error: $!\n";
107 } else {
108 open OUTPUT,'>>',$opt_o or die "Error: $!\n";
109 $tmp=tell OUTPUT;
110 warn "$tmp";
111 if ($tmp > 1) {
112 $with_head=0;
115 my $csv_subject = Text::CSV_XS->new;
116 print STDERR '== 0.00 %';
117 $tmp=-1;
118 while ($#psid >= 0) {
119 $to_retrieve = shift(@psid);
120 $req = HTTP::Request->new(GET => 'http://crep.ncpgr.cn/crep-cgi/elements_details.pl?unique_id='.$to_retrieve);
121 #$req->content_type('application/x-www-form-urlencoded');
123 $res = $ua->request($req);
124 ($csv_data_ref,$count)=&crep_retrieve(\$res,$with_head,$to_retrieve);
125 die "Error: $count <> 1, Server Overflow !\n" if $count != 1;
126 $with_head=0;
127 print OUTPUT $$csv_data_ref;
128 #print "$$csv_data_ref";
129 $percent=sprintf "%5.2f",($ps_count-$#psid-1)*100/$ps_count;
130 print STDERR "\b"x10, ($#psid % 2) ? ('>='):('=>')," $percent %";
131 $tmp=1;
133 if ($#psid >= 0) {
134 die;
136 close OUTPUT;
138 sub crep_retrieve () {
139 my ($res_ref,$with_head,$to_retrieve)=@_;
140 my $tmp_html = $$res_ref->content;
141 =pod
142 <tr bgcolor="#FFFFFF"><th align="left" valign="center" width="20%">Associated TIGR Gene Locus <a class="descrp" onclick="help_window('/crep-cgi/help.pl?Element_details','Help','450','400')" href="javascript:void(0);"><img height="23" border="0" src="/images/question_s.gif" width="8" /></a></th><td align="left"><table border="0" cellspacing="1" cellpadding="5" width="95%"><tr><th align="center" valign="center" width="20%">TIGR Locus</th><th align="center" valign="center" width="20%">Matching Probes</th><th align="center" valign="center" width="60%">Description</th></tr><tr><td align="center" valign="center">NONE</td><td align="left" valign="center" width="20%">NONE</td><td align="left" valign="center" width="60%">NONE</td></tr></table></td></tr>
144 <tr bgcolor="#FFFFFF"><th align="left" valign="center" width="20%">Associated TIGR Gene Locus <a class="descrp" onclick="help_window('/crep-cgi/help.pl?Element_details','Help','450','400')" href="javascript:void(0);"><img height="23" border="0" src="/images/question_s.gif" width="8" /></a></th><td align="left"><table border="0" cellspacing="1" cellpadding="5" width="95%"><tr><th align="center" valign="center" width="20%">TIGR Locus</th><th align="center" valign="center" width="20%">Matching Probes</th><th align="center" valign="center" width="60%">Description</th></tr><tr><td align="center" valign="center" width="20%"><a href='http://rice.plantbiology.msu.edu/cgi-bin/ORF_infopage.cgi?db=osa1r5&orf=LOC_Os12g17920' target='_blank'>LOC_Os12g17920</a></td><td align="center" valign="center" width="20%">1/11</td><td align="center" valign="center" width="60%">hypothetical protein</td></tr><tr><td align="center" valign="center" width="20%"><a href='http://rice.plantbiology.msu.edu/cgi-bin/ORF_infopage.cgi?db=osa1r5&orf=LOC_Os11g17770' target='_blank'>LOC_Os11g17770</a></td><td align="center" valign="center" width="20%">1/11</td><td align="center" valign="center" width="60%">transposon protein, putative, Mutator sub-class</td></tr></table></td></tr>
145 =cut
146 $tmp_html = (grep /Associated TIGR Gene Locus/,(split /\n/,$tmp_html))[0];
147 #$tmp_html =~ m#^<tr bgcolor=\"\#FFFFFF\"><th[^>]*>Associated TIGR Gene Locus.+(<table.*?</table>)#s;
148 #print "\n\n[$tmp_html]\n";
149 my $te = HTML::TableExtract->new( headers => [('TIGR Locus', 'Matching Probes', 'Description')] );
150 $te->parse("$tmp_html");
151 my $ts = $te->first_table_found();
152 my ($to_return,$item_count,@dat) = ('',1);
153 unless ($ts) {
154 warn "\n[!Table]$tmp_html\n";
155 $item_count = 0;
157 #ddx $ts;
158 foreach my $row ($ts->rows) {
159 push @dat,join("|",@$row);
161 $to_return = join("\t",$to_retrieve,@dat)."\n";
162 #print "$to_return\n";
163 return (\$to_return,$item_count);
166 my $stop_time = [gettimeofday];
167 print STDERR "\n Time Elapsed:\t",tv_interval( $start_time, $stop_time )," second(s)\n";
169 __END__