modified: Makefile
[GalaxyCodeBases.git] / perl / etc / crep / fetch_ricexpro_anno.pl
blobd1c3805a7d5cbe023f64d5143e4b7cd23ec98960
1 #!/usr/bin/perl -w
2 use strict;
3 use warnings;
4 #use IO::Handle;
5 #use Time::HiRes qw ( gettimeofday tv_interval );
6 #use Term::ANSIColor qw(:constants);
7 use LWP::UserAgent;
8 #use HTTP::Request::Common qw(POST);
9 use HTML::TreeBuilder::XPath;
10 #use LWP::Simple qw ( get );
11 #use HTTP::Cookies;
12 #use Text::CSV_XS;
13 #use HTML::TableExtract;
14 use Data::Dump;
16 open I,'<','resLOC.lst' or die $!;
17 open O,'>','resLOC.anno' or die $!;
19 my $ua = LWP::UserAgent->new;
20 $ua->agent("Mozilla/5.0");
22 my ($req,$res,$count);
24 $req = HTTP::Request->new(POST => 'http://ricexpro.dna.affrc.go.jp/RXP_1006/gene-search.php');
25 $req->content_type('application/x-www-form-urlencoded');
27 my $keyword = 'LOC_Os01g01360';
28 #$keyword = 'LOC_Os01g01040';
30 while (<I>) {
31 chomp;
32 $keyword = $_;
33 print "$_\n";
35 $req->content("keyword=$keyword");
36 #$req = POST 'http://ricexpro.dna.affrc.go.jp/RXP_1006/gene-search.php', [ keyword => 'LOC_Os01g01030' ];
37 $res = $ua->request($req);
38 if ($res->is_success) {
39 #print $res->decoded_content;
40 my $tmp_html = $res->content;
41 #print "$tmp_html";
42 =pod
43 <div id="result">
44 <table border="1" align="center" width="1000"><tr><th>Locus ID / Links</th><th>Locus<br>Select</th><th>FeatureNum<br>(Link to graph)</th><th>Feature<br>Select</th><th>Accession</th><th>Probe Sequence ID<br>(Link to SeqInfo)</th><th style="width:400px;">Description</th><th>MSU ID</th></tr>
45 <tr><td valign="top" rowspan="1"><span class="locus-link" tos17="1">Os01g0100500</span></td><td valign="top" rowspan="1"><input type="checkbox" class="locus-select" name="Os01g0100500"></td><td><a class="graph-link" barimg="images/barplot/RXP_1006-Os01g0100500-12943_bar.png" lineimg="images/lineplot/RXP_1006-Os01g0100500-12943_line.png" href="graph-view.php?featurenum=12943" target="_blank">12943</a></td>
46 <td><input type="checkbox" class="feature-select Os01g0100500-feature" name="12943"></td>
47 <td>AK067316</td><td><a href="probe-seq-info.php?seqid=S-10941" target="_blank">S-10941</a> (unique)</td><td><span class="desc descinfo">Similar to Pectinesterase-like protein.</span></td><td><a href="http://rice.plantbiology.msu.edu/cgi-bin/ORF_infopage.cgi?orf=LOC_Os01g01040" target="_blank">LOC_Os01g01040</a><br/><a href="http://rice.plantbiology.msu.edu/cgi-bin/ORF_infopage.cgi?orf=LOC_Os01g01030" target="_blank">LOC_Os01g01030</a><br/></td></tr>
48 </table>
49 </div>
50 =cut
51 my $tree= HTML::TreeBuilder::XPath->new;
52 $tree->parse_content($tmp_html);
53 my @toc = $tree->findnodes('//div[@id="result"]/table/tr/td');
54 #ddx \@toc;
55 #ddx $tree;
56 my (@Locus,@FeatureNum,@Accession,@Desc,%Desc,@DescUniq);
57 for my $el ( @toc ) {
58 my $tmp = $el->as_HTML;
59 #print $el->as_HTML," ---\n";
60 =pod
61 <td rowspan="2" valign="top"><span class="locus-link" tos17="0">Os01g0103100</span></td> ---
62 <td rowspan="2" valign="top"><input class="locus-select" name="Os01g0103100" type="checkbox" /></td> ---
63 <td><a barimg="images/barplot/RXP_1006-Os01g0103100-07015_bar.png" class="graph-link" href="graph-view.php?featurenum=7015" lineimg="images/lineplot/RXP_1006-Os01g0103100-07015_line.png" target="_blank">7015</a></td> ---
64 <td><input class="feature-select Os01g0103100-feature" name="7015" type="checkbox" /></td> ---
65 <td>AK070557</td> ---
66 <td><a href="probe-seq-info.php?seqid=S-5924" target="_blank">S-5924</a> (unique)</td> ---
67 <td><span class="desc descinfo">TGF-beta receptor, type I/II extracellular region family protein.</span></td> ---
68 <td><a href="http://rice.plantbiology.msu.edu/cgi-bin/ORF_infopage.cgi?orf=LOC_Os01g01360" target="_blank">LOC_Os01g01360</a><br /></td> ---
69 <td><a barimg="images/barplot/RXP_1006-Os01g0103100-36463_bar.png" class="graph-link" href="graph-view.php?featurenum=36463" lineimg="images/lineplot/RXP_1006-Os01g0103100-36463_line.png" target="_blank">36463</a></td> ---
70 <td><input class="feature-select Os01g0103100-feature" name="36463" type="checkbox" /></td> ---
71 <td>AK058723</td> ---
72 <td><a href="probe-seq-info.php?seqid=S-29658" target="_blank">S-29658</a> (unique)</td> ---
73 <td><span class="desc descinfo">TGF-beta receptor, type I/II extracellular region family protein.</span></td> ---
74 <td><a href="http://rice.plantbiology.msu.edu/cgi-bin/ORF_infopage.cgi?orf=LOC_Os01g01360" target="_blank">LOC_Os01g01360</a><br /></td> ---
75 =cut
76 #print $el->as_trimmed_text,"\n";
77 if ($tmp =~ /"locus-link".+\>(\w+)\<\//) {
78 print "Locus=$1\n";
79 push @Locus,$1;
80 } elsif ($tmp =~ / barimg.+\>(\d+)\<\//) {
81 print "FeatureNum=$1\n";
82 push @FeatureNum,$1;
83 } elsif ($tmp =~ /\<td\>(\w+)\<\/td\>/) {
84 print "Accession=$1\n";
85 push @Accession,$1;
86 } elsif ($tmp =~ / class="desc descinfo".*\>([^<]+)\<\//) {
87 print "Desc=$1\n";
88 push @Desc,$1;
89 ++$Desc{$1};
92 for (@Desc) {
93 push @DescUniq,$_ if $Desc{$_} > 0;
94 $Desc{$_} *= -1 if $Desc{$_} > 1;
96 my $tmp = join("\t",$keyword,join('|',@Locus),join('|',@FeatureNum),join('|',@Accession),join('|',@DescUniq));
97 print O $tmp,"\n";
98 print '-' x 5,"$tmp\n";
99 $tree->delete;
101 else {
102 print "Error: " . $res->status_line . "\n";
106 close I;
107 close O;
109 #my $tmp_html = $res->content;
110 #ddx $res;
112 __END__
113 awk '{print $1}' crep_all_tsv_new.txt.up2*.txt |sort|uniq > resLOC.lst