remove erroneously-commited next i had as part of my FISH loader debugging
[sgn.git] / lib / CXGN / Bulk / UnigeneIDUnigene.pm
blob73e435767ec8619f6617fe86c1a79af448494dc0
1 # Unigene ID's of Unigene download script for SGN database
3 # This bulk download option handles the query
4 # Of Array Spot of type EST.
5 # Many of its methods are in the Bulk object.
7 =head1 NAME
9 /CXGN/Bulk/UnigeneIDUnigene.pm
10 (A subclass of Bulk)
12 =head1 DESCRIPTION
14 This perl script is used on the bulk download page. The script collects
15 identifiers submitted by the user and returns information based on the
16 Unigene ID's for Unigene entered. It then determines the information the
17 user is searching for (SGN_U, Build Number, Automatic Annotation and
18 Unigene Sequence) and performs the appropriate querying of the
19 database. The results of the database query are formated and presented
20 to the user on a separate page. Options of viewing or downloading
21 in text or fasta are available.
23 =cut
25 package CXGN::Bulk::UnigeneIDUnigene;
26 use strict;
27 use warnings;
29 use Scalar::Util qw/ blessed /;
31 use Bio::PrimarySeq;
33 use CXGN::Bulk;
34 use CXGN::Transcript::Unigene;
35 use CXGN::Transcript::CDS;
36 use CXGN::Phenome::Locus;
38 use base "CXGN::Bulk";
40 sub new
42 my $class = shift;
43 my $self = $class->SUPER::new(@_);
45 return $self;
48 =head2 process_parameters
50 Desc:
51 Args: none
52 Ret : 1 if the parameters were OK, 0 if not
54 Modifies some of the parameters received set in get_parameters. Preparing
55 data for the database query.
57 =cut
59 sub process_parameters
61 my $self = shift;
63 my %links = (SGN_U => "/search/unigene.pl?unigene_id=",);
65 $self->{links} = \%links;
66 my @output_fields = ();
68 $self->debug("Type of identifier: ".($self->{idType})."");
70 # @output_fields is the sub-set of fields that will actually be output.
72 my @output_list = qw(
73 automatic_annotation
74 evalue
75 best_genbank_match
76 best_arabidopsis_match
77 associated_loci
80 if(my $value = $self->{convert_to_current})
82 if ($value eq "on")
84 push @output_fields, 'input_unigene';
88 #check condition for SGN_U
89 if(my $value = $self->{SGN_U_U})
91 if ($value eq "on")
93 push @output_fields, 'SGN_U';
97 #then check for rest of fields
98 foreach my $o (@output_list)
100 if (my $value = $self->{$o}) {
101 if ($value eq "on")
103 push @output_fields, $o;
108 if ($self->{uni_seq} eq "on") {push @output_fields, $self->{seq_mode}; }
110 $self->{output_list} = \@output_list;
111 $self->{output_fields} = \@output_fields;
113 my @ids = $self ->check_ids();
114 $self->debug("IDs to be processed:");
115 my $has_valid_id = 0;
117 foreach my $i (@ids)
119 $self->debug($i);
120 if ($self -> {idType} =~ /unigene/)
122 $i =~ s/^.*?(\d+).*?$/$1/;
124 if(!($i =~ m/\d+/))
126 $i = "";
128 if ($i ne "")
130 $has_valid_id = 1;
133 if(!$has_valid_id) {
134 return 0;
136 $self->{ids} = \@ids;
138 return 1; #params were OK if we got here
141 =head2 process_ids
143 Desc: sub process_ids
144 Args: default;
145 Ret : data from database printed to a file;
147 Queries database using Persistent (see perldoc Persistent) and
148 object oriented perl to obtain data on Bulk Objects using formatted
149 IDs.
151 =cut
153 sub process_ids {
154 my $self = shift;
156 my $db = $self->{db};
157 my @output_fields = @{$self -> {output_fields}};
158 my @notfound = ();
159 my ($dump_fh, $notfound_fh) = $self -> create_dumpfile();
160 my $current_time= time(); # - $self -> {query_start_time};
161 $self->debug("Time point 6: $current_time");
163 $self -> {query_start_time} = time();
165 my @u_ids = sort {$a<=>$b} @{$self->{ids}};
166 if( $self->{convert_to_current} ) {
167 @u_ids = $self->convert_to_current( \@u_ids, $notfound_fh );
170 for my $u_id ( @u_ids ) {
172 my $input_unigene; #< only used if converting to current
173 if( ref $u_id ) {
174 ( $u_id, $input_unigene ) = @$u_id;
177 (print $notfound_fh "$u_id\n" and next) if($u_id > 2147483647);
179 my $unigene = CXGN::Transcript::Unigene->new($db, $u_id) or (print $notfound_fh "$u_id\t(not found in database)\n" and next);
181 my $cds = CXGN::Transcript::CDS->new_with_unigene_id($db, $u_id);
183 my @return_data;
184 for my $field (@output_fields){
185 if($field eq "SGN_U"){
186 push (@return_data,"SGN-U".$unigene->get_unigene_id());
188 elsif($field eq "automatic_annotation"){
189 my @annotations = $unigene->get_annotations(1);
190 if(@annotations){
191 my @temp;
192 for my $index (0..4){
193 if($annotations[$index]){
194 for my $anno ($annotations[$index]){
195 my @annotation_list = @$anno;
196 my($evalue, $annotation) = @annotation_list[2,7];
197 if($index == 0){
198 push(@temp, "MATCHED ".$annotation."(evalue: $evalue )");
200 else {
201 push(@temp, "AND MATCHED ".$annotation." (evalue: $evalue)");
206 push(@return_data, join(" ", @temp));
207 }else{
208 push(@return_data, "None.");
211 elsif($field eq "best_genbank_match"){
212 my @annotations = $unigene->get_genbank_annotations();
213 if(@annotations){
214 my($blast_db_id, $seq_id, $evalue, $score, $identities, $start_coord, $end_coord, $annotation) = @{$annotations[0]};
215 push(@return_data, $seq_id." (".$evalue.")");
216 }else{
217 push(@return_data, "None.");
220 elsif($field eq "best_arabidopsis_match"){
221 my @annotations = $unigene->get_arabidopsis_annotations();
222 if (@annotations) {
223 my ($blast_db_id, $seq_id, $evalue, $score, $identities, $start_coord, $end_coord, $annotation) = @{$annotations[0]};
224 push(@return_data, $seq_id . " (".$evalue.")");
225 }else{
226 push(@return_data, "None.");
229 elsif($field eq "associated_loci"){
230 if( my @associated_loci = $unigene->get_associated_loci()) {
231 my @loci;
232 for my $locus (@associated_loci) {
233 push(@loci, $locus->get_locus_symbol());
235 push(@return_data, join " ", @loci);
237 else {push(@return_data, "None.");}
239 elsif($field eq "unigene_seq"){
240 push(@return_data, $unigene->get_sequence()) or push(@return_data, "None.");
242 elsif($field eq "estscan_seq"){
243 my $cds_seq = $cds->get_protein_seq();
244 push(@return_data, $cds_seq) or push(@return_data, "None.");
247 # TODO: This needs to be fixed up.
248 elsif($field eq "longest6frame_seq") {
249 my ($longest_orf) = $self->_find_orfs( $unigene->get_sequence );
250 push @return_data, $longest_orf || "no ORFs found";
252 elsif($field eq "preferred_protein_seq"){
253 if(my $cds_id = $unigene->get_preferred_protein()){
254 my $cds2 = CXGN::Transcript::CDS->new($db, $cds_id);
255 my $preferred = $cds2->get_protein_seq();
256 push(@return_data, $preferred);
257 }else{
258 push(@return_data, "None.");
261 elsif( $field eq 'input_unigene' ) {
262 push @return_data, "SGN-U$input_unigene";
265 print $dump_fh (join "\t", @return_data)."\n";
267 close($dump_fh);
268 close($notfound_fh);
270 $self->{query_time} = time() - $self -> {query_start_time};
273 # given a sequence, return all the ORFs in descending order of length
274 sub _find_orfs {
275 my ($self,$sequence) = @_;
277 # convert to a Bio::PrimarySeq if necessary
278 $sequence = Bio::PrimarySeq->new( -id => 'fake_id', -seq => "$sequence" )
279 unless blessed( $sequence ) && $sequence->can('translate') && $sequence->can('seq');
281 # translate in 3 frames, regex to find ORFs, sort by ORF length
282 # descending
283 return
284 sort { length($b) <=> length($a) }
285 map /(M[^\*]+(?:\*|$))/g,
286 map $sequence->translate(-frame => $_)->seq,
287 0..2;
291 # returns list as [ new_unigene_id, old_unigene_id ], ...
292 # if the unigene is current, old and new will be the same id
293 sub convert_to_current {
294 my ( $self, $uids, $notfound_fh ) = @_;
296 my @current_uids;
297 foreach my $uid (@$uids) {
298 if( my $unigene = CXGN::Transcript::Unigene->new( $self->{db}, $uid) ) {
299 my $unigene_build = CXGN::Transcript::UnigeneBuild->new( $self->{db}, $unigene->get_build_id );
300 if( $unigene_build->get_status eq 'C' ) {
301 push @current_uids, [$uid,$uid];
302 } else {
303 if( my @curr = $unigene->get_current_unigene_ids ) {
304 push @current_uids, map [$_,$uid], @curr;
305 } else {
306 print $notfound_fh "$uid\t(no equivalent in current build)\n";
312 return @current_uids;
315 =head2 get_query
317 Desc:
318 Args: default;
319 Ret : data from database printed to a file;
321 Queries database using SQL to obtain data on Bulk Objects using formatted
322 IDs.
324 =cut
326 # sub get_query
328 # my $in_ids = shift;
329 # return <<EOSQL
330 # SELECT DISTINCT ON (unigene.unigene_id)
331 # unigene.unigene_id as SGN_U,
332 # unigene_build.build_nr as build_nr,
333 # unigene_consensi.seq as unigene_seq,
334 # est.seq as est_seq,
335 # cds.protein_seq,
336 # (SELECT array_to_string(array(SELECT 'MATCHED '
337 # || dl.defline
338 # || ' (evalue:'
339 # || bh.evalue
340 # || ')'
341 # FROM blast_annotations as ba
342 # JOIN blast_hits as bh USING(blast_annotation_id)
343 # JOIN blast_defline as dl USING(defline_id)
344 # WHERE ba.apply_id=unigene.unigene_id
345 # AND ba.blast_target_id=1
346 # AND ba.apply_type=15
347 # LIMIT 5
348 # ),
349 # ' AND ')
350 # ) as automatic_annotation,
351 # (SELECT target_db_id FROM blast_annotations join blast_hits using(blast_annotation_id) WHERE (blast_target_id=1) AND (apply_id=unigene.unigene_id) AND blast_annotations.apply_type=15 order by score desc limit 1) as best_genbank_match,
352 # (SELECT target_db_id FROM blast_annotations join blast_hits using(blast_annotation_id) WHERE (blast_target_id=2) AND (apply_id=unigene.unigene_id) AND blast_annotations.apply_type=15 order by score desc limit 1) as best_arabidopsis_match
355 # FROM unigene
356 # LEFT JOIN unigene_consensi USING(consensi_id)
357 # LEFT JOIN unigene_build USING(unigene_build_id)
358 # LEFT JOIN unigene_member USING(unigene_id)
359 # LEFT JOIN cds ON (unigene.unigene_id=cds.unigene_id)
360 # LEFT JOIN est USING(est_id)
361 # WHERE unigene.unigene_id $in_ids
362 # ORDER BY unigene.unigene_id
363 # EOSQL