lib/CXGN/Bulk/UnigeneIDUnigene.pm

   1 # Unigene ID's of  Unigene download script for SGN database
   2
   3 # This bulk download option handles the query
   4 # Of Array Spot of type EST.
   5 # Many of its methods are in the Bulk object.
   6
   7 =head1 NAME
   8
   9   /CXGN/Bulk/UnigeneIDUnigene.pm
  10   (A subclass of Bulk)
  11
  12 =head1 DESCRIPTION
  13
  14   This perl script is used on the bulk download page. The script collects
  15   identifiers submitted by the user and returns information based on the
  16   Unigene ID's for  Unigene entered. It then determines the information the
  17   user is searching for (SGN_U, Build Number, Automatic Annotation and
  18   Unigene Sequence) and performs the appropriate querying of the
  19   database. The results of the database query are formated and presented
  20   to the user on a separate page. Options of viewing or downloading
  21   in text or fasta are available.
  22
  23 =cut
  24
  25 package CXGN::Bulk::UnigeneIDUnigene;
  26 use strict;
  27 use warnings;
  28
  29 use Scalar::Util qw/ blessed /;
  30
  31 use Bio::PrimarySeq;
  32
  33 use CXGN::Bulk;
  34 use CXGN::Transcript::Unigene;
  35 use CXGN::Transcript::CDS;
  36 use CXGN::Phenome::Locus;
  37
  38 use base "CXGN::Bulk";
  39
  40 sub new
  41 {
  42     my $class = shift;
  43     my $self = $class->SUPER::new(@_);
  44
  45     return $self;
  46 }
  47
  48 =head2 process_parameters
  49
  50   Desc:
  51   Args: none
  52   Ret : 1 if the parameters were OK, 0 if not
  53
  54   Modifies some of the parameters received set in get_parameters. Preparing
  55   data for the database query.
  56
  57 =cut
  58
  59 sub process_parameters
  60 {
  61     my $self = shift;
  62
  63     my %links = (SGN_U  => "/search/unigene.pl?unigene_id=",);
  64
  65     $self->{links} = \%links;
  66     my @output_fields = ();
  67
  68     $self->debug("Type of identifier: ".($self->{idType})."");
  69
  70     # @output_fields is the sub-set of fields that will actually be output.
  71
  72     my @output_list = qw(
  73                          automatic_annotation
  74                          evalue
  75                          best_genbank_match
  76                          best_arabidopsis_match
  77                          associated_loci
  78                         );
  79
  80     if(my $value = $self->{convert_to_current})
  81     {
  82         if ($value eq "on")
  83         {
  84             push @output_fields, 'input_unigene';
  85         }
  86     }
  87
  88     #check condition for SGN_U
  89     if(my $value = $self->{SGN_U_U})
  90     {
  91         if ($value eq "on")
  92         {
  93             push @output_fields, 'SGN_U';
  94         }
  95     }
  96
  97     #then check for rest of fields
  98     foreach my $o (@output_list)
  99     {
 100         if (my $value =  $self->{$o}) {
 101             if ($value eq "on")
 102             {
 103                 push @output_fields, $o;
 104             }
 105         }
 106     }
 107
 108     if ($self->{uni_seq} eq "on") {push @output_fields, $self->{seq_mode}; }
 109
 110     $self->{output_list} = \@output_list;
 111     $self->{output_fields} = \@output_fields;
 112
 113     my @ids = $self ->check_ids();
 114     $self->debug("IDs to be processed:");
 115     my $has_valid_id = 0;
 116
 117     foreach my $i (@ids)
 118     {
 119       $self->debug($i);
 120       if ($self -> {idType} =~ /unigene/)
 121       {
 122         $i =~ s/^.*?(\d+).*?$/$1/;
 123       }
 124       if(!($i =~ m/\d+/))
 125       {
 126           $i = "";
 127       }
 128       if ($i ne "")
 129       {
 130           $has_valid_id = 1;
 131       }
 132     }
 133     if(!$has_valid_id) {
 134         return 0;
 135     }
 136     $self->{ids} = \@ids;
 137
 138     return 1; #params were OK if we got here
 139 }
 140
 141 =head2 process_ids
 142
 143   Desc: sub process_ids
 144   Args: default;
 145   Ret : data from database printed to a file;
 146
 147   Queries database using Persistent (see perldoc Persistent) and
 148   object oriented perl to obtain data on Bulk Objects using formatted
 149   IDs.
 150
 151 =cut
 152
 153 sub process_ids {
 154     my $self = shift;
 155
 156     my $db = $self->{db};
 157     my @output_fields = @{$self -> {output_fields}};
 158     my @notfound = ();
 159     my ($dump_fh, $notfound_fh) = $self -> create_dumpfile();
 160     my $current_time= time(); # - $self -> {query_start_time};
 161     $self->debug("Time point 6: $current_time");
 162
 163     $self -> {query_start_time} = time();
 164
 165     my @u_ids = sort {$a<=>$b} @{$self->{ids}};
 166     if( $self->{convert_to_current} ) {
 167         @u_ids = $self->convert_to_current( \@u_ids, $notfound_fh );
 168     }
 169
 170     for my $u_id ( @u_ids ) {
 171
 172         my $input_unigene; #< only used if converting to current
 173         if( ref $u_id ) {
 174             ( $u_id, $input_unigene ) = @$u_id;
 175         }
 176
 177         (print $notfound_fh "$u_id\n" and next) if($u_id > 2147483647);
 178
 179         my $unigene = CXGN::Transcript::Unigene->new($db, $u_id) or (print $notfound_fh "$u_id\t(not found in database)\n" and next);
 180
 181         my $cds = CXGN::Transcript::CDS->new_with_unigene_id($db, $u_id);
 182
 183         my @return_data;
 184         for my $field (@output_fields){
 185             if($field eq "SGN_U"){
 186                 push (@return_data,"SGN-U".$unigene->get_unigene_id());
 187             }
 188             elsif($field eq "automatic_annotation"){
 189                 my @annotations = $unigene->get_annotations(1);
 190                 if(@annotations){
 191                     my @temp;
 192                     for my $index (0..4){
 193                         if($annotations[$index]){
 194                             for my $anno ($annotations[$index]){
 195                                 my @annotation_list = @$anno;
 196                                 my($evalue, $annotation) = @annotation_list[2,7];
 197                                 if($index == 0){
 198                                     push(@temp, "MATCHED ".$annotation."(evalue: $evalue )");
 199                                 }
 200                                 else {
 201                                     push(@temp, "AND MATCHED ".$annotation." (evalue: $evalue)");
 202                                 }
 203                             }
 204                         }
 205                     }
 206                     push(@return_data, join(" ", @temp));
 207                 }else{
 208                     push(@return_data, "None.");
 209                 }
 210             }
 211             elsif($field eq "best_genbank_match"){
 212                 my @annotations = $unigene->get_genbank_annotations();
 213                 if(@annotations){
 214                     my($blast_db_id, $seq_id, $evalue, $score, $identities, $start_coord,  $end_coord,  $annotation) = @{$annotations[0]};
 215                     push(@return_data, $seq_id." (".$evalue.")");
 216                 }else{
 217                     push(@return_data, "None.");
 218                 }
 219             }
 220             elsif($field eq "best_arabidopsis_match"){
 221                 my @annotations = $unigene->get_arabidopsis_annotations();
 222                 if (@annotations) {
 223                     my ($blast_db_id, $seq_id, $evalue, $score, $identities, $start_coord,  $end_coord,  $annotation) = @{$annotations[0]};
 224                     push(@return_data, $seq_id . " (".$evalue.")");
 225                 }else{
 226                     push(@return_data, "None.");
 227                 }
 228             }
 229             elsif($field eq "associated_loci"){
 230                 if( my @associated_loci = $unigene->get_associated_loci()) {
 231                     my @loci;
 232                     for my $locus (@associated_loci) {
 233                         push(@loci, $locus->get_locus_symbol());
 234                     }
 235                     push(@return_data, join " ", @loci);
 236                 }
 237                 else {push(@return_data, "None.");}
 238             }
 239             elsif($field eq "unigene_seq"){
 240                 push(@return_data, $unigene->get_sequence()) or push(@return_data, "None.");
 241             }
 242             elsif($field eq "estscan_seq"){
 243                 my $cds_seq = $cds->get_protein_seq();
 244                 push(@return_data, $cds_seq) or push(@return_data, "None.");
 245             }
 246
 247             # TODO: This needs to be fixed up.
 248             elsif($field eq "longest6frame_seq") {
 249                 my ($longest_orf) = $self->_find_orfs( $unigene->get_sequence );
 250                 push @return_data, $longest_orf || "no ORFs found";
 251             }
 252             elsif($field eq "preferred_protein_seq"){
 253                 if(my $cds_id = $unigene->get_preferred_protein()){
 254                     my $cds2 = CXGN::Transcript::CDS->new($db, $cds_id);
 255                     my $preferred = $cds2->get_protein_seq();
 256                     push(@return_data, $preferred);
 257                 }else{
 258                     push(@return_data, "None.");
 259                 }
 260             }
 261             elsif( $field eq 'input_unigene' ) {
 262                 push @return_data, "SGN-U$input_unigene";
 263             }
 264         }
 265         print $dump_fh (join "\t", @return_data)."\n";
 266     }
 267     close($dump_fh);
 268     close($notfound_fh);
 269
 270     $self->{query_time} = time() - $self -> {query_start_time};
 271 }
 272
 273 # given a sequence, return all the ORFs in descending order of length
 274 sub _find_orfs {
 275     my ($self,$sequence) = @_;
 276
 277     # convert to a Bio::PrimarySeq if necessary
 278     $sequence = Bio::PrimarySeq->new( -id => 'fake_id', -seq => "$sequence" )
 279         unless blessed( $sequence ) && $sequence->can('translate') && $sequence->can('seq');
 280
 281     # translate in 3 frames, regex to find ORFs, sort by ORF length
 282     # descending
 283     return
 284         sort { length($b) <=> length($a)  }
 285         map /(M[^\*]+(?:\*|$))/g,
 286         map $sequence->translate(-frame => $_)->seq,
 287         0..2;
 288 }
 289
 290
 291 # returns list as [ new_unigene_id, old_unigene_id ], ...
 292 # if the unigene is current, old and new will be the same id
 293 sub convert_to_current {
 294     my ( $self, $uids, $notfound_fh ) = @_;
 295
 296     my @current_uids;
 297     foreach my $uid (@$uids) {
 298         if( my $unigene = CXGN::Transcript::Unigene->new( $self->{db}, $uid) ) {
 299             my $unigene_build = CXGN::Transcript::UnigeneBuild->new( $self->{db}, $unigene->get_build_id );
 300             if( $unigene_build->get_status eq 'C' ) {
 301                 push @current_uids, [$uid,$uid];
 302             } else {
 303                 if( my @curr = $unigene->get_current_unigene_ids ) {
 304                     push @current_uids, map [$_,$uid], @curr;
 305                 } else {
 306                     print $notfound_fh "$uid\t(no equivalent in current build)\n";
 307                 }
 308             }
 309         }
 310     }
 311
 312     return @current_uids;
 313 }
 314
 315 =head2 get_query
 316
 317   Desc:
 318   Args: default;
 319   Ret : data from database printed to a file;
 320
 321   Queries database using SQL to obtain data on Bulk Objects using formatted
 322   IDs.
 323
 324 =cut
 325
 326 # sub get_query
 327 # {
 328 #        my $in_ids = shift;
 329 #        return <<EOSQL
 330 # SELECT DISTINCT ON (unigene.unigene_id)
 331 #         unigene.unigene_id as SGN_U,
 332 #       unigene_build.build_nr as build_nr,
 333 #       unigene_consensi.seq as unigene_seq,
 334 #         est.seq as est_seq,
 335 #       cds.protein_seq,
 336 #         (SELECT array_to_string(array(SELECT 'MATCHED '
 337 #                                              || dl.defline
 338 #                                              || ' (evalue:'
 339 #                                              || bh.evalue
 340 #                                              || ')'
 341 #                                       FROM blast_annotations as ba
 342 #                                       JOIN blast_hits as bh USING(blast_annotation_id)
 343 #                                       JOIN blast_defline as dl USING(defline_id)
 344 #                                       WHERE ba.apply_id=unigene.unigene_id
 345 #                                       AND ba.blast_target_id=1
 346 #                                         AND ba.apply_type=15
 347 #                                     LIMIT 5
 348 #                                      ),
 349 #                                 ' AND ')
 350 #         ) as automatic_annotation,
 351 #       (SELECT target_db_id  FROM blast_annotations  join blast_hits using(blast_annotation_id) WHERE (blast_target_id=1) AND  (apply_id=unigene.unigene_id) AND blast_annotations.apply_type=15 order by score desc limit 1) as best_genbank_match,
 352 #       (SELECT target_db_id FROM blast_annotations join blast_hits using(blast_annotation_id) WHERE (blast_target_id=2) AND (apply_id=unigene.unigene_id) AND blast_annotations.apply_type=15 order by score desc limit 1) as best_arabidopsis_match
 353
 354
 355 # FROM unigene
 356 # LEFT JOIN unigene_consensi USING(consensi_id)
 357 # LEFT JOIN unigene_build USING(unigene_build_id)
 358 # LEFT JOIN unigene_member USING(unigene_id)
 359 # LEFT JOIN cds ON (unigene.unigene_id=cds.unigene_id)
 360 # LEFT JOIN est USING(est_id)
 361 # WHERE unigene.unigene_id $in_ids
 362 # ORDER BY unigene.unigene_id
 363 # EOSQL
 364 # }
 365
 366 1;