lib/CXGN/Tools/Parse/Fasta.pm

   1
   2 package CXGN::Tools::Parse::Fasta;
   3 use strict;
   4
   5 use base qw/CXGN::Tools::Parse/;
   6
   7 =head1 CXGN::Tools::Parse::Fasta
   8
   9  Two reasons for SGN to have its own Fasta parser:
  10  1) CXGN Identifier/Species convention: i.e.: >AT1G01010.1 / Arabidopsis T.
  11  2) BioPerl sucks
  12
  13
  14 =head1 Author
  15
  16 C. Carpita
  17
  18 =cut
  19
  20
  21 sub next {
  22         my $self = shift;
  23         my $data = $self->{data_to_parse};
  24         my $fh = $self->{fh};
  25         my $entry = { id => '', species => '', seq => '' };
  26
  27         if($fh){
  28                 my $entry_filled = 0;
  29                 until($entry_filled){
  30                         my $line = "";
  31                         if($self->{previous_line}){
  32                                 $line = $self->{previous_line};
  33                                 $self->{previous_line} = "";
  34                         }
  35                         else{
  36                                 $line = <$fh>;
  37                                 last unless $line;
  38                         }
  39                         next if $line =~ /^\s*$/;
  40                         unless ($entry->{id}) {
  41                                 chomp $line;
  42                                 my ($id) = $line =~ /^>([^\/\s]+)/;
  43                                 my ($species) = $line =~ /^>\Q$id\E\s*\/\s*([^|]*)/;
  44                                 my $annotation = " ";
  45                                 ($annotation) = $line =~ /^>\Q$id\E\s*(.*)$/;
  46 #print "annotation [$annotation] \n";
  47                                 $annotation =~ s/\s*\/\s*\Q$species\E// if($annotation  and  $species); # don't do if empty string - to avoid warning messages
  48                                 $entry->{id} = $id if $id;
  49                                 $entry->{species} = $species if $species;
  50                                 $entry->{annotation} = $annotation if $annotation;
  51                                 $entry->{defline} = $line;
  52                         }
  53                         else {
  54                                 chomp $line;
  55                                 if ($line =~ /^>/){
  56                                         $self->{previous_line} = $line;
  57                                         $entry_filled = 1;
  58                                 }
  59                                 else {
  60                                         $line =~ s/\s+//g;
  61                                         $line =~ s/\*//g; #ends protein sequences sometimes...
  62                                         $entry->{seq} .= $line;  #eehhh, probably a sequence
  63                                 }
  64                         }
  65                 }
  66         }
  67         elsif($data){
  68                 my ($id) = $data =~ /^.*>([^\/\s]+)/;
  69 # print "In fasta next. data $data id [$id]\n";
  70                 my ($species) = $data =~ /^.*>\Q$id\E\s*\/\s*(.*?)\n/;
  71 # print "In fasta next. data $data species [$species]\n";
  72                 $data =~ s/^.*>\Q$id\E.*?\n//;
  73                 return unless $id;
  74                 $entry->{id} = $id if $id;
  75                 $entry->{species} = $species if $species;
  76                 my ($seq) = $data =~ /([\w\n\-\*]+)/;
  77                 $seq =~ s/\n//g;
  78                 $seq =~ s/\*//g;
  79                 $data =~ s/[\w\n\-\*]+//;
  80                 $entry->{seq} = $seq;
  81                 $entry->{data_to_parse} = $data;
  82         }
  83         return $entry if ($entry->{id} && $entry->{seq});
  84 }
  85
  86 sub parse_all_data {
  87         return undef;
  88 }
  89
  90
  91 1;