2 package CXGN
::Tools
::Parse
::Fasta
;
5 use base qw
/CXGN::Tools::Parse/;
7 =head1 CXGN::Tools::Parse::Fasta
9 Two reasons for SGN to have its own Fasta parser:
10 1) CXGN Identifier/Species convention: i.e.: >AT1G01010.1 / Arabidopsis T.
23 my $data = $self->{data_to_parse
};
25 my $entry = { id
=> '', species
=> '', seq
=> '' };
31 if($self->{previous_line
}){
32 $line = $self->{previous_line
};
33 $self->{previous_line
} = "";
39 next if $line =~ /^\s*$/;
40 unless ($entry->{id
}) {
42 my ($id) = $line =~ /^>([^\/\s
]+)/;
43 my ($species) = $line =~ /^>\Q$id\E\s*\/\s
*([^|]*)/;
45 ($annotation) = $line =~ /^>\Q$id\E\s*(.*)$/;
46 #print "annotation [$annotation] \n";
47 $annotation =~ s/\s*\/\s*\Q$species\E// if($annotation and $species); # don't do if empty string - to avoid warning messages
48 $entry->{id
} = $id if $id;
49 $entry->{species
} = $species if $species;
50 $entry->{annotation
} = $annotation if $annotation;
51 $entry->{defline
} = $line;
56 $self->{previous_line
} = $line;
61 $line =~ s/\*//g; #ends protein sequences sometimes...
62 $entry->{seq
} .= $line; #eehhh, probably a sequence
68 my ($id) = $data =~ /^.*>([^\/\s
]+)/;
69 # print "In fasta next. data $data id [$id]\n";
70 my ($species) = $data =~ /^.*>\Q$id\E\s*\/\s
*(.*?
)\n/;
71 # print "In fasta next. data $data species [$species]\n";
72 $data =~ s/^.*>\Q$id\E.*?\n//;
74 $entry->{id
} = $id if $id;
75 $entry->{species
} = $species if $species;
76 my ($seq) = $data =~ /([\w\n\-\*]+)/;
79 $data =~ s/[\w\n\-\*]+//;
81 $entry->{data_to_parse
} = $data;
83 return $entry if ($entry->{id
} && $entry->{seq
});