change rules for cluster accessible dirs.
[cxgn-corelibs.git] / lib / CXGN / Tools / Parse / Fasta.pm
blobfdcd727ad504423ea5cd12369a86ec16091b4902
2 package CXGN::Tools::Parse::Fasta;
3 use strict;
5 use base qw/CXGN::Tools::Parse/;
7 =head1 CXGN::Tools::Parse::Fasta
9 Two reasons for SGN to have its own Fasta parser:
10 1) CXGN Identifier/Species convention: i.e.: >AT1G01010.1 / Arabidopsis T.
11 2) BioPerl sucks
14 =head1 Author
16 C. Carpita
18 =cut
21 sub next {
22 my $self = shift;
23 my $data = $self->{data_to_parse};
24 my $fh = $self->{fh};
25 my $entry = { id => '', species => '', seq => '' };
27 if($fh){
28 my $entry_filled = 0;
29 until($entry_filled){
30 my $line = "";
31 if($self->{previous_line}){
32 $line = $self->{previous_line};
33 $self->{previous_line} = "";
35 else{
36 $line = <$fh>;
37 last unless $line;
39 next if $line =~ /^\s*$/;
40 unless ($entry->{id}) {
41 chomp $line;
42 my ($id) = $line =~ /^>([^\/\s]+)/;
43 my ($species) = $line =~ /^>\Q$id\E\s*\/\s*([^|]*)/;
44 my $annotation = " ";
45 ($annotation) = $line =~ /^>\Q$id\E\s*(.*)$/;
46 #print "annotation [$annotation] \n";
47 $annotation =~ s/\s*\/\s*\Q$species\E// if($annotation and $species); # don't do if empty string - to avoid warning messages
48 $entry->{id} = $id if $id;
49 $entry->{species} = $species if $species;
50 $entry->{annotation} = $annotation if $annotation;
51 $entry->{defline} = $line;
53 else {
54 chomp $line;
55 if ($line =~ /^>/){
56 $self->{previous_line} = $line;
57 $entry_filled = 1;
59 else {
60 $line =~ s/\s+//g;
61 $line =~ s/\*//g; #ends protein sequences sometimes...
62 $entry->{seq} .= $line; #eehhh, probably a sequence
67 elsif($data){
68 my ($id) = $data =~ /^.*>([^\/\s]+)/;
69 # print "In fasta next. data $data id [$id]\n";
70 my ($species) = $data =~ /^.*>\Q$id\E\s*\/\s*(.*?)\n/;
71 # print "In fasta next. data $data species [$species]\n";
72 $data =~ s/^.*>\Q$id\E.*?\n//;
73 return unless $id;
74 $entry->{id} = $id if $id;
75 $entry->{species} = $species if $species;
76 my ($seq) = $data =~ /([\w\n\-\*]+)/;
77 $seq =~ s/\n//g;
78 $seq =~ s/\*//g;
79 $data =~ s/[\w\n\-\*]+//;
80 $entry->{seq} = $seq;
81 $entry->{data_to_parse} = $data;
83 return $entry if ($entry->{id} && $entry->{seq});
86 sub parse_all_data {
87 return undef;