Bio::Tools::CodonTable::is_start_codon: check in case of ambiguous codons (#266)
[bioperl-live.git] / lib / Bio / SeqIO / tigr.pm
blob3dc266e8245af542cf8c3fad94baeee2942ebf9e
1 # BioPerl module for Bio::SeqIO::tigr
3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
5 # Cared for by Josh Lauricha (laurichj@bioinfo.ucr.edu)
7 # Copyright Josh Lauricha
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::SeqIO::tigr - TIGR XML sequence input/output stream
17 =head1 SYNOPSIS
19 Do not use this module directly. Use it via the Bio::SeqIO class.
21 =head1 DESCRIPTION
23 This object can transform Bio::Seq objects to and from efa flat
24 file databases.
26 =head1 FEEDBACK
28 =head2 Mailing Lists
30 User feedback is an integral part of the evolution of this and other
31 Bioperl modules. Send your comments and suggestions preferably to one
32 of the Bioperl mailing lists. Your participation is much appreciated.
34 bioperl-l@bioperl.org - General discussion
35 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
37 =head2 Support
39 Please direct usage questions or support issues to the mailing list:
41 I<bioperl-l@bioperl.org>
43 rather than to the module maintainer directly. Many experienced and
44 reponsive experts will be able look at the problem and quickly
45 address it. Please include a thorough description of the problem
46 with code and data examples if at all possible.
48 =head2 Reporting Bugs
50 Report bugs to the Bioperl bug tracking system to help us keep track
51 the bugs and their resolution.
52 Bug reports can be submitted via the web:
54 https://github.com/bioperl/bioperl-live/issues
56 =head1 AUTHORS - Josh Lauricha
58 Email: laurichj@bioinfo.ucr.edu
61 =head1 APPENDIX
63 The rest of the documentation details each of the object
64 methods. Internal methods are usually preceded with a _
66 =cut
68 # TODO:
69 # - Clean up code
70 # - Find and fix bugs ;)
72 # Let the code begin...
73 package Bio::SeqIO::tigr;
75 use strict;
77 use Bio::Seq::RichSeq;
78 use Bio::Species;
79 use Bio::Annotation::Comment;
80 use Bio::SeqFeature::Generic;
81 use Bio::Seq::SeqFactory;
82 use Bio::Seq::RichSeq;
83 use Data::Dumper;
84 use Error qw/:try/;
86 use base qw(Bio::SeqIO);
88 sub _initialize
90 my($self, @args) = @_;
92 $self->SUPER::_initialize(@args);
93 $self->sequence_factory(Bio::Seq::SeqFactory->new(
94 -type => 'Bio::Seq::RichSeq')
97 # Parse the document
98 $self->_process();
101 =head2 next_seq
103 Title : next_seq
104 Usage : $seq = $stream->next_seq()
105 Function: returns the next sequence in the stream
106 Returns : Bio::Seq object
107 Args : NONE
109 =cut
111 sub next_seq()
113 my ($self) = @_;
115 # Check for any more sequences
116 return if !defined($self->{_sequences}) or scalar(@{$self->{_sequences}}) < 1;
118 # get the next sequence
119 my $seq = shift(@{ $self->{_sequences} } );
121 # Get the 5' and 3' ends
122 my ($source) = grep { $_->primary_tag() eq 'source' } $seq->get_SeqFeatures();
123 my ($end5) = $source->get_tag_values('end5');
124 my ($end3) = $source->get_tag_values('end3');
126 # Sort the 5' and 3':
127 my ($start, $end) = ( $end5 < $end3 ? ( $end5, $end3 ) : ( $end3, $end5 ) );
129 # make the start a perl index
130 $start -= 1;
132 # Figure out the length
133 my $length = $end - $start;
135 # check to make sure $start >= 0 and $end <= length(assembly_seq)
136 if($start < 0) {
137 throw Bio::Root::OutOfRange("the sequence start is $start < 0");
138 } elsif($end > length($self->{_assembly}->{seq})) {
139 throw Bio::Root::OutOfRange("the sequence end is $end < " . length($self->{_assembly}->{seq}));
140 } elsif($start >= $end) {
141 throw Bio::Root::OutOfRange("the sequence start is after end $start >= $end");
144 # Get and set the real sequence
145 $seq->seq(substr($self->{_assembly}->{seq}, $start, $length));
147 if( $end5 > $end3 ) {
148 # Reverse complement the sequence
149 $seq->seq( $seq->primary_seq()->revcom()->seq() );
152 # add the translation to each CDS
153 foreach my $feat ($seq->get_SeqFeatures()) {
154 next if $feat->primary_tag() ne "CDS";
156 # Check for an invalid protein
157 try {
158 # Get the subsq
159 my $cds = Bio::PrimarySeq->new(
160 -strand => 1,
161 -id => $seq->accession_number(),
162 -seq => $seq->subseq($feat->location())
165 # Translate it
166 my $trans = $cds->translate(undef, undef, undef, undef, 1, 1)->seq();
168 # Add the tag
169 $feat->add_tag_value(translation => $trans);
170 } catch Bio::Root::Exception with {
171 print STDERR 'TIGR strikes again, the CDS is not a valid protein: ', $seq->accession_number(), "\n"
172 if $self->verbose() > 0;
176 # Set the display id to the accession number if there
177 # is no display id
178 $seq->display_id( $seq->accession_number() ) unless $seq->display_id();
180 return $seq;
183 sub _process
185 my($self) = @_;
186 my $line;
187 my $tu = undef;
189 $line = $self->_readline();
190 do {
191 if($line =~ /<\?xml\s+version\s+=\s+"\d+\.\d+"\?>/o) {
192 # do nothing
193 } elsif ($line =~ /<!DOCTYPE (\w+) SYSTEM "[\w\.]+">/o) {
194 $self->throw("DOCTYPE of $1, not TIGR!")
195 if $1 ne "TIGR" ;
196 } elsif ($line =~ /<TIGR>/o) {
197 $self->_pushback($line);
198 $self->_process_tigr();
199 } elsif ($line =~ /<ASSEMBLY.*?>/o) {
200 $self->_pushback($line);
201 $self->_process_assembly();
202 } elsif ($line =~ /<\/TIGR>/o) {
203 $self->{'eof'} = 1;
204 return;
205 } else {
206 $self->throw("Unknown or Invalid process directive:",
207 join('', ($line =~ /^\s*(<[^>]+>)/o)));
209 $line = $self->_readline();
210 } while( defined( $line ) );
213 sub _process_tigr
215 my($self) = @_;
216 my $line;
218 $line = $self->_readline();
219 if($line !~ /<TIGR>/o) {
220 $self->throw("Bio::SeqIO::tigr::_process_tigr called but no ",
221 "<TIGR> found in stream");
224 $line = $self->_readline();
225 if($line =~ /<PSEUDOCHROMOSOME>/o) {
226 $self->_pushback($line);
227 $self->_process_pseudochromosome();
228 } elsif ($line =~ /<ASSEMBLY.*?>/o) {
229 $self->_pushback($line);
230 $self->_process_assembly();
234 sub _process_pseudochromosome
236 my($self) = @_;
237 my $line;
239 $line = $self->_readline();
240 return if $line !~ /<PSEUDOCHROMOSOME>/o;
242 $line = $self->_readline();
244 if($line =~ /<SCAFFOLD>/o) {
245 $self->_pushback($line);
246 $self->_process_scaffold();
247 $line = $self->_readline();
248 } else {
249 $self->warn( "No Scaffold found in <PSUEDOCHROMOSOME> this " .
250 "is a violation of the TIGR dtd, but we ignore " .
251 "it so we are ignoring the error\n"
255 if($line =~ /<ASSEMBLY.*>/o) {
256 $self->_pushback($line);
257 $self->_process_assembly();
258 $line = $self->_readline();
259 } else {
260 $self->throw("Missing required ASSEMBLY in <PSEUDOCHROMOSOME>");
263 if($line =~ /<\/PSEUDOCHROMOSOME>/) {
264 return;
267 $self->throw("Reached end of _process_pseudochromosome");
270 sub _process_assembly
272 my($self) = @_;
273 my $line;
275 $line = $self->_readline();
276 if($line !~ /<ASSEMBLY([^>]*)>/o) {
277 $self->throw("Bio::SeqIO::tigr::_process_assembly called ",
278 "but no <ASSEMBLY> found in stream");
281 my %attribs = ($1 =~ /(\w+)\s*=\s+"(.*?)"/og);
282 $self->{_assembly}->{date} = $attribs{CURRENT_DATE};
283 $self->{_assembly}->{db} = $attribs{DATABASE};
284 $self->{_assembly}->{chromosome} = $attribs{CHROMOSOME};
286 $line = $self->_readline();
287 my($attr, $val);
288 if(($attr, $val) = ($line =~ /<ASMBL_ID([^>]*)>([^<]*)<\/ASMBL_ID>/o)) {
289 %attribs = ($attr =~ /(\w+)\s*=\s+"(.*?)"/og);
290 $self->{_assembly}->{clone_name} = $attribs{CLONE_NAME};
291 $self->{_assembly}->{clone} = $val;
292 $line = $self->_readtag();
293 } else {
294 $self->throw("Required <ASMBL_ID> missing");
297 if($line =~ /<COORDSET>/o) {
298 $self->_pushback($line);
299 my $cs = $self->_process_coordset();
301 $self->{_assembly}->{end5} = $cs->{end5};
302 $self->{_assembly}->{end3} = $cs->{end3};
304 $line = $self->_readline();
305 } else {
306 $self->throw("Required <COORDSET> missing");
309 if($line =~ /<HEADER>/o) {
310 $self->_pushback($line);
311 $self->_process_header();
312 $line = $self->_readline();
313 } else {
314 $self->throw("Required <HEADER> missing");
317 if($line =~ /<TILING_PATH>/o) {
318 $self->_pushback($line);
319 $self->_process_tiling_path();
320 $line = $self->_readline();
323 if($line =~ /<GENE_LIST>/o) {
324 $self->_pushback($line);
325 $self->_process_gene_list();
326 $line = $self->_readline();
327 } else {
328 $self->throw("Required <GENE_LIST> missing");
331 if($line =~ /<MISC_INFO>/o) {
332 $self->_pushback($line);
333 $self->_process_misc_info();
334 $line = $self->_readline();
337 if($line =~ /<REPEAT_LIST>/o) {
338 $self->_pushback($line);
339 $self->_process_repeat_list();
340 $line = $self->_readline();
343 if($line =~ /<ASSEMBLY_SEQUENCE>/o) {
344 $self->_pushback($line);
345 $self->_process_assembly_seq();
346 $line = $self->_readline();
347 } else {
348 $self->throw("Required <ASSEMBLY_SEQUENCE> missing");
351 if($line =~ /<\/ASSEMBLY>/o) {
352 return;
354 $self->throw("Reached the end of <ASSEMBLY>");
357 sub _process_assembly_seq()
359 my ($self) = @_;
360 my $line;
362 $line = $self->_readline();
363 if($line !~ /<ASSEMBLY_SEQUENCE>/o) {
364 $self->throw("Bio::SeqIO::tigr::_process_assembly_seq called ".
365 "with no <ASSEMBLY_SEQUENCE> in the stream");
368 # Protect against lots of smaller lines
369 my @chunks;
371 do {
372 $line = $self->_readline();
373 last unless $line;
375 my $seq;
376 if (($seq) = ($line =~ /^\s*(\w+)\s*$/o)) {
377 push(@chunks, $seq);
378 } elsif( ($seq) = ( $line =~ /^\s*(\w+)<\/ASSEMBLY_SEQUENCE>\s*$/o) ) {
379 push(@chunks, $seq);
380 $self->{_assembly}->{seq} = join('', @chunks);
381 return;
383 } while( $line );
385 $self->throw("Reached end of _proces_assembly");
388 sub _process_coordset($)
390 my ($self) = @_;
391 my $line;
392 my $h;
394 $line = $self->_readline();
395 if($line =~ /<COORDSET>/o) {
396 $self->_pushback($line);
397 $line = $self->_readtag();
398 ($h->{end5}, $h->{end3}) = ($line =~ /<COORDSET>\s*<END5>\s*(\d+)\s*<\/END5>\s*<END3>\s*(\d+)\s*<\/END3>/os);
399 if(!defined($h->{end5}) or !defined($h->{end3})) {
400 $self->throw("Invalid <COORDSET>: $line");
402 return $h;
403 } else {
404 $self->throw("Bio::SeqIO::tigr::_process_coordset() called ",
405 "but no <COORDSET> found in stream");
409 sub _process_header
411 my ($self) = @_;
412 my $line = $self->_readline();
414 if($line !~ /<HEADER>/o) {
415 $self->throw("Bio::SeqIO::tigr::_process_header called ",
416 "but no <HEADER> found in stream");
419 $line = $self->_readtag();
420 if($line =~ /<CLONE_NAME>([^>]+)<\/CLONE_NAME>/o) {
421 $self->{_assembly}->{clone_name} = $1;
422 $line = $self->_readtag();
423 } else {
424 $self->throw("Required <CLONE_NAME> missing");
427 if($line =~ /<SEQ_LAST_TOUCHED>/o) {
428 # Ignored for now
429 $line = $self->_readtag();
430 } else {
431 $self->throw("Reqired <SEQ_LAST_TOUCHED> missing");
434 if($line =~ /<GB_ACCESSION>([^<]*)<\/GB_ACCESSION>/o) {
435 $self->{_assembly}->{gb} = $1;
436 $line = $self->_readtag();
437 } else {
438 $self->throw("Required <GB_ACCESSION> missing");
441 if($line =~ /<ORGANISM>\s*(.+)\s*<\/ORGANISM>/o) {
442 my( $genus, $species, @ss ) = split(/\s+/o, $1);
443 $self->{_assembly}->{species} = Bio::Species->new();
444 $self->{_assembly}->{species}->genus($genus);
445 $self->{_assembly}->{species}->species($species);
446 $self->{_assembly}->{species}->sub_species(join(' ', @ss)) if scalar(@ss) > 0;
448 $line = $self->_readtag();
449 } else {
450 $self->throw("Required <ORGANISM> missing");
453 if($line =~ /<LINEAGE>([^<]*)<\/LINEAGE>/o) {
454 $self->{_assembly}->{species}->classification(
455 $self->{_assembly}->{species}->species(),
456 reverse(split(/\s*;\s*/o, $1))
458 $line = $self->_readtag();
459 } else {
460 $self->throw("Required <LINEAGE> missing");
463 if($line =~ /<SEQ_GROUP>([^<]*)<\/SEQ_GROUP>/o) {
464 # ingnored
465 $line = $self->_readtag();
466 } else {
467 $self->throw("Required <SEQ_GROUP> missing");
470 while($line =~ /<KEYWORDS>[^<]*<\/KEYWORDS>/o) {
471 push(@{$self->{_assembly}->{keywords}}, $1);
472 $line = $self->_readtag();
475 while($line =~ /<GB_DESCRIPTION>([^<]+)<\/GB_DESCRIPTION>/o) {
476 push(@{$self->{_assembly}->{gb_desc}},$1);
477 $line = $self->_readtag();
480 while($line =~ /<GB_COMMENT>([^<]+)<\/GB_COMMENT>/o) {
481 push(@{$self->{_assembly}->{gb_comment}}, $1);
482 $line = $self->_readtag();
485 if(my %h = ($line =~ /<AUTHOR_LIST(?:\s*(\w+)\s*=\s*"([^"]+)"\s*)*>/o)) {
486 #$header->{'AUTHOR_LIST'}=$h{'CONTACT'};
487 # Ignored
488 while($line !~ /<\/AUTHOR_LIST>/o) {
489 $self->_readtag();
491 $line = $self->_readline();
492 } else {
493 $self->throw("Required <AUTHOR_LIST> missing");
496 if($line =~ /<\/HEADER>/o) {
497 return;
500 $self->throw("Reached end of header\n");
503 sub _process_gene_list
505 my($self) = @_;
506 my $line;
508 $line = $self->_readline();
509 if($line !~ /<GENE_LIST>/o) {
510 $self->throw("Bio::SeqIO::tigr::_process_gene_list called ",
511 "but no <GENE_LIST> in the stream");
514 $line = $self->_readline();
515 if($line =~ /<PROTEIN_CODING>/o) {
516 $self->_pushback($line);
517 $self->_process_protein_coding();
518 $line = $self->_readline();
519 } else {
520 $self->throw("Required <PROTEIN_CODING> missing");
523 if($line =~ /<RNA_GENES>/o) {
524 $self->_pushback($line);
525 $self->_process_rna_genes();
526 $line = $self->_readline();
527 } else {
528 $self->throw("Required <RNA_GENES> missing");
531 if($line =~ /<\/GENE_LIST>/o) {
532 return;
535 $self->throw("Reached end of _process_gene_list");
538 sub _process_protein_coding
540 my ($self) = @_;
541 my $line = $self->_readline();
543 if($line !~ /<PROTEIN_CODING>/o) {
544 $self->throw("Bio::SeqIO::tigr::_process_protein_coding called"
545 . "but no <GENE_LIST> in the stream");
548 $line = $self->_readline();
549 while($line and $line =~ /<TU>/o) {
550 $self->_pushback($line);
551 $self->_process_tu();
552 $line = $self->_readline();
555 # Sort the sequences
556 @{$self->{_sequences}} = sort {
557 my($one, $two) = ( $a, $b );
558 ($one) = grep { $_->primary_tag() eq 'source' } $one->get_SeqFeatures();
559 ($two) = grep { $_->primary_tag() eq 'source' } $two->get_SeqFeatures();
560 return 0 unless defined $one and defined $two;
561 ($one) = sort { $a <=> $b } $one->get_tagset_values(qw/end5 end3/);
562 ($two) = sort { $a <=> $b } $two->get_tagset_values(qw/end5 end3/);
563 return $one <=> $two;
564 } @{$self->{_sequences}};
566 if($line =~ /<\/PROTEIN_CODING>/o) {
567 return;
570 $self->throw("Reached end of _process_protein_coding");
574 sub _process_rna_genes
576 my ($self) = @_;
577 my $line = $self->_readline();
579 if($line =~ /<RNA_GENES>/o) {
580 while($line !~ /<\/RNA_GENES>/o) {
581 $line = $self->_readline();
583 } else {
584 $self->throw("Bio::SeqIO::tigr::_process_rna_genes called ",
585 "but no <RNA_GENES> in the stream");
589 sub _process_misc_info
591 my ($self) = @_;
592 my $line = $self->_readline();
594 if($line =~ /<MISC_INFO>/o) {
595 while($line !~ /<\/MISC_INFO>/o) {
596 $line = $self->_readline();
598 } else {
599 $self->throw("Bio::SeqIO::tigr::_process_misc_info called ",
600 "but no <MISC_INFO> in the stream");
604 sub _process_repeat_list
606 my ($self) = @_;
607 my $line = $self->_readline();
609 if($line =~ /<REPEAT_LIST>/o) {
610 while($line !~ /<\/REPEAT_LIST>/o) {
611 $line = $self->_readline();
613 } else {
614 $self->throw("Bio::SeqIO::tigr::_process_repeat_list called ",
615 "but no <MISC_INFO> in the stream");
619 sub _process_tiling_path
621 my($self) = @_;
622 my $line = $self->_readline();
625 if($line =~ /<TILING_PATH>/o) {
626 while($line !~ /<\/TILING_PATH>/o) {
627 $line = $self->_readline();
629 } else {
630 $self->throw("Bio::SeqIO::tigr::_process_repeat_list called ",
631 "but no <MISC_INFO> in the stream");
635 sub _process_scaffold
637 my ($self) = @_;
638 my $line;
640 # for now we just skip them
641 $line = $self->_readline();
642 return if $line !~ /<SCAFFOLD>/o;
643 do {
644 $line = $self->_readline();
645 } while(defined($line) && $line !~ /<\/SCAFFOLD>/o);
648 sub _process_tu
650 my($self) = @_;
651 my $line = $self->_readline();
653 try {
654 my $tu = Bio::Seq::RichSeq->new(-strand => 1);
655 $tu->species( $self->{_assembly}->{species} );
657 # Add the source tag, so we can add the GO annotations to it
658 $tu->add_SeqFeature(Bio::SeqFeature::Generic->new(-source_tag => 'TIGR', -primary_tag => 'source'));
660 if($line !~ /<TU>/o) {
661 $self->throw("Process_tu called when no <TU> tag");
664 $line = $self->_readtag();
665 if ($line =~ /<FEAT_NAME>([\w\.]+)<\/FEAT_NAME>/o) {
666 $tu->accession_number($1);
667 $tu->add_secondary_accession($1);
668 $line = $self->_readtag();
669 } else {
670 $self->throw("Invalid Feat_Name");
673 while($line =~ /<GENE_SYNONYM>/o) {
674 # ignore
675 $line = $self->_readtag();
678 while($line =~ /<CHROMO_LINK>\s*([\w\.]+)\s*<\/CHROMO_LINK>/o) {
679 $tu->add_secondary_accession($1);
680 $line = $self->_readtag();
683 if ($line =~ /<DATE>([^>]*)<\/DATE>/o) {
684 $tu->add_date($1) if $1 and $1 !~ /^\s*$/o;
685 $line = $self->_readline();
686 } else {
687 #$self->throw("Invalid Date: $line");
690 if ($line =~ /<GENE_INFO>/o) {
691 $self->_pushback($line);
692 $self->_process_gene_info($tu);
693 $line = $self->_readline();
694 } else {
695 $self->throw("Invalid Gene_Info");
698 my $source;
699 my $end5;
700 my $end3;
701 if($line =~ /<COORDSET>/o) {
702 $self->_pushback($line);
703 my $cs = $self->_process_coordset();
705 $end5 = $cs->{end5};
706 $end3 = $cs->{end3};
708 my $length = $end3 - $end5;
709 my $strand = $length <=> 0;
710 $length = $length * $strand;
711 $length++; # Correct for starting at 1, not 0
713 # Add X filler sequence
714 $tu->seq('X' x $length);
716 # Get the source tag:
717 my($source) = grep { $_->primary_tag() eq 'source' } $tu->get_SeqFeatures();
719 # Set the start and end values
720 $source->start(1);
721 $source->end($length);
722 $source->strand(1);
724 # Add a bunch of tags to it
725 $source->add_tag_value(clone => $self->{_assembly}->{clone});
726 $source->add_tag_value(clone_name => $self->{_assembly}->{clone_name});
727 $source->add_tag_value(end5 => $end5);
728 $source->add_tag_value(end3 => $end3);
729 $source->add_tag_value(chromosome => $self->{_assembly}->{chromosome});
730 $source->add_tag_value(strand => ( $strand == 1 ? 'positive' : 'negative' ));
732 $line = $self->_readline();
733 } else {
734 $self->throw("Invalid Coordset");
737 if($line =~ /<MODEL[^>]*>/o) {
738 do {
739 $self->_pushback($line);
740 $self->_process_model($tu, $end5, $end3);
741 $line = $self->_readline();
742 } while($line =~ /<MODEL[^>]*>/o);
743 $self->_pushback($line);
744 $line = $self->_readtag();
745 } else {
746 $self->throw("Expected <MODEL> not found");
749 if($line =~ /<TRANSCRIPT_SEQUENCE>/o) {
750 my @chunks;
751 $line = $self->_readline();
752 while ($line =~ /^\s*([ACGT]+)\s*$/o) {
753 push( @chunks, $1 );
754 $line = $self->_readline();
756 # $line = $self->_readline();
759 if($line =~ /<GENE_EVIDENCE>/o) {
760 $line = $self->_readtag();
763 while($line =~ /<URL[^>]*>[^<]*<\/URL>/o) {
764 $line = $self->_readtag();
767 if($line =~ /<\/TU>/o) {
768 push(@{$self->{_sequences}}, $tu);
769 return;
770 } else {
771 $self->throw("Expected </TU> not found: $line");
773 } catch Bio::Root::OutOfRange with {
774 my $E = shift;
775 $self->warn(sprintf("One sub location of a sequence is invalid near line $.\: %s", $E->text()));
776 $line = $self->_readline() until $line =~ /<\/TU>/o;
777 return;
781 sub _process_gene_info
783 my($self, $tu) = @_;
784 my $line = $self->_readline();
786 $self->throw("Invalid Gene Info: $line") if $line !~ /<GENE_INFO>/o;
787 $line = $self->_readline();
789 if($line =~ /<LOCUS>\s*([\w\.]+)\s*<\/LOCUS>/o) {
790 $tu->accession_number($1);
791 $tu->add_secondary_accession($1);
792 $line = $self->_readline();
793 } elsif( $line =~ /<LOCUS>.*<\/LOCUS>/o) {
794 # We should throw an error, but TIGR doesn't alwasy play
795 # nice with adhering to their dtd
796 $line = $self->_readtag();
797 } else {
798 #$self->throw("Invalid Locus: $line");
801 if($line =~ /<ALT_LOCUS>\s*([\w\.]+)\s*<\/ALT_LOCUS>/o) {
802 $tu->accession_number($1);
803 $tu->add_secondary_accession($1);
804 $line = $self->_readline();
807 if($line =~ /<PUB_LOCUS>\s*([\w\.]+)\s*<\/PUB_LOCUS>/o) {
808 $tu->accession_number($1);
809 $tu->add_secondary_accession($1);
810 $line = $self->_readtag();
811 } elsif( $line =~ /<PUB_LOCUS>.*<\/PUB_LOCUS>/o) {
812 $line = $self->_readtag();
813 # $self->throw("Invalid Pub_Locus");
816 if($line =~ /<GENE_NAME.*>.*<\/GENE_NAME>/o) {
817 # Skip the GENE_NAME
818 $line = $self->_readtag();
821 if(my($attr, $value) = ($line =~ /<COM_NAME([^>]*)>([^>]+)<\/COM_NAME>/o)) {
822 #%attribs = ($attr =~ /(\w+)\s*=\s+"(.*?)"/og);
823 #$geneinfo->{'CURATED'} = $attribs{CURATED};
824 #$geneinfo->{IS_PRIMARY} = $attribs{IS_PRIMARY}
825 # TODO: add a tag on sources for curated
826 $tu->desc($value);
827 $line = $self->_readtag();
828 } else {
829 $self->throw("invalid com_name: $line");
832 while($line =~ /<COMMENT>([^<]+)<\/COMMENT>/o) {
833 my $comment = Bio::Annotation::Comment->new(
834 -text => $1
836 $tu->annotation()->add_Annotation('comment', $comment);
837 $line = $self->_readtag();
840 while($line =~ /<PUB_COMMENT>([^<]+)<\/PUB_COMMENT>/o) {
841 my $comment = Bio::Annotation::Comment->new(
842 -text => $1
844 $tu->annotation()->add_Annotation('comment', $comment);
845 $line = $self->_readtag();
848 if($line =~ /<EC_NUM>([\w\-\\\.]+)<\/EC_NUM>/o) {
849 #$geneinfo->{'EC_NUM'} = $1;
850 $line = $self->_readtag();
853 if($line =~ /<GENE_SYM>\s*([^<]+)\s*<\/GENE_SYM>/o) {
854 #$tu->add_secondary_accession($1);
855 $line = $self->_readtag();
858 if($line =~ /<IS_PSEUDOGENE>([^>]+)<\/IS_PSEUDOGENE>/o) {
859 #$geneinfo->{'IS_PSEUDOGENE'} = $1;
860 $line = $self->_readtag();
861 } else {
862 $self->throw("invalid is_pseudogene: $line");
865 if($line =~ /<FUNCT_ANNOT_EVIDENCE/o) {
866 $line = $self->_readtag();
869 if($line =~ /<DATE>([^>]+)<\/DATE>/o) {
870 #$geneinfo->{'DATE'} = $1;
871 $line = $self->_readtag();
874 while($line =~ /<GENE_ONTOLOGY>/o) {
875 # Get the source tag
876 my($source) = grep { $_->primary_tag() eq 'source' } $tu->get_SeqFeatures();
878 my @ids = ( $line =~ /(<GO_ID.*?<\/GO_ID>)/gso);
879 foreach my $go (@ids) {
880 my($assignment) = ($go =~ /<GO_ID\s+ASSIGNMENT\s+=\s+"GO:(\d+)">/os);
881 my($term) = ($go =~ /<GO_TERM>([^<]+)<\/GO_TERM>/os);
882 my($type) = ($go =~ /<GO_TYPE>([^<]+)<\/GO_TYPE>/os);
883 # TODO: Add GO annotation
884 if(defined $type and defined $assignment and defined $term) {
885 # Add the GO Annotation
886 $source->add_tag_value(
887 GO => "ID: $assignment; Type: $type; $term"
891 $line = $self->_readtag();
894 if($line =~ /<\/GENE_INFO/o) {
895 return;
898 $self->throw("unexpected end of gene_info");
901 sub _build_location
903 my($self, $end5, $end3, $length, $cs) = @_;
905 # Find the start and end of the location
906 # relative to the sequence.
907 my $start = abs( $end5 - $cs->{end5} ) + 1;
908 my $end = abs( $end5 - $cs->{end3} ) + 1;
910 # Do some bounds checking:
911 if( $start < 1 ) {
912 throw Bio::Root::OutOfRange(
913 -text => "locations' start( $start) must be >= 1"
915 } elsif( $end > $length ) {
916 throw Bio::Root::OutOfRange(
917 -text => "locations' end( $end ) must be <= length( $length )"
919 } elsif( $start > $end ) {
920 throw Bio::Root::OutOfRange(
921 -text => "locations' start ( $start ) must be < end ( $end ) $end5, $end3, $cs->{end5}, $cs->{end3}"
925 return Bio::Location::Simple->new( -start => $start, -end => $end, -strand => 1 );
928 sub _process_model
930 my($self, $tu, $end5, $end3) = @_;
931 my $line;
932 my( $source ) = grep { $_->primary_tag() eq 'source' } $tu->get_SeqFeatures();
933 my $model = Bio::SeqFeature::Generic->new(
934 -source_tag => 'TIGR',
935 -primary_tag => 'MODEL',
938 $line = $self->_readline();
939 if($line !~ /<MODEL ([^>]+)>/o) {
940 $self->throw("Invalid Model: $line")
942 my %attribs = ($1 =~ /(\w+)\s*=\s*"([^"]*)"/og);
943 #$model->{'CURATED'} = $attribs{'CURATED'};
944 # TODO: Add tag to model
945 $line = $self->_readline();
947 if($line =~ /<FEAT_NAME>\s*([\w\.]+)\s*<\/FEAT_NAME>/o) {
948 $model->add_tag_value( feat_name => $1 );
949 $tu->add_secondary_accession($1);
950 $line = $self->_readline();
951 } else {
952 $self->throw("Invalid Feature Name: $line");
955 if($line =~ /<PUB_LOCUS>\s*([\w\.]+)\s*<\/PUB_LOCUS>/o) {
956 $model->add_tag_value( pub_locus => $1 );
957 $tu->add_secondary_accession($1);
958 $line = $self->_readline();
959 } else {
960 # $self->throw("Invalid Pub_Locus: $line");
963 if($line =~ /<CDNA_SUPPORT>/o) {
964 $self->_pushback($line);
965 $self->_process_cdna_support( $model );
966 $line = $self->_readline();
969 while($line =~ /<CHROMO_LINK>([^>]+)<\/CHROMO_LINK>/o) {
970 $model->add_tag_value( chromo_link => $1 );
971 $line = $self->_readline();
974 if($line =~ /<DATE>([^>]+)<\/DATE>/o) {
975 $line = $self->_readline();
976 } else {
977 $self->throw("Invalid Date: $line");
980 if($line =~ /<COORDSET>/o) {
981 $self->_pushback($line);
982 my $cs = $self->_process_coordset();
983 my $loc = $self->_build_location($end5, $end3, $tu->length(), $cs);
985 $model->start( $loc->start() );
986 $model->end( $loc->end() );
987 $line = $self->_readline();
988 } else {
989 $self->throw("Invalid Coordset: $line");
992 my $exon = Bio::SeqFeature::Generic->new(
993 -source_tag => 'TIGR',
994 -primary_tag => 'EXON',
995 -location => Bio::Location::Split->new(),
996 -tags => [ locus => $tu->accession_number() ],
998 $exon->add_tag_value( model => $model->get_tag_values('feat_name') );
1000 my $cds = Bio::SeqFeature::Generic->new(
1001 -source_tag => 'TIGR',
1002 -primary_tag => 'CDS',
1003 -location => Bio::Location::Split->new(),
1004 -tags => [ locus => $tu->accession_number() ],
1006 $cds->add_tag_value( model => $model->get_tag_values('feat_name') );
1007 my $utr = [];
1009 if($line =~ /<EXON>/o) {
1010 do {
1011 $self->_pushback($line);
1012 $self->_process_exon( $tu, $exon, $cds, $utr, $end5, $end3 );
1013 $line = $self->_readline();
1014 } while($line =~ /<EXON>/o);
1015 } else {
1016 $self->throw("Required <EXON> missing");
1019 until($line =~ /<\/MODEL>/o) {
1020 $line = $self->_readline();
1024 $_->add_tag_value( model => $model->get_tag_values('feat_name') )
1025 foreach @$utr;
1027 # Add the model, EXONs, CDS, and UTRs
1028 $tu->add_SeqFeature($model) if $model and $model->start() >= 1;
1029 $tu->add_SeqFeature($exon) if $exon and scalar($exon->location()->each_Location()) >= 1;
1030 $tu->add_SeqFeature($cds) if $cds and scalar($cds->location()->each_Location()) >= 1;
1031 $tu->add_SeqFeature(@$utr);
1033 return;
1036 sub _process_cdna_support
1038 my($self, $model) = @_;
1039 my $line = $self->_readline();
1041 if($line !~ /<CDNA_SUPPORT>/o) {
1042 $self->throw("Bio::SeqIO::tigr::_process_cdna_support called ",
1043 "but no <CDNA_SUPPORT> in the stream");
1046 $line = $self->_readline();
1048 while( $line =~ /<ACCESSION([^>]+)>(.*)<\/ACCESSION>/o) {
1049 # Save the text
1050 my $desc = $2;
1052 # Get the element's attributes
1053 my %attribs = ($1 =~ /(\w+)\s*=\s*"([^"]*)"/og);
1055 # Add the tag to the model
1056 $model->add_tag_value(
1057 cdna_support => "DBXRef: $attribs{DBXREF}; $desc"
1060 $line = $self->_readline();
1063 if( $line =~ /<\/CDNA_SUPPORT>/o) {
1064 return;
1066 $self->throw("reached end of _process_cdna_support");
1070 sub _process_exon
1072 my($self, $tu, $exon, $cds, $utr, $end5, $end3 ) = @_;
1073 my $line = $self->_readline();
1075 if($line !~ /<EXON>/o) {
1076 $self->throw("Bio::SeqIO::tigr::_process_exon called ",
1077 "but no <EXON> in the stream");
1080 $line = $self->_readtag();
1081 if($line =~ /<FEAT_NAME>([^<]+)<\/FEAT_NAME>/o) {
1082 # Ignore
1083 $line = $self->_readtag();
1084 } else {
1085 $self->throw("Required <FEAT_NAME> missing");
1088 if($line =~ /<DATE>([^<]+)<\/DATE>/o) {
1089 # Ignore
1090 $line = $self->_readtag();
1091 } else {
1092 $self->throw("Required <DATE> missing");
1095 if($line =~ /<COORDSET>/o) {
1096 $self->_pushback($line);
1097 my $cs = $self->_process_coordset();
1098 my $loc = $self->_build_location($end5, $end3, $tu->length(), $cs);
1099 $exon->location()->add_sub_Location($loc);
1100 $line = $self->_readline();
1101 } else {
1102 $self->throw("Required <COORDSET> missing");
1105 if($line =~ /<CDS>/o) {
1106 $self->_pushback($line);
1107 $self->_process_cds($tu, $end5, $end3, $cds);
1108 $line = $self->_readline();
1111 if($line =~ /<UTRS>/o) {
1112 $self->_pushback($line);
1113 $self->_process_utrs($tu, $end5, $end3, $utr);
1114 $line = $self->_readline();
1117 if($line =~ /<\/EXON>/o) {
1118 return;
1121 $self->throw("Reached End of Bio::SeqIO::tigr::_process_exon");
1124 sub _process_cds
1126 my($self, $tu, $end5, $end3, $cds) = @_;
1127 my $line = $self->_readline();
1129 if($line !~ /<CDS>/o) {
1130 $self->throw("Bio::SeqIO::tigr::_process_cda_support called ",
1131 "but no <CDS> in the stream");
1134 $line = $self->_readtag();
1135 if($line =~ /<FEAT_NAME>([^<]+)<\/FEAT_NAME>/o) {
1136 #$cds->{'FEAT_NAME'} = $1;
1137 $line = $self->_readtag();
1138 } else {
1139 $self->throw("Required <FEAT_NAME> missing");
1142 if($line =~ /<DATE>([^<]+)<\/DATE>/o) {
1143 #$cds->{'DATE'} = $1;
1144 $line = $self->_readtag();
1145 } else {
1146 $self->throw("Required <DATE> missing");
1149 if($line =~ /<COORDSET>/o) {
1150 $self->_pushback($line);
1151 my $cs = $self->_process_coordset();
1152 my $loc = $self->_build_location($end5, $end3, $tu->length(), $cs);
1153 $cds->location()->add_sub_Location($loc);
1154 $line = $self->_readline();
1155 } else {
1156 $self->throw("Required <COORDSET> missing");
1159 if($line =~ /<\/CDS>/o) {
1160 return;
1163 $self->throw("Reached onf of Bio::SeqIO::tigr::_process_cds");
1166 sub _process_utrs
1168 my($self, $tu, $end5, $end3, $utrs) = @_;
1169 my $line = $self->_readline();
1171 if($line !~ /<UTRS/o) {
1172 $self->throw("Bio::SeqIO::tigr::_process_utrs called but no ",
1173 "<UTRS> found in stream");
1176 $line = $self->_readline();
1177 while($line !~ /<\/UTRS>/o) {
1178 $self->_pushback($line);
1179 if($line =~ /<LEFT_UTR>/o) {
1180 $self->_process_left_utr($tu, $end5, $end3, $utrs);
1181 } elsif ($line =~ /<RIGHT_UTR>/o) {
1182 $self->_process_right_utr($tu, $end5, $end3, $utrs);
1183 } elsif ($line =~ /<EXTENDED_UTR>/o) {
1184 $self->_process_ext_utr($tu, $end5, $end3, $utrs);
1185 } else {
1186 $self->throw("Unexpected tag");
1189 $line = $self->_readline();
1192 if($line =~ /<\/UTRS>/o) {
1193 return $utrs;
1195 $self->throw("Reached end of Bio::SeqIO::tigr::_process_utrs");
1198 sub _process_left_utr
1200 my($self, $tu, $end5, $end3, $utrs) = @_;
1201 my $line = $self->_readline();
1202 my $coordset;
1204 if($line !~ /<LEFT_UTR>/o) {
1205 $self->throw("Bio::SeqIO::tigr::_process_left_utr called but ",
1206 "no <LEFT_UTR> found in stream");
1209 $line = $self->_readtag();
1210 if($line =~ /<COORDSET>/o) {
1211 $self->_pushback($line);
1212 my $cs = $self->_process_coordset();
1213 my $loc = $self->_build_location($end5, $end3, $tu->length(), $cs);
1215 push(@$utrs, Bio::SeqFeature::Generic->new(
1216 -source_tag => 'TIGR',
1217 -primary_tag => 'LEFT_UTR',
1218 -strand => 1,
1219 -start => $loc->start(),
1220 -end => $loc->end()
1223 $line = $self->_readline();
1224 } else {
1225 $self->throw("Required <COORDSET> missing");
1228 if($line =~ /<\/LEFT_UTR>/o) {
1229 return;
1231 $self->throw("Reached end of Bio::SeqIO::tigr::_process_left_utr");
1234 sub _process_right_utr
1236 my($self, $tu, $end5, $end3, $utrs) = @_;
1237 my $line = $self->_readline();
1238 my $coordset;
1240 if($line !~ /<RIGHT_UTR>/o) {
1241 $self->throw("Bio::SeqIO::tigr::_process_right_utr called but ",
1242 "no <RIGHT_UTR> found in stream");
1245 $line = $self->_readtag();
1246 if($line =~ /<COORDSET>/o) {
1247 $self->_pushback($line);
1248 $coordset = $self->_process_coordset();
1249 $self->_pushback($line);
1250 my $cs = $self->_process_coordset();
1251 my $loc = $self->_build_location($end5, $end3, $tu->length(), $cs);
1253 push(@$utrs, Bio::SeqFeature::Generic->new(
1254 -source_tag => 'TIGR',
1255 -primary_tag => 'RIGHT_UTR',
1256 -strand => 1,
1257 -start => $loc->start(),
1258 -end => $loc->end()
1261 $line = $self->_readline();
1262 } else {
1263 $self->throw("Required <COORDSET> missing");
1266 if($line =~ /<\/RIGHT_UTR>/o) {
1267 return $coordset;
1269 $self->throw("Reached end of Bio::SeqIO::tigr::_process_right_utr");
1272 sub _process_ext_utr
1274 my($self, $tu, $end5, $end3, $utrs) = @_;
1275 my $line = $self->_readline();
1276 my $coordset;
1278 if($line !~ /<EXTENDED_UTR>/o) {
1279 $self->throw("Bio::SeqIO::tigr::_process_ext_utr called but ",
1280 "no <EXTENDED_UTR> found in stream");
1283 $line = $self->_readtag();
1284 if($line =~ /<COORDSET>/o) {
1285 $self->_pushback($line);
1286 my $cs = $self->_process_coordset();
1287 my $loc = $self->_build_location($end5, $end3, $tu->length(), $cs);
1289 push(@$utrs, Bio::SeqFeature::Generic->new(
1290 -source_tag => 'TIGR',
1291 -primary_tag => 'EXTENDED_UTR',
1292 -strand => 1,
1293 -start => $loc->start(),
1294 -end => $loc->end()
1297 $line = $self->_readline();
1298 } else {
1299 $self->throw("Required <COORDSET> missing");
1302 if($line =~ /<\/EXTENDED_UTR>/o) {
1303 return $coordset;
1305 $self->throw("Reached end of Bio::SeqIO::tigr::_process_ext_utr");
1308 sub _readtag
1310 my($self) = @_;
1311 my $line = $self->_readline();
1312 chomp($line);
1314 my $tag;
1315 if(($tag) = ($line =~ /^[^<]*<\/(\w+)/o)) {
1316 $self->_pushback($1) if $line =~ /<\/$tag>(.+)$/;
1317 return "</$tag>";
1320 until(($tag) = ($line =~ /<(\w+)[^>]*>/o)) {
1321 $line = $self->_readline();
1322 chomp $line;
1325 until($line =~ /<\/$tag>/) {
1326 $line .= $self->_readline();
1329 if(my ($val) = ($line =~ /(<$tag.*>.*?<\/$tag>)/s)) {
1330 if($line =~ /<\/$tag>\s*(\w+[\s\w]*?)\s*$/s) {
1331 $self->_pushback($1)
1333 return $val;
1335 $self->throw("summerror");
1338 sub _readline
1340 my($self) = @_;
1341 my $line;
1342 do {
1343 $line = $self->SUPER::_readline();
1344 } while(defined($line) and $line =~ /^\s*$/o);
1346 return $line;
1349 sub throw
1351 my($self, @s) = @_;
1352 my $string = "[$.]" . join('', @s);
1353 $self->SUPER::throw($string);