Move HMMER related modules, tests, and programs to new distribution.
[bioperl-live.git] / Bio / SeqIO / tigr.pm
blob88330012d82b3fed73046a27a8e98aef7fa6870f
1 # BioPerl module for Bio::SeqIO::tigr
3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
5 # Cared for by Josh Lauricha (laurichj@bioinfo.ucr.edu)
7 # Copyright Josh Lauricha
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::SeqIO::tigr - TIGR XML sequence input/output stream
17 =head1 SYNOPSIS
19 Do not use this module directly. Use it via the Bio::SeqIO class.
21 =head1 DESCRIPTION
23 This object can transform Bio::Seq objects to and from efa flat
24 file databases.
26 =head1 FEEDBACK
28 =head2 Mailing Lists
30 User feedback is an integral part of the evolution of this and other
31 Bioperl modules. Send your comments and suggestions preferably to one
32 of the Bioperl mailing lists. Your participation is much appreciated.
34 bioperl-l@bioperl.org - General discussion
35 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
37 =head2 Support
39 Please direct usage questions or support issues to the mailing list:
41 I<bioperl-l@bioperl.org>
43 rather than to the module maintainer directly. Many experienced and
44 reponsive experts will be able look at the problem and quickly
45 address it. Please include a thorough description of the problem
46 with code and data examples if at all possible.
48 =head2 Reporting Bugs
50 Report bugs to the Bioperl bug tracking system to help us keep track
51 the bugs and their resolution.
52 Bug reports can be submitted via the web:
54 https://github.com/bioperl/bioperl-live/issues
56 =head1 AUTHORS - Josh Lauricha
58 Email: laurichj@bioinfo.ucr.edu
61 =head1 APPENDIX
63 The rest of the documentation details each of the object
64 methods. Internal methods are usually preceded with a _
66 =cut
68 # TODO:
69 # - Clean up code
70 # - Find and fix bugs ;)
72 # Let the code begin...
73 package Bio::SeqIO::tigr;
74 use strict;
76 use Bio::Seq::RichSeq;
77 use Bio::Species;
78 use Bio::Annotation::Comment;
79 use Bio::SeqFeature::Generic;
80 use Bio::Seq::SeqFactory;
81 use Bio::Seq::RichSeq;
82 use Data::Dumper;
83 use Error qw/:try/;
85 use base qw(Bio::SeqIO);
87 sub _initialize
89 my($self, @args) = @_;
91 $self->SUPER::_initialize(@args);
92 $self->sequence_factory(Bio::Seq::SeqFactory->new(
93 -type => 'Bio::Seq::RichSeq')
96 # Parse the document
97 $self->_process();
100 =head2 next_seq
102 Title : next_seq
103 Usage : $seq = $stream->next_seq()
104 Function: returns the next sequence in the stream
105 Returns : Bio::Seq object
106 Args : NONE
108 =cut
110 sub next_seq()
112 my ($self) = @_;
114 # Check for any more sequences
115 return if !defined($self->{_sequences}) or scalar(@{$self->{_sequences}}) < 1;
117 # get the next sequence
118 my $seq = shift(@{ $self->{_sequences} } );
120 # Get the 5' and 3' ends
121 my ($source) = grep { $_->primary_tag() eq 'source' } $seq->get_SeqFeatures();
122 my ($end5) = $source->get_tag_values('end5');
123 my ($end3) = $source->get_tag_values('end3');
125 # Sort the 5' and 3':
126 my ($start, $end) = ( $end5 < $end3 ? ( $end5, $end3 ) : ( $end3, $end5 ) );
128 # make the start a perl index
129 $start -= 1;
131 # Figure out the length
132 my $length = $end - $start;
134 # check to make sure $start >= 0 and $end <= length(assembly_seq)
135 if($start < 0) {
136 throw Bio::Root::OutOfRange("the sequence start is $start < 0");
137 } elsif($end > length($self->{_assembly}->{seq})) {
138 throw Bio::Root::OutOfRange("the sequence end is $end < " . length($self->{_assembly}->{seq}));
139 } elsif($start >= $end) {
140 throw Bio::Root::OutOfRange("the sequence start is after end $start >= $end");
143 # Get and set the real sequence
144 $seq->seq(substr($self->{_assembly}->{seq}, $start, $length));
146 if( $end5 > $end3 ) {
147 # Reverse complement the sequence
148 $seq->seq( $seq->primary_seq()->revcom()->seq() );
151 # add the translation to each CDS
152 foreach my $feat ($seq->get_SeqFeatures()) {
153 next if $feat->primary_tag() ne "CDS";
155 # Check for an invalid protein
156 try {
157 # Get the subsq
158 my $cds = Bio::PrimarySeq->new(
159 -strand => 1,
160 -id => $seq->accession_number(),
161 -seq => $seq->subseq($feat->location())
164 # Translate it
165 my $trans = $cds->translate(undef, undef, undef, undef, 1, 1)->seq();
167 # Add the tag
168 $feat->add_tag_value(translation => $trans);
169 } catch Bio::Root::Exception with {
170 print STDERR 'TIGR strikes again, the CDS is not a valid protein: ', $seq->accession_number(), "\n"
171 if $self->verbose() > 0;
175 # Set the display id to the accession number if there
176 # is no display id
177 $seq->display_id( $seq->accession_number() ) unless $seq->display_id();
179 return $seq;
182 sub _process
184 my($self) = @_;
185 my $line;
186 my $tu = undef;
188 $line = $self->_readline();
189 do {
190 if($line =~ /<\?xml\s+version\s+=\s+"\d+\.\d+"\?>/o) {
191 # do nothing
192 } elsif ($line =~ /<!DOCTYPE (\w+) SYSTEM "[\w\.]+">/o) {
193 $self->throw("DOCTYPE of $1, not TIGR!")
194 if $1 ne "TIGR" ;
195 } elsif ($line =~ /<TIGR>/o) {
196 $self->_pushback($line);
197 $self->_process_tigr();
198 } elsif ($line =~ /<ASSEMBLY.*?>/o) {
199 $self->_pushback($line);
200 $self->_process_assembly();
201 } elsif ($line =~ /<\/TIGR>/o) {
202 $self->{'eof'} = 1;
203 return;
204 } else {
205 $self->throw("Unknown or Invalid process directive:",
206 join('', ($line =~ /^\s*(<[^>]+>)/o)));
208 $line = $self->_readline();
209 } while( defined( $line ) );
212 sub _process_tigr
214 my($self) = @_;
215 my $line;
217 $line = $self->_readline();
218 if($line !~ /<TIGR>/o) {
219 $self->throw("Bio::SeqIO::tigr::_process_tigr called but no ",
220 "<TIGR> found in stream");
223 $line = $self->_readline();
224 if($line =~ /<PSEUDOCHROMOSOME>/o) {
225 $self->_pushback($line);
226 $self->_process_pseudochromosome();
227 } elsif ($line =~ /<ASSEMBLY.*?>/o) {
228 $self->_pushback($line);
229 $self->_process_assembly();
233 sub _process_pseudochromosome
235 my($self) = @_;
236 my $line;
238 $line = $self->_readline();
239 return if $line !~ /<PSEUDOCHROMOSOME>/o;
241 $line = $self->_readline();
243 if($line =~ /<SCAFFOLD>/o) {
244 $self->_pushback($line);
245 $self->_process_scaffold();
246 $line = $self->_readline();
247 } else {
248 $self->warn( "No Scaffold found in <PSUEDOCHROMOSOME> this " .
249 "is a violation of the TIGR dtd, but we ignore " .
250 "it so we are ignoring the error\n"
254 if($line =~ /<ASSEMBLY.*>/o) {
255 $self->_pushback($line);
256 $self->_process_assembly();
257 $line = $self->_readline();
258 } else {
259 $self->throw("Missing required ASSEMBLY in <PSEUDOCHROMOSOME>");
262 if($line =~ /<\/PSEUDOCHROMOSOME>/) {
263 return;
266 $self->throw("Reached end of _process_pseudochromosome");
269 sub _process_assembly
271 my($self) = @_;
272 my $line;
274 $line = $self->_readline();
275 if($line !~ /<ASSEMBLY([^>]*)>/o) {
276 $self->throw("Bio::SeqIO::tigr::_process_assembly called ",
277 "but no <ASSEMBLY> found in stream");
280 my %attribs = ($1 =~ /(\w+)\s*=\s+"(.*?)"/og);
281 $self->{_assembly}->{date} = $attribs{CURRENT_DATE};
282 $self->{_assembly}->{db} = $attribs{DATABASE};
283 $self->{_assembly}->{chromosome} = $attribs{CHROMOSOME};
285 $line = $self->_readline();
286 my($attr, $val);
287 if(($attr, $val) = ($line =~ /<ASMBL_ID([^>]*)>([^<]*)<\/ASMBL_ID>/o)) {
288 %attribs = ($attr =~ /(\w+)\s*=\s+"(.*?)"/og);
289 $self->{_assembly}->{clone_name} = $attribs{CLONE_NAME};
290 $self->{_assembly}->{clone} = $val;
291 $line = $self->_readtag();
292 } else {
293 $self->throw("Required <ASMBL_ID> missing");
296 if($line =~ /<COORDSET>/o) {
297 $self->_pushback($line);
298 my $cs = $self->_process_coordset();
300 $self->{_assembly}->{end5} = $cs->{end5};
301 $self->{_assembly}->{end3} = $cs->{end3};
303 $line = $self->_readline();
304 } else {
305 $self->throw("Required <COORDSET> missing");
308 if($line =~ /<HEADER>/o) {
309 $self->_pushback($line);
310 $self->_process_header();
311 $line = $self->_readline();
312 } else {
313 $self->throw("Required <HEADER> missing");
316 if($line =~ /<TILING_PATH>/o) {
317 $self->_pushback($line);
318 $self->_process_tiling_path();
319 $line = $self->_readline();
322 if($line =~ /<GENE_LIST>/o) {
323 $self->_pushback($line);
324 $self->_process_gene_list();
325 $line = $self->_readline();
326 } else {
327 $self->throw("Required <GENE_LIST> missing");
330 if($line =~ /<MISC_INFO>/o) {
331 $self->_pushback($line);
332 $self->_process_misc_info();
333 $line = $self->_readline();
336 if($line =~ /<REPEAT_LIST>/o) {
337 $self->_pushback($line);
338 $self->_process_repeat_list();
339 $line = $self->_readline();
342 if($line =~ /<ASSEMBLY_SEQUENCE>/o) {
343 $self->_pushback($line);
344 $self->_process_assembly_seq();
345 $line = $self->_readline();
346 } else {
347 $self->throw("Required <ASSEMBLY_SEQUENCE> missing");
350 if($line =~ /<\/ASSEMBLY>/o) {
351 return;
353 $self->throw("Reached the end of <ASSEMBLY>");
356 sub _process_assembly_seq()
358 my ($self) = @_;
359 my $line;
361 $line = $self->_readline();
362 if($line !~ /<ASSEMBLY_SEQUENCE>/o) {
363 $self->throw("Bio::SeqIO::tigr::_process_assembly_seq called ".
364 "with no <ASSEMBLY_SEQUENCE> in the stream");
367 # Protect against lots of smaller lines
368 my @chunks;
370 do {
371 $line = $self->_readline();
372 last unless $line;
374 my $seq;
375 if (($seq) = ($line =~ /^\s*(\w+)\s*$/o)) {
376 push(@chunks, $seq);
377 } elsif( ($seq) = ( $line =~ /^\s*(\w+)<\/ASSEMBLY_SEQUENCE>\s*$/o) ) {
378 push(@chunks, $seq);
379 $self->{_assembly}->{seq} = join('', @chunks);
380 return;
382 } while( $line );
384 $self->throw("Reached end of _proces_assembly");
387 sub _process_coordset($)
389 my ($self) = @_;
390 my $line;
391 my $h;
393 $line = $self->_readline();
394 if($line =~ /<COORDSET>/o) {
395 $self->_pushback($line);
396 $line = $self->_readtag();
397 ($h->{end5}, $h->{end3}) = ($line =~ /<COORDSET>\s*<END5>\s*(\d+)\s*<\/END5>\s*<END3>\s*(\d+)\s*<\/END3>/os);
398 if(!defined($h->{end5}) or !defined($h->{end3})) {
399 $self->throw("Invalid <COORDSET>: $line");
401 return $h;
402 } else {
403 $self->throw("Bio::SeqIO::tigr::_process_coordset() called ",
404 "but no <COORDSET> found in stream");
408 sub _process_header
410 my ($self) = @_;
411 my $line = $self->_readline();
413 if($line !~ /<HEADER>/o) {
414 $self->throw("Bio::SeqIO::tigr::_process_header called ",
415 "but no <HEADER> found in stream");
418 $line = $self->_readtag();
419 if($line =~ /<CLONE_NAME>([^>]+)<\/CLONE_NAME>/o) {
420 $self->{_assembly}->{clone_name} = $1;
421 $line = $self->_readtag();
422 } else {
423 $self->throw("Required <CLONE_NAME> missing");
426 if($line =~ /<SEQ_LAST_TOUCHED>/o) {
427 # Ignored for now
428 $line = $self->_readtag();
429 } else {
430 $self->throw("Reqired <SEQ_LAST_TOUCHED> missing");
433 if($line =~ /<GB_ACCESSION>([^<]*)<\/GB_ACCESSION>/o) {
434 $self->{_assembly}->{gb} = $1;
435 $line = $self->_readtag();
436 } else {
437 $self->throw("Required <GB_ACCESSION> missing");
440 if($line =~ /<ORGANISM>\s*(.+)\s*<\/ORGANISM>/o) {
441 my( $genus, $species, @ss ) = split(/\s+/o, $1);
442 $self->{_assembly}->{species} = Bio::Species->new();
443 $self->{_assembly}->{species}->genus($genus);
444 $self->{_assembly}->{species}->species($species);
445 $self->{_assembly}->{species}->sub_species(join(' ', @ss)) if scalar(@ss) > 0;
447 $line = $self->_readtag();
448 } else {
449 $self->throw("Required <ORGANISM> missing");
452 if($line =~ /<LINEAGE>([^<]*)<\/LINEAGE>/o) {
453 $self->{_assembly}->{species}->classification(
454 $self->{_assembly}->{species}->species(),
455 reverse(split(/\s*;\s*/o, $1))
457 $line = $self->_readtag();
458 } else {
459 $self->throw("Required <LINEAGE> missing");
462 if($line =~ /<SEQ_GROUP>([^<]*)<\/SEQ_GROUP>/o) {
463 # ingnored
464 $line = $self->_readtag();
465 } else {
466 $self->throw("Required <SEQ_GROUP> missing");
469 while($line =~ /<KEYWORDS>[^<]*<\/KEYWORDS>/o) {
470 push(@{$self->{_assembly}->{keywords}}, $1);
471 $line = $self->_readtag();
474 while($line =~ /<GB_DESCRIPTION>([^<]+)<\/GB_DESCRIPTION>/o) {
475 push(@{$self->{_assembly}->{gb_desc}},$1);
476 $line = $self->_readtag();
479 while($line =~ /<GB_COMMENT>([^<]+)<\/GB_COMMENT>/o) {
480 push(@{$self->{_assembly}->{gb_comment}}, $1);
481 $line = $self->_readtag();
484 if(my %h = ($line =~ /<AUTHOR_LIST(?:\s*(\w+)\s*=\s*"([^"]+)"\s*)*>/o)) {
485 #$header->{'AUTHOR_LIST'}=$h{'CONTACT'};
486 # Ignored
487 while($line !~ /<\/AUTHOR_LIST>/o) {
488 $self->_readtag();
490 $line = $self->_readline();
491 } else {
492 $self->throw("Required <AUTHOR_LIST> missing");
495 if($line =~ /<\/HEADER>/o) {
496 return;
499 $self->throw("Reached end of header\n");
502 sub _process_gene_list
504 my($self) = @_;
505 my $line;
507 $line = $self->_readline();
508 if($line !~ /<GENE_LIST>/o) {
509 $self->throw("Bio::SeqIO::tigr::_process_gene_list called ",
510 "but no <GENE_LIST> in the stream");
513 $line = $self->_readline();
514 if($line =~ /<PROTEIN_CODING>/o) {
515 $self->_pushback($line);
516 $self->_process_protein_coding();
517 $line = $self->_readline();
518 } else {
519 $self->throw("Required <PROTEIN_CODING> missing");
522 if($line =~ /<RNA_GENES>/o) {
523 $self->_pushback($line);
524 $self->_process_rna_genes();
525 $line = $self->_readline();
526 } else {
527 $self->throw("Required <RNA_GENES> missing");
530 if($line =~ /<\/GENE_LIST>/o) {
531 return;
534 $self->throw("Reached end of _process_gene_list");
537 sub _process_protein_coding
539 my ($self) = @_;
540 my $line = $self->_readline();
542 if($line !~ /<PROTEIN_CODING>/o) {
543 $self->throw("Bio::SeqIO::tigr::_process_protein_coding called"
544 . "but no <GENE_LIST> in the stream");
547 $line = $self->_readline();
548 while($line and $line =~ /<TU>/o) {
549 $self->_pushback($line);
550 $self->_process_tu();
551 $line = $self->_readline();
554 # Sort the sequences
555 @{$self->{_sequences}} = sort {
556 my($one, $two) = ( $a, $b );
557 ($one) = grep { $_->primary_tag() eq 'source' } $one->get_SeqFeatures();
558 ($two) = grep { $_->primary_tag() eq 'source' } $two->get_SeqFeatures();
559 return 0 unless defined $one and defined $two;
560 ($one) = sort { $a <=> $b } $one->get_tagset_values(qw/end5 end3/);
561 ($two) = sort { $a <=> $b } $two->get_tagset_values(qw/end5 end3/);
562 return $one <=> $two;
563 } @{$self->{_sequences}};
565 if($line =~ /<\/PROTEIN_CODING>/o) {
566 return;
569 $self->throw("Reached end of _process_protein_coding");
573 sub _process_rna_genes
575 my ($self) = @_;
576 my $line = $self->_readline();
578 if($line =~ /<RNA_GENES>/o) {
579 while($line !~ /<\/RNA_GENES>/o) {
580 $line = $self->_readline();
582 } else {
583 $self->throw("Bio::SeqIO::tigr::_process_rna_genes called ",
584 "but no <RNA_GENES> in the stream");
588 sub _process_misc_info
590 my ($self) = @_;
591 my $line = $self->_readline();
593 if($line =~ /<MISC_INFO>/o) {
594 while($line !~ /<\/MISC_INFO>/o) {
595 $line = $self->_readline();
597 } else {
598 $self->throw("Bio::SeqIO::tigr::_process_misc_info called ",
599 "but no <MISC_INFO> in the stream");
603 sub _process_repeat_list
605 my ($self) = @_;
606 my $line = $self->_readline();
608 if($line =~ /<REPEAT_LIST>/o) {
609 while($line !~ /<\/REPEAT_LIST>/o) {
610 $line = $self->_readline();
612 } else {
613 $self->throw("Bio::SeqIO::tigr::_process_repeat_list called ",
614 "but no <MISC_INFO> in the stream");
618 sub _process_tiling_path
620 my($self) = @_;
621 my $line = $self->_readline();
624 if($line =~ /<TILING_PATH>/o) {
625 while($line !~ /<\/TILING_PATH>/o) {
626 $line = $self->_readline();
628 } else {
629 $self->throw("Bio::SeqIO::tigr::_process_repeat_list called ",
630 "but no <MISC_INFO> in the stream");
634 sub _process_scaffold
636 my ($self) = @_;
637 my $line;
639 # for now we just skip them
640 $line = $self->_readline();
641 return if $line !~ /<SCAFFOLD>/o;
642 do {
643 $line = $self->_readline();
644 } while(defined($line) && $line !~ /<\/SCAFFOLD>/o);
647 sub _process_tu
649 my($self) = @_;
650 my $line = $self->_readline();
652 try {
653 my $tu = Bio::Seq::RichSeq->new(-strand => 1);
654 $tu->species( $self->{_assembly}->{species} );
656 # Add the source tag, so we can add the GO annotations to it
657 $tu->add_SeqFeature(Bio::SeqFeature::Generic->new(-source_tag => 'TIGR', -primary_tag => 'source'));
659 if($line !~ /<TU>/o) {
660 $self->throw("Process_tu called when no <TU> tag");
663 $line = $self->_readtag();
664 if ($line =~ /<FEAT_NAME>([\w\.]+)<\/FEAT_NAME>/o) {
665 $tu->accession_number($1);
666 $tu->add_secondary_accession($1);
667 $line = $self->_readtag();
668 } else {
669 $self->throw("Invalid Feat_Name");
672 while($line =~ /<GENE_SYNONYM>/o) {
673 # ignore
674 $line = $self->_readtag();
677 while($line =~ /<CHROMO_LINK>\s*([\w\.]+)\s*<\/CHROMO_LINK>/o) {
678 $tu->add_secondary_accession($1);
679 $line = $self->_readtag();
682 if ($line =~ /<DATE>([^>]*)<\/DATE>/o) {
683 $tu->add_date($1) if $1 and $1 !~ /^\s*$/o;
684 $line = $self->_readline();
685 } else {
686 #$self->throw("Invalid Date: $line");
689 if ($line =~ /<GENE_INFO>/o) {
690 $self->_pushback($line);
691 $self->_process_gene_info($tu);
692 $line = $self->_readline();
693 } else {
694 $self->throw("Invalid Gene_Info");
697 my $source;
698 my $end5;
699 my $end3;
700 if($line =~ /<COORDSET>/o) {
701 $self->_pushback($line);
702 my $cs = $self->_process_coordset();
704 $end5 = $cs->{end5};
705 $end3 = $cs->{end3};
707 my $length = $end3 - $end5;
708 my $strand = $length <=> 0;
709 $length = $length * $strand;
710 $length++; # Correct for starting at 1, not 0
712 # Add X filler sequence
713 $tu->seq('X' x $length);
715 # Get the source tag:
716 my($source) = grep { $_->primary_tag() eq 'source' } $tu->get_SeqFeatures();
718 # Set the start and end values
719 $source->start(1);
720 $source->end($length);
721 $source->strand(1);
723 # Add a bunch of tags to it
724 $source->add_tag_value(clone => $self->{_assembly}->{clone});
725 $source->add_tag_value(clone_name => $self->{_assembly}->{clone_name});
726 $source->add_tag_value(end5 => $end5);
727 $source->add_tag_value(end3 => $end3);
728 $source->add_tag_value(chromosome => $self->{_assembly}->{chromosome});
729 $source->add_tag_value(strand => ( $strand == 1 ? 'positive' : 'negative' ));
731 $line = $self->_readline();
732 } else {
733 $self->throw("Invalid Coordset");
736 if($line =~ /<MODEL[^>]*>/o) {
737 do {
738 $self->_pushback($line);
739 $self->_process_model($tu, $end5, $end3);
740 $line = $self->_readline();
741 } while($line =~ /<MODEL[^>]*>/o);
742 $self->_pushback($line);
743 $line = $self->_readtag();
744 } else {
745 $self->throw("Expected <MODEL> not found");
748 if($line =~ /<TRANSCRIPT_SEQUENCE>/o) {
749 my @chunks;
750 $line = $self->_readline();
751 while ($line =~ /^\s*([ACGT]+)\s*$/o) {
752 push( @chunks, $1 );
753 $line = $self->_readline();
755 # $line = $self->_readline();
758 if($line =~ /<GENE_EVIDENCE>/o) {
759 $line = $self->_readtag();
762 while($line =~ /<URL[^>]*>[^<]*<\/URL>/o) {
763 $line = $self->_readtag();
766 if($line =~ /<\/TU>/o) {
767 push(@{$self->{_sequences}}, $tu);
768 return;
769 } else {
770 $self->throw("Expected </TU> not found: $line");
772 } catch Bio::Root::OutOfRange with {
773 my $E = shift;
774 $self->warn(sprintf("One sub location of a sequence is invalid near line $.\: %s", $E->text()));
775 $line = $self->_readline() until $line =~ /<\/TU>/o;
776 return;
780 sub _process_gene_info
782 my($self, $tu) = @_;
783 my $line = $self->_readline();
785 $self->throw("Invalid Gene Info: $line") if $line !~ /<GENE_INFO>/o;
786 $line = $self->_readline();
788 if($line =~ /<LOCUS>\s*([\w\.]+)\s*<\/LOCUS>/o) {
789 $tu->accession_number($1);
790 $tu->add_secondary_accession($1);
791 $line = $self->_readline();
792 } elsif( $line =~ /<LOCUS>.*<\/LOCUS>/o) {
793 # We should throw an error, but TIGR doesn't alwasy play
794 # nice with adhering to their dtd
795 $line = $self->_readtag();
796 } else {
797 #$self->throw("Invalid Locus: $line");
800 if($line =~ /<ALT_LOCUS>\s*([\w\.]+)\s*<\/ALT_LOCUS>/o) {
801 $tu->accession_number($1);
802 $tu->add_secondary_accession($1);
803 $line = $self->_readline();
806 if($line =~ /<PUB_LOCUS>\s*([\w\.]+)\s*<\/PUB_LOCUS>/o) {
807 $tu->accession_number($1);
808 $tu->add_secondary_accession($1);
809 $line = $self->_readtag();
810 } elsif( $line =~ /<PUB_LOCUS>.*<\/PUB_LOCUS>/o) {
811 $line = $self->_readtag();
812 # $self->throw("Invalid Pub_Locus");
815 if($line =~ /<GENE_NAME.*>.*<\/GENE_NAME>/o) {
816 # Skip the GENE_NAME
817 $line = $self->_readtag();
820 if(my($attr, $value) = ($line =~ /<COM_NAME([^>]*)>([^>]+)<\/COM_NAME>/o)) {
821 #%attribs = ($attr =~ /(\w+)\s*=\s+"(.*?)"/og);
822 #$geneinfo->{'CURATED'} = $attribs{CURATED};
823 #$geneinfo->{IS_PRIMARY} = $attribs{IS_PRIMARY}
824 # TODO: add a tag on sources for curated
825 $tu->desc($value);
826 $line = $self->_readtag();
827 } else {
828 $self->throw("invalid com_name: $line");
831 while($line =~ /<COMMENT>([^<]+)<\/COMMENT>/o) {
832 my $comment = Bio::Annotation::Comment->new(
833 -text => $1
835 $tu->annotation()->add_Annotation('comment', $comment);
836 $line = $self->_readtag();
839 while($line =~ /<PUB_COMMENT>([^<]+)<\/PUB_COMMENT>/o) {
840 my $comment = Bio::Annotation::Comment->new(
841 -text => $1
843 $tu->annotation()->add_Annotation('comment', $comment);
844 $line = $self->_readtag();
847 if($line =~ /<EC_NUM>([\w\-\\\.]+)<\/EC_NUM>/o) {
848 #$geneinfo->{'EC_NUM'} = $1;
849 $line = $self->_readtag();
852 if($line =~ /<GENE_SYM>\s*([^<]+)\s*<\/GENE_SYM>/o) {
853 #$tu->add_secondary_accession($1);
854 $line = $self->_readtag();
857 if($line =~ /<IS_PSEUDOGENE>([^>]+)<\/IS_PSEUDOGENE>/o) {
858 #$geneinfo->{'IS_PSEUDOGENE'} = $1;
859 $line = $self->_readtag();
860 } else {
861 $self->throw("invalid is_pseudogene: $line");
864 if($line =~ /<FUNCT_ANNOT_EVIDENCE/o) {
865 $line = $self->_readtag();
868 if($line =~ /<DATE>([^>]+)<\/DATE>/o) {
869 #$geneinfo->{'DATE'} = $1;
870 $line = $self->_readtag();
873 while($line =~ /<GENE_ONTOLOGY>/o) {
874 # Get the source tag
875 my($source) = grep { $_->primary_tag() eq 'source' } $tu->get_SeqFeatures();
877 my @ids = ( $line =~ /(<GO_ID.*?<\/GO_ID>)/gso);
878 foreach my $go (@ids) {
879 my($assignment) = ($go =~ /<GO_ID\s+ASSIGNMENT\s+=\s+"GO:(\d+)">/os);
880 my($term) = ($go =~ /<GO_TERM>([^<]+)<\/GO_TERM>/os);
881 my($type) = ($go =~ /<GO_TYPE>([^<]+)<\/GO_TYPE>/os);
882 # TODO: Add GO annotation
883 if(defined $type and defined $assignment and defined $term) {
884 # Add the GO Annotation
885 $source->add_tag_value(
886 GO => "ID: $assignment; Type: $type; $term"
890 $line = $self->_readtag();
893 if($line =~ /<\/GENE_INFO/o) {
894 return;
897 $self->throw("unexpected end of gene_info");
900 sub _build_location
902 my($self, $end5, $end3, $length, $cs) = @_;
904 # Find the start and end of the location
905 # relative to the sequence.
906 my $start = abs( $end5 - $cs->{end5} ) + 1;
907 my $end = abs( $end5 - $cs->{end3} ) + 1;
909 # Do some bounds checking:
910 if( $start < 1 ) {
911 throw Bio::Root::OutOfRange(
912 -text => "locations' start( $start) must be >= 1"
914 } elsif( $end > $length ) {
915 throw Bio::Root::OutOfRange(
916 -text => "locations' end( $end ) must be <= length( $length )"
918 } elsif( $start > $end ) {
919 throw Bio::Root::OutOfRange(
920 -text => "locations' start ( $start ) must be < end ( $end ) $end5, $end3, $cs->{end5}, $cs->{end3}"
924 return Bio::Location::Simple->new( -start => $start, -end => $end, -strand => 1 );
927 sub _process_model
929 my($self, $tu, $end5, $end3) = @_;
930 my $line;
931 my( $source ) = grep { $_->primary_tag() eq 'source' } $tu->get_SeqFeatures();
932 my $model = Bio::SeqFeature::Generic->new(
933 -source_tag => 'TIGR',
934 -primary_tag => 'MODEL',
937 $line = $self->_readline();
938 if($line !~ /<MODEL ([^>]+)>/o) {
939 $self->throw("Invalid Model: $line")
941 my %attribs = ($1 =~ /(\w+)\s*=\s*"([^"]*)"/og);
942 #$model->{'CURATED'} = $attribs{'CURATED'};
943 # TODO: Add tag to model
944 $line = $self->_readline();
946 if($line =~ /<FEAT_NAME>\s*([\w\.]+)\s*<\/FEAT_NAME>/o) {
947 $model->add_tag_value( feat_name => $1 );
948 $tu->add_secondary_accession($1);
949 $line = $self->_readline();
950 } else {
951 $self->throw("Invalid Feature Name: $line");
954 if($line =~ /<PUB_LOCUS>\s*([\w\.]+)\s*<\/PUB_LOCUS>/o) {
955 $model->add_tag_value( pub_locus => $1 );
956 $tu->add_secondary_accession($1);
957 $line = $self->_readline();
958 } else {
959 # $self->throw("Invalid Pub_Locus: $line");
962 if($line =~ /<CDNA_SUPPORT>/o) {
963 $self->_pushback($line);
964 $self->_process_cdna_support( $model );
965 $line = $self->_readline();
968 while($line =~ /<CHROMO_LINK>([^>]+)<\/CHROMO_LINK>/o) {
969 $model->add_tag_value( chromo_link => $1 );
970 $line = $self->_readline();
973 if($line =~ /<DATE>([^>]+)<\/DATE>/o) {
974 $line = $self->_readline();
975 } else {
976 $self->throw("Invalid Date: $line");
979 if($line =~ /<COORDSET>/o) {
980 $self->_pushback($line);
981 my $cs = $self->_process_coordset();
982 my $loc = $self->_build_location($end5, $end3, $tu->length(), $cs);
984 $model->start( $loc->start() );
985 $model->end( $loc->end() );
986 $line = $self->_readline();
987 } else {
988 $self->throw("Invalid Coordset: $line");
991 my $exon = Bio::SeqFeature::Generic->new(
992 -source_tag => 'TIGR',
993 -primary_tag => 'EXON',
994 -location => Bio::Location::Split->new(),
995 -tags => [ locus => $tu->accession_number() ],
997 $exon->add_tag_value( model => $model->get_tag_values('feat_name') );
999 my $cds = Bio::SeqFeature::Generic->new(
1000 -source_tag => 'TIGR',
1001 -primary_tag => 'CDS',
1002 -location => Bio::Location::Split->new(),
1003 -tags => [ locus => $tu->accession_number() ],
1005 $cds->add_tag_value( model => $model->get_tag_values('feat_name') );
1006 my $utr = [];
1008 if($line =~ /<EXON>/o) {
1009 do {
1010 $self->_pushback($line);
1011 $self->_process_exon( $tu, $exon, $cds, $utr, $end5, $end3 );
1012 $line = $self->_readline();
1013 } while($line =~ /<EXON>/o);
1014 } else {
1015 $self->throw("Required <EXON> missing");
1018 until($line =~ /<\/MODEL>/o) {
1019 $line = $self->_readline();
1023 $_->add_tag_value( model => $model->get_tag_values('feat_name') )
1024 foreach @$utr;
1026 # Add the model, EXONs, CDS, and UTRs
1027 $tu->add_SeqFeature($model) if $model and $model->start() >= 1;
1028 $tu->add_SeqFeature($exon) if $exon and scalar($exon->location()->each_Location()) >= 1;
1029 $tu->add_SeqFeature($cds) if $cds and scalar($cds->location()->each_Location()) >= 1;
1030 $tu->add_SeqFeature(@$utr);
1032 return;
1035 sub _process_cdna_support
1037 my($self, $model) = @_;
1038 my $line = $self->_readline();
1040 if($line !~ /<CDNA_SUPPORT>/o) {
1041 $self->throw("Bio::SeqIO::tigr::_process_cdna_support called ",
1042 "but no <CDNA_SUPPORT> in the stream");
1045 $line = $self->_readline();
1047 while( $line =~ /<ACCESSION([^>]+)>(.*)<\/ACCESSION>/o) {
1048 # Save the text
1049 my $desc = $2;
1051 # Get the element's attributes
1052 my %attribs = ($1 =~ /(\w+)\s*=\s*"([^"]*)"/og);
1054 # Add the tag to the model
1055 $model->add_tag_value(
1056 cdna_support => "DBXRef: $attribs{DBXREF}; $desc"
1059 $line = $self->_readline();
1062 if( $line =~ /<\/CDNA_SUPPORT>/o) {
1063 return;
1065 $self->throw("reached end of _process_cdna_support");
1069 sub _process_exon
1071 my($self, $tu, $exon, $cds, $utr, $end5, $end3 ) = @_;
1072 my $line = $self->_readline();
1074 if($line !~ /<EXON>/o) {
1075 $self->throw("Bio::SeqIO::tigr::_process_exon called ",
1076 "but no <EXON> in the stream");
1079 $line = $self->_readtag();
1080 if($line =~ /<FEAT_NAME>([^<]+)<\/FEAT_NAME>/o) {
1081 # Ignore
1082 $line = $self->_readtag();
1083 } else {
1084 $self->throw("Required <FEAT_NAME> missing");
1087 if($line =~ /<DATE>([^<]+)<\/DATE>/o) {
1088 # Ignore
1089 $line = $self->_readtag();
1090 } else {
1091 $self->throw("Required <DATE> missing");
1094 if($line =~ /<COORDSET>/o) {
1095 $self->_pushback($line);
1096 my $cs = $self->_process_coordset();
1097 my $loc = $self->_build_location($end5, $end3, $tu->length(), $cs);
1098 $exon->location()->add_sub_Location($loc);
1099 $line = $self->_readline();
1100 } else {
1101 $self->throw("Required <COORDSET> missing");
1104 if($line =~ /<CDS>/o) {
1105 $self->_pushback($line);
1106 $self->_process_cds($tu, $end5, $end3, $cds);
1107 $line = $self->_readline();
1110 if($line =~ /<UTRS>/o) {
1111 $self->_pushback($line);
1112 $self->_process_utrs($tu, $end5, $end3, $utr);
1113 $line = $self->_readline();
1116 if($line =~ /<\/EXON>/o) {
1117 return;
1120 $self->throw("Reached End of Bio::SeqIO::tigr::_process_exon");
1123 sub _process_cds
1125 my($self, $tu, $end5, $end3, $cds) = @_;
1126 my $line = $self->_readline();
1128 if($line !~ /<CDS>/o) {
1129 $self->throw("Bio::SeqIO::tigr::_process_cda_support called ",
1130 "but no <CDS> in the stream");
1133 $line = $self->_readtag();
1134 if($line =~ /<FEAT_NAME>([^<]+)<\/FEAT_NAME>/o) {
1135 #$cds->{'FEAT_NAME'} = $1;
1136 $line = $self->_readtag();
1137 } else {
1138 $self->throw("Required <FEAT_NAME> missing");
1141 if($line =~ /<DATE>([^<]+)<\/DATE>/o) {
1142 #$cds->{'DATE'} = $1;
1143 $line = $self->_readtag();
1144 } else {
1145 $self->throw("Required <DATE> missing");
1148 if($line =~ /<COORDSET>/o) {
1149 $self->_pushback($line);
1150 my $cs = $self->_process_coordset();
1151 my $loc = $self->_build_location($end5, $end3, $tu->length(), $cs);
1152 $cds->location()->add_sub_Location($loc);
1153 $line = $self->_readline();
1154 } else {
1155 $self->throw("Required <COORDSET> missing");
1158 if($line =~ /<\/CDS>/o) {
1159 return;
1162 $self->throw("Reached onf of Bio::SeqIO::tigr::_process_cds");
1165 sub _process_utrs
1167 my($self, $tu, $end5, $end3, $utrs) = @_;
1168 my $line = $self->_readline();
1170 if($line !~ /<UTRS/o) {
1171 $self->throw("Bio::SeqIO::tigr::_process_utrs called but no ",
1172 "<UTRS> found in stream");
1175 $line = $self->_readline();
1176 while($line !~ /<\/UTRS>/o) {
1177 $self->_pushback($line);
1178 if($line =~ /<LEFT_UTR>/o) {
1179 $self->_process_left_utr($tu, $end5, $end3, $utrs);
1180 } elsif ($line =~ /<RIGHT_UTR>/o) {
1181 $self->_process_right_utr($tu, $end5, $end3, $utrs);
1182 } elsif ($line =~ /<EXTENDED_UTR>/o) {
1183 $self->_process_ext_utr($tu, $end5, $end3, $utrs);
1184 } else {
1185 $self->throw("Unexpected tag");
1188 $line = $self->_readline();
1191 if($line =~ /<\/UTRS>/o) {
1192 return $utrs;
1194 $self->throw("Reached end of Bio::SeqIO::tigr::_process_utrs");
1197 sub _process_left_utr
1199 my($self, $tu, $end5, $end3, $utrs) = @_;
1200 my $line = $self->_readline();
1201 my $coordset;
1203 if($line !~ /<LEFT_UTR>/o) {
1204 $self->throw("Bio::SeqIO::tigr::_process_left_utr called but ",
1205 "no <LEFT_UTR> found in stream");
1208 $line = $self->_readtag();
1209 if($line =~ /<COORDSET>/o) {
1210 $self->_pushback($line);
1211 my $cs = $self->_process_coordset();
1212 my $loc = $self->_build_location($end5, $end3, $tu->length(), $cs);
1214 push(@$utrs, Bio::SeqFeature::Generic->new(
1215 -source_tag => 'TIGR',
1216 -primary_tag => 'LEFT_UTR',
1217 -strand => 1,
1218 -start => $loc->start(),
1219 -end => $loc->end()
1222 $line = $self->_readline();
1223 } else {
1224 $self->throw("Required <COORDSET> missing");
1227 if($line =~ /<\/LEFT_UTR>/o) {
1228 return;
1230 $self->throw("Reached end of Bio::SeqIO::tigr::_process_left_utr");
1233 sub _process_right_utr
1235 my($self, $tu, $end5, $end3, $utrs) = @_;
1236 my $line = $self->_readline();
1237 my $coordset;
1239 if($line !~ /<RIGHT_UTR>/o) {
1240 $self->throw("Bio::SeqIO::tigr::_process_right_utr called but ",
1241 "no <RIGHT_UTR> found in stream");
1244 $line = $self->_readtag();
1245 if($line =~ /<COORDSET>/o) {
1246 $self->_pushback($line);
1247 $coordset = $self->_process_coordset();
1248 $self->_pushback($line);
1249 my $cs = $self->_process_coordset();
1250 my $loc = $self->_build_location($end5, $end3, $tu->length(), $cs);
1252 push(@$utrs, Bio::SeqFeature::Generic->new(
1253 -source_tag => 'TIGR',
1254 -primary_tag => 'RIGHT_UTR',
1255 -strand => 1,
1256 -start => $loc->start(),
1257 -end => $loc->end()
1260 $line = $self->_readline();
1261 } else {
1262 $self->throw("Required <COORDSET> missing");
1265 if($line =~ /<\/RIGHT_UTR>/o) {
1266 return $coordset;
1268 $self->throw("Reached end of Bio::SeqIO::tigr::_process_right_utr");
1271 sub _process_ext_utr
1273 my($self, $tu, $end5, $end3, $utrs) = @_;
1274 my $line = $self->_readline();
1275 my $coordset;
1277 if($line !~ /<EXTENDED_UTR>/o) {
1278 $self->throw("Bio::SeqIO::tigr::_process_ext_utr called but ",
1279 "no <EXTENDED_UTR> found in stream");
1282 $line = $self->_readtag();
1283 if($line =~ /<COORDSET>/o) {
1284 $self->_pushback($line);
1285 my $cs = $self->_process_coordset();
1286 my $loc = $self->_build_location($end5, $end3, $tu->length(), $cs);
1288 push(@$utrs, Bio::SeqFeature::Generic->new(
1289 -source_tag => 'TIGR',
1290 -primary_tag => 'EXTENDED_UTR',
1291 -strand => 1,
1292 -start => $loc->start(),
1293 -end => $loc->end()
1296 $line = $self->_readline();
1297 } else {
1298 $self->throw("Required <COORDSET> missing");
1301 if($line =~ /<\/EXTENDED_UTR>/o) {
1302 return $coordset;
1304 $self->throw("Reached end of Bio::SeqIO::tigr::_process_ext_utr");
1307 sub _readtag
1309 my($self) = @_;
1310 my $line = $self->_readline();
1311 chomp($line);
1313 my $tag;
1314 if(($tag) = ($line =~ /^[^<]*<\/(\w+)/o)) {
1315 $self->_pushback($1) if $line =~ /<\/$tag>(.+)$/;
1316 return "</$tag>";
1319 until(($tag) = ($line =~ /<(\w+)[^>]*>/o)) {
1320 $line = $self->_readline();
1321 chomp $line;
1324 until($line =~ /<\/$tag>/) {
1325 $line .= $self->_readline();
1328 if(my ($val) = ($line =~ /(<$tag.*>.*?<\/$tag>)/s)) {
1329 if($line =~ /<\/$tag>\s*(\w+[\s\w]*?)\s*$/s) {
1330 $self->_pushback($1)
1332 return $val;
1334 $self->throw("summerror");
1337 sub _readline
1339 my($self) = @_;
1340 my $line;
1341 do {
1342 $line = $self->SUPER::_readline();
1343 } while(defined($line) and $line =~ /^\s*$/o);
1345 return $line;
1348 sub throw
1350 my($self, @s) = @_;
1351 my $string = "[$.]" . join('', @s);
1352 $self->SUPER::throw($string);