2 # BioPerl module for Bio::SeqIO::EMBL
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
8 # Copyright Ewan Birney
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::SeqIO::embl - EMBL sequence input/output stream
20 It is probably best not to use this object directly, but
21 rather go through the SeqIO handler system. Go:
23 $stream = Bio::SeqIO->new(-file => $filename, -format => 'EMBL');
25 while ( (my $seq = $stream->next_seq()) ) {
26 # do something with $seq
31 This object can transform Bio::Seq objects to and from EMBL flat
34 There is a lot of flexibility here about how to dump things which
35 should be documented more fully.
37 There should be a common object that this and Genbank share (probably
38 with Swissprot). Too much of the magic is identical.
40 =head2 Optional functions
46 (output only) shows the dna or not
50 (output only) provides a sorting func which is applied to the FTHelpers
53 =item _id_generation_func()
55 This is function which is called as
57 print "ID ", $func($annseq), "\n";
59 To generate the ID line. If it is not there, it generates a sensible ID
60 line using a number of tools.
62 If you want to output annotations in EMBL format they need to be
63 stored in a Bio::Annotation::Collection object which is accessible
64 through the Bio::SeqI interface method L<annotation()|annotation>.
66 The following are the names of the keys which are polled from a
67 L<Bio::Annotation::Collection> object.
69 reference - Should contain Bio::Annotation::Reference objects
70 comment - Should contain Bio::Annotation::Comment objects
71 dblink - Should contain Bio::Annotation::DBLink objects
79 User feedback is an integral part of the evolution of this and other
80 Bioperl modules. Send your comments and suggestions preferably to one
81 of the Bioperl mailing lists. Your participation is much appreciated.
83 bioperl-l@bioperl.org - General discussion
84 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
88 Please direct usage questions or support issues to the mailing list:
90 I<bioperl-l@bioperl.org>
92 rather than to the module maintainer directly. Many experienced and
93 reponsive experts will be able look at the problem and quickly
94 address it. Please include a thorough description of the problem
95 with code and data examples if at all possible.
99 Report bugs to the Bioperl bug tracking system to help us keep track
100 the bugs and their resolution. Bug reports can be submitted via
103 https://github.com/bioperl/bioperl-live/issues
105 =head1 AUTHOR - Ewan Birney
107 Email birney@ebi.ac.uk
111 The rest of the documentation details each of the object
112 methods. Internal methods are usually preceded with a _
117 # Let the code begin...
120 package Bio
::SeqIO
::embl
;
121 use vars
qw(%FTQUAL_NO_QUOTE);
123 use Bio::SeqIO::FTHelper;
124 use Bio::SeqFeature::Generic;
126 use Bio::Seq::SeqFactory;
127 use Bio::Annotation::Collection;
128 use Bio::Annotation::Comment;
129 use Bio::Annotation::Reference;
130 use Bio::Annotation::DBLink;
132 use base qw(Bio::SeqIO);
134 # Note that a qualifier that exceeds one line (i.e. a long label) will
135 # automatically be quoted regardless:
155 my($self,@args) = @_;
157 $self->SUPER::_initialize
(@args);
158 # hash for functions for decoding keys.
159 $self->{'_func_ftunit_hash'} = {};
160 # sets this to one by default. People can change it
162 if ( ! defined $self->sequence_factory ) {
163 $self->sequence_factory(Bio
::Seq
::SeqFactory
->new
164 (-verbose
=> $self->verbose(),
165 -type
=> 'Bio::Seq::RichSeq'));
172 Usage : $seq = $stream->next_seq()
173 Function: returns the next sequence in the stream
174 Returns : Bio::Seq object
180 my ($self,@args) = @_;
181 my ($pseq,$c,$line,$name,$desc,$acc,$seqc,$mol,$div,
182 $date, $comment, @date_arr);
184 my ($annotation, %params, @features) =
185 Bio
::Annotation
::Collection
->new();
187 $line = $self->_readline;
188 # This needs to be before the first eof() test
190 if ( !defined $line ) {
191 return; # no throws - end of file
194 if ( $line =~ /^\s+$/ ) {
195 while ( defined ($line = $self->_readline) ) {
196 $line =~/^\S/ && last;
198 # return without error if the whole next sequence was just a single
199 # blank line and then eof
203 # no ID as 1st non-blank line, need short circuit and exit routine
204 $self->throw("EMBL stream with no ID. Not embl in my book")
205 unless $line =~ /^ID\s+\S+/;
207 # At this point we are sure that $line contains an ID header line
209 if ( $line =~ tr/;/;/ == 6) { # New style headers contain exactly six semicolons.
211 # New style header (EMBL Release >= 87, after June 2006)
215 # ID DQ299383; SV 1; linear; mRNA; STD; MAM; 431 BP.
216 # This regexp comes from the new2old.pl conversion script, from EBI
217 if ($line =~ m/^ID (\S+);\s+SV (\d+); (\w+); ([^;]+); (\w{3}); (\w{3}); (\d+) BP./) {
218 ($name, $sv, $topology, $mol, $div) = ($1, $2, $3, $4, $6);
221 $params{'-seq_version'} = $sv;
222 $params{'-version'} = $sv;
225 if (defined $topology && $topology eq 'circular') {
226 $params{'-is_circular'} = 1;
232 } elsif ($mol =~ /RNA/) {
234 } elsif ($mol =~ /AA/) {
235 $alphabet = 'protein';
240 # Old style header (EMBL Release < 87, before June 2006)
241 if ($line =~ /^ID\s+(\S+)[^;]*;\s+(\S+)[^;]*;\s+(\S+)[^;]*;/) {
242 ($name, $mol, $div) = ($1, $2, $3);
246 if ( $mol =~ /circular/ ) {
247 $params{'-is_circular'} = 1;
248 $mol =~ s
|circular
||;
253 } elsif ($mol =~ /RNA/) {
255 } elsif ($mol =~ /AA/) {
262 unless( defined $name && length($name) ) {
263 $name = "unknown_id";
266 # $self->warn("not parsing upper annotation in EMBL file yet!");
269 BEFORE_FEATURE_TABLE
:
271 until ( !defined $buffer ) {
273 # Exit at start of Feature table
274 if ( /^(F[HT]|SQ)/ ) {
275 $self->_pushback($_) if( $1 eq 'SQ' || $1 eq 'FT');
278 # Description line(s)
279 if (/^DE\s+(\S.*\S)/) {
280 $desc .= $desc ?
" $1" : $1;
284 if ( /^AC\s+(.*)?/ || /^PA\s+(.*)?/) {
285 my @accs = split(/[; ]+/, $1); # allow space in addition
286 $params{'-accession_number'} = shift @accs
287 unless defined $params{'-accession_number'};
288 push @
{$params{'-secondary_accessions'}}, @accs;
292 if ( /^SV\s+\S+\.(\d+);?/ ) {
295 $params{'-seq_version'} = $sv;
296 $params{'-version'} = $sv;
299 #date (NOTE: takes last date line)
300 if ( /^DT\s+(.+)$/ ) {
302 my ($date, $version) = split(' ', $line, 2);
303 $date =~ tr/,//d; # remove comma if new version
305 if ($version =~ /\(Rel\. (\d+), Created\)/ms ) {
306 my $release = Bio
::Annotation
::SimpleValue
->new(
307 -tagname
=> 'creation_release',
310 $annotation->add_Annotation($release);
311 } elsif ($version =~ /\(Rel\. (\d+), Last updated, Version (\d+)\)/ms ) {
312 my $release = Bio
::Annotation
::SimpleValue
->new(
313 -tagname
=> 'update_release',
316 $annotation->add_Annotation($release);
318 my $update = Bio
::Annotation
::SimpleValue
->new(
319 -tagname
=> 'update_version',
322 $annotation->add_Annotation($update);
325 push @
{$params{'-dates'}}, $date;
329 if ( /^KW (.*)\S*$/ ) {
330 my @kw = split(/\s*\;\s*/,$1);
331 push @
{$params{'-keywords'}}, @kw;
334 # Organism name and phylogenetic information
336 # pass the accession number so we can give an informative throw message if necessary
337 my $species = $self->_read_EMBL_Species(\
$buffer, $params{'-accession_number'});
338 $params{'-species'}= $species;
343 if (/NCBI_TaxID=(\d+)/) {
347 my @links = $self->_read_EMBL_TaxID_DBLink(\
$buffer);
348 foreach my $dblink ( @links ) {
349 $annotation->add_Annotation('dblink',$dblink);
355 my @refs = $self->_read_EMBL_References(\
$buffer);
356 foreach my $ref ( @refs ) {
357 $annotation->add_Annotation('reference',$ref);
363 my @links = $self->_read_EMBL_DBLink(\
$buffer);
364 foreach my $dblink ( @links ) {
365 $annotation->add_Annotation('dblink',$dblink);
370 elsif (/^CC\s+(.*)/) {
373 while (defined ($_ = $self->_readline) ) {
381 my $commobj = Bio
::Annotation
::Comment
->new();
382 $commobj->text($comment);
383 $annotation->add_Annotation('comment',$commobj);
388 $buffer = $self->_readline;
391 while ( defined ($_ = $self->_readline) ) {
392 /^FT\s{3}\w/ && last;
398 if (defined($buffer) && $buffer =~ /^FT /) {
399 until ( !defined ($buffer) ) {
400 my $ftunit = $self->_read_FTHelper_EMBL(\
$buffer);
404 $ftunit->_generic_seqfeature($self->location_factory(), $name);
406 # add taxon_id from source if available
407 # Notice, this will override what is found in the OX line.
408 # this is by design as this seems to be the official way
409 # of specifying a TaxID
410 if ($params{'-species'} && ($feat->primary_tag eq 'source')
411 && $feat->has_tag('db_xref')
412 && (! $params{'-species'}->ncbi_taxid())) {
413 foreach my $tagval ($feat->get_tag_values('db_xref')) {
414 if (index($tagval,"taxon:") == 0) {
415 $params{'-species'}->ncbi_taxid(substr($tagval,6));
421 # add feature to list of features
422 push(@features, $feat);
424 if ( $buffer !~ /^FT/ ) {
429 # Set taxid found in OX line
430 if ($params{'-species'} && defined $ncbi_taxid
431 && (! $params{'-species'}->ncbi_taxid())) {
432 $params{'-species'}->ncbi_taxid($ncbi_taxid);
436 while ( defined ($buffer) && $buffer =~ /^XX/ ) {
437 $buffer = $self->_readline();
440 if ( $buffer =~ /^CO/ ) {
442 # special : create contig as annotation
443 while ( defined ($buffer) ) {
444 $annotation->add_Annotation($_) for $self->_read_EMBL_Contig(\
$buffer);
445 if ( !$buffer || $buffer !~ /^CO/ ) {
451 if ($buffer !~ /^\/\
//) { # if no SQ lines following CO (bug#2958)
452 if ( $buffer !~ /^SQ/ ) {
453 while ( defined ($_ = $self->_readline) ) {
458 while ( defined ($_ = $self->_readline) ) {
465 my $seq = $self->sequence_factory->create
466 (-verbose
=> $self->verbose(),
470 -display_id
=> $name,
471 -annotation
=> $annotation,
473 -alphabet
=> $alphabet,
474 -features
=> \
@features,
481 =head2 _write_ID_line
483 Title : _write_ID_line
484 Usage : $self->_write_ID_line($seq);
485 Function: Writes the EMBL Release 87 format ID line to the stream, unless
486 : there is a user-supplied ID line generation function in which
487 : case that is used instead.
488 : ( See Bio::SeqIO::embl::_id_generation_function(). )
490 Args : Bio::Seq object
496 my ($self, $seq) = @_;
499 # If there is a user-supplied ID generation function, use it.
500 if ( $self->_id_generation_func ) {
501 $id_line = "ID " . &{$self->_id_generation_func}($seq) . "\nXX\n";
503 # Otherwise, generate a standard EMBL release 87 (June 2006) ID line.
506 # The sequence name is supposed to be the primary accession number,
507 my $name = $seq->accession_number();
508 if ( not(defined $name) || $name eq 'unknown') {
509 # but if it is not present, use the sequence ID or the empty string
510 $name = $seq->id() || '';
513 $self->warn("No whitespace allowed in EMBL id [". $name. "]") if $name =~ /\s/;
515 # Use the sequence version, or default to 1.
516 my $version = $seq->version() || 1;
518 my $len = $seq->length();
520 # Taxonomic division.
522 if ( $seq->can('division') && defined($seq->division) &&
523 $self->_is_valid_division($seq->division) ) {
524 $div = $seq->division();
526 $div ||= 'UNC'; # 'UNC' is the EMBL division code for 'unclassified'.
530 # If the molecule type is a valid EMBL type, use it.
531 if ( $seq->can('molecule')
532 && defined($seq->molecule)
533 && $self->_is_valid_molecule_type($seq->molecule)
535 $mol = $seq->molecule();
537 # Otherwise, choose unassigned DNA or RNA based on the alphabet.
538 elsif ($seq->can('primary_seq') && defined $seq->primary_seq->alphabet) {
539 my $alphabet =$seq->primary_seq->alphabet;
540 if ($alphabet eq 'dna') {
541 $mol ='unassigned DNA';
542 } elsif ($alphabet eq 'rna') {
543 $mol='unassigned RNA';
544 } elsif ($alphabet eq 'protein') {
545 $self->warn("Protein sequence found; EMBL is a nucleotide format.");
546 $mol='AA'; # AA is not a valid EMBL molecule type.
550 my $topology = 'linear';
551 if ($seq->is_circular) {
552 $topology = 'circular';
555 $mol ||= ''; # 'unassigned'; ?
556 $id_line = "ID $name; SV $version; $topology; $mol; STD; $div; $len BP.\nXX\n";
558 $self->_print($id_line);
561 =head2 _is_valid_division
563 Title : _is_valid_division
564 Usage : $self->_is_valid_division($div)
565 Function: tests division code for validity
566 Returns : true if $div is a valid EMBL release 87 taxonomic division.
567 Args : taxonomic division code string
571 sub _is_valid_division
{
572 my ($self, $division) = @_;
574 my %EMBL_divisions = (
575 "PHG" => 1, # Bacteriophage
576 "ENV" => 1, # Environmental Sample
579 "INV" => 1, # Invertebrate
580 "MAM" => 1, # Other Mammal
581 "VRT" => 1, # Other Vertebrate
582 "MUS" => 1, # Mus musculus
584 "PRO" => 1, # Prokaryote
585 "ROD" => 1, # Other Rodent
586 "SYN" => 1, # Synthetic
587 "UNC" => 1, # Unclassified
591 return exists($EMBL_divisions{$division});
594 =head2 _is_valid_molecule_type
596 Title : _is_valid_molecule_type
597 Usage : $self->_is_valid_molecule_type($mol)
598 Function: tests molecule type for validity
599 Returns : true if $mol is a valid EMBL release 87 molecule type.
600 Args : molecule type string
604 sub _is_valid_molecule_type
{
605 my ($self, $moltype) = @_;
607 my %EMBL_molecule_types = (
619 "unassigned DNA" => 1,
620 "unassigned RNA" => 1
623 return exists($EMBL_molecule_types{$moltype});
629 Usage : $stream->write_seq($seq)
630 Function: writes the $seq object (must be seq) to the stream
631 Returns : 1 for success and undef for error
632 Args : array of 1 to n Bio::SeqI objects
638 my ($self,@seqs) = @_;
640 foreach my $seq ( @seqs ) {
641 $self->throw("Attempting to write with no seq!") unless defined $seq;
642 unless ( ref $seq && $seq->isa('Bio::SeqI' ) ) {
643 $self->warn("$seq is not a SeqI compliant sequence object!")
644 if $self->verbose >= 0;
645 unless ( ref $seq && $seq->isa('Bio::PrimarySeqI' ) ) {
646 $self->throw("$seq is not a PrimarySeqI compliant sequence object!");
649 my $str = $seq->seq || '';
652 $self->_write_ID_line($seq);
655 # Write the accession line if present
658 if ( my $func = $self->_ac_generation_func ) {
659 $acc = &{$func}($seq);
660 } elsif ( $seq->isa('Bio::Seq::RichSeqI') &&
661 defined($seq->accession_number) ) {
662 $acc = $seq->accession_number;
663 $acc = join("; ", $acc, $seq->get_secondary_accessions);
664 } elsif ( $seq->can('accession_number') ) {
665 $acc = $seq->accession_number;
669 $self->_print("AC $acc;\n",
676 if ( $seq->can('get_dates') ) {
677 my @dates = $seq->get_dates();
680 my ($cr) = $seq->annotation->get_Annotations("creation_release");
681 my ($ur) = $seq->annotation->get_Annotations("update_release");
682 my ($uv) = $seq->annotation->get_Annotations("update_version");
684 unless ($cr && $ur && $ur) {
688 foreach my $dt (@dates) {
690 $self->_write_line_EMBL_regex("DT ","DT ",
691 $dt." (Rel. ".($cr->value()).", Created)",
692 '\s+|$',80) if $ct == 1;
693 $self->_write_line_EMBL_regex("DT ","DT ",
694 $dt." (Rel. ".($ur->value()).", Last updated, Version ".($uv->value()).")",
695 '\s+|$',80) if $ct == 2;
696 } else { # other formats?
697 $self->_write_line_EMBL_regex("DT ","DT ",
704 $self->_print("XX\n") || return;
709 $self->_write_line_EMBL_regex("DE ","DE ",$seq->desc(),'\s+|$',80) || return; #'
710 $self->_print( "XX\n") || return;
712 # if there, write the kw line
715 if ( my $func = $self->_kw_generation_func ) {
716 $kw = &{$func}($seq);
717 } elsif ( $seq->can('keywords') ) {
718 $kw = $seq->keywords;
721 $self->_write_line_EMBL_regex("KW ", "KW ", $kw, '\s+|$', 80) || return; #'
722 $self->_print( "XX\n") || return;
728 if ($seq->can('species') && (my $spec = $seq->species)) {
729 my @class = $spec->classification();
730 shift @class; # get rid of species name. Some embl files include
731 # the species name in the OC lines, but this seems
732 # more like an error than something we need to
734 my $OS = $spec->scientific_name;
735 if ($spec->common_name) {
736 $OS .= ' ('.$spec->common_name.')';
738 $self->_print("OS $OS\n") || return;
739 my $OC = join('; ', reverse(@class)) .'.';
740 $self->_write_line_EMBL_regex("OC ","OC ",$OC,'; |$',80) || return;
741 if ($spec->organelle) {
742 $self->_write_line_EMBL_regex("OG ","OG ",$spec->organelle,'; |$',80) || return;
744 my $ncbi_taxid = $spec->ncbi_taxid;
746 $self->_print("OX NCBI_TaxID=$ncbi_taxid\n") || return;
748 $self->_print("XX\n") || return;
752 if ( $seq->can('annotation') && defined $seq->annotation ) {
753 foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
754 $self->_print( "RN [$t]\n") || return;
756 # Having no RP line is legal, but we need both
757 # start and end for a valid location.
759 $self->_write_line_EMBL_regex("RC ", "RC ", $ref->comment, '\s+|$', 80) || return; #'
761 my $start = $ref->start;
763 if ($start and $end) {
764 $self->_print( "RP $start-$end\n") || return;
765 } elsif ($start or $end) {
766 $self->throw("Both start and end are needed for a valid RP line.".
767 " Got: start='$start' end='$end'");
770 if (my $med = $ref->medline) {
771 $self->_print( "RX MEDLINE; $med.\n") || return;
773 if (my $pm = $ref->pubmed) {
774 $self->_print( "RX PUBMED; $pm.\n") || return;
776 my $authors = $ref->authors;
777 $authors =~ s/([\w\.]) (\w)/$1#$2/g; # add word wrap protection char '#'
779 $self->_write_line_EMBL_regex("RA ", "RA ",
781 '\s+|$', 80) || return; #'
783 # If there is no title to the reference, it appears
784 # as a single semi-colon. All titles must end in
786 my $ref_title = $ref->title || '';
787 $ref_title =~ s/[\s;]*$/;/;
788 $self->_write_line_EMBL_regex("RT ", "RT ", $ref_title, '\s+|$', 80) || return; #'
789 $self->_write_line_EMBL_regex("RL ", "RL ", $ref->location, '\s+|$', 80) || return; #'
790 $self->_print("XX\n") || return;
795 if (my @db_xref = $seq->annotation->get_Annotations('dblink') ) {
796 for my $dr (@db_xref) {
797 my $db_name = $dr->database;
798 my $prim = $dr->primary_id;
800 my $opt = $dr->optional_id || '';
801 my $line = $opt ?
"$db_name; $prim; $opt." : "$db_name; $prim.";
802 $self->_write_line_EMBL_regex("DR ", "DR ", $line, '\s+|$', 80) || return; #'
804 $self->_print("XX\n") || return;
808 foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
809 $self->_write_line_EMBL_regex("CC ", "CC ", $comment->text, '\s+|$', 80) || return; #'
810 $self->_print("XX\n") || return;
817 $self->_print("FH Key Location/Qualifiers\n") || return;
818 $self->_print("FH\n") || return;
820 my @feats = $seq->can('top_SeqFeatures') ?
$seq->top_SeqFeatures : ();
822 if ( defined $self->_post_sort ) {
823 # we need to read things into an array.
824 # Process. Sort them. Print 'em
826 my $post_sort_func = $self->_post_sort();
829 foreach my $sf ( @feats ) {
830 push(@fth,Bio
::SeqIO
::FTHelper
::from_SeqFeature
($sf,$seq));
833 @fth = sort { &$post_sort_func($a,$b) } @fth;
835 foreach my $fth ( @fth ) {
836 $self->_print_EMBL_FTHelper($fth) || return;
839 # not post sorted. And so we can print as we get them.
840 # lower memory load...
842 foreach my $sf ( @feats ) {
843 my @fth = Bio
::SeqIO
::FTHelper
::from_SeqFeature
($sf,$seq);
844 foreach my $fth ( @fth ) {
845 if ( $fth->key eq 'CONTIG') {
848 $self->_print_EMBL_FTHelper($fth) || return;
854 if ( $self->_show_dna() == 0 ) {
855 $self->_print( "//\n") || return;
858 $self->_print( "XX\n") || return;
860 # finished printing features.
862 # print contig if present : bug#2982
863 if ( $seq->can('annotation') && defined $seq->annotation) {
864 foreach my $ctg ( $seq->annotation->get_Annotations('contig') ) {
866 $self->_write_line_EMBL_regex("CO ","CO ", $ctg->value,
867 '[,]|$', 80) || return;
871 # print sequence lines only if sequence is present! bug#2982
875 # Count each nucleotide
876 my $alen = $str =~ tr/a/a/;
877 my $clen = $str =~ tr/c/c/;
878 my $glen = $str =~ tr/g/g/;
879 my $tlen = $str =~ tr/t/t/;
881 my $len = $seq->length();
882 my $olen = $seq->length() - ($alen + $tlen + $clen + $glen);
884 $self->warn("Weird. More atgc than bases. Problem!");
887 $self->_print("SQ Sequence $len BP; $alen A; $clen C; $glen G; $tlen T; $olen other;\n") || return;
889 my $nuc = 60; # Number of nucleotides per line
890 my $whole_pat = 'a10' x
6; # Pattern for unpacking a whole line
891 my $out_pat = 'A11' x
6; # Pattern for packing a line
892 my $length = length($str);
894 # Calculate the number of nucleotides which fit on whole lines
895 my $whole = int($length / $nuc) * $nuc;
897 # Print the whole lines
899 for ($i = 0; $i < $whole; $i += $nuc) {
900 my $blocks = pack $out_pat,
902 substr($str, $i, $nuc);
903 $self->_print(sprintf(" $blocks%9d\n", $i + $nuc)) || return;
906 # Print the last line
907 if (my $last = substr($str, $i)) {
908 my $last_len = length($last);
909 my $last_pat = 'a10' x
int($last_len / 10) .'a'. $last_len % 10;
910 my $blocks = pack $out_pat,
911 unpack($last_pat, $last);
912 $self->_print(sprintf(" $blocks%9d\n", $length)) ||
913 return; # Add the length to the end
917 $self->_print( "//\n") || return;
919 $self->flush if $self->_flush_on_write && defined $self->_fh;
924 =head2 _print_EMBL_FTHelper
926 Title : _print_EMBL_FTHelper
928 Function: Internal function
929 Returns : 1 if writing succeeded, otherwise undef
935 sub _print_EMBL_FTHelper
{
936 my ($self,$fth) = @_;
938 if ( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) {
939 $fth->warn("$fth is not a FTHelper class. Attempting to print, but there could be tears!");
943 #$self->_print( "FH Key Location/Qualifiers\n");
944 #$self->_print( sprintf("FT %-15s %s\n",$fth->key,$fth->loc));
946 if ( $fth->key eq 'CONTIG' ) {
947 $self->_print("XX\n") || return;
948 $self->_write_line_EMBL_regex("CO ",
950 '\,|$',80) || return; #'
953 $self->_write_line_EMBL_regex(sprintf("FT %-15s ",$fth->key),
955 '\,|$',80) || return; #'
956 foreach my $tag (sort keys %{$fth->field} ) {
957 if ( ! defined $fth->field->{$tag} ) {
960 foreach my $value ( @
{$fth->field->{$tag}} ) {
961 $value =~ s/\"/\"\"/g;
962 if ($value eq "_no_value") {
963 $self->_write_line_EMBL_regex("FT ",
965 "/$tag",'.|$',80) || return; #'
967 # there are almost 3x more quoted qualifier values and they
968 # are more common too so we take quoted ones first
970 # Long qualifiers, that will be line wrapped, are always quoted
971 elsif (!$FTQUAL_NO_QUOTE{$tag} or length("/$tag=$value")>=60) {
972 my $pat = $value =~ /\s+/ ?
'\s+|\-|$' : '.|\-|$';
973 $self->_write_line_EMBL_regex("FT ",
975 "/$tag=\"$value\"",$pat,80) || return;
977 $self->_write_line_EMBL_regex("FT ",
979 "/$tag=$value",'.|$',80) || return; #'
989 =head2 _read_EMBL_Contig()
991 Title : _read_EMBL_Contig
993 Function: convert CO lines into annotations
999 sub _read_EMBL_Contig
{
1000 my ($self, $buffer) = @_;
1002 if ( $$buffer !~ /^CO/ ) {
1003 warn("Not parsing line '$$buffer' which maybe important");
1005 $self->_pushback($$buffer);
1006 while ( defined ($_ = $self->_readline) ) {
1008 /^CO\s+(.*)/ && do {
1009 push @ret, Bio
::Annotation
::SimpleValue
->new( -tagname
=> 'contig',
1018 =head2 _read_EMBL_References
1020 Title : _read_EMBL_References
1022 Function: Reads references from EMBL format. Internal function really
1030 sub _read_EMBL_References
{
1031 my ($self,$buffer) = @_;
1034 # assume things are starting with RN
1036 if ( $$buffer !~ /^RN/ ) {
1037 warn("Not parsing line '$$buffer' which maybe important");
1048 while ( defined ($_ = $self->_readline) ) {
1050 /^RP (\d+)-(\d+)/ && do {$b1=$1;$b2=$2;};
1051 /^RX MEDLINE;\s+(\d+)/ && do {$med=$1};
1052 /^RX PUBMED;\s+(\d+)/ && do {$pm=$1};
1054 $au = $self->_concatenate_lines($au,$1); next;
1057 $title = $self->_concatenate_lines($title,$1); next;
1060 $loc = $self->_concatenate_lines($loc,$1); next;
1063 $com = $self->_concatenate_lines($com,$1); next;
1067 my $ref = Bio
::Annotation
::Reference
->new();
1069 $title =~ s/;\s*$//g;
1074 $ref->title($title);
1075 $ref->location($loc);
1076 $ref->medline($med);
1077 $ref->comment($com);
1086 =head2 _read_EMBL_Species
1088 Title : _read_EMBL_Species
1090 Function: Reads the EMBL Organism species and classification
1093 Returns : A Bio::Species object
1094 Args : a reference to the current line buffer, accession number
1098 sub _read_EMBL_Species
{
1099 my( $self, $buffer, $acc ) = @_;
1103 my( $sub_species, $species, $genus, $common, $sci_name, $class_lines );
1104 while (defined( $_ ||= $self->_readline )) {
1106 $sci_name .= ($sci_name) ?
' '.$1 : $1;
1107 } elsif (s/^OC\s+(.+)$//) {
1109 } elsif (/^OG\s+(.*)/) {
1115 $_ = undef; # Empty $_ to trigger read of next line
1119 $self->_pushback($_);
1121 $sci_name =~ s{\.$}{};
1122 $sci_name || return;
1124 # Convert data in classification lines into classification array.
1125 # only split on ';' or '.' so that classification that is 2 or more words
1126 # will still get matched, use map() to remove trailing/leading/intervening
1128 my @class = map { s/^\s+//; s/\s+$//; s/\s{2,}/ /g; $_; } split /(?<!subgen)[;\.]+/, $class_lines;
1130 # do we have a genus?
1131 my $possible_genus = $class[-1];
1132 $possible_genus .= "|$class[-2]" if $class[-2];
1133 if ($sci_name =~ /^($possible_genus)/) {
1135 ($species) = $sci_name =~ /^$genus\s+(.+)/;
1137 $species = $sci_name;
1140 # Don't make a species object if it is "Unknown" or "None"
1142 return if $genus =~ /^(Unknown|None)$/i;
1145 # is this organism of rank species or is it lower?
1146 # (doesn't catch everything, but at least the guess isn't dangerous)
1147 if ($species =~ /subsp\.|var\./) {
1148 ($species, $sub_species) = $species =~ /(.+)\s+((?:subsp\.|var\.).+)/;
1151 # sometimes things have common name in brackets, like
1152 # Schizosaccharomyces pombe (fission yeast), so get rid of the common
1153 # name bit. Probably dangerous if real scientific species name ends in
1155 unless ($class[-1] eq 'Viruses') {
1156 ($species, $common) = $species =~ /^(.+)\s+\((.+)\)$/;
1157 $sci_name =~ s/\s+\(.+\)$// if $common;
1160 # Bio::Species array needs array in Species -> Kingdom direction
1161 unless ($class[-1] eq $sci_name) {
1162 push(@class, $sci_name);
1164 @class = reverse @class;
1166 # do minimal sanity checks before we hand off to Bio::Species which won't
1167 # be able to give informative throw messages if it has to throw because
1169 $self->throw("$acc seems to be missing its OS line: invalid.") unless $sci_name;
1171 foreach my $i (0..$#class) {
1172 my $name = $class[$i];
1174 # this code breaks examples like: Xenopus (Silurana) tropicalis
1175 # commenting out, see bug 3158
1177 #if ($names{$name} > 1 && ($name ne $class[$i - 1])) {
1178 # $self->warn("$acc seems to have an invalid species classification:$name ne $class[$i - 1]");
1181 my $make = Bio
::Species
->new();
1182 $make->scientific_name($sci_name);
1183 $make->classification(@class);
1184 unless ($class[-1] eq 'Viruses') {
1185 $make->genus($genus) if $genus;
1186 $make->species($species) if $species;
1187 $make->sub_species($sub_species) if $sub_species;
1188 $make->common_name($common) if $common;
1190 $make->organelle($org) if $org;
1194 =head2 _read_EMBL_DBLink
1196 Title : _read_EMBL_DBLink
1198 Function: Reads the EMBL database cross reference ("DR") lines
1200 Returns : A list of Bio::Annotation::DBLink objects
1205 sub _read_EMBL_DBLink
{
1206 my( $self,$buffer ) = @_;
1210 while (defined( $_ ||= $self->_readline )) {
1211 if ( /^DR ([^\s;]+);\s*([^\s;]+);?\s*([^\s;]+)?\.$/) {
1212 my ($databse, $prim_id, $sec_id) = ($1,$2,$3);
1213 my $link = Bio
::Annotation
::DBLink
->new(-database
=> $databse,
1214 -primary_id
=> $prim_id,
1215 -optional_id
=> $sec_id);
1217 push(@db_link, $link);
1221 $_ = undef; # Empty $_ to trigger read of next line
1228 =head2 _read_EMBL_TaxID_DBLink
1230 Title : _read_EMBL_TaxID_DBLink
1232 Function: Reads the EMBL database cross reference to NCBI TaxID ("OX") lines
1234 Returns : A list of Bio::Annotation::DBLink objects
1239 sub _read_EMBL_TaxID_DBLink
{
1240 my( $self,$buffer ) = @_;
1244 while (defined( $_ ||= $self->_readline )) {
1245 if ( /^OX (\S+)=(\d+);$/ ) {
1246 my ($databse, $prim_id) = ($1,$2);
1247 my $link = Bio
::Annotation
::DBLink
->new(-database
=> $databse,
1248 -primary_id
=> $prim_id,);
1249 push(@db_link, $link);
1253 $_ = undef; # Empty $_ to trigger read of next line
1263 Usage : $obj->_filehandle($newval)
1266 Returns : value of _filehandle
1267 Args : newvalue (optional)
1273 my ($obj,$value) = @_;
1274 if ( defined $value) {
1275 $obj->{'_filehandle'} = $value;
1277 return $obj->{'_filehandle'};
1281 =head2 _read_FTHelper_EMBL
1283 Title : _read_FTHelper_EMBL
1284 Usage : _read_FTHelper_EMBL($buffer)
1285 Function: reads the next FT key line
1287 Returns : Bio::SeqIO::FTHelper object
1288 Args : filehandle and reference to a scalar
1293 sub _read_FTHelper_EMBL
{
1294 my ($self,$buffer) = @_;
1296 my ($key, # The key of the feature
1297 $loc, # The location line from the feature
1298 @qual, # An arrray of lines making up the qualifiers
1301 if ($$buffer =~ /^FT\s{3}(\S+)\s+(\S+)/ ) {
1304 # Read all the lines up to the next feature
1305 while ( defined($_ = $self->_readline) ) {
1306 if (/^FT(\s+)(.+?)\s*$/) {
1307 # Lines inside features are preceeded by 19 spaces
1308 # A new feature is preceeded by 3 spaces
1309 if (length($1) > 4) {
1310 # Add to qualifiers if we're in the qualifiers
1314 # Start the qualifier list if it's the first qualifier
1315 elsif (substr($2, 0, 1) eq '/') {
1318 # We're still in the location line, so append to location
1323 # We've reached the start of the next feature
1327 # We're at the end of the feature table
1331 } elsif ( $$buffer =~ /^CO\s+(\S+)/) {
1334 # Read all the lines up to the next feature
1335 while ( defined($_ = $self->_readline) ) {
1336 if (/^CO\s+(\S+)\s*$/) {
1339 # We've reached the start of the next feature
1348 # Put the first line of the next feature into the buffer
1351 # Make the new FTHelper object
1352 my $out = Bio
::SeqIO
::FTHelper
->new();
1353 $out->verbose($self->verbose());
1357 # Now parse and add any qualifiers. (@qual is kept
1358 # intact to provide informative error messages.)
1359 my $last_unquoted_qualifier;
1361 for (my $i = 0; $i < @qual; $i++) {
1362 my $data = $qual[$i];
1363 my ( $qualifier, $value );
1365 unless (( $qualifier, $value ) = ($data =~ m{^/([^=]+)(?:=\s*(.*))?})) {
1366 if ( defined $last_unquoted_qualifier ) {
1367 # handle case of unquoted multiline - read up everything until the next qualifier
1369 # Protein sequence translations need to be joined without spaces,
1370 # other qualifiers need those.
1371 $value .= ' ' if $qualifier ne "translation";
1373 } while defined($data = $qual[++$i]) && $data !~ m
[^/];
1375 $out->field->{$last_unquoted_qualifier}->[-1] .= $value;
1376 $last_unquoted_qualifier = undef;
1379 $self->throw("Can't see new qualifier in: $_\nfrom:\n"
1380 . join('', map "$_\n", @qual));
1383 $qualifier = '' if not defined $qualifier;
1385 $last_unquoted_qualifier = undef;
1386 if (defined $value) {
1387 # Do we have a quoted value?
1388 if (substr($value, 0, 1) eq '"') {
1389 # Keep adding to value until we find the trailing quote
1390 # and the quotes are balanced
1392 while ($value !~ /"$/ or $value =~ tr/"/"/ % 2) { #"
1394 my $next = $qual[$i];
1395 if (!defined($next)) {
1396 $self->warn("Unbalanced quote in:\n".join("\n", @qual).
1397 "\nAdding quote to close...".
1398 "Check sequence quality!");
1403 # Protein sequence translations need to be joined without spaces,
1404 # other qualifiers need those.
1405 if ($qualifier eq "translation") {
1411 # Trim leading and trailing quotes
1412 $value =~ s/^"|"$//g;
1413 # Undouble internal quotes
1414 $value =~ s/""/"/g; #"
1416 $last_unquoted_qualifier = $qualifier;
1419 $value = '_no_value';
1422 # Store the qualifier
1423 $out->field->{$qualifier} ||= [];
1424 push(@
{$out->field->{$qualifier}},$value);
1430 =head2 _write_line_EMBL
1432 Title : _write_line_EMBL
1434 Function: internal function
1436 Returns : 1 if writing succeeded, else undef
1442 sub _write_line_EMBL
{
1443 my ($self,$pre1,$pre2,$line,$length) = @_;
1445 $length || $self->throw("Miscalled write_line_EMBL without length. Programming error!");
1446 my $subl = $length - length $pre2;
1447 my $linel = length $line;
1450 my $sub = substr($line,0,$length - length $pre1);
1452 $self->_print( "$pre1$sub\n") || return;
1454 for ($i= ($length - length $pre1);$i < $linel;) {
1455 $sub = substr($line,$i,($subl));
1456 $self->_print( "$pre2$sub\n") || return;
1463 =head2 _write_line_EMBL_regex
1465 Title : _write_line_EMBL_regex
1467 Function: internal function for writing lines of specified
1468 length, with different first and the next line
1469 left hand headers and split at specific points in the
1473 Args : file handle, first header, second header, text-line, regex for line breaks, total line length
1478 sub _write_line_EMBL_regex
{
1479 my ($self,$pre1,$pre2,$line,$regex,$length) = @_;
1481 #print STDOUT "Going to print with $line!\n";
1483 $length || $self->throw("Programming error - called write_line_EMBL_regex without length.");
1485 my $subl = $length - (length $pre1) -1 ;
1488 CHUNK
: while($line) {
1489 foreach my $pat ($regex, '[,;\.\/-]\s|'.$regex, '[,;\.\/-]|'.$regex) {
1490 if ($line =~ m/^(.{1,$subl})($pat)(.*)/ ) {
1492 $l =~ s/#/ /g # remove word wrap protection char '#'
1495 $line = substr($line,length($l));
1496 # be strict about not padding spaces according to
1499 next CHUNK
if ($l eq '');
1504 # if we get here none of the patterns matched $subl or less chars
1505 $self->warn("trouble dissecting \"$line\"\n into chunks ".
1506 "of $subl chars or less - this tag won't print right");
1507 # insert a space char to prevent infinite loops
1508 $line = substr($line,0,$subl) . " " . substr($line,$subl);
1510 my $s = shift @lines;
1511 ($self->_print("$pre1$s\n") || return) if $s;
1512 foreach my $s ( @lines ) {
1513 $self->_print("$pre2$s\n") || return;
1522 Usage : $obj->_post_sort($newval)
1524 Returns : value of _post_sort
1525 Args : newvalue (optional)
1534 $obj->{'_post_sort'} = $value;
1536 return $obj->{'_post_sort'};
1543 Usage : $obj->_show_dna($newval)
1545 Returns : value of _show_dna
1546 Args : newvalue (optional)
1555 $obj->{'_show_dna'} = $value;
1557 return $obj->{'_show_dna'};
1561 =head2 _id_generation_func
1563 Title : _id_generation_func
1564 Usage : $obj->_id_generation_func($newval)
1566 Returns : value of _id_generation_func
1567 Args : newvalue (optional)
1572 sub _id_generation_func
{
1576 $obj->{'_id_generation_func'} = $value;
1578 return $obj->{'_id_generation_func'};
1582 =head2 _ac_generation_func
1584 Title : _ac_generation_func
1585 Usage : $obj->_ac_generation_func($newval)
1587 Returns : value of _ac_generation_func
1588 Args : newvalue (optional)
1593 sub _ac_generation_func
{
1597 $obj->{'_ac_generation_func'} = $value;
1599 return $obj->{'_ac_generation_func'};
1603 =head2 _sv_generation_func
1605 Title : _sv_generation_func
1606 Usage : $obj->_sv_generation_func($newval)
1608 Returns : value of _sv_generation_func
1609 Args : newvalue (optional)
1614 sub _sv_generation_func
{
1618 $obj->{'_sv_generation_func'} = $value;
1620 return $obj->{'_sv_generation_func'};
1624 =head2 _kw_generation_func
1626 Title : _kw_generation_func
1627 Usage : $obj->_kw_generation_func($newval)
1629 Returns : value of _kw_generation_func
1630 Args : newvalue (optional)
1635 sub _kw_generation_func
{
1639 $obj->{'_kw_generation_func'} = $value;
1641 return $obj->{'_kw_generation_func'};