lib/Bio/SeqIO/embl.pm

   1 #
   2 # BioPerl module for Bio::SeqIO::EMBL
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
   7 #
   8 # Copyright Ewan Birney
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::SeqIO::embl - EMBL sequence input/output stream
  17
  18 =head1 SYNOPSIS
  19
  20 It is probably best not to use this object directly, but
  21 rather go through the SeqIO handler system. Go:
  22
  23     $stream = Bio::SeqIO->new(-file => $filename, -format => 'EMBL');
  24
  25     while ( (my $seq = $stream->next_seq()) ) {
  26         # do something with $seq
  27     }
  28
  29 =head1 DESCRIPTION
  30
  31 This object can transform Bio::Seq objects to and from EMBL flat
  32 file databases.
  33
  34 There is a lot of flexibility here about how to dump things which
  35 should be documented more fully.
  36
  37 There should be a common object that this and Genbank share (probably
  38 with Swissprot). Too much of the magic is identical.
  39
  40 =head2 Optional functions
  41
  42 =over 3
  43
  44 =item _show_dna()
  45
  46 (output only) shows the dna or not
  47
  48 =item _post_sort()
  49
  50 (output only) provides a sorting func which is applied to the FTHelpers
  51 before printing
  52
  53 =item _id_generation_func()
  54
  55 This is function which is called as
  56
  57    print "ID   ", $func($annseq), "\n";
  58
  59 To generate the ID line. If it is not there, it generates a sensible ID
  60 line using a number of tools.
  61
  62 If you want to output annotations in EMBL format they need to be
  63 stored in a Bio::Annotation::Collection object which is accessible
  64 through the Bio::SeqI interface method L<annotation()|annotation>.
  65
  66 The following are the names of the keys which are polled from a
  67 L<Bio::Annotation::Collection> object.
  68
  69  reference  - Should contain Bio::Annotation::Reference objects
  70  comment    - Should contain Bio::Annotation::Comment objects
  71  dblink     - Should contain Bio::Annotation::DBLink objects
  72
  73 =back
  74
  75 =head1 FEEDBACK
  76
  77 =head2 Mailing Lists
  78
  79 User feedback is an integral part of the evolution of this and other
  80 Bioperl modules. Send your comments and suggestions preferably to one
  81 of the Bioperl mailing lists.  Your participation is much appreciated.
  82
  83   bioperl-l@bioperl.org                  - General discussion
  84   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  85
  86 =head2 Support
  87
  88 Please direct usage questions or support issues to the mailing list:
  89
  90 I<bioperl-l@bioperl.org>
  91
  92 rather than to the module maintainer directly. Many experienced and
  93 reponsive experts will be able look at the problem and quickly
  94 address it. Please include a thorough description of the problem
  95 with code and data examples if at all possible.
  96
  97 =head2 Reporting Bugs
  98
  99 Report bugs to the Bioperl bug tracking system to help us keep track
 100 the bugs and their resolution. Bug reports can be submitted via
 101 the web:
 102
 103   https://github.com/bioperl/bioperl-live/issues
 104
 105 =head1 AUTHOR - Ewan Birney
 106
 107 Email birney@ebi.ac.uk
 108
 109 =head1 APPENDIX
 110
 111 The rest of the documentation details each of the object
 112 methods. Internal methods are usually preceded with a _
 113
 114 =cut
 115
 116
 117 # Let the code begin...
 118
 119
 120 package Bio::SeqIO::embl;
 121 use vars qw(%FTQUAL_NO_QUOTE);
 122 use strict;
 123 use Bio::SeqIO::FTHelper;
 124 use Bio::SeqFeature::Generic;
 125 use Bio::Species;
 126 use Bio::Seq::SeqFactory;
 127 use Bio::Annotation::Collection;
 128 use Bio::Annotation::Comment;
 129 use Bio::Annotation::Reference;
 130 use Bio::Annotation::DBLink;
 131
 132 use base qw(Bio::SeqIO);
 133
 134 # Note that a qualifier that exceeds one line (i.e. a long label) will
 135 # automatically be quoted regardless:
 136 %FTQUAL_NO_QUOTE=(
 137                   'anticodon'=>1,
 138                   'citation'=>1,
 139                   'codon'=>1,
 140                   'codon_start'=>1,
 141                   'cons_splice'=>1,
 142                   'direction'=>1,
 143                   'evidence'=>1,
 144                   'label'=>1,
 145                   'mod_base'=> 1,
 146                   'number'=> 1,
 147                   'rpt_type'=> 1,
 148                   'rpt_unit'=> 1,
 149                   'transl_except'=> 1,
 150                   'transl_table'=> 1,
 151                   'usedin'=> 1,
 152                  );
 153
 154 sub _initialize {
 155     my($self,@args) = @_;
 156
 157     $self->SUPER::_initialize(@args);
 158     # hash for functions for decoding keys.
 159     $self->{'_func_ftunit_hash'} = {};
 160     # sets this to one by default. People can change it
 161     $self->_show_dna(1);
 162     if ( ! defined $self->sequence_factory ) {
 163         $self->sequence_factory(Bio::Seq::SeqFactory->new
 164                                 (-verbose => $self->verbose(),
 165                                  -type => 'Bio::Seq::RichSeq'));
 166     }
 167 }
 168
 169 =head2 next_seq
 170
 171  Title   : next_seq
 172  Usage   : $seq = $stream->next_seq()
 173  Function: returns the next sequence in the stream
 174  Returns : Bio::Seq object
 175  Args    :
 176
 177 =cut
 178
 179 sub next_seq {
 180     my ($self,@args) = @_;
 181     my ($pseq,$c,$line,$name,$desc,$acc,$seqc,$mol,$div,
 182         $date, $comment, @date_arr);
 183
 184     my ($annotation, %params, @features) =
 185         Bio::Annotation::Collection->new();
 186
 187     $line = $self->_readline;
 188     # This needs to be before the first eof() test
 189
 190     if ( !defined $line ) {
 191         return;                 # no throws - end of file
 192     }
 193
 194     if ( $line =~ /^\s+$/ ) {
 195         while ( defined ($line = $self->_readline) ) {
 196             $line =~/^\S/ && last;
 197         }
 198         # return without error if the whole next sequence was just a single
 199         # blank line and then eof
 200         return unless $line;
 201     }
 202
 203     # no ID as 1st non-blank line, need short circuit and exit routine
 204     $self->throw("EMBL stream with no ID. Not embl in my book")
 205         unless $line =~ /^ID\s+\S+/;
 206
 207     # At this point we are sure that $line contains an ID header line
 208     my $alphabet;
 209     if ( $line =~ tr/;/;/ == 6) { # New style headers contain exactly six semicolons.
 210
 211         # New style header (EMBL Release >= 87, after June 2006)
 212         my $topology;
 213         my $sv;
 214
 215         # ID   DQ299383; SV 1; linear; mRNA; STD; MAM; 431 BP.
 216         # This regexp comes from the new2old.pl conversion script, from EBI
 217         if ($line =~ m/^ID   (\S+);\s+SV (\d+); (\w+); ([^;]+); (\w{3}); (\w{3}); (\d+) BP./) {
 218             ($name, $sv, $topology, $mol, $div) = ($1, $2, $3, $4, $6);
 219         }
 220         if (defined $sv) {
 221             $params{'-seq_version'} = $sv;
 222             $params{'-version'} = $sv;
 223         }
 224
 225         if (defined $topology && $topology eq 'circular') {
 226             $params{'-is_circular'} = 1;
 227         }
 228
 229         if (defined $mol ) {
 230             if ($mol =~ /DNA/) {
 231                 $alphabet = 'dna';
 232             } elsif ($mol =~ /RNA/) {
 233                 $alphabet = 'rna';
 234             } elsif ($mol =~ /AA/) {
 235                 $alphabet = 'protein';
 236             }
 237         }
 238     } else {
 239
 240         # Old style header (EMBL Release < 87, before June 2006)
 241         if ($line =~ /^ID\s+(\S+)[^;]*;\s+(\S+)[^;]*;\s+(\S+)[^;]*;/) {
 242         ($name, $mol, $div) = ($1, $2, $3);
 243         }
 244
 245         if ($mol) {
 246             if ( $mol =~ /circular/ ) {
 247             $params{'-is_circular'} = 1;
 248             $mol =~  s|circular ||;
 249             }
 250             if (defined $mol ) {
 251             if ($mol =~ /DNA/) {
 252                 $alphabet='dna';
 253             } elsif ($mol =~ /RNA/) {
 254                 $alphabet='rna';
 255             } elsif ($mol =~ /AA/) {
 256                 $alphabet='protein';
 257             }
 258             }
 259         }
 260     }
 261
 262     unless( defined $name && length($name) ) {
 263     $name = "unknown_id";
 264     }
 265
 266     # $self->warn("not parsing upper annotation in EMBL file yet!");
 267     my $buffer = $line;
 268     local $_;
 269     BEFORE_FEATURE_TABLE :
 270           my $ncbi_taxid;
 271           until ( !defined $buffer ) {
 272               $_ = $buffer;
 273               # Exit at start of Feature table
 274               if ( /^(F[HT]|SQ)/ ) {
 275                   $self->_pushback($_) if( $1 eq 'SQ' || $1 eq 'FT');
 276                   last;
 277               }
 278               # Description line(s)
 279               if (/^DE\s+(\S.*\S)/) {
 280                   $desc .= $desc ? " $1" : $1;
 281               }
 282
 283               #accession number
 284               if ( /^AC\s+(.*)?/ || /^PA\s+(.*)?/) {
 285                   my @accs = split(/[; ]+/, $1); # allow space in addition
 286                   $params{'-accession_number'} = shift @accs
 287                       unless defined $params{'-accession_number'};
 288                   push @{$params{'-secondary_accessions'}}, @accs;
 289               }
 290
 291               #version number
 292               if ( /^SV\s+\S+\.(\d+);?/ ) {
 293                   my $sv = $1;
 294                   #$sv =~ s/\;//;
 295                   $params{'-seq_version'} = $sv;
 296                   $params{'-version'} = $sv;
 297               }
 298
 299               #date (NOTE: takes last date line)
 300               if ( /^DT\s+(.+)$/ ) {
 301                   my $line = $1;
 302                   my ($date, $version) = split(' ', $line, 2);
 303                   $date =~ tr/,//d; # remove comma if new version
 304                   if ($version) {
 305                   if ($version =~ /\(Rel\. (\d+), Created\)/ms ) {
 306                       my $release = Bio::Annotation::SimpleValue->new(
 307                                                                       -tagname    => 'creation_release',
 308                                                                       -value      => $1
 309                                                                      );
 310                       $annotation->add_Annotation($release);
 311                   } elsif ($version =~ /\(Rel\. (\d+), Last updated, Version (\d+)\)/ms ) {
 312                       my $release = Bio::Annotation::SimpleValue->new(
 313                                                                       -tagname    => 'update_release',
 314                                                                       -value      => $1
 315                                                                      );
 316                       $annotation->add_Annotation($release);
 317
 318                       my $update = Bio::Annotation::SimpleValue->new(
 319                                                                      -tagname    => 'update_version',
 320                                                                      -value      => $2
 321                                                                     );
 322                       $annotation->add_Annotation($update);
 323                   }
 324                   }
 325                   push @{$params{'-dates'}}, $date;
 326               }
 327
 328               #keywords
 329               if ( /^KW   (.*)\S*$/ ) {
 330                   my @kw = split(/\s*\;\s*/,$1);
 331                   push @{$params{'-keywords'}}, @kw;
 332               }
 333
 334               # Organism name and phylogenetic information
 335               elsif (/^O[SC]/) {
 336                   # pass the accession number so we can give an informative throw message if necessary
 337                   my $species = $self->_read_EMBL_Species(\$buffer, $params{'-accession_number'});
 338                   $params{'-species'}= $species;
 339               }
 340
 341               # NCBI TaxID Xref
 342               elsif (/^OX/) {
 343                   if (/NCBI_TaxID=(\d+)/) {
 344                       $ncbi_taxid=$1;
 345                   }
 346
 347                   my @links = $self->_read_EMBL_TaxID_DBLink(\$buffer);
 348                   foreach my $dblink ( @links ) {
 349                       $annotation->add_Annotation('dblink',$dblink);
 350                   }
 351               }
 352
 353               # References
 354               elsif (/^R/) {
 355                   my @refs = $self->_read_EMBL_References(\$buffer);
 356                   foreach my $ref ( @refs ) {
 357                       $annotation->add_Annotation('reference',$ref);
 358                   }
 359               }
 360
 361               # DB Xrefs
 362               elsif (/^DR/) {
 363                   my @links = $self->_read_EMBL_DBLink(\$buffer);
 364                   foreach my $dblink ( @links ) {
 365                       $annotation->add_Annotation('dblink',$dblink);
 366                   }
 367               }
 368
 369               # Comments
 370               elsif (/^CC\s+(.*)/) {
 371                   $comment .= $1;
 372                   $comment .= " ";
 373                   while (defined ($_ = $self->_readline) ) {
 374                       if (/^CC\s+(.*)/) {
 375                           $comment .= $1;
 376                           $comment .= " ";
 377                       } else {
 378                           last;
 379                       }
 380                   }
 381                   my $commobj = Bio::Annotation::Comment->new();
 382                   $commobj->text($comment);
 383                   $annotation->add_Annotation('comment',$commobj);
 384                   $comment = "";
 385               }
 386
 387               # Get next line.
 388               $buffer = $self->_readline;
 389           }
 390
 391     while ( defined ($_ = $self->_readline) ) {
 392         /^FT\s{3}\w/ && last;
 393         /^SQ / && last;
 394         /^CO / && last;
 395     }
 396     $buffer = $_;
 397
 398     if (defined($buffer) && $buffer =~ /^FT /) {
 399         until ( !defined ($buffer) ) {
 400             my $ftunit = $self->_read_FTHelper_EMBL(\$buffer);
 401
 402             # process ftunit
 403             my $feat =
 404                 $ftunit->_generic_seqfeature($self->location_factory(), $name);
 405
 406             # add taxon_id from source if available
 407             # Notice, this will override what is found in the OX line.
 408             # this is by design as this seems to be the official way
 409             # of specifying a TaxID
 410             if ($params{'-species'} && ($feat->primary_tag eq 'source')
 411                 && $feat->has_tag('db_xref')
 412                 && (! $params{'-species'}->ncbi_taxid())) {
 413                 foreach my $tagval ($feat->get_tag_values('db_xref')) {
 414                     if (index($tagval,"taxon:") == 0) {
 415                         $params{'-species'}->ncbi_taxid(substr($tagval,6));
 416                         last;
 417                     }
 418                 }
 419             }
 420
 421             # add feature to list of features
 422             push(@features, $feat);
 423
 424             if ( $buffer !~ /^FT/ ) {
 425                 last;
 426             }
 427         }
 428     }
 429     # Set taxid found in OX line
 430     if ($params{'-species'} && defined $ncbi_taxid
 431         && (! $params{'-species'}->ncbi_taxid())) {
 432         $params{'-species'}->ncbi_taxid($ncbi_taxid);
 433     }
 434
 435     # skip comments
 436     while ( defined ($buffer) && $buffer =~ /^XX/ ) {
 437         $buffer = $self->_readline();
 438     }
 439
 440     if ( $buffer =~ /^CO/  ) {
 441         # bug#2982
 442         # special : create contig as annotation
 443         while ( defined ($buffer) ) {
 444             $annotation->add_Annotation($_) for $self->_read_EMBL_Contig(\$buffer);
 445             if ( !$buffer || $buffer !~ /^CO/ ) {
 446                 last;
 447             }
 448         }
 449         $buffer ||= '';
 450     }
 451     if ($buffer !~ /^\/\//) { # if no SQ lines following CO (bug#2958)
 452     if ( $buffer !~ /^SQ/ ) {
 453         while ( defined ($_ = $self->_readline) ) {
 454             /^SQ/ && last;
 455         }
 456     }
 457     $seqc = "";
 458     while ( defined ($_ = $self->_readline) ) {
 459         m{^//} && last;
 460         $_ = uc($_);
 461         s/[^A-Za-z]//g;
 462         $seqc .= $_;
 463     }
 464 }
 465     my $seq = $self->sequence_factory->create
 466         (-verbose => $self->verbose(),
 467          -division => $div,
 468          -seq => $seqc,
 469          -desc => $desc,
 470          -display_id => $name,
 471          -annotation => $annotation,
 472          -molecule => $mol,
 473          -alphabet => $alphabet,
 474          -features => \@features,
 475          %params);
 476     return $seq;
 477 }
 478
 479
 480
 481 =head2 _write_ID_line
 482
 483  Title   : _write_ID_line
 484  Usage   : $self->_write_ID_line($seq);
 485  Function: Writes the EMBL Release 87 format ID line to the stream, unless
 486          : there is a user-supplied ID line generation function in which
 487          : case that is used instead.
 488          : ( See Bio::SeqIO::embl::_id_generation_function(). )
 489  Returns : nothing
 490  Args    : Bio::Seq object
 491
 492 =cut
 493
 494 sub _write_ID_line {
 495
 496     my ($self, $seq) = @_;
 497
 498     my $id_line;
 499     # If there is a user-supplied ID generation function, use it.
 500     if ( $self->_id_generation_func ) {
 501         $id_line = "ID   " . &{$self->_id_generation_func}($seq) . "\nXX\n";
 502     }
 503     # Otherwise, generate a standard EMBL release 87 (June 2006) ID line.
 504     else {
 505
 506         # The sequence name is supposed to be the primary accession number,
 507         my $name = $seq->accession_number();
 508         if ( not(defined $name) || $name eq 'unknown') {
 509             # but if it is not present, use the sequence ID or the empty string
 510             $name = $seq->id() || '';
 511         }
 512
 513         $self->warn("No whitespace allowed in EMBL id [". $name. "]") if $name =~ /\s/;
 514
 515         # Use the sequence version, or default to 1.
 516         my $version = $seq->version() || 1;
 517
 518         my $len = $seq->length();
 519
 520         # Taxonomic division.
 521         my $div;
 522         if ( $seq->can('division') && defined($seq->division) &&
 523              $self->_is_valid_division($seq->division) ) {
 524             $div = $seq->division();
 525         } else {
 526             $div ||= 'UNC';     # 'UNC' is the EMBL division code for 'unclassified'.
 527         }
 528
 529         my $mol;
 530         # If the molecule type is a valid EMBL type, use it.
 531         if (  $seq->can('molecule')
 532               && defined($seq->molecule)
 533               && $self->_is_valid_molecule_type($seq->molecule)
 534            ) {
 535             $mol = $seq->molecule();
 536         }
 537         # Otherwise, choose unassigned DNA or RNA based on the alphabet.
 538         elsif ($seq->can('primary_seq') && defined $seq->primary_seq->alphabet) {
 539             my $alphabet =$seq->primary_seq->alphabet;
 540             if ($alphabet eq 'dna') {
 541                 $mol ='unassigned DNA';
 542             } elsif ($alphabet eq 'rna') {
 543                 $mol='unassigned RNA';
 544             } elsif ($alphabet eq 'protein') {
 545                 $self->warn("Protein sequence found; EMBL is a nucleotide format.");
 546                 $mol='AA';  # AA is not a valid EMBL molecule type.
 547             }
 548         }
 549
 550         my $topology = 'linear';
 551         if ($seq->is_circular) {
 552             $topology = 'circular';
 553         }
 554
 555         $mol ||= '';            # 'unassigned'; ?
 556         $id_line = "ID   $name; SV $version; $topology; $mol; STD; $div; $len BP.\nXX\n";
 557     }
 558     $self->_print($id_line);
 559 }
 560
 561 =head2 _is_valid_division
 562
 563  Title   : _is_valid_division
 564  Usage   : $self->_is_valid_division($div)
 565  Function: tests division code for validity
 566  Returns : true if $div is a valid EMBL release 87 taxonomic division.
 567  Args    : taxonomic division code string
 568
 569 =cut
 570
 571 sub _is_valid_division {
 572     my ($self, $division) = @_;
 573
 574     my %EMBL_divisions = (
 575                           "PHG"    => 1, # Bacteriophage
 576                           "ENV"    => 1, # Environmental Sample
 577                           "FUN"    => 1, # Fungal
 578                           "HUM"    => 1, # Human
 579                           "INV"    => 1, # Invertebrate
 580                           "MAM"    => 1, # Other Mammal
 581                           "VRT"    => 1, # Other Vertebrate
 582                           "MUS"    => 1, # Mus musculus
 583                           "PLN"    => 1, # Plant
 584                           "PRO"    => 1, # Prokaryote
 585                           "ROD"    => 1, # Other Rodent
 586                           "SYN"    => 1, # Synthetic
 587                           "UNC"    => 1, # Unclassified
 588                           "VRL"    => 1 # Viral
 589                          );
 590
 591     return exists($EMBL_divisions{$division});
 592 }
 593
 594 =head2 _is_valid_molecule_type
 595
 596  Title   : _is_valid_molecule_type
 597  Usage   : $self->_is_valid_molecule_type($mol)
 598  Function: tests molecule type for validity
 599  Returns : true if $mol is a valid EMBL release 87 molecule type.
 600  Args    : molecule type string
 601
 602 =cut
 603
 604 sub _is_valid_molecule_type {
 605     my ($self, $moltype) = @_;
 606
 607     my %EMBL_molecule_types = (
 608                                "genomic DNA"    => 1,
 609                                "genomic RNA"    => 1,
 610                                "mRNA"           => 1,
 611                                "tRNA"           => 1,
 612                                "rRNA"           => 1,
 613                                "snoRNA"         => 1,
 614                                "snRNA"          => 1,
 615                                "scRNA"          => 1,
 616                                "pre-RNA"        => 1,
 617                                "other RNA"      => 1,
 618                                "other DNA"      => 1,
 619                                "unassigned DNA" => 1,
 620                                "unassigned RNA" => 1
 621                               );
 622
 623     return exists($EMBL_molecule_types{$moltype});
 624 }
 625
 626 =head2 write_seq
 627
 628  Title   : write_seq
 629  Usage   : $stream->write_seq($seq)
 630  Function: writes the $seq object (must be seq) to the stream
 631  Returns : 1 for success and undef for error
 632  Args    : array of 1 to n Bio::SeqI objects
 633
 634
 635 =cut
 636
 637 sub write_seq {
 638     my ($self,@seqs) = @_;
 639
 640     foreach my $seq ( @seqs ) {
 641         $self->throw("Attempting to write with no seq!") unless defined $seq;
 642         unless ( ref $seq && $seq->isa('Bio::SeqI' ) ) {
 643             $self->warn("$seq is not a SeqI compliant sequence object!")
 644                 if $self->verbose >= 0;
 645             unless ( ref $seq && $seq->isa('Bio::PrimarySeqI' ) ) {
 646                 $self->throw("$seq is not a PrimarySeqI compliant sequence object!");
 647             }
 648         }
 649         my $str = $seq->seq || '';
 650
 651         # Write the ID line.
 652         $self->_write_ID_line($seq);
 653
 654
 655         # Write the accession line if present
 656         my( $acc );
 657         {
 658             if ( my $func = $self->_ac_generation_func ) {
 659                 $acc = &{$func}($seq);
 660             } elsif ( $seq->isa('Bio::Seq::RichSeqI') &&
 661                       defined($seq->accession_number) ) {
 662                 $acc = $seq->accession_number;
 663                 $acc = join("; ", $acc, $seq->get_secondary_accessions);
 664             } elsif ( $seq->can('accession_number') ) {
 665                 $acc = $seq->accession_number;
 666             }
 667
 668             if (defined $acc) {
 669                 $self->_print("AC   $acc;\n",
 670                               "XX\n") || return;
 671             }
 672         }
 673
 674         # Date lines
 675         my $switch=0;
 676         if ( $seq->can('get_dates') ) {
 677             my @dates =  $seq->get_dates();
 678             my $ct = 1;
 679             my $date_flag = 0;
 680             my ($cr) = $seq->annotation->get_Annotations("creation_release");
 681             my ($ur) = $seq->annotation->get_Annotations("update_release");
 682             my ($uv) = $seq->annotation->get_Annotations("update_version");
 683
 684             unless ($cr && $ur && $ur) {
 685                 $date_flag = 1;
 686             }
 687
 688             foreach my $dt (@dates) {
 689                 if (!$date_flag) {
 690                     $self->_write_line_EMBL_regex("DT   ","DT   ",
 691                                                   $dt." (Rel. ".($cr->value()).", Created)",
 692                                                   '\s+|$',80) if $ct == 1;
 693                     $self->_write_line_EMBL_regex("DT   ","DT   ",
 694                                                   $dt." (Rel. ".($ur->value()).", Last updated, Version ".($uv->value()).")",
 695                                                   '\s+|$',80) if $ct == 2;
 696                 } else {        # other formats?
 697                     $self->_write_line_EMBL_regex("DT   ","DT   ",
 698                                                   $dt,'\s+|$',80);
 699                 }
 700                 $switch =1;
 701                 $ct++;
 702             }
 703             if ($switch == 1) {
 704                 $self->_print("XX\n") || return;
 705             }
 706         }
 707
 708         # Description lines
 709         $self->_write_line_EMBL_regex("DE   ","DE   ",$seq->desc(),'\s+|$',80) || return; #'
 710         $self->_print( "XX\n") || return;
 711
 712         # if there, write the kw line
 713         {
 714             my( $kw );
 715             if ( my $func = $self->_kw_generation_func ) {
 716                 $kw = &{$func}($seq);
 717             } elsif ( $seq->can('keywords') ) {
 718                 $kw = $seq->keywords;
 719             }
 720             if (defined $kw) {
 721                 $self->_write_line_EMBL_regex("KW   ", "KW   ", $kw, '\s+|$', 80) || return; #'
 722                 $self->_print( "XX\n") || return;
 723             }
 724         }
 725
 726         # Organism lines
 727
 728         if ($seq->can('species') && (my $spec = $seq->species)) {
 729             my @class = $spec->classification();
 730             shift @class;       # get rid of species name. Some embl files include
 731                                 # the species name in the OC lines, but this seems
 732                                 # more like an error than something we need to
 733                                 # emulate
 734             my $OS = $spec->scientific_name;
 735             if ($spec->common_name) {
 736                 $OS .= ' ('.$spec->common_name.')';
 737             }
 738             $self->_print("OS   $OS\n") || return;
 739             my $OC = join('; ', reverse(@class)) .'.';
 740             $self->_write_line_EMBL_regex("OC   ","OC   ",$OC,'; |$',80) || return;
 741             if ($spec->organelle) {
 742                 $self->_write_line_EMBL_regex("OG   ","OG   ",$spec->organelle,'; |$',80) || return;
 743             }
 744             my $ncbi_taxid = $spec->ncbi_taxid;
 745             if ($ncbi_taxid) {
 746                 $self->_print("OX   NCBI_TaxID=$ncbi_taxid\n") || return;
 747             }
 748             $self->_print("XX\n") || return;
 749         }
 750         # Reference lines
 751         my $t = 1;
 752         if ( $seq->can('annotation') && defined $seq->annotation ) {
 753             foreach my $ref ( $seq->annotation->get_Annotations('reference') ) {
 754                 $self->_print( "RN   [$t]\n") || return;
 755
 756                 # Having no RP line is legal, but we need both
 757                 # start and end for a valid location.
 758                 if ($ref->comment) {
 759                     $self->_write_line_EMBL_regex("RC   ", "RC   ", $ref->comment, '\s+|$', 80) || return; #'
 760                 }
 761                 my $start = $ref->start;
 762                 my $end   = $ref->end;
 763                 if ($start and $end) {
 764                     $self->_print( "RP   $start-$end\n") || return;
 765                 } elsif ($start or $end) {
 766                     $self->throw("Both start and end are needed for a valid RP line.".
 767                                  "  Got: start='$start' end='$end'");
 768                 }
 769
 770                 if (my $med = $ref->medline) {
 771                     $self->_print( "RX   MEDLINE; $med.\n") || return;
 772                 }
 773                 if (my $pm = $ref->pubmed) {
 774                     $self->_print( "RX   PUBMED; $pm.\n") || return;
 775                 }
 776                 my $authors = $ref->authors;
 777                 $authors =~ s/([\w\.]) (\w)/$1#$2/g;  # add word wrap protection char '#'
 778
 779                 $self->_write_line_EMBL_regex("RA   ", "RA   ",
 780                                               $authors . ";",
 781                                               '\s+|$', 80) || return; #'
 782
 783                 # If there is no title to the reference, it appears
 784                 # as a single semi-colon.  All titles must end in
 785                 # a semi-colon.
 786                 my $ref_title = $ref->title || '';
 787                 $ref_title =~ s/[\s;]*$/;/;
 788                 $self->_write_line_EMBL_regex("RT   ", "RT   ", $ref_title,    '\s+|$', 80) || return; #'
 789                 $self->_write_line_EMBL_regex("RL   ", "RL   ", $ref->location, '\s+|$', 80) || return; #'
 790                 $self->_print("XX\n") || return;
 791                 $t++;
 792             }
 793
 794             # DB Xref lines
 795             if (my @db_xref = $seq->annotation->get_Annotations('dblink') ) {
 796                 for my $dr (@db_xref) {
 797                     my $db_name = $dr->database;
 798                     my $prim    = $dr->primary_id;
 799
 800                     my $opt     = $dr->optional_id || '';
 801                     my $line = $opt ? "$db_name; $prim; $opt." : "$db_name; $prim.";
 802                     $self->_write_line_EMBL_regex("DR   ", "DR   ", $line, '\s+|$', 80) || return; #'
 803                 }
 804                 $self->_print("XX\n") || return;
 805             }
 806
 807             # Comment lines
 808             foreach my $comment ( $seq->annotation->get_Annotations('comment') ) {
 809                 $self->_write_line_EMBL_regex("CC   ", "CC   ", $comment->text, '\s+|$', 80) || return; #'
 810                 $self->_print("XX\n") || return;
 811             }
 812         }
 813         # "\\s\+\|\$"
 814
 815         ## FEATURE TABLE
 816
 817         $self->_print("FH   Key             Location/Qualifiers\n") || return;
 818         $self->_print("FH\n") || return;
 819
 820         my @feats = $seq->can('top_SeqFeatures') ? $seq->top_SeqFeatures : ();
 821         if ($feats[0]) {
 822             if ( defined $self->_post_sort ) {
 823                 # we need to read things into an array.
 824                 # Process. Sort them. Print 'em
 825
 826                 my $post_sort_func = $self->_post_sort();
 827                 my @fth;
 828
 829                 foreach my $sf ( @feats ) {
 830                     push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq));
 831                 }
 832
 833                 @fth = sort { &$post_sort_func($a,$b) } @fth;
 834
 835                 foreach my $fth ( @fth ) {
 836                     $self->_print_EMBL_FTHelper($fth) || return;
 837                 }
 838             } else {
 839                 # not post sorted. And so we can print as we get them.
 840                 # lower memory load...
 841
 842                 foreach my $sf ( @feats ) {
 843                     my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq);
 844                     foreach my $fth ( @fth ) {
 845                         if ( $fth->key eq 'CONTIG') {
 846                             $self->_show_dna(0);
 847                         }
 848                         $self->_print_EMBL_FTHelper($fth) || return;
 849                     }
 850                 }
 851             }
 852         }
 853
 854         if ( $self->_show_dna() == 0 ) {
 855             $self->_print( "//\n") || return;
 856             return;
 857         }
 858         $self->_print( "XX\n") || return;
 859
 860         # finished printing features.
 861
 862         # print contig if present : bug#2982
 863         if ( $seq->can('annotation') && defined $seq->annotation) {
 864             foreach my $ctg ( $seq->annotation->get_Annotations('contig') ) {
 865                 if ($ctg->value) {
 866                     $self->_write_line_EMBL_regex("CO   ","CO   ", $ctg->value,
 867                                                   '[,]|$', 80) || return;
 868                 }
 869             }
 870         }
 871         # print sequence lines only if sequence is present! bug#2982
 872         if (length($str)) {
 873             $str =~ tr/A-Z/a-z/;
 874
 875             # Count each nucleotide
 876             my $alen = $str =~ tr/a/a/;
 877             my $clen = $str =~ tr/c/c/;
 878             my $glen = $str =~ tr/g/g/;
 879             my $tlen = $str =~ tr/t/t/;
 880
 881             my $len = $seq->length();
 882             my $olen = $seq->length() - ($alen + $tlen + $clen + $glen);
 883             if ( $olen < 0 ) {
 884                 $self->warn("Weird. More atgc than bases. Problem!");
 885             }
 886
 887             $self->_print("SQ   Sequence $len BP; $alen A; $clen C; $glen G; $tlen T; $olen other;\n") || return;
 888
 889             my $nuc = 60;       # Number of nucleotides per line
 890             my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line
 891             my $out_pat   = 'A11' x 6; # Pattern for packing a line
 892             my $length = length($str);
 893
 894             # Calculate the number of nucleotides which fit on whole lines
 895             my $whole = int($length / $nuc) * $nuc;
 896
 897             # Print the whole lines
 898             my( $i );
 899             for ($i = 0; $i < $whole; $i += $nuc) {
 900                 my $blocks = pack $out_pat,
 901                 unpack $whole_pat,
 902                 substr($str, $i, $nuc);
 903                 $self->_print(sprintf("     $blocks%9d\n", $i + $nuc)) || return;
 904             }
 905
 906             # Print the last line
 907             if (my $last = substr($str, $i)) {
 908                 my $last_len = length($last);
 909                 my $last_pat = 'a10' x int($last_len / 10) .'a'. $last_len % 10;
 910                 my $blocks = pack $out_pat,
 911                 unpack($last_pat, $last);
 912                 $self->_print(sprintf("     $blocks%9d\n", $length)) ||
 913                     return;         # Add the length to the end
 914             }
 915         }
 916
 917         $self->_print( "//\n") || return;
 918
 919         $self->flush if $self->_flush_on_write && defined $self->_fh;
 920     }
 921     return 1;
 922 }
 923
 924 =head2 _print_EMBL_FTHelper
 925
 926  Title   : _print_EMBL_FTHelper
 927  Usage   :
 928  Function: Internal function
 929  Returns : 1 if writing succeeded, otherwise undef
 930  Args    :
 931
 932
 933 =cut
 934
 935 sub _print_EMBL_FTHelper {
 936     my ($self,$fth) = @_;
 937
 938     if ( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) {
 939         $fth->warn("$fth is not a FTHelper class. Attempting to print, but there could be tears!");
 940     }
 941
 942
 943     #$self->_print( "FH   Key             Location/Qualifiers\n");
 944     #$self->_print( sprintf("FT   %-15s  %s\n",$fth->key,$fth->loc));
 945     # let
 946     if ( $fth->key eq 'CONTIG' ) {
 947         $self->_print("XX\n") || return;
 948         $self->_write_line_EMBL_regex("CO   ",
 949                                       "CO   ",$fth->loc,
 950                                       '\,|$',80) || return; #'
 951         return 1;
 952     }
 953     $self->_write_line_EMBL_regex(sprintf("FT   %-15s ",$fth->key),
 954                                   "FT                   ",$fth->loc,
 955                                   '\,|$',80) || return; #'
 956     foreach my $tag (sort keys %{$fth->field} ) {
 957         if ( ! defined $fth->field->{$tag} ) {
 958             next;
 959         }
 960         foreach my $value ( @{$fth->field->{$tag}} ) {
 961             $value =~ s/\"/\"\"/g;
 962             if ($value eq "_no_value") {
 963                 $self->_write_line_EMBL_regex("FT                   ",
 964                                               "FT                   ",
 965                                               "/$tag",'.|$',80) || return; #'
 966             }
 967             # there are almost 3x more quoted qualifier values and they
 968             # are more common too so we take quoted ones first
 969             #
 970             # Long qualifiers, that will be line wrapped, are always quoted
 971             elsif (!$FTQUAL_NO_QUOTE{$tag} or length("/$tag=$value")>=60) {
 972                 my $pat = $value =~ /\s+/ ? '\s+|\-|$' : '.|\-|$';
 973                 $self->_write_line_EMBL_regex("FT                   ",
 974                                               "FT                   ",
 975                                               "/$tag=\"$value\"",$pat,80) || return;
 976             } else {
 977                 $self->_write_line_EMBL_regex("FT                   ",
 978                                               "FT                   ",
 979                                               "/$tag=$value",'.|$',80) || return; #'
 980                                           }
 981             }
 982         }
 983
 984         return 1;
 985     }
 986
 987
 988
 989 =head2 _read_EMBL_Contig()
 990
 991  Title   : _read_EMBL_Contig
 992  Usage   :
 993  Function: convert CO lines into annotations
 994  Returns :
 995  Args    :
 996
 997 =cut
 998
 999 sub _read_EMBL_Contig {
1000     my ($self, $buffer) = @_;
1001     my @ret;
1002     if ( $$buffer !~ /^CO/ ) {
1003         warn("Not parsing line '$$buffer' which maybe important");
1004     }
1005     $self->_pushback($$buffer);
1006     while ( defined ($_ = $self->_readline) ) {
1007         /^C/ || last;
1008         /^CO\s+(.*)/ && do {
1009         push @ret, Bio::Annotation::SimpleValue->new( -tagname => 'contig',
1010                                                       -value => $1);
1011         };
1012     }
1013     $$buffer = $_;
1014     return @ret;
1015
1016 }
1017 #'
1018 =head2 _read_EMBL_References
1019
1020  Title   : _read_EMBL_References
1021  Usage   :
1022  Function: Reads references from EMBL format. Internal function really
1023  Example :
1024  Returns :
1025  Args    :
1026
1027
1028 =cut
1029
1030 sub _read_EMBL_References {
1031     my ($self,$buffer) = @_;
1032     my (@refs);
1033
1034     # assume things are starting with RN
1035
1036     if ( $$buffer !~ /^RN/ ) {
1037         warn("Not parsing line '$$buffer' which maybe important");
1038     }
1039     my $b1;
1040     my $b2;
1041     my $title;
1042     my $loc;
1043     my $au;
1044     my $med;
1045     my $pm;
1046     my $com;
1047
1048     while ( defined ($_ = $self->_readline) ) {
1049         /^R/ || last;
1050         /^RP   (\d+)-(\d+)/ && do {$b1=$1;$b2=$2;};
1051         /^RX   MEDLINE;\s+(\d+)/ && do {$med=$1};
1052         /^RX   PUBMED;\s+(\d+)/ && do {$pm=$1};
1053         /^RA   (.*)/ && do {
1054             $au = $self->_concatenate_lines($au,$1); next;
1055         };
1056         /^RT   (.*)/ && do {
1057             $title = $self->_concatenate_lines($title,$1); next;
1058         };
1059         /^RL   (.*)/ && do {
1060             $loc = $self->_concatenate_lines($loc,$1); next;
1061         };
1062         /^RC   (.*)/ && do {
1063             $com = $self->_concatenate_lines($com,$1); next;
1064         };
1065     }
1066
1067     my $ref = Bio::Annotation::Reference->new();
1068     $au =~ s/;\s*$//g;
1069     $title =~ s/;\s*$//g;
1070
1071     $ref->start($b1);
1072     $ref->end($b2);
1073     $ref->authors($au);
1074     $ref->title($title);
1075     $ref->location($loc);
1076     $ref->medline($med);
1077     $ref->comment($com);
1078     $ref->pubmed($pm);
1079
1080     push(@refs,$ref);
1081     $$buffer = $_;
1082
1083     return @refs;
1084 }
1085
1086 =head2 _read_EMBL_Species
1087
1088  Title   : _read_EMBL_Species
1089  Usage   :
1090  Function: Reads the EMBL Organism species and classification
1091            lines.
1092  Example :
1093  Returns : A Bio::Species object
1094  Args    : a reference to the current line buffer, accession number
1095
1096 =cut
1097
1098 sub _read_EMBL_Species {
1099     my( $self, $buffer, $acc ) = @_;
1100     my $org;
1101
1102     $_ = $$buffer;
1103     my( $sub_species, $species, $genus, $common, $sci_name, $class_lines );
1104     while (defined( $_ ||= $self->_readline )) {
1105         if (/^OS\s+(.+)/) {
1106             $sci_name .= ($sci_name) ? ' '.$1 : $1;
1107         } elsif (s/^OC\s+(.+)$//) {
1108             $class_lines .= $1;
1109         } elsif (/^OG\s+(.*)/) {
1110             $org = $1;
1111         } else {
1112             last;
1113         }
1114
1115         $_ = undef;             # Empty $_ to trigger read of next line
1116     }
1117
1118 #    $$buffer = $_;
1119         $self->_pushback($_);
1120
1121     $sci_name =~ s{\.$}{};
1122     $sci_name || return;
1123
1124     # Convert data in classification lines into classification array.
1125     # only split on ';' or '.' so that classification that is 2 or more words
1126     # will still get matched, use map() to remove trailing/leading/intervening
1127     # spaces
1128     my @class = map { s/^\s+//; s/\s+$//; s/\s{2,}/ /g; $_; } split /(?<!subgen)[;\.]+/, $class_lines;
1129
1130     # do we have a genus?
1131     my $possible_genus = $class[-1];
1132     $possible_genus .= "|$class[-2]" if $class[-2];
1133     if ($sci_name =~ /^($possible_genus)/) {
1134         $genus = $1;
1135         ($species) = $sci_name =~ /^$genus\s+(.+)/;
1136     } else {
1137         $species = $sci_name;
1138     }
1139
1140     # Don't make a species object if it is "Unknown" or "None"
1141     if ($genus) {
1142         return if $genus =~ /^(Unknown|None)$/i;
1143     }
1144
1145     # is this organism of rank species or is it lower?
1146     # (doesn't catch everything, but at least the guess isn't dangerous)
1147     if ($species =~ /subsp\.|var\./) {
1148         ($species, $sub_species) = $species =~ /(.+)\s+((?:subsp\.|var\.).+)/;
1149     }
1150
1151     # sometimes things have common name in brackets, like
1152     # Schizosaccharomyces pombe (fission yeast), so get rid of the common
1153     # name bit. Probably dangerous if real scientific species name ends in
1154     # bracketed bit.
1155     unless ($class[-1] eq 'Viruses') {
1156         ($species, $common) = $species =~ /^(.+)\s+\((.+)\)$/;
1157         $sci_name =~ s/\s+\(.+\)$// if $common;
1158     }
1159
1160     # Bio::Species array needs array in Species -> Kingdom direction
1161     unless ($class[-1] eq $sci_name) {
1162         push(@class, $sci_name);
1163     }
1164     @class = reverse @class;
1165
1166     # do minimal sanity checks before we hand off to Bio::Species which won't
1167     # be able to give informative throw messages if it has to throw because
1168     # of problems here
1169     $self->throw("$acc seems to be missing its OS line: invalid.") unless $sci_name;
1170     my %names;
1171     foreach my $i (0..$#class) {
1172         my $name = $class[$i];
1173         $names{$name}++;
1174         # this code breaks examples like: Xenopus (Silurana) tropicalis
1175         # commenting out, see bug 3158
1176
1177         #if ($names{$name} > 1 && ($name ne $class[$i - 1])) {
1178         #    $self->warn("$acc seems to have an invalid species classification:$name ne $class[$i - 1]");
1179         #}
1180     }
1181     my $make = Bio::Species->new();
1182     $make->scientific_name($sci_name);
1183     $make->classification(@class);
1184     unless ($class[-1] eq 'Viruses') {
1185         $make->genus($genus) if $genus;
1186         $make->species($species) if $species;
1187         $make->sub_species($sub_species) if $sub_species;
1188         $make->common_name($common) if $common;
1189     }
1190     $make->organelle($org) if $org;
1191     return $make;
1192 }
1193
1194 =head2 _read_EMBL_DBLink
1195
1196  Title   : _read_EMBL_DBLink
1197  Usage   :
1198  Function: Reads the EMBL database cross reference ("DR") lines
1199  Example :
1200  Returns : A list of Bio::Annotation::DBLink objects
1201  Args    :
1202
1203 =cut
1204
1205 sub _read_EMBL_DBLink {
1206     my( $self,$buffer ) = @_;
1207     my( @db_link );
1208
1209     $_ = $$buffer;
1210     while (defined( $_ ||= $self->_readline )) {
1211         if ( /^DR   ([^\s;]+);\s*([^\s;]+);?\s*([^\s;]+)?\.$/) {
1212         my ($databse, $prim_id, $sec_id) = ($1,$2,$3);
1213         my $link = Bio::Annotation::DBLink->new(-database    => $databse,
1214                             -primary_id  => $prim_id,
1215                             -optional_id => $sec_id);
1216
1217             push(@db_link, $link);
1218     } else {
1219             last;
1220         }
1221         $_ = undef;             # Empty $_ to trigger read of next line
1222     }
1223
1224     $$buffer = $_;
1225     return @db_link;
1226 }
1227
1228 =head2 _read_EMBL_TaxID_DBLink
1229
1230  Title   : _read_EMBL_TaxID_DBLink
1231  Usage   :
1232  Function: Reads the EMBL database cross reference to NCBI TaxID ("OX") lines
1233  Example :
1234  Returns : A list of Bio::Annotation::DBLink objects
1235  Args    :
1236
1237 =cut
1238
1239 sub _read_EMBL_TaxID_DBLink {
1240     my( $self,$buffer ) = @_;
1241     my( @db_link );
1242
1243     $_ = $$buffer;
1244     while (defined( $_ ||= $self->_readline )) {
1245         if ( /^OX   (\S+)=(\d+);$/ ) {
1246             my ($databse, $prim_id) = ($1,$2);
1247             my $link = Bio::Annotation::DBLink->new(-database    => $databse,
1248                                                     -primary_id  => $prim_id,);
1249             push(@db_link, $link);
1250         } else {
1251             last;
1252         }
1253         $_ = undef;             # Empty $_ to trigger read of next line
1254     }
1255
1256     $$buffer = $_;
1257     return @db_link;
1258 }
1259
1260 =head2 _filehandle
1261
1262  Title   : _filehandle
1263  Usage   : $obj->_filehandle($newval)
1264  Function:
1265  Example :
1266  Returns : value of _filehandle
1267  Args    : newvalue (optional)
1268
1269
1270 =cut
1271
1272 sub _filehandle{
1273     my ($obj,$value) = @_;
1274     if ( defined $value) {
1275         $obj->{'_filehandle'} = $value;
1276     }
1277     return $obj->{'_filehandle'};
1278
1279 }
1280
1281 =head2 _read_FTHelper_EMBL
1282
1283  Title   : _read_FTHelper_EMBL
1284  Usage   : _read_FTHelper_EMBL($buffer)
1285  Function: reads the next FT key line
1286  Example :
1287  Returns : Bio::SeqIO::FTHelper object
1288  Args    : filehandle and reference to a scalar
1289
1290
1291 =cut
1292
1293 sub _read_FTHelper_EMBL {
1294     my ($self,$buffer) = @_;
1295
1296     my ($key,                   # The key of the feature
1297         $loc,                   # The location line from the feature
1298         @qual,                  # An arrray of lines making up the qualifiers
1299        );
1300
1301     if ($$buffer =~ /^FT\s{3}(\S+)\s+(\S+)/ ) {
1302         $key = $1;
1303         $loc = $2;
1304         # Read all the lines up to the next feature
1305         while ( defined($_ = $self->_readline) ) {
1306             if (/^FT(\s+)(.+?)\s*$/) {
1307                 # Lines inside features are preceeded by 19 spaces
1308                 # A new feature is preceeded by 3 spaces
1309                 if (length($1) > 4) {
1310                     # Add to qualifiers if we're in the qualifiers
1311                     if (@qual) {
1312                         push(@qual, $2);
1313                     }
1314                     # Start the qualifier list if it's the first qualifier
1315                     elsif (substr($2, 0, 1) eq '/') {
1316                         @qual = ($2);
1317                     }
1318                     # We're still in the location line, so append to location
1319                     else {
1320                         $loc .= $2;
1321                     }
1322                 } else {
1323                     # We've reached the start of the next feature
1324                     last;
1325                 }
1326             } else {
1327                 # We're at the end of the feature table
1328                 last;
1329             }
1330         }
1331     } elsif ( $$buffer =~ /^CO\s+(\S+)/) {
1332     $key = 'CONTIG';
1333     $loc = $1;
1334     # Read all the lines up to the next feature
1335     while ( defined($_ = $self->_readline) ) {
1336         if (/^CO\s+(\S+)\s*$/) {
1337         $loc .= $1;
1338         } else {
1339         # We've reached the start of the next feature
1340         last;
1341         }
1342     }
1343     } else {
1344         # No feature key
1345         return;
1346     }
1347
1348     # Put the first line of the next feature into the buffer
1349     $$buffer = $_;
1350
1351     # Make the new FTHelper object
1352     my $out = Bio::SeqIO::FTHelper->new();
1353     $out->verbose($self->verbose());
1354     $out->key($key);
1355     $out->loc($loc);
1356
1357     # Now parse and add any qualifiers.  (@qual is kept
1358     # intact to provide informative error messages.)
1359     my $last_unquoted_qualifier;
1360   QUAL:
1361     for (my $i = 0; $i < @qual; $i++) {
1362         my $data = $qual[$i];
1363         my ( $qualifier, $value );
1364
1365         unless (( $qualifier, $value ) = ($data =~ m{^/([^=]+)(?:=\s*(.*))?})) {
1366            if ( defined $last_unquoted_qualifier ) {
1367               # handle case of unquoted multiline - read up everything until the next qualifier
1368               do {
1369                  # Protein sequence translations need to be joined without spaces,
1370                  # other qualifiers need those.
1371                  $value .= ' ' if $qualifier ne "translation";
1372                  $value .= $data;
1373               } while defined($data = $qual[++$i]) && $data !~ m[^/];
1374               $i--;
1375               $out->field->{$last_unquoted_qualifier}->[-1] .= $value;
1376               $last_unquoted_qualifier = undef;
1377               next QUAL;
1378            } else {
1379               $self->throw("Can't see new qualifier in: $_\nfrom:\n"
1380                             . join('', map "$_\n", @qual));
1381            }
1382         }
1383         $qualifier = '' if not defined $qualifier;
1384
1385         $last_unquoted_qualifier = undef;
1386         if (defined $value) {
1387             # Do we have a quoted value?
1388             if (substr($value, 0, 1) eq '"') {
1389                 # Keep adding to value until we find the trailing quote
1390                 # and the quotes are balanced
1391               QUOTES:
1392                 while ($value !~ /"$/ or $value =~ tr/"/"/ % 2) { #"
1393                     $i++;
1394                     my $next = $qual[$i];
1395                     if (!defined($next)) {
1396                         $self->warn("Unbalanced quote in:\n".join("\n", @qual).
1397                                     "\nAdding quote to close...".
1398                                     "Check sequence quality!");
1399                         $value .= '"';
1400                         last QUOTES;
1401                     }
1402
1403                     # Protein sequence translations need to be joined without spaces,
1404                     # other qualifiers need those.
1405                     if ($qualifier eq "translation") {
1406                         $value .= $next;
1407                     } else {
1408                         $value .= " $next";
1409                     }
1410                 }
1411                 # Trim leading and trailing quotes
1412                 $value =~ s/^"|"$//g;
1413                 # Undouble internal quotes
1414                 $value =~ s/""/"/g; #"
1415             } else {
1416               $last_unquoted_qualifier = $qualifier;
1417             }
1418         } else {
1419             $value = '_no_value';
1420         }
1421
1422         # Store the qualifier
1423         $out->field->{$qualifier} ||= [];
1424         push(@{$out->field->{$qualifier}},$value);
1425     }
1426
1427     return $out;
1428 }
1429
1430 =head2 _write_line_EMBL
1431
1432  Title   : _write_line_EMBL
1433  Usage   :
1434  Function: internal function
1435  Example :
1436  Returns : 1 if writing succeeded, else undef
1437  Args    :
1438
1439
1440 =cut
1441
1442 sub _write_line_EMBL {
1443     my ($self,$pre1,$pre2,$line,$length) = @_;
1444
1445     $length || $self->throw("Miscalled write_line_EMBL without length. Programming error!");
1446     my $subl = $length - length $pre2;
1447     my $linel = length $line;
1448     my $i;
1449
1450     my $sub = substr($line,0,$length - length $pre1);
1451
1452     $self->_print( "$pre1$sub\n") || return;
1453
1454     for ($i= ($length - length $pre1);$i < $linel;) {
1455         $sub = substr($line,$i,($subl));
1456         $self->_print( "$pre2$sub\n") || return;
1457         $i += $subl;
1458     }
1459
1460     return 1;
1461 }
1462
1463 =head2 _write_line_EMBL_regex
1464
1465  Title   : _write_line_EMBL_regex
1466  Usage   :
1467  Function: internal function for writing lines of specified
1468            length, with different first and the next line
1469            left hand headers and split at specific points in the
1470            text
1471  Example :
1472  Returns : nothing
1473  Args    : file handle, first header, second header, text-line, regex for line breaks, total line length
1474
1475
1476 =cut
1477
1478 sub _write_line_EMBL_regex {
1479     my ($self,$pre1,$pre2,$line,$regex,$length) = @_;
1480
1481     #print STDOUT "Going to print with $line!\n";
1482
1483     $length || $self->throw("Programming error - called write_line_EMBL_regex without length.");
1484
1485     my $subl = $length - (length $pre1) -1 ;
1486     my( @lines );
1487
1488   CHUNK: while($line) {
1489         foreach my $pat ($regex, '[,;\.\/-]\s|'.$regex, '[,;\.\/-]|'.$regex) {
1490             if ($line =~ m/^(.{1,$subl})($pat)(.*)/ ) {
1491                 my $l = $1.$2;
1492                 $l =~ s/#/ /g  # remove word wrap protection char '#'
1493                     if $pre1 eq "RA   ";
1494                 my $newl = $3;
1495                 $line = substr($line,length($l));
1496                 # be strict about not padding spaces according to
1497                 # genbank format
1498                 $l =~ s/\s+$//;
1499                 next CHUNK if ($l eq '');
1500                 push(@lines, $l);
1501                 next CHUNK;
1502             }
1503         }
1504         # if we get here none of the patterns matched $subl or less chars
1505         $self->warn("trouble dissecting \"$line\"\n     into chunks ".
1506                     "of $subl chars or less - this tag won't print right");
1507         # insert a space char to prevent infinite loops
1508         $line = substr($line,0,$subl) . " " . substr($line,$subl);
1509     }
1510     my $s = shift @lines;
1511     ($self->_print("$pre1$s\n") || return) if $s;
1512     foreach my $s ( @lines ) {
1513         $self->_print("$pre2$s\n") || return;
1514     }
1515
1516     return 1;
1517 }
1518
1519 =head2 _post_sort
1520
1521  Title   : _post_sort
1522  Usage   : $obj->_post_sort($newval)
1523  Function:
1524  Returns : value of _post_sort
1525  Args    : newvalue (optional)
1526
1527
1528 =cut
1529
1530 sub _post_sort{
1531     my $obj = shift;
1532     if ( @_ ) {
1533         my $value = shift;
1534         $obj->{'_post_sort'} = $value;
1535     }
1536     return $obj->{'_post_sort'};
1537
1538 }
1539
1540 =head2 _show_dna
1541
1542  Title   : _show_dna
1543  Usage   : $obj->_show_dna($newval)
1544  Function:
1545  Returns : value of _show_dna
1546  Args    : newvalue (optional)
1547
1548
1549 =cut
1550
1551 sub _show_dna{
1552     my $obj = shift;
1553     if ( @_ ) {
1554         my $value = shift;
1555         $obj->{'_show_dna'} = $value;
1556     }
1557     return $obj->{'_show_dna'};
1558
1559 }
1560
1561 =head2 _id_generation_func
1562
1563  Title   : _id_generation_func
1564  Usage   : $obj->_id_generation_func($newval)
1565  Function:
1566  Returns : value of _id_generation_func
1567  Args    : newvalue (optional)
1568
1569
1570 =cut
1571
1572 sub _id_generation_func{
1573     my $obj = shift;
1574     if ( @_ ) {
1575         my $value = shift;
1576         $obj->{'_id_generation_func'} = $value;
1577     }
1578     return $obj->{'_id_generation_func'};
1579
1580 }
1581
1582 =head2 _ac_generation_func
1583
1584  Title   : _ac_generation_func
1585  Usage   : $obj->_ac_generation_func($newval)
1586  Function:
1587  Returns : value of _ac_generation_func
1588  Args    : newvalue (optional)
1589
1590
1591 =cut
1592
1593 sub _ac_generation_func{
1594     my $obj = shift;
1595     if ( @_ ) {
1596         my $value = shift;
1597         $obj->{'_ac_generation_func'} = $value;
1598     }
1599     return $obj->{'_ac_generation_func'};
1600
1601 }
1602
1603 =head2 _sv_generation_func
1604
1605  Title   : _sv_generation_func
1606  Usage   : $obj->_sv_generation_func($newval)
1607  Function:
1608  Returns : value of _sv_generation_func
1609  Args    : newvalue (optional)
1610
1611
1612 =cut
1613
1614 sub _sv_generation_func{
1615     my $obj = shift;
1616     if ( @_ ) {
1617         my $value = shift;
1618         $obj->{'_sv_generation_func'} = $value;
1619     }
1620     return $obj->{'_sv_generation_func'};
1621
1622 }
1623
1624 =head2 _kw_generation_func
1625
1626  Title   : _kw_generation_func
1627  Usage   : $obj->_kw_generation_func($newval)
1628  Function:
1629  Returns : value of _kw_generation_func
1630  Args    : newvalue (optional)
1631
1632
1633 =cut
1634
1635 sub _kw_generation_func{
1636     my $obj = shift;
1637     if ( @_ ) {
1638         my $value = shift;
1639         $obj->{'_kw_generation_func'} = $value;
1640     }
1641     return $obj->{'_kw_generation_func'};
1642
1643 }
1644
1645 1;