t/SeqIO/genbank.t

   1 # -*-Perl-*- Test Harness script for Bioperl
   2
   3 use strict;
   4
   5 BEGIN {
   6     use Bio::Root::Test;
   7     test_begin(-tests => 301);
   8     use_ok('Bio::SeqIO::genbank');
   9 }
  10
  11 my $verbose = test_debug;
  12
  13 my $ast = Bio::SeqIO->new(-format  => 'genbank' ,
  14                           -verbose => $verbose,
  15                           -file    => test_input_file('roa1.genbank'));
  16 isa_ok($ast, 'Bio::SeqIO');
  17 $ast->verbose($verbose);
  18 my $as = $ast->next_seq;
  19 is $as->molecule, 'mRNA',$as->accession_number;
  20 is $as->alphabet, 'dna';
  21 is $as->division, 'EST';
  22 is join(',',$as->get_dates), '27-OCT-1998';
  23 is($as->primary_id, 3598416);
  24 my @class = $as->species->classification;
  25 is $class[$#class],'Eukaryota';
  26
  27 $ast = Bio::SeqIO->new(-format  => 'genbank',
  28                        -verbose => $verbose,
  29                        -file    => test_input_file('NT_021877.gbk'));
  30 $ast->verbose($verbose);
  31 $as = $ast->next_seq;
  32 is $as->molecule, 'DNA',$as->accession_number;
  33 is $as->alphabet, 'dna';
  34 is $as->division, 'CON';
  35 is join(',',$as->get_dates), '17-OCT-2003';
  36 is($as->primary_id, 37539616);
  37 is($as->accession_number, 'NT_021877');
  38
  39 my ($cds) = grep { $_->primary_tag eq 'CDS' } $as->get_SeqFeatures;
  40 is(($cds->get_tag_values('transl_except'))[1],
  41    '(pos:complement(4224..4226),aa:OTHER)');
  42
  43 # test for a DBSOURCE line
  44 $ast = Bio::SeqIO->new(-format  => 'genbank',
  45                        -verbose => $verbose,
  46                        -file    => test_input_file('BAB68554.gb'));
  47 $ast->verbose($verbose);
  48 $as = $ast->next_seq;
  49 is $as->molecule, 'PRT',$as->accession_number;
  50 is $as->alphabet, 'protein';
  51 is $as->division, 'VRT';
  52 is join(',',$as->get_dates), '11-APR-2002';
  53 # Though older GenBank releases indicate SOURCE contains only the common name,
  54 # this is no longer true.  In general, this line will contain an abbreviated
  55 # form of the full organism name (but may contain the full length name),
  56 # as well as the optional common name and organelle.  There is no get/set
  57 # for the abbreviated name but it is accessible via name()
  58 ok defined($as->species->name('abbreviated')->[0]);
  59 is $as->species->name('abbreviated')->[0], 'Aldabra giant tortoise';
  60 is($as->primary_id, 15824047);
  61 my $ac = $as->annotation;
  62 ok defined $ac;
  63 my @dblinks = $ac->get_Annotations('dblink');
  64 is(scalar @dblinks,1);
  65 is($dblinks[0]->database, 'GenBank');
  66 is($dblinks[0]->primary_id, 'AB072353');
  67 is($dblinks[0]->version, '1');
  68 is($dblinks[0]->display_text, 'GenBank:AB072353.1','operator overloading in AnnotationI is deprecated');
  69
  70 # test for multi-line SOURCE
  71 $ast = Bio::SeqIO->new(-format  => 'genbank',
  72                        -verbose => $verbose,
  73                        -file    => test_input_file('NC_006346.gb'));
  74 $as = $ast->next_seq;
  75 is $as->species->binomial('FULL'), 'Bolitoglossa n. sp. RLM-2004',$as->accession_number;;
  76 @class = $as->species->classification;
  77 is($class[$#class],'Eukaryota');
  78 is($as->species->common_name,'mushroomtongue salamander');
  79
  80 $ast = Bio::SeqIO->new(-format  => 'genbank',
  81                        -verbose => $verbose,
  82                        -file    => test_input_file('U71225.gb'));
  83 $as = $ast->next_seq;
  84 @class = $as->species->classification;
  85 is($class[$#class],'Eukaryota',$as->accession_number);
  86 is $as->species->common_name,'black-bellied salamander';
  87
  88 # test for unusual common name
  89 $ast = Bio::SeqIO->new(-format  => 'genbank',
  90                        -verbose => $verbose,
  91                        -file    => test_input_file('AB077698.gb'));
  92 $as = $ast->next_seq;
  93 # again, this is not a common name but is in name('abbreviated')
  94 ok defined($as->species->name('abbreviated')->[0]),$as->accession_number;
  95 is $as->species->name('abbreviated')->[0],'Homo sapiens cDNA to mRNA';
  96
  97 # test for common name with parentheses
  98 $ast = Bio::SeqIO->new(-format  => 'genbank',
  99                        -verbose => $verbose,
 100                        -file    => test_input_file('DQ018368.gb'));
 101 $as = $ast->next_seq;
 102 is $as->species->scientific_name,'(Populus tomentosa x P. bolleana) x P. tomentosa var. truncata',
 103 $as->accession_number;;
 104
 105 # test secondary accessions
 106 my $seqio = Bio::SeqIO->new(-format  => 'genbank',
 107                             -verbose => $verbose,
 108                             -file    => test_input_file('D10483.gbk'));
 109 my $seq = $seqio->next_seq;
 110 my @kw =  $seq->get_keywords;
 111 is(scalar @kw, 118, $seq->accession_number);
 112 is($kw[-1], 'yabO');
 113 my @sec_acc = $seq->get_secondary_accessions;
 114 is(scalar @sec_acc,14);
 115 is($sec_acc[-1], 'X56742');
 116
 117 # bug #1487
 118 my $str = Bio::SeqIO->new(-verbose => $verbose,
 119                           -file    => test_input_file('D12555.gbk'));
 120 eval {
 121     $seq = $str->next_seq;
 122 };
 123
 124 ok(! $@, 'bug 1487');
 125
 126 # bug 1647 rpt_unit sub-feature with multiple parens
 127 $str = Bio::SeqIO->new(-format  => 'genbank',
 128                        -verbose => $verbose,
 129                        -file    => test_input_file('mini-AE001405.gb'));
 130 ok($seq = $str->next_seq);
 131 my @rpts = grep { $_->primary_tag eq 'repeat_region' }
 132   $seq->get_SeqFeatures;
 133 is $#rpts, 2, 'bug 1647';
 134 my @rpt_units = grep {$_->has_tag('rpt_unit')} @rpts;
 135 is $#rpt_units, 0;
 136 is(($rpt_units[0]->get_tag_values('rpt_unit'))[0],'(TG)10;A;(TG)7');
 137
 138 # test bug #1673 , RDB-II genbank files
 139 $str = Bio::SeqIO->new(-format  => 'genbank',
 140                        -verbose => $verbose,
 141                        -file    => test_input_file('Mcjanrna_rdbII.gbk')
 142               );
 143 ok($seq = $str->next_seq, 'bug 1673');
 144 my @refs = $seq->annotation->get_Annotations('reference');
 145 is(@refs, 1);
 146 is($seq->display_id,'Mc.janrrnA');
 147 is($seq->molecule ,'RNA');
 148 is $as->division, 'PLN';
 149 is join(',',$as->get_dates), '23-MAY-2005';
 150
 151 $str  = Bio::SeqIO->new(-format  => 'genbank',
 152                         -file    => test_input_file('AF165282.gb'),
 153                         -verbose => $verbose);
 154 $seq = $str->next_seq;
 155 my @features = $seq->all_SeqFeatures;
 156 is(@features, 5, $seq->accession_number);
 157 is($features[0]->start, 1);
 158 is($features[0]->end, 226);
 159 my $location = $features[1]->location;
 160 ok($location->isa('Bio::Location::SplitLocationI'));
 161 my @sublocs = $location->sub_Location;
 162 is(@sublocs, 29);
 163
 164 # version and primary ID - believe it or not, this wasn't working
 165 is ($seq->version, 1);
 166 is ($seq->seq_version, 1);
 167 is ($seq->primary_id, "5734104");
 168
 169 # streaming and Bio::RichSeq creation
 170 my $stream = Bio::SeqIO->new(-file    => test_input_file('test.genbank'),
 171                              -verbose => $verbose,
 172                              -format  => 'genbank');
 173 $stream->verbose($verbose);
 174 my $seqnum = 0;
 175 my $species;
 176 my @cl;
 177 my $lasts;
 178 my @ids = qw(DDU63596 DDU63595 HUMBDNF);
 179 my @tids = (44689, 44689, 9606);
 180 my @tnames = ("Dictyostelium discoideum",
 181               "Dictyostelium discoideum",
 182               "Homo sapiens");
 183 while($seq = $stream->next_seq) {
 184     if($seqnum < 3) {
 185         is $seq->display_id, $ids[$seqnum];
 186         $species = $seq->species;
 187         @cl = $species->classification;
 188         is( $species->binomial, $tnames[$seqnum],
 189             'species parsing incorrect for genbank');
 190         is( $cl[3] ne $species->genus, 1,
 191             'genus duplicated in genbank parsing');
 192         is( $species->ncbi_taxid, $tids[$seqnum] );
 193     }
 194     $seqnum++;
 195     $lasts = $seq;
 196 }
 197 is($seqnum, 5,'streaming');
 198 is $lasts->display_id, "HUMBETGLOA";
 199 my ($ref) = $lasts->annotation->get_Annotations('reference');
 200 is($ref->medline, 94173918);
 201 $stream->close;
 202
 203 $stream = Bio::SeqIO->new(-file    => test_input_file('test.genbank.noseq'),
 204                           -verbose => $verbose,
 205                           -format  => 'genbank' );
 206 $seqnum = 0;
 207 while($seq = $stream->next_seq) {
 208     if($seqnum < 3) {
 209         is $seq->display_id, $ids[$seqnum];
 210     }
 211     elsif( $seq->display_id eq 'M37762') {
 212         is( ($seq->get_keywords)[0], 'neurotrophic factor');
 213     }
 214     $seqnum++;
 215 }
 216 is $seqnum, 5, "Total number of sequences in test file";
 217
 218 # fuzzy
 219 $seq = Bio::SeqIO->new( -file    => test_input_file('testfuzzy.genbank'),
 220                         -format  => 'genbank',
 221                         -verbose => $verbose );
 222 ok(defined($as = $seq->next_seq));
 223
 224 @features = $as->all_SeqFeatures;
 225 is(@features,21,'Fuzzy in');
 226 my $lastfeature = pop @features;
 227 # this is a split location; the root doesn't have strand
 228 is($lastfeature->strand, undef);
 229 $location = $lastfeature->location;
 230 #$location->verbose(-1); # silence the warning of undef seq_id()
 231 # see above; splitlocs roots do not have a strand really
 232 is($location->strand, undef);
 233 is($location->start, 83202);
 234 is($location->end, 84996);
 235
 236 @sublocs = $location->sub_Location;
 237
 238 is(@sublocs, 2);
 239 my $loc = shift @sublocs;
 240 is($loc->start, 83202);
 241 is($loc->end, 83329);
 242 is($loc->strand, -1);
 243
 244 $loc = shift @sublocs;
 245 is($loc->start, 84248);
 246 is($loc->end, 84996);
 247 is($loc->strand,1);
 248
 249 $seq = Bio::SeqIO->new(-format  => 'genbank',
 250                        -verbose => $verbose,
 251                        -file    => ">" .test_output_file);
 252 $seq->verbose($verbose);
 253 ok($seq->write_seq($as),'Fuzzy out');
 254
 255 ## now genbank ##
 256 $str = Bio::SeqIO->new(-format  => 'genbank',
 257                        -verbose => $verbose,
 258                        -file    => test_input_file('BK000016-tpa.gbk'));
 259 $seq = $str->next_seq;
 260 ok(defined $seq, $seq->accession_number);
 261 ok(defined $seq->seq);
 262 is($seq->accession_number, 'BK000016',$seq->accession_number);
 263 is($seq->alphabet, 'dna');
 264 is($seq->display_id, 'BK000016');
 265 is($seq->length, 1162);
 266 is($seq->division, 'ROD');
 267 is($seq->get_dates, 1);
 268 is($seq->keywords, 'Third Party Annotation; TPA');
 269 is($seq->desc, 'TPA: Mus musculus pantothenate kinase 4 mRNA, partial cds.');
 270 is($seq->seq_version, 1);
 271 is($seq->feature_count, 2);
 272 my $spec_obj = $seq->species;
 273 is ($spec_obj->common_name, 'house mouse');
 274 is ($spec_obj->species, 'musculus');
 275 is ($spec_obj->genus, 'Mus');
 276 is ($spec_obj->binomial, 'Mus musculus');
 277 $ac = $seq->annotation;
 278 my $reference =  ($ac->get_Annotations('reference') )[0];
 279 is ($reference->pubmed, '11479594');
 280 is ($reference->medline, '21372465',$seq->accession_number);
 281
 282 # validate that what is written is what is read
 283 my $testfile = test_output_file;
 284 my $out = Bio::SeqIO->new(-file   => ">$testfile",
 285                           -format => 'genbank');
 286 $out->write_seq($seq);
 287 $out->close;
 288
 289 $str = Bio::SeqIO->new(-format => 'genbank',
 290                        -file   => $testfile);
 291 $seq = $str->next_seq;
 292 ok(defined $seq,'roundtrip');
 293 ok(defined $seq->seq);
 294 is($seq->accession_number, 'BK000016');
 295 is($seq->alphabet, 'dna');
 296 is($seq->display_id, 'BK000016');
 297 is($seq->length, 1162);
 298 is($seq->division, 'ROD');
 299 is($seq->get_dates, 1);
 300 is($seq->keywords, 'Third Party Annotation; TPA');
 301 is($seq->desc, 'TPA: Mus musculus pantothenate kinase 4 mRNA, partial cds.');
 302 is($seq->seq_version, 1);
 303 is($seq->feature_count, 2);
 304 $spec_obj = $seq->species;
 305 is ($spec_obj->common_name, 'house mouse');
 306 is ($spec_obj->species, 'musculus');
 307 is ($spec_obj->genus, 'Mus');
 308 is ($spec_obj->binomial, 'Mus musculus');
 309 $ac = $seq->annotation;
 310 $reference =  ($ac->get_Annotations('reference') )[0];
 311 is ($reference->pubmed, '11479594');
 312 is ($reference->medline, '21372465');
 313
 314 # write revcomp split location
 315 my $gb = Bio::SeqIO->new(-format => 'genbank',
 316                          # This sequence has an odd LOCUS line which sets off a warning, setting
 317                          # verbose to -1.
 318                          # The newest Ensembl seq lacks this.  Maybe update?  cjfields 6-5-07
 319                          -verbose => $verbose ? $verbose : -1,
 320                          -file   => test_input_file('revcomp_mrna.gb'));
 321 $seq = $gb->next_seq;
 322
 323 $gb = Bio::SeqIO->new(-format => 'genbank',
 324                       -file   => ">$testfile");
 325
 326 $gb->write_seq($seq);
 327 undef $gb;
 328 ok(! -z $testfile, 'revcomp split location');
 329
 330 # bug 1925, continuation of long ORGANISM line ends up in @classification:
 331 # ORGANISM  Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC
 332 #           9150
 333 #           Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;
 334 #           Enterobacteriaceae; Salmonella.
 335 $gb = Bio::SeqIO->new(-format  => 'genbank',
 336                       -verbose => $verbose,
 337                       -file    => test_input_file('NC_006511-short.gbk'));
 338 $seq = $gb->next_seq;
 339 is $seq->species->common_name, undef, "Bug 1925";
 340 is $seq->species->scientific_name, "Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150";
 341 @class = $seq->species->classification;
 342 is $class[$#class], "Bacteria";
 343
 344 # WGS   tests
 345 $gb = Bio::SeqIO->new(-format  => 'genbank',
 346                       -verbose => $verbose,
 347                       -file    => test_input_file('O_sat.wgs'));
 348 $seq = $gb->next_seq;
 349
 350 my @tests = ('wgs'        => 'AAAA02000001-AAAA02050231',
 351              'wgs_scafld' => 'CM000126-CM000137',
 352              'wgs_scafld' => 'CH398081-CH401163');
 353
 354 my @wgs = map {$seq->annotation->get_Annotations(lc($_))} qw(WGS WGS_SCAFLD);
 355
 356 my $ct=0;
 357
 358 for my $wgs (@wgs) {
 359     my ($tagname, $value) = (shift @tests, shift @tests);
 360     is($wgs->tagname, $tagname, $tagname);
 361     is($wgs->value, $value);
 362     $ct++;
 363 }
 364
 365 is ($ct, 3);
 366
 367 # make sure we can retrieve a feature with a primary tag of 'misc_difference'
 368 $gb = Bio::SeqIO->new(-format  => 'genbank',
 369                       -verbose => $verbose,
 370                       -file    => test_input_file('BC000007.gbk'));
 371 $seq = $gb->next_seq;
 372 ($cds) = grep { $_->primary_tag eq 'misc_difference' } $seq->get_SeqFeatures;
 373 my @vals = $cds->get_tag_values('gene');
 374 is $vals[0], 'PX19', $seq->accession_number;
 375
 376 # Check that the source,organism section is identical between input and output.
 377 # - test an easy one where organism is species, then two different formats of
 378 # subspecies, then a species with a format that used to be mistaken for
 379 # subspecies, then a bacteria with no genus, and finally a virus with a genus.
 380
 381 # These tests are now somewhat out-of-date since we are moving to a Bio::Taxon-
 382 # based system for verifying taxonomic information.  Right now they just verify
 383 # changes so are really useless; I will change them to verify common name,
 384 # organelle, scientific name, etc.
 385
 386 my $outfile = test_output_file;
 387
 388 # output always adds a period (GenBank std), but two of these files do not use them.
 389
 390 foreach my $in ('BK000016-tpa.gbk', 'ay116458.gb', 'ay149291.gb', 'NC_006346.gb', 'ay007676.gb', 'dq519393.gb') {
 391     my $infile =  test_input_file($in);
 392
 393     $str = Bio::SeqIO->new(-format  => 'genbank',
 394                            -verbose => $verbose,
 395                            -file    => $infile);
 396     $seq = $str->next_seq;
 397
 398     $out = Bio::SeqIO->new(-file   => ">$outfile",
 399                            -format => 'genbank');
 400     $out->write_seq($seq);
 401     $out->close;
 402
 403     open my $IN, '<', $infile or die "Could not read file '$infile': $!\n";
 404     my @in = <$IN>;
 405     close $IN;
 406
 407     open my $RESULT, '<', $outfile or die "Could not read file '$outfile': $!\n";
 408     my $line = 0;
 409     my $check = 0;
 410     my $is = 1;
 411
 412   FILECHECK:
 413     while (my $result = <$RESULT>) {
 414         if ($result =~ /^KEYWORDS/) {
 415             $check = 1;
 416             next;
 417         }
 418
 419         if ($result =~ /^REFERENCE/) {
 420             last FILECHECK;
 421         }
 422
 423         if ($check) {
 424             # end periods don't count (not all input files have them)
 425             $result =~ s{\.$}{};
 426             $in[$line] =~ s{\.$}{};
 427
 428             if ($result ne $in[$line]) {
 429                 $is = 0;
 430                 last;
 431             }
 432         }
 433     }
 434     continue {
 435         $line++;
 436     }
 437     close $RESULT;
 438
 439     ok $is, $in;
 440 }
 441
 442 # NB: there should probably be full testing on all lines to ensure that output
 443 # matches input.
 444
 445 # 20061117: problem with *double* colon in some annotation-dblink values
 446 $ct = 0;
 447
 448 foreach my $in ('P35527.gb') {
 449     my $infile =  test_input_file($in);
 450     $str = Bio::SeqIO->new(-format  => 'genbank',
 451                            -verbose => $verbose,
 452                            -file    => $infile);
 453     $seq = $str->next_seq;
 454
 455     my $ac = $seq->annotation;      # Bio::AnnotationCollection
 456     foreach my $key ($ac->get_all_annotation_keys ) {
 457         my @values = $ac->get_Annotations($key);
 458         foreach my $ann (@values) {
 459             my $value = $ann->display_text;
 460             $ct++;
 461             if ($key eq 'dblink') {
 462                 ok (index($value,'::') < 0);   # this should never be true
 463                 ok ($value, $value);           # check value is not empty
 464
 465                 #  print "  ann/", sprintf('%12s  ',$key), '>>>', $value , '<<<', "\n";
 466                 #  print "        index double colon: ",index($value   ,'::'), "\n";
 467
 468                 #  check db name:
 469                 my @parts = split(/:/,$value);
 470                 if ( $parts[0] =~ /^(?:
 471                         #  not an exhaustive list of databases;
 472                         #  just the db's referenced in P35527.gb:
 473                           swissprot | GenBank | GenPept      | HSSP | IntAct   | Ensembl | KEGG
 474                         | HGNC      | MIM     | ArrayExpress | GO   | InterPro | Pfam    | PRINTS
 475                         | PROSITE    )$/x
 476                 ) {
 477                     ok 1;
 478                 }
 479                 else {
 480                     ok 0;
 481                 }
 482                     ok ( $parts[1], "$parts[0]" );
 483             }
 484         }
 485     }
 486 }
 487
 488 is($ct, 46);
 489
 490 # bug 2195
 491
 492 $str = Bio::SeqIO->new(-format  => 'genbank',
 493                        -verbose => $verbose,
 494                        -file    => test_input_file('AF305198.gb')
 495                      );
 496
 497 $species = $str->next_seq->species;
 498
 499 is($species->scientific_name, 'Virginia creeper phytoplasma', 'Bug 2195');
 500 is(join(', ',$species->classification),
 501      'Virginia creeper phytoplasma, 16SrV (Elm yellows group), '
 502    . 'Candidatus Phytoplasma, Acholeplasmataceae, Acholeplasmatales, '
 503    . 'Mollicutes, Firmicutes, Bacteria',
 504    'Bug 2195');
 505
 506 # bug 2569, PROJECT line support, read and write, round-tripping
 507
 508 $str = Bio::SeqIO->new(-format  => 'genbank',
 509                        -verbose => $verbose,
 510                        -file    => test_input_file('NC_008536.gb'));
 511 $seq = $str->next_seq;
 512
 513 my $project = ($seq->annotation->get_Annotations('project'))[0];
 514 isa_ok($project, 'Bio::Annotation::SimpleValue');
 515
 516 if ($project) {
 517     is($project->value, 'GenomeProject:12638');
 518 }
 519 else {
 520     ok(0, "PROJECT not parsed");
 521 }
 522
 523 $outfile = test_output_file;
 524 $gb = Bio::SeqIO->new(-format  => 'genbank',
 525                       -verbose => $verbose,
 526                       -file    => ">$outfile");
 527 $gb->write_seq($seq);
 528
 529 $str = Bio::SeqIO->new(-format  => 'genbank',
 530                        -verbose => $verbose,
 531                        -file    => $outfile);
 532 $seq = $str->next_seq;
 533
 534 $project = ($seq->annotation->get_Annotations('project'))[0];
 535 isa_ok($project, 'Bio::Annotation::SimpleValue');
 536
 537 if ($project) {
 538     is($project->value, 'GenomeProject:12638');
 539 }
 540 else {
 541     ok(0, "Roundtrip test failed");
 542 }
 543
 544 # test for swissprot/UniProt/UniProtKB DBSOURCE line (Bug : RT 44536)
 545 $ast = Bio::SeqIO->new(-format  => 'genbank',
 546                        -verbose => $verbose,
 547                        -file    => test_input_file('P39765.gb'));
 548 $as = $ast->next_seq;
 549 is $as->molecule, 'PRT',$as->accession_number;
 550 is $as->division, 'BCT',$as->accession_number;
 551 is join(',',$as->get_dates), '03-MAR-2009',$as->accession_number;
 552 is $as->alphabet, 'protein';
 553 # Though older GenBank releases indicate SOURCE contains only the common name,
 554 # this is no longer true.  In general, this line will contain an abbreviated
 555 # form of the full organism name (but may contain the full length name),
 556 # as well as the optional common name and organelle.  There is no get/set
 557 # for the abbreviated name but it is accessible via name()
 558 ok defined($as->species->name('abbreviated')->[0]);
 559 is $as->species->name('abbreviated')->[0], 'Bacillus subtilis';
 560 is($as->primary_id, 20141743);
 561 $ac = $as->annotation;
 562 ok defined $ac;
 563 @dblinks = $ac->get_Annotations('dblink');
 564 is(scalar @dblinks,31);
 565 is($dblinks[0]->database, 'UniProtKB');
 566 is($dblinks[0]->primary_id, 'PYRR_BACSU');
 567 is($dblinks[0]->version, undef);
 568 is($dblinks[0]->display_text, 'UniProtKB:PYRR_BACSU','operator overloading in AnnotationI is deprecated');
 569
 570 #bug 2982 embl/genbank contig handling
 571
 572 $ast = Bio::SeqIO->new( -file   => test_input_file('bug2982.gb'),
 573                         -format => 'genbank' );
 574 $seq = $ast->next_seq;
 575
 576 ok my @ctg = $seq->annotation->get_Annotations('contig');
 577 like $ctg[0]->value, qr/join\(.*?gap.*?complement/;
 578
 579 # write_seq() and FTHelper duplicate specific tags, need to check a round-trip
 580 $ast = Bio::SeqIO->new(-format  => 'genbank' ,
 581                        -verbose => $verbose,
 582                        -file    => test_input_file('singlescore.gbk'));
 583 $as = $ast->next_seq;
 584 ($cds) = grep { $_->primary_tag eq 'CDS' } $as->get_SeqFeatures;
 585 my @notes = $cds->get_tag_values('note');
 586 is(scalar @notes, 2);
 587 $testfile = test_output_file;
 588 $out = Bio::SeqIO->new(-file   => ">$testfile",
 589                        -format => 'genbank');
 590 $out->write_seq($as);
 591 $out->close;
 592 $ast = Bio::SeqIO->new(-format  => 'genbank' ,
 593                        -verbose => $verbose,
 594                        -file    => $testfile );
 595 $as = $ast->next_seq;
 596 ($cds) = grep { $_->primary_tag eq 'CDS' } $as->get_SeqFeatures;
 597 @notes = $cds->get_tag_values('note');
 598 is(scalar @notes, 2);
 599
 600
 601 #bug 3375
 602 my $in = Bio::SeqIO->new(-format => 'genbank',
 603                          -file   => test_input_file('NC_002058_multDBLINK_bug3375.gb'));
 604 $seq = $in->next_seq;     # should not throw a warning now
 605 @dblinks = $seq->annotation->get_Annotations('dblink');    # contains 5 dblink references
 606 # testing DBLINK      BioProject: PRJNA15288
 607 is($dblinks[0]->database, 'BioProject', 'bug3375 database is BioProject');
 608 is($dblinks[0]->primary_id, 'PRJNA15288', 'bug3375 primary_id is PRJNA15288');
 609 # testing DBLINK      Project:100,200,300
 610 is($dblinks[3]->database, 'Project');
 611 is($dblinks[3]->primary_id, '300');
 612 # testing DBLINK      NC_002058.3
 613 is($dblinks[4]->database, 'GenBank');
 614 is($dblinks[4]->primary_id, 'NC_002058');
 615 is($dblinks[4]->version, '3');
 616
 617 # long labels handled
 618 {
 619     # Create sequence with feature with a long label qualifier
 620     my $seq=Bio::Seq->new(-seq => 'actg',
 621                           -id  => 'abacab');
 622     my $feature=Bio::SeqFeature::Generic->new(-primary=>'CDS', -start=>1, -end=>4);
 623     my $label='1 2 3 4 5 6 7 8 9 a b c d e f g h i j k l m n o p q r';
 624     $feature->add_tag_value(label => $label);
 625     $seq->add_SeqFeature($feature);
 626
 627     # Write genbank
 628     my $string;
 629     open my $str_fh, '>', \$string or skip("Could not write string, skipping", 2);
 630     my $out = Bio::SeqIO->new(-format => 'genbank',
 631                               -fh     => $str_fh);
 632     $out->write_seq($seq);
 633
 634     # Read genbank
 635     my $in = Bio::SeqIO->new(-format => 'genbank',
 636                              -string => $string);
 637     my $genbank = $in->next_seq;
 638     my ($read_feature) = $genbank->get_SeqFeatures;
 639     my ($read_label) = $read_feature->get_tag_values('label');
 640     is($read_label, $label, 'Label is the same');
 641 }
 642
 643 # bug 3448
 644 $in = Bio::SeqIO->new(-format  => 'genbank',
 645                       -file    => test_input_file('YP_007988852.gp'),
 646                       -verbose => $verbose);
 647 $seq = $in->next_seq;     # should not throw a warning now
 648 is($seq->length, 205);
 649
 650 my @anns = $seq->annotation->get_Annotations('contig');
 651 is(@anns, 1);
 652 isa_ok($anns[0], 'Bio::Annotation::SimpleValue');
 653 is($anns[0]->value, 'join(WP_015639704.1:1..205)');
 654
 655 is($seq->seq, 'MENRKFGYIRVSSKDQNEGRQLEAMRKIGITERDIYLDKQSGKNFERANYQLLKRIIRKGDI'
 656             . 'LYIHSLDRFGRNKEEILQEWNDLTKNIEADIVVLDMPLLDTTQYKDSMGTFIADLVLQILSWMAEEERERIRK'
 657             . 'RQREGIDLALQNGIQFGRSPVVVSDEFKEVYRKWKAKELTAVEAMQEAGVKKTSFYKLVKAHENSIKVNS');
 658
 659 # Genbank files with CONTIG and sequence should print the sequence with write_seq()
 660 $testfile = test_output_file;
 661 $out = Bio::SeqIO->new(-file   => ">$testfile",
 662                        -format => 'genbank');
 663 $out->write_seq($seq);
 664 $out->close;
 665
 666 $in = Bio::SeqIO->new(-file    => $testfile,
 667                       -format  => 'genbank',
 668                       -verbose => $verbose);
 669 $seq = $in->next_seq;
 670 is($seq->length, 205);
 671
 672 @anns = $seq->annotation->get_Annotations('contig');
 673 is(@anns, 1);
 674 isa_ok($anns[0], 'Bio::Annotation::SimpleValue');
 675 is($anns[0]->value, 'join(WP_015639704.1:1..205)');
 676
 677 is($seq->seq, 'MENRKFGYIRVSSKDQNEGRQLEAMRKIGITERDIYLDKQSGKNFERANYQLLKRIIRKGDI'
 678             . 'LYIHSLDRFGRNKEEILQEWNDLTKNIEADIVVLDMPLLDTTQYKDSMGTFIADLVLQILSWMAEEERERIRK'
 679             . 'RQREGIDLALQNGIQFGRSPVVVSDEFKEVYRKWKAKELTAVEAMQEAGVKKTSFYKLVKAHENSIKVNS');
 680
 681 $seq = Bio::SeqIO->new(-format => 'genbank',
 682                        -file   => test_input_file('YP_007988852.gp') )->next_seq;
 683 @features = $seq->remove_SeqFeatures;
 684 is $#features, 10, 'Got 11 features';
 685
 686 $seq = Bio::SeqIO->new(-format => 'genbank',
 687                        -file   => test_input_file('YP_007988852.gp') )->next_seq;
 688 @features = $seq->remove_SeqFeatures('CDS');
 689 is $#features, 0, 'Got 1 feature';
 690 is $features[0]->primary_tag, 'CDS', 'Correct primary tag for feature';
 691 @features = $seq->remove_SeqFeatures;
 692 is $#features, 9, 'Got 10 features';
 693
 694 # Handle Structured Comments in COMMENT section
 695 $seq = Bio::SeqIO->new(-format => 'genbank',
 696                        -file   => test_input_file('KF527485.gbk') )->next_seq;
 697 my $comment = ($seq->get_Annotations('comment') )[0];
 698 is($comment->as_text, "Comment:
 699 ##Assembly-Data-START##
 700 Assembly Method :: Lasergene v. 10
 701 Sequencing Technology :: ABI37XL; Sanger dideoxy sequencing
 702 ##Assembly-Data-END##",
 703 "Got correct Structured Comment");
 704
 705 $seq = Bio::SeqIO->new(-format => 'genbank',
 706                        -file   => test_input_file('HM138502.gbk') )->next_seq;
 707 $comment = ($seq->get_Annotations('comment') )[0];
 708 ok( $comment->as_text
 709         =~ /^Comment: Swine influenza A \(H1N1\) virus isolated during human swine flu outbreak of 2009/,
 710     "Got correct Structured Comment"
 711 );
 712 ok( $comment->as_text =~ /^##GISAID_EpiFlu\(TM\)Data-START##/m,
 713     "Got correct Structured Comment" );
 714 ok( $comment->as_text =~ /^Subtype :: H1N1/m,
 715     "Got correct Structured Comment"
 716 );
 717 ok( $comment->as_text =~ /^##GISAID_EpiFlu\(TM\)Data-END##/m,
 718     "Got correct Structured Comment" );