t/SeqIO/Handler.t

   1 # -*-Perl-*- Test Harness script for Bioperl
   2 # $Id$
   3
   4 use strict;
   5
   6 BEGIN {
   7     use Bio::Root::Test;
   8
   9     test_begin(-tests           => 561,
  10                -requires_module => 'Data::Stag');
  11
  12     use_ok('Bio::SeqIO');
  13 }
  14
  15
  16 my $verbose = test_debug();
  17
  18 ################################## GenBank ##################################
  19
  20 my $ast = Bio::SeqIO->new(-format => 'gbdriver' ,
  21                         -verbose => $verbose,
  22                         -file => test_input_file("roa1.genbank"));
  23 $ast->verbose($verbose);
  24 my $as = $ast->next_seq();
  25 is $as->molecule, 'mRNA',$as->accession_number;
  26 is $as->alphabet, 'dna';
  27 is($as->primary_id, 3598416);
  28 my @class = $as->species->classification;
  29 is $class[$#class],'Eukaryota';
  30
  31 $ast = Bio::SeqIO->new(-format => 'gbdriver',
  32                               -verbose => $verbose,
  33                        -file => test_input_file("NT_021877.gbk"));
  34 $ast->verbose($verbose);
  35 $as = $ast->next_seq();
  36 is $as->molecule, 'DNA',$as->accession_number;
  37 is $as->alphabet, 'dna';
  38 is($as->primary_id, 37539616);
  39 is($as->accession_number, 'NT_021877');
  40
  41 my ($cds) = grep { $_->primary_tag eq 'CDS' } $as->get_SeqFeatures();
  42 is(($cds->get_tag_values('transl_except'))[1],
  43    '(pos:complement(4224..4226),aa:OTHER)');
  44
  45 # test for a DBSOURCE line
  46 $ast = Bio::SeqIO->new(-format => 'gbdriver',
  47                               -verbose => $verbose,
  48                        -file => test_input_file("BAB68554.gb"));
  49 $ast->verbose($verbose);
  50 $as = $ast->next_seq();
  51 is $as->molecule, 'linear',$as->accession_number;;
  52 is $as->alphabet, 'protein';
  53 # Though older GenBank releases indicate SOURCE contains only the common name,
  54 # this is no longer true.  In general, this line will contain an abbreviated
  55 # form of the full organism name (but may contain the full length name),
  56 # as well as the optional common name and organelle.  There is no get/set
  57 # for the abbreviated name but it is accessible via name()
  58 ok defined($as->species->name('abbreviated')->[0]);
  59 is $as->species->name('abbreviated')->[0], 'Aldabra giant tortoise';
  60 is($as->primary_id, 15824047);
  61 my $ac = $as->annotation;
  62 ok defined $ac;
  63 my @dblinks = $ac->get_Annotations('dblink');
  64 is(scalar @dblinks,1);
  65 is($dblinks[0]->database, 'GenBank');
  66 is($dblinks[0]->primary_id, 'AB072353');
  67 is($dblinks[0]->version, '1');
  68 is($dblinks[0]->display_text, 'GenBank:AB072353.1','operator overloading in AnnotationI is deprecated');
  69
  70 # test for multi-line SOURCE
  71 $ast = Bio::SeqIO->new(-format => 'gbdriver',
  72                               -verbose => $verbose,
  73                        -file => test_input_file("NC_006346.gb"));
  74 $as = $ast->next_seq;
  75 is $as->species->binomial('FULL'), 'Bolitoglossa n. sp. RLM-2004',$as->accession_number;;
  76 @class = $as->species->classification;
  77 is($class[$#class],'Eukaryota');
  78 is($as->species->common_name,'mushroomtongue salamander');
  79
  80 $ast = Bio::SeqIO->new(-format => 'gbdriver',
  81                               -verbose => $verbose,
  82                        -file => test_input_file("U71225.gb"));
  83 $as = $ast->next_seq;
  84 @class = $as->species->classification;
  85 is($class[$#class],'Eukaryota',$as->accession_number);
  86 is $as->species->common_name,'black-bellied salamander';
  87
  88 # test for unusual common name
  89 $ast = Bio::SeqIO->new(-format => 'gbdriver',
  90                               -verbose => $verbose,
  91                        -file => test_input_file("AB077698.gb"));
  92 $as = $ast->next_seq;
  93 # again, this is not a common name but is in name('abbreviated')
  94 ok defined($as->species->name('abbreviated')->[0]),$as->accession_number;
  95 is $as->species->name('abbreviated')->[0],'Homo sapiens cDNA to mRNA';
  96
  97 # test for common name with parentheses
  98 $ast = Bio::SeqIO->new(-format => 'gbdriver',
  99                               -verbose => $verbose,
 100                        -file => test_input_file("DQ018368.gb"));
 101 $as = $ast->next_seq;
 102 is $as->species->scientific_name,'(Populus tomentosa x P. bolleana) x P. tomentosa var. truncata',
 103 $as->accession_number;;
 104
 105 # test secondary accessions
 106 my $seqio = Bio::SeqIO->new(-format => 'gbdriver',
 107                                     -verbose => $verbose,
 108                                     -file => test_input_file('D10483.gbk'));
 109 my $seq = $seqio->next_seq;
 110 my @kw =  $seq->get_keywords;
 111 is(scalar @kw, 118, $seq->accession_number);
 112 is($kw[-1], 'yabO');
 113 my @sec_acc = $seq->get_secondary_accessions();
 114 is(scalar @sec_acc,14);
 115 is($sec_acc[-1], 'X56742');
 116
 117 # bug #1487
 118 my $str = Bio::SeqIO->new(-verbose => $verbose,
 119                                  -file    => test_input_file('D12555.gbk'));
 120 eval {
 121     $seq = $str->next_seq;
 122 };
 123
 124 ok(! $@, 'bug 1487');
 125
 126 # bug 1647 rpt_unit sub-feature with multiple parens
 127 $str = Bio::SeqIO->new(-format => 'gbdriver',
 128                               -verbose => $verbose,
 129                        -file => test_input_file('mini-AE001405.gb'));
 130 ok($seq = $str->next_seq);
 131 my @rpts = grep { $_->primary_tag eq 'repeat_region' }
 132   $seq->get_SeqFeatures;
 133 is $#rpts, 2, 'bug 1647';
 134 my @rpt_units = grep {$_->has_tag('rpt_unit')} @rpts;
 135 is $#rpt_units, 0;
 136 is(($rpt_units[0]->get_tag_values('rpt_unit'))[0],'(TG)10;A;(TG)7');
 137
 138 # test bug #1673 , RDB-II genbank files
 139 $str = Bio::SeqIO->new(-format => 'gbdriver',
 140                               -verbose => $verbose,
 141                        -file => test_input_file('Mcjanrna_rdbII.gbk')
 142               );
 143 ok($seq = $str->next_seq, 'bug 1673');
 144 my @refs = $seq->annotation->get_Annotations('reference');
 145 is(@refs, 1);
 146 is($seq->display_id,'Mc.janrrnA');
 147 is($seq->molecule ,'RNA');
 148
 149 $str  = Bio::SeqIO->new(-format => 'gbdriver',
 150                               -file   => test_input_file("AF165282.gb"),
 151                               -verbose => $verbose);
 152 $seq = $str->next_seq;
 153 my @features = $seq->all_SeqFeatures();
 154 is(@features, 5, $seq->accession_number);
 155 is($features[0]->start, 1);
 156 is($features[0]->end, 226);
 157 my $location = $features[1]->location;
 158 ok($location->isa('Bio::Location::SplitLocationI'));
 159 my @sublocs = $location->sub_Location();
 160 is(@sublocs, 29);
 161
 162 # version and primary ID - believe it or not, this wasn't working
 163 is ($seq->version, 1);
 164 is ($seq->seq_version, 1);
 165 is ($seq->primary_id, "5734104");
 166
 167 # streaming and Bio::RichSeq creation
 168 my $stream = Bio::SeqIO->new(-file => test_input_file("test.genbank"),
 169                                       -verbose => $verbose,
 170                              -format => 'gbdriver');
 171 $stream->verbose($verbose);
 172 my $seqnum = 0;
 173 my $species;
 174 my @cl;
 175 my $lasts;
 176 my @ids = qw(DDU63596 DDU63595 HUMBDNF);
 177 my @tids = (44689, 44689, 9606);
 178 my @tnames = ("Dictyostelium discoideum","Dictyostelium discoideum",
 179                   "Homo sapiens");
 180 while($seq = $stream->next_seq()) {
 181     if($seqnum < 3) {
 182         is $seq->display_id(), $ids[$seqnum];
 183         $species = $seq->species();
 184         @cl = $species->classification();
 185         is( $species->binomial(), $tnames[$seqnum],
 186              'species parsing incorrect for genbank');
 187         is( $cl[3] ne $species->genus(), 1,
 188              'genus duplicated in genbank parsing');
 189         is( $species->ncbi_taxid, $tids[$seqnum] );
 190     }
 191     $seqnum++;
 192     $lasts = $seq;
 193 }
 194 is($seqnum, 5,'streaming');
 195 is $lasts->display_id(), "HUMBETGLOA";
 196 my ($ref) = $lasts->annotation->get_Annotations('reference');
 197 is($ref->medline, 94173918);
 198 $stream->close();
 199
 200 $stream = Bio::SeqIO->new(-file => test_input_file("test.genbank.noseq"),
 201                                   -verbose => $verbose,
 202                                   -format => 'gbdriver' );
 203 $seqnum = 0;
 204 while($seq = $stream->next_seq()) {
 205     if($seqnum < 3) {
 206         is $seq->display_id(), $ids[$seqnum];
 207     } elsif( $seq->display_id eq 'M37762') {
 208         is( ($seq->get_keywords())[0], 'neurotrophic factor');
 209     }
 210     $seqnum++;
 211 }
 212 is $seqnum, 5, "Total number of sequences in test file";
 213
 214 # fuzzy
 215 $seq = Bio::SeqIO->new( -format => 'gbdriver',
 216                                 -verbose => $verbose,
 217                         -file =>test_input_file("testfuzzy.genbank"));
 218 $seq->verbose($verbose);
 219 ok(defined($as = $seq->next_seq()));
 220
 221 @features = $as->all_SeqFeatures();
 222 is(@features,21,'Fuzzy in');
 223 my $lastfeature = pop @features;
 224 # this is a split location; the root doesn't have strand
 225 is($lastfeature->strand, undef);
 226 $location = $lastfeature->location;
 227 #$location->verbose(-1); # silence the warning of undef seq_id()
 228 # see above; splitlocs roots do not have a strand really
 229 is($location->strand, undef);
 230 is($location->start, 83202);
 231 is($location->end, 84996);
 232
 233 @sublocs = $location->sub_Location();
 234
 235 is(@sublocs, 2);
 236 my $loc = shift @sublocs;
 237 is($loc->start, 83202);
 238 is($loc->end, 83329);
 239 is($loc->strand, -1);
 240
 241 $loc = shift @sublocs;
 242 is($loc->start, 84248);
 243 is($loc->end, 84996);
 244 is($loc->strand,1);
 245
 246 my $outfile = test_output_file();
 247 $seq = Bio::SeqIO->new(-format => 'genbank',
 248                               -verbose => $verbose,
 249                        -file=> ">$outfile");
 250 $seq->verbose($verbose);
 251 ok($seq->write_seq($as),'Fuzzy out');
 252
 253 ## now genbank ##
 254 $str = Bio::SeqIO->new(-format =>'gbdriver',
 255                              -verbose => $verbose,
 256                              -file => test_input_file('BK000016-tpa.gbk'));
 257 $seq = $str->next_seq;
 258 ok(defined $seq, $seq->accession_number);
 259 ok(defined $seq->seq);
 260 is($seq->accession_number, 'BK000016',$seq->accession_number);
 261 is($seq->alphabet, 'dna');
 262 is($seq->display_id, 'BK000016');
 263 is($seq->length, 1162);
 264 is($seq->division, 'ROD');
 265 is($seq->get_dates, 1);
 266 is($seq->keywords, 'Third Party Annotation; TPA');
 267 is($seq->desc, 'TPA: Mus musculus pantothenate kinase 4 mRNA, partial cds.');
 268 is($seq->seq_version, 1);
 269 is($seq->feature_count, 2);
 270 my $spec_obj = $seq->species;
 271 is ($spec_obj->common_name, 'house mouse');
 272 is ($spec_obj->species, 'musculus');
 273 is ($spec_obj->genus, 'Mus');
 274 is ($spec_obj->binomial, 'Mus musculus');
 275 $ac = $seq->annotation;
 276 my $reference =  ($ac->get_Annotations('reference') )[0];
 277 is ($reference->pubmed, '11479594');
 278 is ($reference->medline, '21372465',$seq->accession_number);
 279
 280 # validate that what is written is what is read
 281 my $testfile = test_output_file();
 282 my $out = Bio::SeqIO->new(-file => ">$testfile",
 283                              -format => 'genbank');
 284 $out->write_seq($seq);
 285 $out->close();
 286
 287 $str = Bio::SeqIO->new(-format =>'gbdriver',
 288                              -file => $testfile);
 289 $seq = $str->next_seq;
 290 ok(defined $seq,'roundtrip');
 291 ok(defined $seq->seq);
 292 is($seq->accession_number, 'BK000016');
 293 is($seq->alphabet, 'dna');
 294 is($seq->display_id, 'BK000016');
 295 is($seq->length, 1162);
 296 is($seq->division, 'ROD');
 297 is($seq->get_dates, 1);
 298 is($seq->keywords, 'Third Party Annotation; TPA');
 299 is($seq->desc, 'TPA: Mus musculus pantothenate kinase 4 mRNA, partial cds.');
 300 is($seq->seq_version, 1);
 301 is($seq->feature_count, 2);
 302 $spec_obj = $seq->species;
 303 is ($spec_obj->common_name, 'house mouse');
 304 is ($spec_obj->species, 'musculus');
 305 is ($spec_obj->genus, 'Mus');
 306 is ($spec_obj->binomial, 'Mus musculus');
 307 $ac = $seq->annotation;
 308 $reference =  ($ac->get_Annotations('reference') )[0];
 309 is ($reference->pubmed, '11479594');
 310 is ($reference->medline, '21372465');
 311
 312 # write revcomp split location
 313 my $gb = Bio::SeqIO->new(-format => 'gbdriver',
 314                         -verbose => $verbose,
 315                         -file   => test_input_file('revcomp_mrna.gb'));
 316 $seq = $gb->next_seq();
 317
 318 $gb = Bio::SeqIO->new(-format => 'genbank',
 319                      -file   => ">$testfile");
 320
 321 $gb->write_seq($seq);
 322 undef $gb;
 323 ok(! -z $testfile, 'revcomp split location');
 324
 325 # bug 1925, continuation of long ORGANISM line ends up in @classification:
 326 # ORGANISM  Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC
 327 #           9150
 328 #           Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;
 329 #           Enterobacteriaceae; Salmonella.
 330 $gb = Bio::SeqIO->new(-format => 'gbdriver',
 331                      -verbose => $verbose,
 332                         -file   => test_input_file('NC_006511-short.gbk'));
 333 $seq = $gb->next_seq;
 334 is $seq->species->common_name, undef, "Bug 1925";
 335 is $seq->species->scientific_name, "Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150";
 336 @class = $seq->species->classification;
 337 is $class[$#class], "Bacteria";
 338
 339 # WGS tests
 340 $gb = Bio::SeqIO->new(-format => 'gbdriver',
 341                       -verbose => $verbose,
 342                     -file   => test_input_file('O_sat.wgs'));
 343 $seq = $gb->next_seq;
 344
 345 my @tests = ('wgs'        => 'AAAA02000001-AAAA02050231',
 346             'wgs_scafld' => 'CM000126-CM000137',
 347             'wgs_scafld' => 'CH398081-CH401163');
 348
 349 my @wgs = map {$seq->annotation->get_Annotations(lc($_))} (qw(WGS WGS_SCAFLD));
 350
 351 my $ct=0;
 352
 353 for my $wgs (@wgs) {
 354     my ($tagname, $value) = (shift @tests, shift @tests);
 355     is($wgs->tagname, $tagname, $tagname);
 356     is($wgs->value, $value);
 357     $ct++;
 358 }
 359
 360 is ($ct, 3);
 361
 362 # make sure we can retrieve a feature with a primary tag of 'misc_difference'
 363 $gb = Bio::SeqIO->new(-format => 'gbdriver',
 364                      -verbose => $verbose,
 365                     -file   => test_input_file('BC000007.gbk'));
 366 $seq = $gb->next_seq;
 367 ($cds) = grep { $_->primary_tag eq 'misc_difference' } $seq->get_SeqFeatures;
 368 my @vals = $cds->get_tag_values('gene');
 369 is $vals[0], 'PX19', $seq->accession_number;
 370
 371 # Check that the source,organism section is identical between input and output.
 372 # - test an easy one where organism is species, then two different formats of
 373 # subspecies, then a species with a format that used to be mistaken for
 374 # subspecies, then a bacteria with no genus, and finally a virus with a genus.
 375
 376 # These tests are now somewhat out-of-date since we are moving to a Bio::Taxon-
 377 # based system for verifying taxonomic information.  Right now they just verify
 378 # changes so are really useless; I will change them to verify common name,
 379 # organelle, scientific name, etc.
 380
 381
 382 # output always adds a period (GenBank std), but two of these files do not use them.
 383
 384 foreach my $in ('BK000016-tpa.gbk', 'ay116458.gb', 'ay149291.gb', 'NC_006346.gb', 'ay007676.gb', 'dq519393.gb') {
 385     my $infile =  test_input_file($in);
 386         $outfile = test_output_file();
 387
 388     $str = Bio::SeqIO->new(-format =>'genbank',
 389                           -verbose => $verbose,
 390                           -file => $infile);
 391     $seq = $str->next_seq;
 392
 393     $out = Bio::SeqIO->new(-file => $outfile, -format => 'genbank');
 394     $out->write_seq($seq);
 395     $out->close();
 396
 397     open my $IN, '<', $infile or die "Could not read file '$infile': $!\n";
 398     my @in = <$IN>;
 399     close $IN;
 400     open my $RESULT, '<', $outfile or die "Could not read file '$outfile': $!\n";
 401     my $line = 0;
 402     my $check = 0;
 403     my $is = 1;
 404
 405     FILECHECK:
 406     while (my $result = <$RESULT>) {
 407         if ($result =~ /^KEYWORDS/) {
 408             $check = 1;
 409             next;
 410         }
 411
 412         if ($result =~ /^REFERENCE/) {
 413             last FILECHECK;
 414         }
 415
 416         if ($check) {
 417
 418             # end periods don't count (not all input files have them)
 419             $result =~ s{\.$}{};
 420             $in[$line] =~ s{\.$}{};
 421
 422             if ($result ne $in[$line]) {
 423                 $is = 0;
 424                 last;
 425             }
 426         }
 427     } continue { $line++ }
 428     close $RESULT;
 429
 430     ok $is, $in;
 431 }
 432
 433 # NB: there should probably be full testing on all lines to ensure that output
 434 # matches input.
 435
 436 # 20061117: problem with *double* colon in some annotation-dblink values
 437 $ct = 0;
 438
 439 foreach my $in ('P35527.gb') {
 440     my $infile =  test_input_file($in);
 441     $str = Bio::SeqIO->new(-format =>'genbank',
 442                          -verbose => $verbose,
 443                          -file => $infile);
 444     $seq = $str->next_seq;
 445     my $ac      = $seq->annotation();      # Bio::AnnotationCollection
 446     foreach my $key ($ac->get_all_annotation_keys() ) {
 447         my @values = $ac->get_Annotations($key);
 448         foreach my $ann (@values) {
 449             my $value = $ann->display_text;
 450             $ct++;
 451             if ($key eq 'dblink') {
 452
 453                 ok (index($value,'::') < 0);   # this should never be true
 454
 455                 ok ($value, $value);   # check value is not empty
 456
 457                 #  print "  ann/", sprintf('%12s  ',$key), '>>>', $value , '<<<', "\n";
 458                 #  print "        index double colon: ",index($value   ,'::'), "\n";
 459
 460                 #  check db name:
 461                 my @parts = split(/:/,$value);
 462                 if ( $parts[0] =~ /^(?:
 463                         #  not an exhaustive list of databases;
 464                         #  just the db's referenced in P35527.gb:
 465                         swissprot | GenBank | GenPept  | HSSP| IntAct | Ensembl | KEGG | HGNC | MIM | ArrayExpress
 466                                   | GO      | InterPro | Pfam| PRINTS | PROSITE
 467                                      )$/x )
 468                 {
 469                     ok 1;
 470                 }
 471                 else {
 472                     ok 0;
 473                 }
 474                     ok ( $parts[1], "$parts[0]" );
 475             }
 476                 # elsif ($key eq 'reference') { }
 477         }
 478     }
 479 }
 480
 481 is($ct, 46);
 482
 483 # bug 2195
 484
 485 $str = Bio::SeqIO->new(-format =>'gbdriver',
 486                       -verbose => $verbose,
 487                       -file => test_input_file('AF305198.gb')
 488                      );
 489
 490 $species = $str->next_seq->species;
 491
 492 is($species->scientific_name, 'Virginia creeper phytoplasma', 'Bug 2195');
 493 is(join(', ',$species->classification), 'Virginia creeper phytoplasma, '.
 494    '16SrV (Elm yellows group), Candidatus Phytoplasma, '.
 495    'Acholeplasmataceae, Acholeplasmatales, Mollicutes, '.
 496    'Firmicutes, Bacteria', 'Bug 2195');
 497
 498 # bug 2569, PROJECT line support, read and write, round-tripping
 499
 500 $str = Bio::SeqIO->new(-format =>'gbdriver',
 501                       -verbose => $verbose,
 502                       -file => test_input_file('NC_008536.gb'));
 503
 504 $seq = $str->next_seq;
 505
 506 my $project = ($seq->annotation->get_Annotations('project'))[0];
 507
 508 isa_ok($project, 'Bio::Annotation::SimpleValue');
 509
 510 if ($project) {
 511         is($project->value, 'GenomeProject:12638');
 512 } else {
 513         ok(0, "PROJECT not parsed");
 514 }
 515
 516 $outfile = test_output_file();
 517
 518 $gb = Bio::SeqIO->new(-format => 'genbank',
 519                               -verbose => $verbose,
 520                        -file=> ">$outfile");
 521
 522 $gb->write_seq($seq);
 523
 524 $str = Bio::SeqIO->new(-format =>'gbdriver',
 525                       -verbose => $verbose,
 526                       -file => $outfile);
 527
 528 $seq = $str->next_seq;
 529
 530 $project = ($seq->annotation->get_Annotations('project'))[0];
 531
 532 isa_ok($project, 'Bio::Annotation::SimpleValue');
 533
 534 if ($project) {
 535         is($project->value, 'GenomeProject:12638');
 536 } else {
 537         ok(0, "Roundtrip test failed");
 538 }
 539
 540 ################################## EMBL ##################################
 541
 542 # Set to -1 for release version, so warnings aren't printed
 543
 544 $ast = Bio::SeqIO->new( -format => 'embldriver',
 545                            -verbose => $verbose,
 546                            -file => test_input_file("roa1.dat"));
 547 $ast->verbose($verbose);
 548 $as = $ast->next_seq();
 549 ok defined $as->seq;
 550 is($as->display_id, 'HSHNCPA1');
 551 is($as->accession_number, 'X79536');
 552 is($as->seq_version, 1);
 553 is($as->version, 1);
 554 is($as->desc, 'H.sapiens mRNA for hnRNPcore protein A1');
 555 is($as->molecule, 'RNA');
 556 is($as->alphabet, 'rna');
 557 is(scalar $as->all_SeqFeatures(), 4);
 558 is($as->length, 1198);
 559 is($as->species->binomial(), 'Homo sapiens');
 560 is($as->get_dates, 2);
 561
 562 # EMBL Release 87 changes (8-17-06)
 563
 564 $ast = Bio::SeqIO->new( -format => 'embldriver',
 565                            -verbose => $verbose,
 566                            -file => test_input_file("roa1_v2.dat"));
 567 $ast->verbose($verbose);
 568 $as = $ast->next_seq();
 569 ok defined $as->seq;
 570 # accession # same as display name now
 571 is($as->display_id, 'X79536');
 572 is($as->get_dates, 2);
 573 is($as->accession_number, 'X79536');
 574 is($as->seq_version, 1);
 575 is($as->version, 1);
 576 is($as->desc, 'H.sapiens mRNA for hnRNPcore protein A1');
 577 # mRNA instead of RNA
 578 is($as->molecule, 'mRNA');
 579 is($as->alphabet, 'rna');
 580 is(scalar $as->all_SeqFeatures(), 4);
 581 is($as->length, 1198);
 582 is($as->species->binomial(), 'Homo sapiens');
 583
 584 my $ent = Bio::SeqIO->new( -file => test_input_file("test.embl"),
 585                            -format => 'embldriver');
 586 $seq = $ent->next_seq();
 587
 588 is(defined $seq->seq(), 1,
 589    'success reading Embl with ^ location and badly split double quotes');
 590 is(scalar $seq->annotation->get_Annotations('reference'), 3);
 591 is($seq->get_dates, 0);
 592
 593 $out = Bio::SeqIO->new(-file=> ">$outfile",
 594                           -format => 'embl');
 595 is($out->write_seq($seq),1,
 596    'success writing Embl format with ^ < and > locations');
 597
 598 # embl with no FT
 599 $ent = Bio::SeqIO->new( -file => test_input_file("test.embl"),
 600                         -format => 'embldriver');
 601 $seq = $ent->next_seq();
 602
 603 ok($seq);
 604 is(lc($seq->subseq(1,10)),'gatcagtaga');
 605 is($seq->length, 4870);
 606
 607 # embl with no FH
 608 my $noFH = Bio::SeqIO->new(-file => test_input_file("no_FH.embl"),
 609                         -format => 'embldriver');
 610 $seq = $noFH->next_seq;
 611 is(scalar($seq->get_SeqFeatures), 4);
 612 is($seq->display_id, 'AE000001');
 613 is($seq->get_dates, 0);
 614
 615 # bug 1571
 616 $ent = Bio::SeqIO->new(-format => 'embldriver',
 617                        -file   => test_input_file('test.embl2sq'));
 618 $seq = $ent->next_seq;
 619 is($seq->length,4870);
 620 is($seq->get_dates, 0);
 621
 622 # embl repbase
 623 $ent = Bio::SeqIO->new(-file => test_input_file("BEL16-LTR_AG.embl"), -format => 'embldriver');
 624 $seq = $ent->next_seq;
 625 is($seq->display_id,'BEL16-LTR_AG');
 626 is($seq->get_dates, 2);
 627
 628 # test secondary accessions in EMBL (bug #1332)
 629 $seqio = Bio::SeqIO->new(-format => 'embldriver',
 630                            -file => test_input_file('ECAPAH02.embl'));
 631 $seq = $seqio->next_seq;
 632 is($seq->accession_number, 'D10483');
 633 is($seq->seq_version, 2);
 634 my @accs = $seq->get_secondary_accessions();
 635 is($accs[0], 'J01597');
 636 is($accs[-1], 'X56742');
 637 is($seq->get_dates, 2);
 638
 639 ### TPA TESTS - Thanks to Richard Adams ###
 640 # test Third Party Annotation entries in EMBL/Gb format
 641 # to ensure compatability with parsers.
 642 $str = Bio::SeqIO->new(-verbose => $verbose,
 643                          -format =>'embldriver',
 644                          -file => test_input_file('BN000066-tpa.embl'));
 645 $seq = $str->next_seq;
 646 ok(defined $seq);
 647 is($seq->accession_number, 'BN000066');
 648 is($seq->alphabet, 'dna');
 649 is($seq->display_id, 'AGA000066');
 650 is($seq->length, 5195);
 651 is($seq->division, 'INV');
 652 is($seq->keywords, 'acetylcholinesterase; achE1 gene; Third Party Annotation; TPA');
 653 is($seq->seq_version, 1);
 654 is($seq->feature_count, 15);
 655 is($seq->get_dates, 2);
 656
 657 $spec_obj = $seq->species;
 658 is ($spec_obj->common_name, 'African malaria mosquito');
 659 is ($spec_obj->species, 'gambiae');
 660 is ($spec_obj->genus, 'Anopheles');
 661 is ($spec_obj->binomial, 'Anopheles gambiae');
 662
 663 $ac = $seq->annotation;
 664 $reference =  ($ac->get_Annotations('reference') )[1];
 665 is ($reference->title,'"A novel acetylcholinesterase gene in mosquitoes codes for the insecticide target and is non-homologous to the ace gene in Drosophila"');
 666 is ($reference->authors,'Weill M., Fort P., Berthomi eu A., Dubois M.P., Pasteur N., Raymond M.');
 667 my $cmmnt =  ($ac->get_Annotations('comment') )[0];
 668 is($cmmnt->text, 'see also AJ488492 for achE-1 from Kisumu strain Third Party Annotation Database: This TPA record uses Anopheles gambiae trace archive data (http://trace.ensembl.org)');
 669
 670
 671 $ent = Bio::SeqIO->new( -file => test_input_file("test.embl"),
 672                         -format => 'embldriver');
 673 $ent->verbose($verbose);
 674 $seq = $ent->next_seq();
 675 $species = $seq->species();
 676 @cl = $species->classification();
 677 is( $cl[3] ne $species->genus(), 1, 'genus duplication test');
 678 $ent->close();
 679
 680 #
 681 ## read-write - test embl writing of a PrimarySeq
 682 #
 683 my $primaryseq = Bio::PrimarySeq->new( -seq => 'AGAGAGAGATA',
 684                                       -id  => 'myid',
 685                                       -desc => 'mydescr',
 686                                       -alphabet => 'DNA',
 687                                       -accession_number => 'myaccession');
 688
 689 $verbose = -1 unless $ENV{'BIOPERLDEBUG'};  # silence warnings unless we are debuggin
 690
 691 my $embl = Bio::SeqIO->new(-format => 'embl',
 692                           -verbose => $verbose,
 693                           -file => ">$outfile");
 694
 695 ok($embl->write_seq($primaryseq));
 696
 697 # this should generate a warning
 698 my $scalar = "test";
 699 eval {
 700         $embl->write_seq($scalar);
 701 };
 702 ok ($@);
 703
 704 ############################## Swiss/UniProt ##############################
 705
 706 $seqio = Bio::SeqIO->new( -verbose => $verbose,
 707                                      -format => 'swissdriver',
 708                                      -file   => test_input_file('test.swiss'));
 709
 710 isa_ok($seqio, 'Bio::SeqIO');
 711 $seq = $seqio->next_seq;
 712 my @gns = $seq->annotation->get_Annotations('gene_name');
 713
 714 $outfile = test_output_file();
 715 $seqio = Bio::SeqIO->new( -verbose => $verbose,
 716                                  -format => 'swiss',
 717                                  -file   => ">$outfile");
 718
 719 $seqio->write_seq($seq);
 720
 721 # reads it in once again
 722 $seqio = Bio::SeqIO->new( -verbose => $verbose,
 723                                  -format => 'swissdriver',
 724                                  -file => $outfile);
 725
 726 $seq = $seqio->next_seq;
 727 isa_ok($seq->species, 'Bio::Species');
 728 is($seq->species->ncbi_taxid, 6239);
 729
 730 # version, seq_update, dates (5 tests)
 731 is($seq->version, 40);
 732 my ($ann) = $seq->annotation->get_Annotations('seq_update');
 733 eval {is($ann->display_text, 35,'operator overloading in AnnotationI is deprecated')};
 734 ok(!$@);
 735
 736 my @dates = $seq->get_dates;
 737 my @date_check = qw(01-NOV-1997 01-NOV-1997 16-OCT-2001);
 738
 739 for my $date (@dates) {
 740     my $expdate = shift @date_check;
 741     is($date, $expdate,'dates');
 742 }
 743
 744 my @gns2 = $seq->annotation->get_Annotations('gene_name');
 745 # check gene name is preserved (was losing suffix in worm gene names)
 746 ok($#gns2 == 0 && $gns[0]->value eq $gns2[0]->value);
 747
 748 # test swissprot multiple RP lines
 749 $str = Bio::SeqIO->new(-file => test_input_file('P33897'));
 750 $seq = $str->next_seq;
 751 isa_ok($seq, 'Bio::Seq::RichSeqI');
 752 @refs = $seq->annotation->get_Annotations('reference');
 753 is( @refs, 23);
 754 is($refs[20]->rp, 'VARIANTS X-ALD LEU-98; ASP-99; GLU-217; GLN-518; ASP-608; ILE-633 AND PRO-660, AND VARIANT THR-13.');
 755
 756 # version, seq_update, dates (5 tests)
 757 is($seq->version, 44);
 758 ($ann) = $seq->annotation->get_Annotations('seq_update');
 759 is($ann->display_text, 28,'operator overloading in AnnotationI is deprecated');
 760 @dates = $seq->get_dates;
 761 @date_check = qw(01-FEB-1994 01-FEB-1994 15-JUN-2004);
 762 for my $date (@dates) {
 763     is($date, shift @date_check);
 764 }
 765
 766 $ast = Bio::SeqIO->new(-verbose => $verbose,
 767                                   -format => 'swissdriver' ,
 768                                   -file => test_input_file("roa1.swiss"));
 769 $as = $ast->next_seq();
 770
 771 ok defined $as->seq;
 772 is($as->id, 'ROA1_HUMAN', "id is ".$as->id);
 773 like($as->primary_id, qr(Bio::PrimarySeq));
 774 is($as->length, 371);
 775 is($as->alphabet, 'protein');
 776 is($as->division, 'HUMAN');
 777 is(scalar $as->all_SeqFeatures(), 16);
 778 is(scalar $as->annotation->get_Annotations('reference'), 11);
 779
 780 # version, seq_update, dates (5 tests)
 781 is($as->version, 35);
 782 ($ann) = $as->annotation->get_Annotations('seq_update');
 783 is($ann->display_text, 15,'operator overloading in AnnotationI is deprecated');
 784 @dates = $as->get_dates;
 785 @date_check = qw(01-MAR-1989 01-AUG-1990 01-NOV-1997);
 786 for my $date (@dates) {
 787     is($date, shift @date_check);
 788 }
 789
 790 ($ent,$out) = undef;
 791 ($as,$seq) = undef;
 792
 793 $seqio = Bio::SeqIO->new(-format => 'swissdriver' ,
 794                                  -verbose => $verbose,
 795                                  -file => test_input_file("swiss.dat"));
 796 $seq = $seqio->next_seq;
 797 isa_ok($seq, 'Bio::Seq::RichSeqI');
 798
 799 # more tests to verify we are actually parsing correctly
 800 like($seq->primary_id, qr(Bio::PrimarySeq));
 801 is($seq->display_id, 'MA32_HUMAN');
 802 is($seq->length, 282);
 803 is($seq->division, 'HUMAN');
 804 is($seq->alphabet, 'protein');
 805 my @f = $seq->all_SeqFeatures();
 806 is(@f, 2);
 807 is($f[1]->primary_tag, 'CHAIN');
 808 is(($f[1]->get_tag_values('description'))[0], 'COMPLEMENT COMPONENT 1, Q SUBCOMPONENT BINDING PROTEIN');
 809
 810 # version, seq_update, dates (5 tests)
 811 is($seq->version, 40);
 812 ($ann) = $seq->annotation->get_Annotations('seq_update');
 813 is($ann->display_text, 31,'operator overloading in AnnotationI is deprecated');
 814 @dates = $seq->get_dates;
 815 @date_check = qw(01-FEB-1995 01-FEB-1995 01-OCT-2000);
 816 for my $date (@dates) {
 817     is($date, shift @date_check);
 818 }
 819
 820 my @genenames = qw(GC1QBP HABP1 SF2P32 C1QBP);
 821 ($ann) = $seq->annotation->get_Annotations('gene_name');
 822 # use Data::Stag findval and element name to get values/nodes
 823 foreach my $gn ( $ann->findval('Name') ) {
 824     ok ($gn, shift(@genenames));
 825 }
 826 foreach my $gn ( $ann->findval('Synonyms') ) {
 827     ok ($gn, shift(@genenames));
 828 }
 829 like($ann->value, qr/Name: GC1QBP/);
 830
 831 # test for feature locations like ?..N
 832 $seq = $seqio->next_seq();
 833 isa_ok($seq, 'Bio::Seq::RichSeqI');
 834 like($seq->primary_id, qr(Bio::PrimarySeq));
 835 is($seq->display_id, 'ACON_CAEEL');
 836 is($seq->length, 788);
 837 is($seq->division, 'CAEEL');
 838 is($seq->alphabet, 'protein');
 839 is(scalar $seq->all_SeqFeatures(), 5);
 840
 841 foreach my $gn ( $seq->annotation->get_Annotations('gene_name') ) {
 842     ok ($gn->value, 'F54H12.1');
 843 }
 844
 845 # test species in swissprot -- this can be a n:n nightmare
 846 $seq = $seqio->next_seq();
 847 isa_ok($seq, 'Bio::Seq::RichSeqI');
 848 like($seq->primary_id, qr(Bio::PrimarySeq));
 849 @sec_acc = $seq->get_secondary_accessions();
 850 is($sec_acc[0], 'P29360');
 851 is($sec_acc[1], 'Q63631');
 852 is($seq->accession_number, 'P42655');
 853 @kw = $seq->get_keywords;
 854 is( $kw[0], 'Brain');
 855 is( $kw[1], 'Neurone');
 856 is($kw[3], 'Multigene family');
 857 is($seq->display_id, '143E_HUMAN');
 858
 859 # hybrid names from old sequences are no longer valid, these are chopped
 860 # off at the first organism
 861 is($seq->species->binomial, "Homo sapiens");
 862 is($seq->species->common_name, "Human");
 863 is($seq->species->ncbi_taxid, 9606);
 864
 865 $seq = $seqio->next_seq();
 866 isa_ok($seq, 'Bio::Seq::RichSeqI');
 867 like($seq->primary_id, qr(Bio::PrimarySeq));
 868 is($seq->species->binomial, "Bos taurus");
 869 is($seq->species->common_name, "Bovine");
 870 is($seq->species->ncbi_taxid, 9913);
 871
 872 # multiple genes in swissprot
 873 $seq = $seqio->next_seq();
 874 isa_ok($seq, 'Bio::Seq::RichSeqI');
 875 like($seq->primary_id, qr(Bio::PrimarySeq));
 876
 877 ($ann) = $seq->annotation->get_Annotations("gene_name");
 878 @genenames = qw(CALM1 CAM1 CALM CAM CALM2 CAM2 CAMB CALM3 CAM3 CAMC);
 879 my $flatnames = "(CALM1 OR CAM1 OR CALM OR CAM) AND (CALM2 OR CAM2 OR CAMB) AND (CALM3 OR CAM3 OR CAMC)";
 880
 881 my @names = @genenames; # copy array
 882
 883 my @ann_names = $ann->get_all_values();
 884 is(scalar(@ann_names), scalar(@names));
 885
 886 # do this in a layered way (nested tags)
 887 for my $node ($ann->findnode('gene_name')) {
 888     for my $name ($node->findval('Name')) {
 889         is($name, shift(@names));
 890     }
 891     for my $name ($node->findval('Synonyms')) {
 892         is($name, shift(@names));
 893     }
 894 }
 895
 896 is(scalar(@names),0);
 897
 898 # same entry as before, but with the new gene names format
 899 $seqio = Bio::SeqIO->new(-format => 'swissdriver',
 900                                  -verbose => $verbose,
 901                          -file => test_input_file("calm.swiss"));
 902 $seq = $seqio->next_seq();
 903 isa_ok($seq, 'Bio::Seq::RichSeqI');
 904 like($seq->primary_id, qr(Bio::PrimarySeq));
 905
 906 ($ann) = $seq->annotation->get_Annotations("gene_name");
 907 @names = @genenames; # copy array
 908
 909 my @ann_names2 = $ann->get_all_values(); #emulate StructuredValue's flattened array
 910 is(scalar(@ann_names2), scalar(@names));
 911
 912 for my $node ($ann->findnode('gene_name')) {
 913     for my $name ($node->findval('Name')) {
 914         is($name, shift(@names));
 915     }
 916     for my $name ($node->findval('Synonyms')) {
 917         is($name, shift(@names));
 918     }
 919 }
 920
 921 is(scalar(@names),0);
 922
 923 # test proper parsing of references
 924 my @litrefs = $seq->annotation->get_Annotations('reference');
 925 is(scalar(@litrefs), 17);
 926
 927 my @titles = (
 928     '"Complete amino acid sequence of human brain calmodulin."',
 929     '"Multiple divergent mRNAs code for a single human calmodulin."',
 930     '"Molecular analysis of human and rat calmodulin complementary DNA clones. Evidence for additional active genes in these species."',
 931     '"Isolation and nucleotide sequence of a cDNA encoding human calmodulin."',
 932     '"Structure of the human CALM1 calmodulin gene and identification of two CALM1-related pseudogenes CALM1P1 and CALM1P2."',
 933     undef,
 934     '"Characterization of the human CALM2 calmodulin gene and comparison of the transcriptional activity of CALM1, CALM2 and CALM3."',
 935     '"Cloning of human full-length CDSs in BD Creator(TM) system donor vector."',
 936     '"The DNA sequence and analysis of human chromosome 14."',
 937     '"Generation and initial analysis of more than 15,000 full-length human and mouse cDNA sequences."',
 938     '"Alpha-helix nucleation by a calcium-binding peptide loop."',
 939     '"Solution structure of Ca(2+)-calmodulin reveals flexible hand-like properties of its domains."',
 940     '"Calmodulin structure refined at 1.7 A resolution."',
 941     '"Drug binding by calmodulin: crystal structure of a calmodulin-trifluoperazine complex."',
 942     '"Structural basis for the activation of anthrax adenylyl cyclase exotoxin by calmodulin."',
 943     '"Physiological calcium concentrations regulate calmodulin binding and catalysis of adenylyl cyclase exotoxins."',
 944     '"Crystal structure of a MARCKS peptide containing the calmodulin-binding domain in complex with Ca2+-calmodulin."',
 945 );
 946
 947 my @locs = (
 948     "Biochemistry 21:2565-2569(1982).",
 949     "J. Biol. Chem. 263:17055-17062(1988).",
 950     "J. Biol. Chem. 262:16663-16670(1987).",
 951     "Biochem. Int. 9:177-185(1984).",
 952     "Eur. J. Biochem. 225:71-82(1994).",
 953     "Submitted (FEB-1995) to the EMBL/GenBank/DDBJ databases.",
 954     "Cell Calcium 23:323-338(1998).",
 955     "Submitted (MAY-2003) to the EMBL/GenBank/DDBJ databases.",
 956     "Nature 421:601-607(2003).",
 957     "Proc. Natl. Acad. Sci. U.S.A. 99:16899-16903(2002).",
 958     "Proc. Natl. Acad. Sci. U.S.A. 96:903-908(1999).",
 959     "Nat. Struct. Biol. 8:990-997(2001).",
 960     "J. Mol. Biol. 228:1177-1192(1992).",
 961     "Biochemistry 33:15259-15265(1994).",
 962     "Nature 415:396-402(2002).",
 963     "EMBO J. 21:6721-6732(2002).",
 964     "Nat. Struct. Biol. 10:226-231(2003).",
 965 );
 966
 967 my @positions = (
 968      undef, undef,
 969     undef, undef,
 970     undef, undef,
 971     undef, undef,
 972     undef, undef,
 973     undef, undef,
 974     undef, undef,
 975     undef, undef,
 976     undef, undef,
 977     undef, undef,
 978     94, 103,
 979     1, 76,
 980     undef, undef,
 981     undef, undef,
 982     5, 148,
 983     1, 148,
 984     undef, undef,
 985 );
 986
 987 foreach my $litref (@litrefs) {
 988     is($litref->title, shift(@titles));
 989     is($litref->location, shift(@locs));
 990     is($litref->start, shift(@positions));
 991     is($litref->end, shift(@positions));
 992 }
 993
 994 # format parsing changes (pre-rel 9.0)
 995
 996 $seqio = Bio::SeqIO->new( -verbose => $verbose,
 997                          -format => 'swissdriver',
 998                          -file   => test_input_file('pre_rel9.swiss'));
 999
1000 ok($seqio);
1001 $seq = $seqio->next_seq;
1002 isa_ok($seq->species, 'Bio::Species');
1003 is($seq->species->ncbi_taxid, "6239");
1004
1005 # version, seq_update, dates (5 tests)
1006 is($seq->version, 44);
1007 ($ann) = $seq->annotation->get_Annotations('seq_update');
1008 is($ann->display_text, 1);
1009 @dates = $seq->get_dates;
1010 @date_check = qw(01-NOV-1997 01-NOV-1996 30-MAY-2006 );
1011 for my $date (@dates) {
1012     is($date, shift @date_check);
1013 }
1014
1015 my @idcheck = qw(Z66513 T22647 Cel.30446 Q06319 Q20772 F54D5.7 WBGene00010052
1016                  F54D5.7 GO:0005515 IPR006089 IPR006091 IPR006090
1017                  IPR006092 IPR009075 IPR009100 IPR013764 PF00441
1018                  PF02770 PF02771 PS00072 PS00073);
1019
1020 for my $dblink ( $seq->annotation->get_Annotations('dblink') ) {
1021     is($dblink->primary_id, shift @idcheck);
1022 }
1023
1024 $seqio = Bio::SeqIO->new( -verbose => $verbose,
1025                          -format => 'swissdriver',
1026                          -file   => test_input_file('pre_rel9.swiss'));
1027
1028 my @namespaces = qw(Swiss-Prot TrEMBL TrEMBL);
1029
1030 while (my $seq = $seqio->next_seq) {
1031     is($seq->namespace, shift @namespaces);
1032 }
1033
1034 # format parsing changes (rel 9.0, Oct 2006)
1035
1036 $seqio = Bio::SeqIO->new( -verbose => $verbose,
1037                          -format => 'swissdriver',
1038                          -file   => test_input_file('rel9.swiss'));
1039
1040 ok($seqio);
1041 $seq = $seqio->next_seq;
1042 isa_ok($seq->species, 'Bio::Species');
1043 is($seq->species->ncbi_taxid, 6239);
1044
1045 is($seq->version, 47);
1046 ($ann) = $seq->annotation->get_Annotations('seq_update');
1047 is($ann->display_text, 1,'operator overloading in AnnotationI is deprecated');
1048 @dates = $seq->get_dates;
1049 @date_check = qw(01-NOV-1997 01-NOV-1996 31-OCT-2006 );
1050 for my $date (@dates) {
1051     is($date, shift @date_check);
1052 }
1053
1054 @idcheck = qw(Z66513 T22647 Cel.30446 Q06319 Q20772 F54D5.7 cel:F54D5.7
1055          WBGene00010052 F54D5.7 GO:0005515 IPR006089 IPR006091 IPR006090
1056          IPR006092 IPR009075 IPR013786 IPR009100 IPR013764 PF00441 PF02770
1057          PF02771 PS00072 PS00073 );
1058
1059 for my $dblink ( $seq->annotation->get_Annotations('dblink') ) {
1060     is($dblink->primary_id, shift @idcheck);
1061 }
1062
1063 $seqio = Bio::SeqIO->new( -verbose => $verbose,
1064                          -format => 'swissdriver',
1065                          -file   => test_input_file('rel9.swiss'));
1066
1067 @namespaces = qw(Swiss-Prot TrEMBL TrEMBL);
1068
1069 while (my $seq = $seqio->next_seq) {
1070     is($seq->namespace, shift @namespaces);
1071 }
1072
1073 # bug 2288
1074 # Q8GBD3.swiss
1075 $seqio = Bio::SeqIO->new( -verbose => $verbose,
1076                          -format => 'swiss',
1077                          -file   => test_input_file('Q8GBD3.swiss'));
1078
1079 while (my $seq = $seqio->next_seq) {
1080     my $lineage = join(';', $seq->species->classification);
1081         is ($lineage, 'Acetobacter aceti;Acetobacter subgen. Acetobacter;'.
1082                 'Acetobacter;Acetobacteraceae;Rhodospirillales;Alphaproteobacteria;'.
1083                 'Proteobacteria;Bacteria');
1084 }
1085
1086 # test for GenBank swissprot/UniProt/UniProtKB DBSOURCE line (Bug : RT 44536)
1087 $ast = Bio::SeqIO->new(-format => 'genbank',
1088                               -verbose => $verbose,
1089                        -file => test_input_file('P39765.gb'));
1090 $ast->verbose($verbose);
1091 $as = $ast->next_seq();
1092 is $as->molecule, 'PRT',$as->accession_number;;
1093 is $as->alphabet, 'protein';
1094 # Though older GenBank releases indicate SOURCE contains only the common name,
1095 # this is no longer true.  In general, this line will contain an abbreviated
1096 # form of the full organism name (but may contain the full length name),
1097 # as well as the optional common name and organelle.  There is no get/set
1098 # for the abbreviated name but it is accessible via name()
1099 ok defined($as->species->name('abbreviated')->[0]);
1100 is $as->species->name('abbreviated')->[0], 'Bacillus subtilis';
1101 is($as->primary_id, 20141743);
1102 $ac = $as->annotation;
1103 ok defined $ac;
1104 @dblinks = $ac->get_Annotations('dblink');
1105 is(scalar @dblinks,31);
1106 is($dblinks[0]->database, 'UniProtKB');
1107 is($dblinks[0]->primary_id, 'PYRR_BACSU');
1108 is($dblinks[0]->version, undef);
1109 is($dblinks[0]->display_text, 'UniProtKB:PYRR_BACSU','operator overloading in AnnotationI is deprecated');