t/SeqIO/swiss.t

   1 # -*-Perl-*- Test Harness script for Bioperl
   2 # $Id$
   3
   4 use strict;
   5
   6 BEGIN {
   7     use Bio::Root::Test;
   8
   9     test_begin(-tests           => 247,
  10                -requires_module => 'Data::Stag');
  11
  12     use_ok('Bio::SeqIO::swiss');
  13 }
  14
  15 use Bio::Annotation::SimpleValue;
  16
  17 my $verbose = test_debug();
  18
  19 my $seqio = Bio::SeqIO->new( -verbose => $verbose,
  20                                      -format => 'swiss',
  21                                      -file   => test_input_file('test.swiss'));
  22
  23 isa_ok($seqio, 'Bio::SeqIO');
  24 my $seq = $seqio->next_seq;
  25 my @gns = $seq->annotation->get_Annotations('gene_name');
  26
  27 my $outfile = test_output_file();
  28 $seqio = Bio::SeqIO->new( -verbose => $verbose,
  29                                  -format => 'swiss',
  30                                  -file   => ">$outfile");
  31
  32 $seqio->write_seq($seq);
  33
  34 # reads it in once again
  35 $seqio = Bio::SeqIO->new( -verbose => $verbose,
  36                                  -format => 'swiss',
  37                                  -file => $outfile);
  38
  39 $seq = $seqio->next_seq;
  40 isa_ok($seq->species, 'Bio::Species');
  41 is($seq->species->ncbi_taxid, 6239);
  42
  43 # version, seq_update, dates (5 tests)
  44 is($seq->version, 40);
  45 my ($ann) = $seq->annotation->get_Annotations('seq_update');
  46 is($ann->display_text, 35,'operator overloading in AnnotationI is deprecated');
  47
  48 my @dates = $seq->get_dates;
  49 my @date_check = qw(01-NOV-1997 01-NOV-1997 16-OCT-2001);
  50
  51 for my $date (@dates) {
  52     my $expdate = shift @date_check;
  53     if ($expdate) {
  54         is($date, $expdate,'dates');
  55     } else {
  56         is($date, $expdate);
  57     }
  58 }
  59
  60 my @gns2 = $seq->annotation->get_Annotations('gene_name');
  61 # check gene name is preserved (was losing suffix in worm gene names)
  62 ok($#gns2 == 0 && $gns[0]->value eq $gns2[0]->value);
  63
  64 # test swissprot multiple RP lines
  65 my $str = Bio::SeqIO->new(-file => test_input_file('P33897'));
  66 $seq = $str->next_seq;
  67 isa_ok($seq, 'Bio::Seq::RichSeqI');
  68 my @refs = $seq->annotation->get_Annotations('reference');
  69 is( @refs, 23);
  70 is($refs[20]->rp, 'VARIANTS X-ALD LEU-98; ASP-99; GLU-217; GLN-518; ASP-608; ILE-633 AND PRO-660, AND VARIANT THR-13.');
  71
  72 # version, seq_update, dates (5 tests)
  73 is($seq->version, 44);
  74 ($ann) = $seq->annotation->get_Annotations('seq_update');
  75 is($ann->display_text, 28,'operator overloading in AnnotationI is deprecated');
  76 @dates = $seq->get_dates;
  77 @date_check = qw(01-FEB-1994 01-FEB-1994 15-JUN-2004);
  78 for my $date (@dates) {
  79     is($date, shift @date_check);
  80 }
  81
  82 my $ast = Bio::SeqIO->new(-verbose => $verbose,
  83                                   -format => 'swiss' ,
  84                                   -file => test_input_file('roa1.swiss'));
  85 my $as = $ast->next_seq();
  86
  87 ok defined $as->seq;
  88 is($as->id, 'ROA1_HUMAN', "id is ".$as->id);
  89 like($as->primary_id, qr(Bio::PrimarySeq));
  90 is($as->length, 371);
  91 is($as->alphabet, 'protein');
  92 is($as->division, 'HUMAN');
  93 is(scalar $as->all_SeqFeatures(), 16);
  94 is(scalar $as->annotation->get_Annotations('reference'), 11);
  95
  96 # version, seq_update, dates (6 tests)
  97 is($as->version, 35);
  98 ($ann) = $as->annotation->get_Annotations('seq_update');
  99 is($ann->display_text, 15,'operator overloading in AnnotationI is deprecated');
 100 @dates = $as->get_dates;
 101 @date_check = qw(01-MAR-1989 01-AUG-1990 01-NOV-1997);
 102 for my $date (@dates) {
 103     is($date, shift @date_check);
 104 }
 105 ($ann) = $as->annotation->get_Annotations('evidence');
 106 is($ann->value,"1: Evidence at protein level");
 107
 108
 109 my ($ent,$out) = undef;
 110 ($as,$seq) = undef;
 111
 112 $seqio = Bio::SeqIO->new(-format => 'swiss' ,
 113                                  -verbose => $verbose,
 114                                  -file => test_input_file('swiss.dat'));
 115 $seq = $seqio->next_seq;
 116 isa_ok($seq, 'Bio::Seq::RichSeqI');
 117
 118 # more tests to verify we are actually parsing correctly
 119 like($seq->primary_id, qr(Bio::PrimarySeq));
 120 is($seq->display_id, 'MA32_HUMAN');
 121 is($seq->length, 282);
 122 is($seq->division, 'HUMAN');
 123 is($seq->alphabet, 'protein');
 124 my @f = $seq->all_SeqFeatures();
 125 is(@f, 2);
 126 is($f[1]->primary_tag, 'CHAIN');
 127 is(($f[1]->get_tag_values('description'))[0], 'COMPLEMENT COMPONENT 1, Q SUBCOMPONENT BINDING PROTEIN');
 128
 129 # version, seq_update, dates (5 tests)
 130 is($seq->version, 40);
 131 ($ann) = $seq->annotation->get_Annotations('seq_update');
 132 is($ann->display_text, 31,'operator overloading in AnnotationI is deprecated');
 133 @dates = $seq->get_dates;
 134 @date_check = qw(01-FEB-1995 01-FEB-1995 01-OCT-2000);
 135 for my $date (@dates) {
 136     is($date, shift @date_check);
 137 }
 138
 139 my @genenames = qw(GC1QBP HABP1 SF2P32 C1QBP);
 140 ($ann) = $seq->annotation->get_Annotations('gene_name');
 141 # use Data::Stag findval and element name to get values/nodes
 142 foreach my $gn ( $ann->findval('Name') ) {
 143     ok ($gn, shift(@genenames));
 144 }
 145 foreach my $gn ( $ann->findval('Synonyms') ) {
 146     ok ($gn, shift(@genenames));
 147 }
 148 like($ann->value, qr/Name: GC1QBP/);
 149
 150 # test for feature locations like ?..N
 151 $seq = $seqio->next_seq();
 152 isa_ok($seq, 'Bio::Seq::RichSeqI');
 153 like($seq->primary_id, qr(Bio::PrimarySeq));
 154 is($seq->display_id, 'ACON_CAEEL');
 155 is($seq->length, 788);
 156 is($seq->division, 'CAEEL');
 157 is($seq->alphabet, 'protein');
 158 is(scalar $seq->all_SeqFeatures(), 5);
 159
 160 foreach my $gn ( $seq->annotation->get_Annotations('gene_name') ) {
 161     ok ($gn->value, 'F54H12.1');
 162 }
 163
 164 # test species in swissprot -- this can be a n:n nightmare
 165 $seq = $seqio->next_seq();
 166 isa_ok($seq, 'Bio::Seq::RichSeqI');
 167 like($seq->primary_id, qr(Bio::PrimarySeq));
 168 my @sec_acc = $seq->get_secondary_accessions();
 169 is($sec_acc[0], 'P29360');
 170 is($sec_acc[1], 'Q63631');
 171 is($seq->accession_number, 'P42655');
 172 my @kw = $seq->get_keywords;
 173 is( $kw[0], 'Brain');
 174 is( $kw[1], 'Neurone');
 175 is($kw[3], 'Multigene family');
 176 is($seq->display_id, '143E_HUMAN');
 177 is($seq->species->binomial, "Homo sapiens");
 178 is($seq->species->common_name, "Human");
 179 is($seq->species->ncbi_taxid, 9606);
 180
 181 $seq = $seqio->next_seq();
 182 isa_ok($seq, 'Bio::Seq::RichSeqI');
 183 like($seq->primary_id, qr(Bio::PrimarySeq));
 184 is($seq->species->binomial, "Bos taurus");
 185 is($seq->species->common_name, "Bovine");
 186 is($seq->species->ncbi_taxid, 9913);
 187
 188 # multiple genes in swissprot
 189 $seq = $seqio->next_seq();
 190 isa_ok($seq, 'Bio::Seq::RichSeqI');
 191 like($seq->primary_id, qr(Bio::PrimarySeq));
 192
 193 ($ann) = $seq->annotation->get_Annotations("gene_name");
 194 @genenames = qw(CALM1 CAM1 CALM CAM CALM2 CAM2 CAMB CALM3 CAM3 CAMC);
 195 my $flatnames = "(CALM1 OR CAM1 OR CALM OR CAM) AND (CALM2 OR CAM2 OR CAMB) AND (CALM3 OR CAM3 OR CAMC)";
 196
 197 my @names = @genenames; # copy array
 198
 199 my @ann_names = $ann->get_all_values();
 200 is(scalar(@ann_names), scalar(@names));
 201
 202 # do this in a layered way (nested tags)
 203 for my $node ($ann->findnode('gene_name')) {
 204     for my $name ($node->findval('Name')) {
 205         is($name, shift(@names));
 206     }
 207     for my $name ($node->findval('Synonyms')) {
 208         is($name, shift(@names));
 209     }
 210 }
 211
 212 is(scalar(@names),0);
 213
 214 # same entry as before, but with the new gene names format
 215 $seqio = Bio::SeqIO->new(-format => 'swiss',
 216                                  -verbose => $verbose,
 217                          -file => test_input_file('calm.swiss'));
 218 $seq = $seqio->next_seq();
 219 isa_ok($seq, 'Bio::Seq::RichSeqI');
 220 like($seq->primary_id, qr(Bio::PrimarySeq));
 221 ($ann) = $seq->annotation->get_Annotations("gene_name");
 222 @names = @genenames; # copy array
 223
 224 my @ann_names2 = $ann->get_all_values(); #emulate StructuredValue's flattened array
 225 is(scalar(@ann_names2), scalar(@names));
 226
 227 for my $node ($ann->findnode('gene_name')) {
 228     for my $name ($node->findval('Name')) {
 229         is($name, shift(@names));
 230     }
 231     for my $name ($node->findval('Synonyms')) {
 232         is($name, shift(@names));
 233     }
 234 }
 235
 236 is(scalar(@names),0);
 237
 238 # test proper parsing of references
 239 my @litrefs = $seq->annotation->get_Annotations('reference');
 240 is(scalar(@litrefs), 17);
 241
 242 my @titles = (
 243     '"Complete amino acid sequence of human brain calmodulin."',
 244     '"Multiple divergent mRNAs code for a single human calmodulin."',
 245     '"Molecular analysis of human and rat calmodulin complementary DNA clones. Evidence for additional active genes in these species."',
 246     '"Isolation and nucleotide sequence of a cDNA encoding human calmodulin."',
 247     '"Structure of the human CALM1 calmodulin gene and identification of two CALM1-related pseudogenes CALM1P1 and CALM1P2."',
 248     undef,
 249     '"Characterization of the human CALM2 calmodulin gene and comparison of the transcriptional activity of CALM1, CALM2 and CALM3."',
 250     '"Cloning of human full-length CDSs in BD Creator(TM) system donor vector."',
 251     '"The DNA sequence and analysis of human chromosome 14."',
 252     '"Generation and initial analysis of more than 15,000 full-length human and mouse cDNA sequences."',
 253     '"Alpha-helix nucleation by a calcium-binding peptide loop."',
 254     '"Solution structure of Ca(2+)-calmodulin reveals flexible hand-like properties of its domains."',
 255     '"Calmodulin structure refined at 1.7 A resolution."',
 256     '"Drug binding by calmodulin: crystal structure of a calmodulin-trifluoperazine complex."',
 257     '"Structural basis for the activation of anthrax adenylyl cyclase exotoxin by calmodulin."',
 258     '"Physiological calcium concentrations regulate calmodulin binding and catalysis of adenylyl cyclase exotoxins."',
 259     '"Crystal structure of a MARCKS peptide containing the calmodulin-binding domain in complex with Ca2+-calmodulin."',
 260 );
 261
 262 my @locs = (
 263     "Biochemistry 21:2565-2569(1982).",
 264     "J. Biol. Chem. 263:17055-17062(1988).",
 265     "J. Biol. Chem. 262:16663-16670(1987).",
 266     "Biochem. Int. 9:177-185(1984).",
 267     "Eur. J. Biochem. 225:71-82(1994).",
 268     "Submitted (FEB-1995) to the EMBL/GenBank/DDBJ databases.",
 269     "Cell Calcium 23:323-338(1998).",
 270     "Submitted (MAY-2003) to the EMBL/GenBank/DDBJ databases.",
 271     "Nature 421:601-607(2003).",
 272     "Proc. Natl. Acad. Sci. U.S.A. 99:16899-16903(2002).",
 273     "Proc. Natl. Acad. Sci. U.S.A. 96:903-908(1999).",
 274     "Nat. Struct. Biol. 8:990-997(2001).",
 275     "J. Mol. Biol. 228:1177-1192(1992).",
 276     "Biochemistry 33:15259-15265(1994).",
 277     "Nature 415:396-402(2002).",
 278     "EMBO J. 21:6721-6732(2002).",
 279     "Nat. Struct. Biol. 10:226-231(2003).",
 280 );
 281
 282 my @positions = (
 283      undef, undef,
 284     undef, undef,
 285     undef, undef,
 286     undef, undef,
 287     undef, undef,
 288     undef, undef,
 289     undef, undef,
 290     undef, undef,
 291     undef, undef,
 292     undef, undef,
 293     94, 103,
 294     1, 76,
 295     undef, undef,
 296     undef, undef,
 297     5, 148,
 298     1, 148,
 299     undef, undef,
 300 );
 301
 302 foreach my $litref (@litrefs) {
 303     is($litref->title, shift(@titles));
 304     is($litref->location, shift(@locs));
 305     is($litref->start, shift(@positions));
 306     is($litref->end, shift(@positions));
 307 }
 308
 309 # format parsing changes (pre-rel 9.0)
 310
 311 $seqio = Bio::SeqIO->new( -verbose => $verbose,
 312                          -format => 'swiss',
 313                          -file   => test_input_file('pre_rel9.swiss'));
 314
 315 ok($seqio);
 316 $seq = $seqio->next_seq;
 317 isa_ok($seq->species, 'Bio::Species');
 318 is($seq->species->ncbi_taxid, "6239");
 319
 320 # version, seq_update, dates (5 tests)
 321 is($seq->version, 44);
 322 ($ann) = $seq->annotation->get_Annotations('seq_update');
 323 is($ann->display_text, 1,'operator overloading in AnnotationI is deprecated');
 324 @dates = $seq->get_dates;
 325 @date_check = qw(01-NOV-1997 01-NOV-1996 30-MAY-2006 );
 326 for my $date (@dates) {
 327     is($date, shift @date_check);
 328 }
 329
 330 my @idcheck = qw(Z66513 T22647 Cel.30446 Q06319 Q20772 F54D5.7 WBGene00010052
 331                  F54D5.7 GO:0005515 IPR006089 IPR006091 IPR006090
 332                  IPR006092 IPR009075 IPR009100 IPR013764 PF00441
 333                  PF02770 PF02771 PS00072 PS00073);
 334
 335 for my $dblink ( $seq->annotation->get_Annotations('dblink') ) {
 336     is($dblink->primary_id, shift @idcheck);
 337 }
 338
 339 $seqio = Bio::SeqIO->new( -verbose => $verbose,
 340                          -format => 'swiss',
 341                          -file   => test_input_file('pre_rel9.swiss'));
 342
 343 my @namespaces = qw(Swiss-Prot TrEMBL TrEMBL);
 344
 345 while (my $seq = $seqio->next_seq) {
 346     is($seq->namespace, shift @namespaces);
 347 }
 348
 349 # format parsing changes (rel 9.0, Oct 2006)
 350
 351 $seqio = Bio::SeqIO->new( -verbose => $verbose,
 352                          -format => 'swiss',
 353                          -file   => test_input_file('rel9.swiss'));
 354
 355 ok($seqio);
 356 $seq = $seqio->next_seq;
 357 isa_ok($seq->species, 'Bio::Species');
 358 is($seq->species->ncbi_taxid, 6239);
 359
 360 is($seq->version, 47);
 361 ($ann) = $seq->annotation->get_Annotations('seq_update');
 362 is($ann->display_text, 1,'operator overloading in AnnotationI is deprecated');
 363 @dates = $seq->get_dates;
 364 @date_check = qw(01-NOV-1997 01-NOV-1996 31-OCT-2006 );
 365 for my $date (@dates) {
 366     is($date, shift @date_check);
 367 }
 368
 369 @idcheck = qw(Z66513 T22647 Cel.30446 Q06319 Q20772 F54D5.7 cel:F54D5.7
 370          WBGene00010052 F54D5.7 GO:0005515 IPR006089 IPR006091 IPR006090
 371          IPR006092 IPR009075 IPR013786 IPR009100 IPR013764 PF00441 PF02770
 372          PF02771 PS00072 PS00073 );
 373
 374 for my $dblink ( $seq->annotation->get_Annotations('dblink') ) {
 375     is($dblink->primary_id, shift @idcheck);
 376 }
 377
 378 $seqio = Bio::SeqIO->new( -verbose => $verbose,
 379                          -format => 'swiss',
 380                          -file   => test_input_file('rel9.swiss'));
 381
 382 @namespaces = qw(Swiss-Prot TrEMBL TrEMBL);
 383
 384 while (my $seq = $seqio->next_seq) {
 385     is($seq->namespace, shift @namespaces);
 386 }
 387
 388 # bug 2288
 389 # Q8GBD3.swiss
 390 $seqio = Bio::SeqIO->new( -verbose => $verbose,
 391                          -format => 'swiss',
 392                          -file   => test_input_file('Q8GBD3.swiss'));
 393
 394 while (my $seq = $seqio->next_seq) {
 395     my $lineage = join(';', $seq->species->classification);
 396         is ($lineage, 'Acetobacter aceti;Acetobacter subgen. Acetobacter;'.
 397                 'Acetobacter;Acetobacteraceae;Rhodospirillales;Alphaproteobacteria;'.
 398                 'Proteobacteria;Bacteria');
 399 }
 400
 401 # Test for roundtrippability swiss->fasta->swiss
 402 # 1. Swiss -> Fasta
 403 $seqio = Bio::SeqIO->new(
 404     -verbose => $verbose,
 405     -format  => 'swiss',
 406     -file    => test_input_file('test.swiss'),
 407 );
 408 my $fasta_output = test_output_file();
 409 my $seqio_out = Bio::SeqIO->new(
 410     -verbose => $verbose,
 411     -format  => 'fasta',
 412     -file    => ">$fasta_output",
 413 );
 414
 415 my $seq_first = $seqio->next_seq();
 416 $seqio_out->write_seq( $seq_first );
 417
 418 # 2. Fasta -> Swiss
 419 my $swiss_output = test_output_file();
 420 $seqio = Bio::SeqIO->new(
 421     -verbose => $verbose,
 422     -format  => 'fasta',
 423     -file    => $fasta_output,
 424 );
 425 $seqio_out = Bio::SeqIO->new(
 426     -verbose => $verbose,
 427     -format  => 'swiss',
 428     -file    => ">$swiss_output",
 429 );
 430 my $seq_second = $seqio->next_seq();
 431 is( $seq_second->id,  $seq_first->id,  'Converting to fasta seqids match');
 432 is( $seq_second->seq, $seq_first->seq, 'Converting to fasta sequences match');
 433 $seqio_out->write_seq( $seq_second );
 434
 435 # 3. Check that we can open and read the resulting swiss-prot file
 436
 437 $seqio = Bio::SeqIO->new(
 438     -verbose => $verbose,
 439     -format  => 'swiss',
 440     -file    => $swiss_output,
 441 );
 442 my $seq_third;
 443 SKIP: {
 444     skip "Can't parse generated swissprot file", 1
 445         unless lives_ok( sub {$seq_third = $seqio->next_seq()}, 'Can parse generated swiss' );
 446     is( $seq_third->id,  $seq_first->id,  'Roundtrip, seqids match');
 447     is( $seq_third->seq, $seq_first->seq, 'Roundtrip, sequences match');
 448 };
 449
 450 # bug 3153
 451
 452 # the default type for gene_name is Bio::Annotation::TagTree, but we need to
 453 # allow Bio::Annotation::SimpleValue as well for output (even though we will not
 454 # support parsing it)
 455
 456 $seqio = Bio::SeqIO->new(-format => 'swiss',
 457                          -file =>  test_input_file('test.swiss'));
 458
 459 $seq = $seqio->next_seq;
 460
 461 $seq->annotation->remove_Annotations('gene_name');
 462
 463 $seq->add_Annotation('gene_name',
 464         Bio::Annotation::SimpleValue->new(-name   => 'foo', -value  => 'bar'));
 465
 466 $outfile = test_output_file();
 467
 468 my $seqout = Bio::SeqIO->new(-format => 'swiss',
 469                              -file   => ">$outfile");
 470
 471 lives_ok {$seqout->write_seq($seq)};
 472
 473 $seqout->close;
 474
 475 open my $swissfh, '<', $outfile or die "Could not read file '$outfile': $!\n";
 476
 477 my $seen_gn;
 478 while (<$swissfh>) {
 479     if (/^GN\s+(\S+)/) {
 480         $seen_gn = $1;
 481         last
 482     }
 483 }
 484 close $swissfh;
 485
 486 is $seen_gn, 'bar';