4 test_begin( -tests => 109,
5 -requires_modules => [qw(Bio::DB::Fasta Bio::SeqIO)] );
11 my $DEBUG = test_debug();
14 # Test Bio::DB::Fasta, but also the underlying module, Bio::DB::IndexedBase
16 my $test_dir = setup_temp_dir('dbfa');
17 my $test_file = test_input_file('dbfa', 'mixed_alphabet.fasta');
19 test_input_file('dbfa', 'mixed_alphabet.fasta'),
20 test_input_file('dbfa', '6.fa')
25 # Test basic functionalities
26 ok my $db = Bio::DB::Fasta->new($test_dir, -reindex => 1), 'Index a directory';
27 is $db->glob, '*.{fa,FA,fasta,FASTA,fast,FAST,dna,DNA,fna,FNA,faa,FAA,fsa,FSA}';
28 isa_ok $db, 'Bio::DB::Fasta';
29 is $db->length('CEESC13F'), 389;
30 is $db->seq('CEESC13F:1,10'), 'cttgcttgaa';
31 is $db->seq('CEESC13F:1-10'), 'cttgcttgaa';
32 is $db->seq('CEESC13F:1..10'), 'cttgcttgaa';
33 is $db->seq('CEESC13F:1..10/1'), 'cttgcttgaa';
34 is $db->seq('CEESC13F:1..10/+1'), 'cttgcttgaa';
35 is $db->seq('CEESC13F:1..10/-1'), 'ttcaagcaag';
36 is $db->seq('CEESC13F/1'), 'cttgcttgaaaaatttatataaatatttaagagaagaaaaataaataatcgcatctaatgacgtctgtccttgtatccctggtttccattgactggtgcactttcctgtctttgaggacatggacaatattcggcatcagttcctggctctccctcctctcctggtgctccagcagaaccgttctctccattatctcccttgtctccacgtggtccacgctctcctggtgctcctggaataccttgagctccctcgtgccgaattcctgcagcccgggggatccactagttctagagcggccgccaccgcggtgggagctccagcttttgttncctttagtgagggttaatttcgagcttggcgtaatcatggtcatagctgtttcctg';
37 is $db->seq('CEESC13F/-1'), 'caggaaacagctatgaccatgattacgccaagctcgaaattaaccctcactaaaggnaacaaaagctggagctcccaccgcggtggcggccgctctagaactagtggatcccccgggctgcaggaattcggcacgagggagctcaaggtattccaggagcaccaggagagcgtggaccacgtggagacaagggagataatggagagaacggttctgctggagcaccaggagaggagggagagccaggaactgatgccgaatattgtccatgtcctcaaagacaggaaagtgcaccagtcaatggaaaccagggatacaaggacagacgtcattagatgcgattatttatttttcttctcttaaatatttatataaatttttcaagcaag';
38 is $db->seq('AW057119', 1, 10), 'tcatgttggc';
39 is $db->seq('AW057119', 1, 10, 1), 'tcatgttggc';
40 is $db->seq('AW057119', 1, 10, -1), 'gccaacatga';
41 is $db->seq('AW057119', 10, 1), 'gccaacatga';
42 is $db->seq('AW057119', 10, 1, -1), 'tcatgttggc';
43 is $db->header('AW057119'), 'AW057119 test description';
44 is $db->seq('foobarbaz'), undef;
45 is $db->get_Seq_by_id('foobarbaz'), undef;
46 is $db->file('AW057119'), '1.fa';
47 is $db->file('AW057410'), '3.fa';
48 is $db->file('CEESC13F'), '6.fa';
50 # Bio::DB::RandomAccessI and Bio::DB::SeqI methods
51 ok my $primary_seq = $db->get_Seq_by_id('AW057119');
52 ok $primary_seq = $db->get_Seq_by_acc('AW057119');
53 ok $primary_seq = $db->get_Seq_by_version('AW057119');
54 ok $primary_seq = $db->get_Seq_by_primary_id('AW057119');
55 isa_ok $primary_seq, 'Bio::PrimarySeq::Fasta';
56 isa_ok $primary_seq, 'Bio::PrimarySeqI';
58 # Bio::PrimarySeqI methods
59 is $primary_seq->id, 'AW057119';
60 is $primary_seq->display_id, 'AW057119';
61 like $primary_seq->primary_id, qr/^Bio::PrimarySeq::Fasta=HASH/;
62 is $primary_seq->alphabet, 'dna';
63 is $primary_seq->accession_number, 'unknown';
64 is $primary_seq->is_circular, undef;
65 is $primary_seq->subseq(11, 20), 'ttctcggggt';
66 is $primary_seq->description, 'test description', 'bug 3126';
67 is $primary_seq->seq, 'tcatgttggcttctcggggtttttatggattaatacattttccaaacgattctttgcgccttctgtggtgccgccttctccgaaggaactgacgaaaaatgacgtggatttgctgacaaatccaggcgaggaatatttggacggattgatgaaatggcacggcgacgagcgacccgtgttcaaaagagaggacatttatcgttggtcggatagttttccagaatatcggctaagaatgatttgtctgaaagacacgacaagggtcattgcagtcggtcaatattgttactttgatgctctgaaagaaaggagagcagccattgttcttcttaggattgggatggacggatcctgaatatcgtaatcgggcagttatggagcttcaagcttcgatggcgctggaggagagggatcggtatccgactgccaacgcggcatcgcatccaaataagttcatgaaacgattttggcacatattcaacggcctcaaagagcacgaggacaaaggtcacaaggctgccgctgtttcatacaagagcttctacgacctcanagacatgatcattcctgaaaatctggatgtcagtggtattactgtaaatgatgcacgaaaggtgccacaaagagatataatcaactacgatcaaacatttcatccatatcatcgagaaatggttataatttctcacatgtatgacaatgatgggtttggaaaagtgcgtatgatgaggatggaaatgtacttggaattgtctagcgatgtctttanaccaacaagactgcacattagtcaattatgcagatagcc';
68 ok my $trunc = $primary_seq->trunc(11,20);
69 isa_ok $trunc, 'Bio::PrimarySeq::Fasta';
70 isa_ok $trunc, 'Bio::PrimarySeqI';
71 is $trunc->length, 10;
72 is $trunc->seq, 'ttctcggggt';
73 ok my $rev = $trunc->revcom;
74 isa_ok $rev, 'Bio::PrimarySeq::Fasta';
75 isa_ok $rev, 'Bio::PrimarySeqI';
76 is $rev->seq, 'accccgagaa';
82 # Re-open an existing index.
83 # Doing this test properly involves unloading and reloading Bio::DB::Fasta.
86 test_skip(-tests => 1, -requires_modules => [qw(Class::Unload)]);
87 use_ok('Class::Unload');
88 Class::Unload->unload( 'Bio::DB::Fasta' );
89 Class::Unload->unload( 'Bio::DB::IndexedBase' );
90 require Bio::DB::Fasta;
93 ok my $db = Bio::DB::Fasta->new($test_dir), 'Re-open an existing index';
94 is $db->seq('AW057119', 1, 10), 'tcatgttggc';
99 # Test tied hash access
101 ok tie(%h, 'Bio::DB::Fasta', $test_dir), 'Tied hash access';
102 ok exists $h{'AW057146'};
103 is $h{'AW057146:1,10'} , 'aatgtgtaca'; # in file 1.fa
104 is $h{'AW057146:10,1'} , 'tgtacacatt'; # reverse complement
105 is $h{'AW057443:11,20'}, 'gaaccgtcag'; # in file 4.fa
110 # Test writing the Bio::PrimarySeq::Fasta objects with SeqIO
111 ok my $db = Bio::DB::Fasta->new($test_dir, -reindex => 1), 'Writing with SeqIO';
112 my $out = Bio::SeqIO->new(
113 -format => 'genbank',
114 -file => '>'.test_output_file()
116 my $primary_seq = Bio::Seq->new(-primary_seq => $db->get_Seq_by_acc('AW057119'));
118 $out->write_seq($primary_seq)
122 $out = Bio::SeqIO->new(-format => 'embl', -file => '>'.test_output_file());
124 $out->write_seq($primary_seq)
131 # Test alphabet and reverse-complement RNA
132 ok my $db = Bio::DB::Fasta->new( $test_file, -reindex => 1), 'Index a single file';
133 is $db->alphabet('gi|352962132|ref|NG_030353.1|'), 'dna';
134 is $db->alphabet('gi|352962148|ref|NM_001251825.1|'), 'rna';
135 is $db->alphabet('gi|194473622|ref|NP_001123975.1|'), 'protein';
136 is $db->alphabet('gi|61679760|pdb|1Y4P|B'), 'protein';
137 is $db->alphabet('123'), '';
138 is $db->seq('gi|352962148|ref|NM_001251825.1|', 20, 29, 1), 'GUCAGCGUCC';
139 is $db->seq('gi|352962148|ref|NM_001251825.1|', 20, 29, -1), 'GGACGCUGAC';
141 # Test empty sequence
142 is $db->seq('123'), '';
144 is $db->file('gi|352962132|ref|NG_030353.1|'), 'mixed_alphabet.fasta';
150 ok my $db = Bio::DB::Fasta->new( $test_file, -reindex => 1);
151 ok my $stream = $db->get_PrimarySeq_stream;
152 isa_ok $stream, 'Bio::DB::Indexed::Stream';
155 # note use of modified iterator, needed b/c of overloading
156 while (defined(my $seq = $stream->next_seq)) {
162 # retrieve seq with ID of 0
163 my $seq = $db->get_Seq_by_id(0);
164 isa_ok $seq, 'Bio::PrimarySeq::Fasta';
165 is $seq->display_id, 0;
167 # ActivePerl will not allow deletion if the tie-hash is still active
169 # Strawberry Perl temporary file
170 unlink "$test_file.index" if -e "$test_file.index";
171 # ActivePerl temporary files
172 unlink "$test_file.index.dir" if -e "$test_file.index.dir";
173 unlink "$test_file.index.pag" if -e "$test_file.index.pag";
178 # Concurrent databases (bug #3390)
179 ok my $db1 = Bio::DB::Fasta->new( test_input_file('dbfa', '1.fa') );
180 ok my $db3 = Bio::DB::Fasta->new( test_input_file('dbfa', '3.fa') );
181 ok my $db4 = Bio::DB::Fasta->new( $test_dir );
182 ok my $db2 = Bio::DB::Fasta->new( test_input_file('dbfa', '2.fa') );
183 is $db4->file('AW057231'), '1.fa';
184 is $db2->file('AW057302'), '2.fa';
185 is $db4->file('AW057119'), '1.fa';
186 is $db3->file('AW057336'), '3.fa';
187 is $db1->file('AW057231'), '1.fa';
188 is $db4->file('AW057410'), '3.fa';
190 # ActivePerl will not allow deletion if the tie-hash is still active
194 # Strawberry Perl temporary file
195 unlink $db1->index_name if -e $db1->index_name;
196 unlink $db2->index_name if -e $db2->index_name;
197 unlink $db3->index_name if -e $db3->index_name;
198 # ActivePerl temporary files
199 unlink $db1->index_name().'.dir' if -e $db1->index_name().'.dir';
200 unlink $db2->index_name().'.dir' if -e $db2->index_name().'.dir';
201 unlink $db3->index_name().'.dir' if -e $db3->index_name().'.dir';
202 unlink $db1->index_name().'.pag' if -e $db1->index_name().'.pag';
203 unlink $db2->index_name().'.pag' if -e $db2->index_name().'.pag';
204 unlink $db3->index_name().'.pag' if -e $db3->index_name().'.pag';
209 # Test an arbitrary index filename and cleaning
210 my $name = 'arbitrary.idx';
211 ok my $db = Bio::DB::Fasta->new( $test_file,
212 -reindex => 1, -index_name => $name, -clean => 1,
214 is $db->index_name, $name;
216 # Tied-hash in Strawberry Perl produce $name,
217 # while in ActivePerl produce "$name.dir" and "$name.pag"
218 if (-e "$name.pag") {
220 # ActivePerl will not allow deletion if the tie-hash is still active
222 unlink "$name.dir" if -e "$name.dir";
223 unlink "$name.pag" if -e "$name.pag";
228 # ActivePerl will not allow deletion if the tie-hash is still active
230 unlink $name if -e $name;
239 ok my $db = Bio::DB::Fasta->new( $test_file,
240 -reindex => 1, -clean => 1, -makeid => \&extract_gi,
242 is_deeply [sort $db->get_all_primary_ids], ['', 194473622, 352962132, 352962148, 61679760];
243 is $db->get_Seq_by_id('gi|352962148|ref|NM_001251825.1|'), undef;
244 isa_ok $db->get_Seq_by_id(194473622), 'Bio::PrimarySeqI';
249 # Test makeid that generates several IDs, bug #3389
250 ok my $db = Bio::DB::Fasta->new( $test_file,
251 -reindex => 1, -clean => 1, -makeid => \&extract_gi_and_ref,
252 ), 'Make multiple IDs, bug #3389';
253 is_deeply [sort $db->get_all_primary_ids], ['', 194473622, 352962132, 352962148, 61679760, 'NG_030353.1', 'NM_001251825.1', 'NP_001123975.1'];
254 is $db->get_Seq_by_id('gi|352962148|ref|NM_001251825.1|'), undef;
255 isa_ok $db->get_Seq_by_id('NG_030353.1'), 'Bio::PrimarySeqI';
260 # Test opening set of files and test IDs
261 ok my $db = Bio::DB::Fasta->new( $test_files, -reindex => 1), 'Index a set of files';
264 my @ids = sort $db->get_all_primary_ids();
265 is_deeply \@ids, [ qw(
306 gi|194473622|ref|NP_001123975.1|
307 gi|352962132|ref|NG_030353.1|
308 gi|352962148|ref|NM_001251825.1|
309 gi|61679760|pdb|1Y4P|B
311 like $db->index_name, qr/^fileset_.+\.index$/;
313 my $index = $db->index_name;
314 # ActivePerl will not allow deletion if the tie-hash is still active
316 # Strawberry Perl temporary file
317 unlink $index if -e $index;
318 # ActivePerl temporary files
319 unlink "$index.dir" if -e "$index.dir";
320 unlink "$index.pag" if -e "$index.pag";
325 # Squash warnings locally
326 local $SIG{__WARN__} = sub {};
329 my $test_dir = setup_temp_dir('bad_dbfa');
330 throws_ok {my $db = Bio::DB::Fasta->new($test_dir, -reindex => 1)}
331 qr/FASTA header doesn't match/;
334 # Empty lines within a sequence is bad...
335 throws_ok {my $db = Bio::DB::Fasta->new(test_input_file('badfasta.fa'), -reindex => 1)}
336 qr/Blank lines can only precede header lines/;
342 # but empty lines preceding headers are okay, but let's check the seqs just in case
344 lives_ok {$db = Bio::DB::Fasta->new(test_input_file('spaced_fasta.fa'), -reindex => 1)};
345 is length($db->seq('CEESC39F')), 375, 'length is correct in sequences past spaces';
346 is length($db->seq('CEESC13F')), 389;
348 is $db->subseq('CEESC39F', 51, 60) , 'acatatganc', 'subseq is correct';
349 is $db->subseq('CEESC13F', 146, 155), 'ggctctccct', 'subseq is correct';
351 # Remove temporary test file
352 my $outfile = test_input_file('spaced_fasta.fa').'.index';
354 # ActivePerl will not allow deletion if the tie-hash is still active
356 # Strawberry Perl temporary file
357 unlink $outfile if -e $outfile;
358 # ActivePerl temporary files
359 unlink "$outfile.dir" if -e "$outfile.dir";
360 unlink "$outfile.pag" if -e "$outfile.pag";
367 # Extract GI from RefSeq
369 my ($id) = ($header =~ /gi\|(\d+)/m);
374 sub extract_gi_and_ref {
375 # Extract GI and from RefSeq
377 my ($gi) = ($header =~ /gi\|(\d+)/m);
379 my ($ref) = ($header =~ /ref\|([^|]+)/m);
386 # this obfuscation is to deal with lockfiles by GDBM_File which can
387 # only be created on local filesystems apparently so will cause test
388 # to block and then fail when the testdir is on an NFS mounted system
390 my $data_dir = shift;
392 my $io = Bio::Root::IO->new();
393 my $tempdir = test_output_dir();
394 my $test_dir = $io->catfile($tempdir, $data_dir);
395 mkdir($test_dir); # make the directory
396 my $indir = test_input_file($data_dir);
397 opendir(my $INDIR,$indir) || die("cannot open dir $indir");
398 # effectively do a cp -r but only copy the files that are in there, no subdirs
399 for my $file ( map { $io->catfile($indir,$_) } readdir($INDIR) ) {
400 next unless (-f $file );
401 copy($file, $test_dir);