1 # -*-Perl-*- Test Harness script for Bioperl
9 test_begin( -tests => 149 );
11 use_ok('Bio::SeqIO::fastq');
12 use_ok('Bio::Seq::Quality');
15 my $DEBUG = test_debug();
17 # simple parsing, data conversion of fastq example files
21 'variant' => 'sanger',
22 'seq' => 'TTGGAATGTTGCAAATGGGAGGCAGTTTGAAATACTGAATAGGCCTCATC'.
23 'GAGAATGTGAAGTTTCAGTAAAGACTTGAGGAAGTTGAATGAGCTGATGA'.
25 'qual' => '31 23 32 23 31 22 27 28 32 24 25 23 30 25 2 21 33 '.
26 '29 9 17 33 27 27 27 25 33 29 9 28 32 27 7 27 21 '.
27 '26 21 27 27 17 26 23 31 23 32 24 27 27 28 27 28 '.
28 '28 27 27 31 23 23 28 27 27 32 23 27 35 30 12 28 '.
29 '27 27 25 33 29 10 27 28 28 33 25 27 27 31 23 34 '.
30 '27 27 32 24 27 30 22 24 28 24 27 28 27 26 28 27 '.
31 '28 32 24 28 33 25 23 27 27 28 27 28 26',
32 'display_id' => 'DS6BPQV01A2G0A',
37 'variant' => 'sanger',
38 'seq' => 'CCGCCATTTCTTCAAATCTTTTCTTTTCTTTAGGAGTCATCAATTTCCAT'.
39 'TTCTCTGCACATTTCTTTGAAAATTA',
40 'qual' => '31 34 34 34 34 34 34 34 34 33 34 34 34 34 34 34 '.
41 '34 34 34 34 34 32 32 34 34 34 34 34 34 34 34 34 '.
42 '30 34 34 34 34 34 34 34 34 34 34 34 34 34 32 32 '.
43 '34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 '.
44 '34 34 34 34 33 30 30 27 33 34 29 2',
45 'display_id' => 'Illumina_SRR125365.38',
46 'desc' => 's_5_1_0001_qseq_37 length=76',
50 'variant' => 'sanger',
51 'seq' => 'TATTGACAATTGTAAGACCACTAAGGATTTTTGGGCGGCAGCGACTTGGA'.
52 'GCTCTTGTAAAAGCGCACTGCGTTCCTTTTCTTTATTCTTTTGATCTTGA'.
53 'GAATCTTCTAAAAATGCCGAAAAGAAATGTTGGGAAGAGAGCGTAATCAG'.
54 'TTTAGAAATGCTCTTGATGGTAGCTTTATGTTGATCCATTCTTCTGCCTC'.
55 'CTTTACGAATAAAATAGAATAAAACTCAAATGACTAATTACCTGTATTTT'.
56 'ACCTAATTTTGTGATAAAATTCAAGAAAATATGTTCGCCTTCAATAATTA'.
58 'qual' => '37 37 37 37 37 37 37 37 37 37 37 40 38 40 40 37 '.
59 '37 37 39 39 40 39 39 39 39 39 37 33 33 33 33 33 '.
60 '39 39 34 29 28 28 38 39 39 39 39 39 39 37 37 37 '.
61 '37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 '.
62 '37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 38 '.
63 '38 29 29 29 34 38 37 37 33 33 33 33 37 37 37 37 '.
64 '37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 '.
65 '37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 '.
66 '37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 '.
67 '37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 '.
68 '37 37 37 37 37 37 37 37 37 37 37 37 37 37 38 38 '.
69 '38 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 '.
70 '37 37 37 37 37 37 37 37 37 34 34 34 38 38 37 37 '.
71 '37 37 37 37 37 37 37 37 40 40 40 40 38 38 38 38 '.
72 '40 40 40 38 38 38 40 40 40 40 40 40 40 40 40 40 '.
73 '38 38 38 38 38 32 25 25 25 25 30 30 31 32 32 31 '.
74 '31 31 31 31 31 31 31 31 19 19 19 19 19 22 22 31 '.
75 '31 31 31 31 31 31 31 32 32 31 32 31 31 31 31 31 '.
76 '31 25 25 25 28 28 30 30 30 30 30 31 31 32',
77 'display_id' => 'SRR005406.250',
78 'desc' => 'FB9GE3J10F6I2T length=302',
82 'variant' => 'solexa',
83 'seq' => 'GTATTATTTAATGGCATACACTCAA',
84 'qual' => '25 25 25 25 25 25 25 25 25 25 23 25 25 25 25 23 '.
85 '25 23 23 21 23 23 23 17 17',
86 'display_id' => 'SLXA-B3_649_FC8437_R1_1_1_183_714',
91 'variant' => 'illumina',
92 'seq' => 'CCAAATCTTGAATTGTAGCTCCCCT',
93 'qual' => '15 19 24 15 17 24 24 24 24 24 19 24 24 21 24 24 '.
94 '20 24 24 24 24 20 18 13 19',
95 'display_id' => 'FC12044_91407_8_200_285_136',
100 'variant' => 'sanger', # TODO: guessing on the format here...
101 'seq' => 'GTTGCTTCTGGCGTGGGTGGGGGGG',
102 'qual' => '26 26 26 26 26 26 26 26 26 26 26 24 26 22 26 26 '.
103 '13 22 26 18 24 18 18 18 18',
104 'display_id' => 'EAS54_6_R1_2_1_443_348',
109 'variant' => 'illumina',
110 'seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTN',
111 'qual' => '40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 '.
112 '21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0',
113 'display_id' => 'Test',
114 'desc' => 'PHRED qualities from 40 to 0 inclusive',
118 'variant' => 'sanger',
119 'seq' => 'ACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGAC'.
120 'TGACTGACTGACTGACTGACTGACTGACTGACTGAN',
121 'qual' => '93 92 91 90 89 88 87 86 85 84 83 82 81 80 79 78 77 76 75 '.
122 '74 73 72 71 70 69 68 67 66 65 64 63 62 61 60 59 58 57 56 '.
123 '55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 '.
124 '36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 '.
125 '17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0',
126 'display_id' => 'Test',
127 'desc' => 'PHRED qualities from 93 to 0 inclusive',
131 'variant' => 'sanger',
132 'seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTN',
133 'qual' => '40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 '.
134 '21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0',
135 'display_id' => 'Test',
136 'desc' => 'PHRED qualities from 40 to 0 inclusive',
140 'variant' => 'solexa',
141 'seq' => 'GTATTATTTAATGGCATACACTCAA',
142 'qual' => '25 25 25 25 25 25 25 25 25 25 23 25 25 25 25 23 '.
143 '25 23 23 21 23 23 23 17 17',
144 'display_id' => 'SLXA-B3_649_FC8437_R1_1_1_183_714',
149 'variant' => 'solexa',
150 'seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN',
151 'qual' => '40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 '.
152 '24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 10 9 '.
153 '8 7 6 5 5 4 4 3 3 2 2 1 1',
154 'display_id' => 'slxa_0001_1_0001_01',
159 'variant' => 'sanger', # TODO: guessing on the format here...
160 'seq' => 'TGGGAGGTTTTATGTGGAAAGCAGCAATGTACAAGA',
161 'qual' => '40 40 40 40 40 40 40 13 40 40 40 40 40 40 16 31 '.
162 '19 19 31 12 22 13 4 27 5 10 14 3 14 4 19 7 10 10 '.
164 'display_id' => '071113_EAS56_0053:1:3:990:501',
169 'variant' => 'sanger', # TODO: guessing on the format here...
170 'seq' => 'AACCCGTCCCATCAAAGATTTTGGTTGGAACCCGAAAGGGTTTTGAATTC'.
171 'AAACCCCTTTCGGTTCCAACTATTCAATTGTTTAACTTTTTTTAAATTGA'.
172 'TGGTCTGTTGGACCATTTGTAATAATCCCCATCGGAATTTCTTT',
173 'qual' => '32 26 31 26 4 22 20 30 25 2 27 27 24 36 32 16 '.
174 '26 28 36 32 18 4 33 26 33 26 32 26 33 26 31 26 '.
175 '4 24 36 32 16 36 32 16 36 32 18 4 27 33 26 32 26 '.
176 '23 36 32 15 35 31 18 3 36 32 16 28 33 26 32 26 33 '.
177 '26 33 26 25 28 25 33 26 25 33 25 32 24 25 36 32 '.
178 '15 32 24 27 37 32 23 16 10 5 1 35 30 12 33 26 19 '.
179 '27 25 25 14 27 26 28 25 32 24 23 12 20 30 21 28 '.
180 '34 29 10 23 27 27 18 26 28 19 25 35 32 18 4 27 26 '.
181 '28 23 12 24 13 32 28 8 25 33 28 9',
182 'display_id' => 'SRR014849.203935',
183 'desc' => 'EIXKN4201B4HU6 length=144',
188 for my $example (sort keys %example_files) {
189 my $file = test_input_file('fastq', "$example.fastq");
190 my $variant = $example_files{$example}->{variant};
191 my $in = Bio::SeqIO->new(-format => "fastq-$variant",
193 -verbose => 2); #strictest level
197 while (my $seq = $in->next_seq) {
199 $sample_seq = $seq; # always grab the last seq
202 ok(!$@, "$example parses");
203 is($ct, $example_files{$example}->{count}, "correct num. seqs in $example");
204 ok(defined($sample_seq), 'sample sequence obtained');
206 isa_ok($sample_seq, 'Bio::Seq::Quality');
207 for my $method (qw(seq desc display_id)) {
208 is($sample_seq->$method,
209 $example_files{$example}->{$method},
210 "$method() matches $example");
212 is(join(' ', map {sprintf("%.0f", $_)} @{$sample_seq->qual}),
213 $example_files{$example}->{qual},
214 "qual() matches $example");
215 my $truncated = $sample_seq->trunc(1,10);
216 is(scalar(@{$truncated->meta}), $truncated->length);
220 # test round-trip and conversions (single file of each type)
222 my @variants = qw(sanger illumina solexa);
224 my %conversion = ( # check conversions, particularly solexa
226 'variant' => 'sanger',
228 '-seq' => 'ACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGAN',
229 '-qual' => [ (map {62} 0..31),(reverse(1..61)),1 ],
230 '-raw_quality' => '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJHGFECB@>;;',
232 '-desc' => 'PHRED qualities from 93 to 0 inclusive',
233 '-descriptor' => 'Test PHRED qualities from 93 to 0 inclusive'
236 '-seq' => 'ACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGAN',
237 '-qual' => [ (map {62} 0..31),(reverse(0..61)) ],
238 '-raw_quality' => '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@',
240 '-desc' => 'PHRED qualities from 93 to 0 inclusive',
241 '-descriptor' => 'Test PHRED qualities from 93 to 0 inclusive'
244 '-seq' => 'ACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGAN',
245 '-qual' => [reverse(0..93)],
246 '-raw_quality' => '~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;:9876543210/.-,+*)(\'&%$#"!',
248 '-desc' => 'PHRED qualities from 93 to 0 inclusive',
249 '-descriptor' => 'Test PHRED qualities from 93 to 0 inclusive'
253 'variant' => 'solexa',
254 'to_solexa' => {'-seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN',
255 '-qual' => [qw(40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 10 9 8 7 6 5 5 4 4 3 3 2 2 1 1)],
256 '-raw_quality' => 'hgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;',
257 '-id' => 'slxa_0001_1_0001_01',
259 '-descriptor' => 'slxa_0001_1_0001_01'
262 '-seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN',
263 '-qual' => [qw(40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 10 9 8 7 6 5 5 4 4 3 3 2 2 1 1)],
264 '-namespace' => 'solexa',
265 '-raw_quality' => 'hgfedcba`_^]\\[ZYXWVUTSRQPONMLKJJIHGFEEDDCCBBAA',
266 '-id' => 'slxa_0001_1_0001_01',
268 '-descriptor' => 'slxa_0001_1_0001_01'
271 '-seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN',
272 '-qual' => [qw(40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 10 9 8 7 6 5 5 4 4 3 3 2 2 1 1)],
273 '-namespace' => 'solexa',
274 '-raw_quality' => 'IHGFEDCBA@?>=<;:9876543210/.-,++*)(\'&&%%$$##""',
275 '-id' => 'slxa_0001_1_0001_01',
277 '-descriptor' => 'slxa_0001_1_0001_01'
281 'variant' => 'illumina',
283 '-seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTN',
284 '-qual' => [reverse(1..40), 1], # round trip from solexa is lossy
285 '-namespace' => 'illumina',
286 '-raw_quality' => 'hgfedcba`_^]\[ZYXWVUTSRQPONMLKJHGFECB@>;;',
288 '-desc' => 'PHRED qualities from 40 to 0 inclusive',
289 '-descriptor' => 'Test PHRED qualities from 40 to 0 inclusive'
292 '-seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTN',
293 '-qual' => [reverse(0..40)],
294 '-raw_quality' => 'hgfedcba`_^]\\[ZYXWVUTSRQPONMLKJIHGFEDCBA@',
296 '-desc' => 'PHRED qualities from 40 to 0 inclusive',
297 '-descriptor' => 'Test PHRED qualities from 40 to 0 inclusive'
300 '-seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTN',
301 '-qual' => [reverse(0..40)],
302 '-raw_quality' => 'IHGFEDCBA@?>=<;:9876543210/.-,+*)(\'&%$#"!',
304 '-desc' => 'PHRED qualities from 40 to 0 inclusive',
305 '-descriptor' => 'Test PHRED qualities from 40 to 0 inclusive'
310 for my $example (sort keys %conversion) {
311 my $file = test_input_file('fastq', "$example.fastq");
312 my $variant = $conversion{$example}->{variant};
313 my $in = Bio::SeqIO->new(-format => "fastq-$variant",
315 -verbose => 2); #strictest level
316 # this both tests the next_dataset method and helps check roundtripping
317 my $seq = $in->next_seq;
318 for my $newvar (@variants) {
319 next unless exists $conversion{$example}->{"to_$newvar"};
320 my $outfile = test_output_file();
321 Bio::SeqIO->new(-format => "fastq-$newvar",
322 -file => ">$outfile",
323 -verbose => -1)->write_seq($seq);
324 my $newdata = Bio::SeqIO->new(-format => "fastq-$newvar",
325 -file => $outfile)->next_dataset;
326 # round for simple comparison, get around floating pt comparison probs
328 if ($newvar eq 'solexa') {
329 $newdata->{-qual} = [map {sprintf("%.0f",$_)} @{$newdata->{-qual}}];
332 #print Dumper($newdata) if $variant eq 'sanger' && $newvar eq 'illumina';
334 $conversion{$example}->{"to_$newvar"}->{'-namespace'} = $newvar;
335 is_deeply($newdata, $conversion{$example}->{"to_$newvar"}, "Conversion from $variant to $newvar");
339 # test fastq exception handling
345 exception => qr/doesn't\smatch\sseq\sdescriptor/xms,
349 exception => qr/doesn't\smatch\slength\sof\ssequence/xms,
353 exception => qr/Missing\ssequence\sand\/or\squality\sdata/xms,
357 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
359 error_qual_escape => {
361 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
365 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
367 error_qual_space => {
369 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
373 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
375 error_qual_unit_sep => {
377 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
381 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
383 error_short_qual => {
385 exception => qr/doesn't\smatch\slength\sof\ssequence/,
389 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
393 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
395 error_trunc_at_plus => {
397 exception => qr/Missing\ssequence\sand\/or\squality\sdata/xms,
399 error_trunc_at_qual => {
401 exception => qr/Missing\ssequence\sand\/or\squality\sdata/xms,
403 error_trunc_at_seq => {
405 exception => qr/Missing\ssequence\sand\/or\squality\sdata/xms,
407 error_trunc_in_title => {
409 exception => qr/Missing\ssequence\sand\/or\squality\sdata/xms,
411 error_trunc_in_seq => {
413 exception => qr/Missing\ssequence\sand\/or\squality\sdata/xms,
415 error_trunc_in_plus => {
417 exception => qr/doesn't\smatch\sseq\s descriptor/xms,
419 error_trunc_in_qual => {
421 exception => qr/doesn't\smatch\slength\sof\ssequence/xms,
425 for my $example (sort keys %error) {
426 my $file = test_input_file('fastq', "$example.fastq");
427 my $variant = $error{$example}->{variant};
428 my $in = Bio::SeqIO->new(-format => "fastq-$variant",
430 -verbose => 2); #strictest level
432 throws_ok { while (my $seq = $in->next_seq) {
434 } } $error{$example}->{exception}, "Exception caught for $example";
439 my $in = Bio::SeqIO->new(-format => 'fastq',
440 -file => test_input_file('fastq', 'zero_qual.fastq'),
441 -verbose => 2); # strictest level
443 lives_and {my $seq = $in->next_seq;
444 is($seq->seq, 'G');} 'edge case; single 0 in quality fails';