1 # -*-Perl-*- Test Harness script for Bioperl
10 test_begin( -tests => 149 );
12 use_ok('Bio::SeqIO::fastq');
13 use_ok('Bio::Seq::Quality');
16 my $DEBUG = test_debug();
18 # simple parsing, data conversion of fastq example files
22 'variant' => 'sanger',
23 'seq' => 'TTGGAATGTTGCAAATGGGAGGCAGTTTGAAATACTGAATAGGCCTCATC'.
24 'GAGAATGTGAAGTTTCAGTAAAGACTTGAGGAAGTTGAATGAGCTGATGA'.
26 'qual' => '31 23 32 23 31 22 27 28 32 24 25 23 30 25 2 21 33 '.
27 '29 9 17 33 27 27 27 25 33 29 9 28 32 27 7 27 21 '.
28 '26 21 27 27 17 26 23 31 23 32 24 27 27 28 27 28 '.
29 '28 27 27 31 23 23 28 27 27 32 23 27 35 30 12 28 '.
30 '27 27 25 33 29 10 27 28 28 33 25 27 27 31 23 34 '.
31 '27 27 32 24 27 30 22 24 28 24 27 28 27 26 28 27 '.
32 '28 32 24 28 33 25 23 27 27 28 27 28 26',
33 'display_id' => 'DS6BPQV01A2G0A',
38 'variant' => 'sanger',
39 'seq' => 'CCGCCATTTCTTCAAATCTTTTCTTTTCTTTAGGAGTCATCAATTTCCAT'.
40 'TTCTCTGCACATTTCTTTGAAAATTA',
41 'qual' => '31 34 34 34 34 34 34 34 34 33 34 34 34 34 34 34 '.
42 '34 34 34 34 34 32 32 34 34 34 34 34 34 34 34 34 '.
43 '30 34 34 34 34 34 34 34 34 34 34 34 34 34 32 32 '.
44 '34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 '.
45 '34 34 34 34 33 30 30 27 33 34 29 2',
46 'display_id' => 'Illumina_SRR125365.38',
47 'desc' => 's_5_1_0001_qseq_37 length=76',
51 'variant' => 'sanger',
52 'seq' => 'TATTGACAATTGTAAGACCACTAAGGATTTTTGGGCGGCAGCGACTTGGA'.
53 'GCTCTTGTAAAAGCGCACTGCGTTCCTTTTCTTTATTCTTTTGATCTTGA'.
54 'GAATCTTCTAAAAATGCCGAAAAGAAATGTTGGGAAGAGAGCGTAATCAG'.
55 'TTTAGAAATGCTCTTGATGGTAGCTTTATGTTGATCCATTCTTCTGCCTC'.
56 'CTTTACGAATAAAATAGAATAAAACTCAAATGACTAATTACCTGTATTTT'.
57 'ACCTAATTTTGTGATAAAATTCAAGAAAATATGTTCGCCTTCAATAATTA'.
59 'qual' => '37 37 37 37 37 37 37 37 37 37 37 40 38 40 40 37 '.
60 '37 37 39 39 40 39 39 39 39 39 37 33 33 33 33 33 '.
61 '39 39 34 29 28 28 38 39 39 39 39 39 39 37 37 37 '.
62 '37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 '.
63 '37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 38 '.
64 '38 29 29 29 34 38 37 37 33 33 33 33 37 37 37 37 '.
65 '37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 '.
66 '37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 '.
67 '37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 '.
68 '37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 '.
69 '37 37 37 37 37 37 37 37 37 37 37 37 37 37 38 38 '.
70 '38 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37 '.
71 '37 37 37 37 37 37 37 37 37 34 34 34 38 38 37 37 '.
72 '37 37 37 37 37 37 37 37 40 40 40 40 38 38 38 38 '.
73 '40 40 40 38 38 38 40 40 40 40 40 40 40 40 40 40 '.
74 '38 38 38 38 38 32 25 25 25 25 30 30 31 32 32 31 '.
75 '31 31 31 31 31 31 31 31 19 19 19 19 19 22 22 31 '.
76 '31 31 31 31 31 31 31 32 32 31 32 31 31 31 31 31 '.
77 '31 25 25 25 28 28 30 30 30 30 30 31 31 32',
78 'display_id' => 'SRR005406.250',
79 'desc' => 'FB9GE3J10F6I2T length=302',
83 'variant' => 'solexa',
84 'seq' => 'GTATTATTTAATGGCATACACTCAA',
85 'qual' => '25 25 25 25 25 25 25 25 25 25 23 25 25 25 25 23 '.
86 '25 23 23 21 23 23 23 17 17',
87 'display_id' => 'SLXA-B3_649_FC8437_R1_1_1_183_714',
92 'variant' => 'illumina',
93 'seq' => 'CCAAATCTTGAATTGTAGCTCCCCT',
94 'qual' => '15 19 24 15 17 24 24 24 24 24 19 24 24 21 24 24 '.
95 '20 24 24 24 24 20 18 13 19',
96 'display_id' => 'FC12044_91407_8_200_285_136',
101 'variant' => 'sanger', # TODO: guessing on the format here...
102 'seq' => 'GTTGCTTCTGGCGTGGGTGGGGGGG',
103 'qual' => '26 26 26 26 26 26 26 26 26 26 26 24 26 22 26 26 '.
104 '13 22 26 18 24 18 18 18 18',
105 'display_id' => 'EAS54_6_R1_2_1_443_348',
110 'variant' => 'illumina',
111 'seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTN',
112 'qual' => '40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 '.
113 '21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0',
114 'display_id' => 'Test',
115 'desc' => 'PHRED qualities from 40 to 0 inclusive',
119 'variant' => 'sanger',
120 'seq' => 'ACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGAC'.
121 'TGACTGACTGACTGACTGACTGACTGACTGACTGAN',
122 'qual' => '93 92 91 90 89 88 87 86 85 84 83 82 81 80 79 78 77 76 75 '.
123 '74 73 72 71 70 69 68 67 66 65 64 63 62 61 60 59 58 57 56 '.
124 '55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 '.
125 '36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 '.
126 '17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0',
127 'display_id' => 'Test',
128 'desc' => 'PHRED qualities from 93 to 0 inclusive',
132 'variant' => 'sanger',
133 'seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTN',
134 'qual' => '40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 '.
135 '21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0',
136 'display_id' => 'Test',
137 'desc' => 'PHRED qualities from 40 to 0 inclusive',
141 'variant' => 'solexa',
142 'seq' => 'GTATTATTTAATGGCATACACTCAA',
143 'qual' => '25 25 25 25 25 25 25 25 25 25 23 25 25 25 25 23 '.
144 '25 23 23 21 23 23 23 17 17',
145 'display_id' => 'SLXA-B3_649_FC8437_R1_1_1_183_714',
150 'variant' => 'solexa',
151 'seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN',
152 'qual' => '40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 '.
153 '24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 10 9 '.
154 '8 7 6 5 5 4 4 3 3 2 2 1 1',
155 'display_id' => 'slxa_0001_1_0001_01',
160 'variant' => 'sanger', # TODO: guessing on the format here...
161 'seq' => 'TGGGAGGTTTTATGTGGAAAGCAGCAATGTACAAGA',
162 'qual' => '40 40 40 40 40 40 40 13 40 40 40 40 40 40 16 31 '.
163 '19 19 31 12 22 13 4 27 5 10 14 3 14 4 19 7 10 10 '.
165 'display_id' => '071113_EAS56_0053:1:3:990:501',
170 'variant' => 'sanger', # TODO: guessing on the format here...
171 'seq' => 'AACCCGTCCCATCAAAGATTTTGGTTGGAACCCGAAAGGGTTTTGAATTC'.
172 'AAACCCCTTTCGGTTCCAACTATTCAATTGTTTAACTTTTTTTAAATTGA'.
173 'TGGTCTGTTGGACCATTTGTAATAATCCCCATCGGAATTTCTTT',
174 'qual' => '32 26 31 26 4 22 20 30 25 2 27 27 24 36 32 16 '.
175 '26 28 36 32 18 4 33 26 33 26 32 26 33 26 31 26 '.
176 '4 24 36 32 16 36 32 16 36 32 18 4 27 33 26 32 26 '.
177 '23 36 32 15 35 31 18 3 36 32 16 28 33 26 32 26 33 '.
178 '26 33 26 25 28 25 33 26 25 33 25 32 24 25 36 32 '.
179 '15 32 24 27 37 32 23 16 10 5 1 35 30 12 33 26 19 '.
180 '27 25 25 14 27 26 28 25 32 24 23 12 20 30 21 28 '.
181 '34 29 10 23 27 27 18 26 28 19 25 35 32 18 4 27 26 '.
182 '28 23 12 24 13 32 28 8 25 33 28 9',
183 'display_id' => 'SRR014849.203935',
184 'desc' => 'EIXKN4201B4HU6 length=144',
189 for my $example (sort keys %example_files) {
190 my $file = test_input_file('fastq', "$example.fastq");
191 my $variant = $example_files{$example}->{variant};
192 my $in = Bio::SeqIO->new(-format => "fastq-$variant",
194 -verbose => 2); #strictest level
198 while (my $seq = $in->next_seq) {
200 $sample_seq = $seq; # always grab the last seq
203 ok(!$@, "$example parses");
204 is($ct, $example_files{$example}->{count}, "correct num. seqs in $example");
205 ok(defined($sample_seq), 'sample sequence obtained');
207 isa_ok($sample_seq, 'Bio::Seq::Quality');
208 for my $method (qw(seq desc display_id)) {
209 is($sample_seq->$method,
210 $example_files{$example}->{$method},
211 "$method() matches $example");
213 is(join(' ', map {sprintf("%.0f", $_)} @{$sample_seq->qual}),
214 $example_files{$example}->{qual},
215 "qual() matches $example");
216 my $truncated = $sample_seq->trunc(1,10);
217 is(scalar(@{$truncated->meta}), $truncated->length);
221 # test round-trip and conversions (single file of each type)
223 my @variants = qw(sanger illumina solexa);
225 my %conversion = ( # check conversions, particularly solexa
227 'variant' => 'sanger',
229 '-seq' => 'ACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGAN',
230 '-qual' => [ (map {62} 0..31),(reverse(1..61)),1 ],
231 '-raw_quality' => '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJHGFECB@>;;',
233 '-desc' => 'PHRED qualities from 93 to 0 inclusive',
234 '-descriptor' => 'Test PHRED qualities from 93 to 0 inclusive'
237 '-seq' => 'ACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGAN',
238 '-qual' => [ (map {62} 0..31),(reverse(0..61)) ],
239 '-raw_quality' => '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@',
241 '-desc' => 'PHRED qualities from 93 to 0 inclusive',
242 '-descriptor' => 'Test PHRED qualities from 93 to 0 inclusive'
245 '-seq' => 'ACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGACTGAN',
246 '-qual' => [reverse(0..93)],
247 '-raw_quality' => '~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;:9876543210/.-,+*)(\'&%$#"!',
249 '-desc' => 'PHRED qualities from 93 to 0 inclusive',
250 '-descriptor' => 'Test PHRED qualities from 93 to 0 inclusive'
254 'variant' => 'solexa',
255 'to_solexa' => {'-seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN',
256 '-qual' => [qw(40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 10 9 8 7 6 5 5 4 4 3 3 2 2 1 1)],
257 '-raw_quality' => 'hgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;',
258 '-id' => 'slxa_0001_1_0001_01',
260 '-descriptor' => 'slxa_0001_1_0001_01'
263 '-seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN',
264 '-qual' => [qw(40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 10 9 8 7 6 5 5 4 4 3 3 2 2 1 1)],
265 '-namespace' => 'solexa',
266 '-raw_quality' => 'hgfedcba`_^]\\[ZYXWVUTSRQPONMLKJJIHGFEEDDCCBBAA',
267 '-id' => 'slxa_0001_1_0001_01',
269 '-descriptor' => 'slxa_0001_1_0001_01'
272 '-seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN',
273 '-qual' => [qw(40 39 38 37 36 35 34 33 32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 10 9 8 7 6 5 5 4 4 3 3 2 2 1 1)],
274 '-namespace' => 'solexa',
275 '-raw_quality' => 'IHGFEDCBA@?>=<;:9876543210/.-,++*)(\'&&%%$$##""',
276 '-id' => 'slxa_0001_1_0001_01',
278 '-descriptor' => 'slxa_0001_1_0001_01'
282 'variant' => 'illumina',
284 '-seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTN',
285 '-qual' => [reverse(1..40), 1], # round trip from solexa is lossy
286 '-namespace' => 'illumina',
287 '-raw_quality' => 'hgfedcba`_^]\[ZYXWVUTSRQPONMLKJHGFECB@>;;',
289 '-desc' => 'PHRED qualities from 40 to 0 inclusive',
290 '-descriptor' => 'Test PHRED qualities from 40 to 0 inclusive'
293 '-seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTN',
294 '-qual' => [reverse(0..40)],
295 '-raw_quality' => 'hgfedcba`_^]\\[ZYXWVUTSRQPONMLKJIHGFEDCBA@',
297 '-desc' => 'PHRED qualities from 40 to 0 inclusive',
298 '-descriptor' => 'Test PHRED qualities from 40 to 0 inclusive'
301 '-seq' => 'ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTN',
302 '-qual' => [reverse(0..40)],
303 '-raw_quality' => 'IHGFEDCBA@?>=<;:9876543210/.-,+*)(\'&%$#"!',
305 '-desc' => 'PHRED qualities from 40 to 0 inclusive',
306 '-descriptor' => 'Test PHRED qualities from 40 to 0 inclusive'
311 for my $example (sort keys %conversion) {
312 my $file = test_input_file('fastq', "$example.fastq");
313 my $variant = $conversion{$example}->{variant};
314 my $in = Bio::SeqIO->new(-format => "fastq-$variant",
316 -verbose => 2); #strictest level
317 # this both tests the next_dataset method and helps check roundtripping
318 my $seq = $in->next_seq;
319 for my $newvar (@variants) {
320 next unless exists $conversion{$example}->{"to_$newvar"};
321 my $outfile = test_output_file();
322 Bio::SeqIO->new(-format => "fastq-$newvar",
323 -file => ">$outfile",
324 -verbose => -1)->write_seq($seq);
325 my $newdata = Bio::SeqIO->new(-format => "fastq-$newvar",
326 -file => $outfile)->next_dataset;
327 # round for simple comparison, get around floating pt comparison probs
329 if ($newvar eq 'solexa') {
330 $newdata->{-qual} = [map {sprintf("%.0f",$_)} @{$newdata->{-qual}}];
333 #print Dumper($newdata) if $variant eq 'sanger' && $newvar eq 'illumina';
335 $conversion{$example}->{"to_$newvar"}->{'-namespace'} = $newvar;
336 is_deeply($newdata, $conversion{$example}->{"to_$newvar"}, "Conversion from $variant to $newvar");
340 # test fastq exception handling
346 exception => qr/doesn't\smatch\sseq\sdescriptor/xms,
350 exception => qr/doesn't\smatch\slength\sof\ssequence/xms,
354 exception => qr/Missing\ssequence\sand\/or\squality\sdata/xms,
358 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
360 error_qual_escape => {
362 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
366 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
368 error_qual_space => {
370 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
374 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
376 error_qual_unit_sep => {
378 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
382 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
384 error_short_qual => {
386 exception => qr/doesn't\smatch\slength\sof\ssequence/,
390 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
394 exception => qr/Unknown\ssymbol\swith\sASCII\svalue/xms,
396 error_trunc_at_plus => {
398 exception => qr/Missing\ssequence\sand\/or\squality\sdata/xms,
400 error_trunc_at_qual => {
402 exception => qr/Missing\ssequence\sand\/or\squality\sdata/xms,
404 error_trunc_at_seq => {
406 exception => qr/Missing\ssequence\sand\/or\squality\sdata/xms,
408 error_trunc_in_title => {
410 exception => qr/Missing\ssequence\sand\/or\squality\sdata/xms,
412 error_trunc_in_seq => {
414 exception => qr/Missing\ssequence\sand\/or\squality\sdata/xms,
416 error_trunc_in_plus => {
418 exception => qr/doesn't\smatch\sseq\s descriptor/xms,
420 error_trunc_in_qual => {
422 exception => qr/doesn't\smatch\slength\sof\ssequence/xms,
426 for my $example (sort keys %error) {
427 my $file = test_input_file('fastq', "$example.fastq");
428 my $variant = $error{$example}->{variant};
429 my $in = Bio::SeqIO->new(-format => "fastq-$variant",
431 -verbose => 2); #strictest level
433 throws_ok { while (my $seq = $in->next_seq) {
435 } } $error{$example}->{exception}, "Exception caught for $example";
440 my $in = Bio::SeqIO->new(-format => 'fastq',
441 -file => test_input_file('fastq', 'zero_qual.fastq'),
442 -verbose => 2); # strictest level
444 lives_and {my $seq = $in->next_seq;
445 is($seq->seq, 'G');} 'edge case; single 0 in quality fails';