8 from subprocess import call
10 def find_accuracy(fn_truth, fn_assemby, name, method = "panda", printnice = True):
14 ftruth = open(fn_truth)
16 line = ftruth.readline()
18 ss = line.strip().split()
19 seq_len_map[ss[0]] = int(ss[1])
21 line = ftruth.readline()
24 fin = open(fn_assemby)
27 seqname = line.strip()
29 seqname = seqname[1:].split("_")[0]
30 seq = fin.readline().strip()
32 true_seq_len = int(seq_len_map[seqname])
33 numassebled = numassebled + 1
34 if true_seq_len == as_seq_len:
35 numcorrect = numcorrect + 1
38 seqname = seqname[1:].split(":")[0]
40 elif method == "flash":
42 elif method == "cope":
43 #@cp0 EU861894-140/1_EU861894-140/2 28 172
44 seqname = seqname[1:].split()[1].split("/")[0]
46 seqname = seqname[1:].split("/")[0]
47 seq = fin.readline().strip()
49 qual = fin.readline().strip()
51 true_seq_len = int(seq_len_map[seqname])
52 numassebled = numassebled + 1
53 if true_seq_len == as_seq_len:
54 numcorrect = numcorrect + 1
64 print("Assebly method:" + method)
65 print("Total seqs:" + repr(numseqs))
66 print("Assembled seqs:" + repr(numassebled))
67 print("Corrected seqs:" + repr(numcorrect))
68 print("Cor / Total:" + repr(float(numcorrect)/float(numseqs)))
69 print("Cor / Assemled:" + repr(float(numcorrect)/float(numassebled)))
72 numcorrect = numseqs - numassebled
73 print(name + " " +repr(numseqs)+" "+repr(numassebled)+" "+repr(numcorrect)+" "+repr(float(numcorrect)/float(numseqs))+" " + repr(float(numcorrect)/float(numseqs)) + " " + repr(1- float(numcorrect)/float(numseqs)))
75 print(name + " " +repr(numseqs)+" "+repr(numassebled)+" "+repr(numcorrect)+" "+repr(float(numcorrect)/float(numseqs))+" " + repr(float(numcorrect)/float(numassebled)) + " " + repr(1- float(numcorrect)/float(numassebled)))
79 def test_pear(forward, reverse, output, pvalue, minoverlap, maxlen, minlen, mintrimlen, minquality, maxuncalled, scoremethod, empirical_freqs, truelenfile, testname, testw):
80 if empirical_freqs == "yes":
81 #print "../src/pear" + " " + "-f" + " " + forward + " " + "-r" + " " + reverse + " " + "-o" + " " + output + " " + "-p" + " " + pvalue + " " + "-v" + " " + minoverlap + " " + "-m" + " " + maxlen + " " + "-n" + " " + minlen + " " + "-t" + " " + mintrimlen + " " + "-q" + " " + minquality + " " + "-u" + " " + maxuncalled + " " + "-s" + " " + scoremethod + " " + "-j" + " " + "2" + " " + "-g" + " " + testw + " " + "-y" + " " + "200000000"
82 call(["../src/pear","-f",forward,"-r",reverse,"-o", output,"-p", pvalue, "-v", minoverlap, "-m", maxlen, "-n", minlen, "-t", mintrimlen, "-q", minquality, "-u", maxuncalled, "-s", scoremethod, "-j", "2", "-g", testw, "-y", "200000000" ], stdout=open(os.devnull, "w"), stderr=subprocess.STDOUT)
84 #print "../src/pear" + "-f" + forward + "-r" + reverse + "-o" + output + "-p" + pvalue + "-v" + minoverlap + "-m" + maxlen + "-n" + minlen + "-t" + mintrimlen + "-q", minquality + "-u", maxuncalled + "-s" + scoremethod + "-j" + "2" + "-g" + testw + "-y" + "200000000 -e"
85 call(["../src/pear","-f",forward,"-r",reverse,"-o", output,"-p", pvalue, "-v", minoverlap, "-m", maxlen, "-n", minlen, "-t", mintrimlen, "-q", minquality, "-u", maxuncalled, "-s", scoremethod, "-j", "2", "-g", testw, "-y", "200000000", "-e"], stdout=open(os.devnull, "w"), stderr=subprocess.STDOUT)
86 return find_accuracy(fn_truth = truelenfile, fn_assemby = output+".assembled.fastq", name = testname, method = "pear", printnice = False)
88 def publication_data():
89 print("Testname" + " " +"Reads"+" "+"Assembled"+" "+"Correct"+" "+"Correct/Reads"+" " + "Correct/Assembled" + " fp_rate")
90 num101bpt_nt = test_pear(forward = "16S_101bp1.fq.through.fastq",
91 reverse = "16S_101bp2.fq.through.fastq",
101 empirical_freqs = "yes",
102 truelenfile = "16S_101bp_len.txt",
103 testname = "101bpt_nt",
105 num101bpt_t01 = test_pear(forward = "16S_101bp1.fq.through.fastq",
106 reverse = "16S_101bp2.fq.through.fastq",
116 empirical_freqs = "yes",
117 truelenfile = "16S_101bp_len.txt",
118 testname = "101bpt_t01",
120 num101bpt_t01 = test_pear(forward = "16S_101bp1.fq.through.fastq",
121 reverse = "16S_101bp2.fq.through.fastq",
131 empirical_freqs = "yes",
132 truelenfile = "16S_101bp_len.txt",
133 testname = "101bpt_t2_01",
135 num101bp_nt = test_pear(forward = "16S_101bp1.fq",
136 reverse = "16S_101bp2.fq",
146 empirical_freqs = "yes",
147 truelenfile = "16S_101bp_len.txt",
148 testname = "101bp_nt",
150 num101bp_t01 = test_pear(forward = "16S_101bp1.fq",
151 reverse = "16S_101bp2.fq",
161 empirical_freqs = "yes",
162 truelenfile = "16S_101bp_len.txt",
163 testname = "101bp_t01",
165 num101bp_t01 = test_pear(forward = "16S_101bp1.fq",
166 reverse = "16S_101bp2.fq",
176 empirical_freqs = "yes",
177 truelenfile = "16S_101bp_len.txt",
178 testname = "101bp_t2_01",
180 num101bp_nt = test_pear(forward = "16S_150bp1.fq",
181 reverse = "16S_150bp2.fq",
191 empirical_freqs = "yes",
192 truelenfile = "16S_150bp_len.txt",
193 testname = "150bp_nt",
195 num101bp_t01 = test_pear(forward = "16S_150bp1.fq",
196 reverse = "16S_150bp2.fq",
206 empirical_freqs = "yes",
207 truelenfile = "16S_150bp_len.txt",
208 testname = "150bp_t01",
210 num101bp_t01 = test_pear(forward = "16S_150bp1.fq",
211 reverse = "16S_150bp2.fq",
221 empirical_freqs = "yes",
222 truelenfile = "16S_150bp_len.txt",
223 testname = "150bp_t2_01",
225 num101bp_nt = test_pear(forward = "16S_165bp1.fq",
226 reverse = "16S_165bp2.fq",
236 empirical_freqs = "yes",
237 truelenfile = "16S_165bp_len.txt",
238 testname = "165bp_nt",
240 num101bp_t01 = test_pear(forward = "16S_165bp1.fq",
241 reverse = "16S_165bp2.fq",
251 empirical_freqs = "yes",
252 truelenfile = "16S_165bp_len.txt",
253 testname = "165bp_t01",
255 num101bp_t01 = test_pear(forward = "16S_165bp1.fq",
256 reverse = "16S_165bp2.fq",
266 empirical_freqs = "yes",
267 truelenfile = "16S_165bp_len.txt",
268 testname = "165bp_t2_01",
270 num101bp_nt = test_pear(forward = "16S_180bp1.fq",
271 reverse = "16S_180bp2.fq",
281 empirical_freqs = "yes",
282 truelenfile = "16S_180bp_len.txt",
283 testname = "180bp_nt",
285 num101bp_t01 = test_pear(forward = "16S_180bp1.fq",
286 reverse = "16S_180bp2.fq",
296 empirical_freqs = "yes",
297 truelenfile = "16S_180bp_len.txt",
298 testname = "180bp_t01",
300 num101bp_t01 = test_pear(forward = "16S_180bp1.fq",
301 reverse = "16S_180bp2.fq",
311 empirical_freqs = "yes",
312 truelenfile = "16S_180bp_len.txt",
313 testname = "180bp_t2_01",
315 num101bp_nt = test_pear(forward = "16S_190bp1.fq",
316 reverse = "16S_190bp2.fq",
326 empirical_freqs = "yes",
327 truelenfile = "16S_190bp_len.txt",
328 testname = "190bp_nt",
330 num101bp_t01 = test_pear(forward = "16S_190bp1.fq",
331 reverse = "16S_190bp2.fq",
341 empirical_freqs = "yes",
342 truelenfile = "16S_190bp_len.txt",
343 testname = "190bp_t01",
345 num101bp_t01 = test_pear(forward = "16S_190bp1.fq",
346 reverse = "16S_190bp2.fq",
356 empirical_freqs = "yes",
357 truelenfile = "16S_190bp_len.txt",
358 testname = "190bp_t2_01",
360 num101bp_nt = test_pear(forward = "16S_250bp1.fq",
361 reverse = "16S_250bp2.fq",
371 empirical_freqs = "yes",
372 truelenfile = "16S_250bp_len.txt",
373 testname = "250bp_nt",
375 num101bp_t01 = test_pear(forward = "16S_250bp1.fq",
376 reverse = "16S_250bp2.fq",
386 empirical_freqs = "yes",
387 truelenfile = "16S_250bp_len.txt",
388 testname = "250bp_t01",
390 num101bp_t01 = test_pear(forward = "16S_250bp1.fq",
391 reverse = "16S_250bp2.fq",
401 empirical_freqs = "yes",
402 truelenfile = "16S_250bp_len.txt",
403 testname = "250bp_t2_01",
405 os.remove("to1.assembled.fastq")
406 os.remove("to1.discarded.fastq")
407 os.remove("to1.unassembled.forward.fastq")
408 os.remove("to1.unassembled.reverse.fastq")
411 find_accuracy(fn_truth = "16S_101bp_len.txt", fn_assemby = "", name= "101bpt", method = "cope", printnice = True)
414 #101bp through, should have corrected assembled sequences =
415 print("Testname" + " " +"Reads"+" "+"Assembled"+" "+"Correct"+" "+"Correct/Reads"+" " + "Correct/Assembled" + " fp_rate")
416 num101bpt_t1 = test_pear(forward = "16S_101bp1.fq.through.fastq",
417 reverse = "16S_101bp2.fq.through.fastq",
425 maxuncalled = "0.05",
427 empirical_freqs = "yes",
428 truelenfile = "16S_101bp_len.txt",
429 testname = "101bpt_t1",
431 num150bp_t2 = test_pear(forward = "16S_150bp1.fq",
432 reverse = "16S_150bp2.fq",
440 maxuncalled = "0.05",
442 empirical_freqs = "yes",
443 truelenfile = "16S_150bp_len.txt",
444 testname = "150bp_t2",
446 num165bp_t1_o10 = test_pear(forward = "16S_165bp1.fq",
447 reverse = "16S_165bp2.fq",
455 maxuncalled = "0.05",
457 empirical_freqs = "yes",
458 truelenfile = "16S_165bp_len.txt",
459 testname = "165bp_t1_o10",
463 num180bp_t1 = test_pear(forward = "16S_180bp1.fq",
464 reverse = "16S_180bp2.fq",
472 maxuncalled = "0.05",
474 empirical_freqs = "yes",
475 truelenfile = "16S_180bp_len.txt",
476 testname = "180bp_t1",
479 num180bp_t1_001 = test_pear(forward = "16S_180bp1.fq",
480 reverse = "16S_180bp2.fq",
488 maxuncalled = "0.05",
490 empirical_freqs = "yes",
491 truelenfile = "16S_180bp_len.txt",
492 testname = "180bp_t1_001",
495 num180bp_t2_001 = test_pear(forward = "16S_180bp1.fq",
496 reverse = "16S_180bp2.fq",
504 maxuncalled = "0.05",
506 empirical_freqs = "yes",
507 truelenfile = "16S_180bp_len.txt",
508 testname = "180bp_t2_001",
511 num190bp_nt = test_pear(forward = "16S_190bp1.fq",
512 reverse = "16S_190bp2.fq",
520 maxuncalled = "0.05",
522 empirical_freqs = "yes",
523 truelenfile = "16S_190bp_len.txt",
524 testname = "190bp_nt",
526 num250bp_t2 = test_pear(forward = "16S_250bp1.fq",
527 reverse = "16S_250bp2.fq",
535 maxuncalled = "0.05",
537 empirical_freqs = "yes",
538 truelenfile = "16S_250bp_len.txt",
539 testname = "250bp_t2",
542 os.remove("to1.assembled.fastq")
543 os.remove("to1.discarded.fastq")
544 os.remove("to1.unassembled.forward.fastq")
545 os.remove("to1.unassembled.reverse.fastq")
548 if (num101bpt_t1 == 33022 and
549 num150bp_t2 == 28228 and
550 num165bp_t1_o10 == 25817 and
551 num180bp_t1 == 18115 and
552 num180bp_t1_001 == 14679 and
553 num180bp_t2_001 == 15532 and
554 num190bp_nt == 17112 and
555 num250bp_t2 == 23063):
556 print("Tests passed!")
558 print("Warnning: tests failed!")