modified: src1/input.c
[GalaxyCodeBases.git] / python / etc / samdepthstat.py
blob6096b9f0e2e61340b9c1b18b85bad5468aee1a2a
1 #!/usr/bin/env pypy3
2 import sys
4 SamplesList = ('D3B_Crick', 'D3B_Watson', 'Normal_Crick', 'Normal_Watson', 'D3B_WGS', 'Normal_WGS')
6 #from collections import defaultdict
7 #DepthCnt = {key:defaultdict(int) for key in SamplesList}
8 #yDepthCnt = defaultdict(lambda: defaultdict(int))
10 def main():
11 import math
13 if len(sys.argv) < 3 :
14 print('Usage:',sys.argv[0],'<samtools.depth.gz> <out.tsv> [verbose=0]',file=sys.stderr,flush=True)
15 exit(0)
16 try:
17 verbose = int(sys.argv[3])
18 except: # `except IndexError:` and `except ValueError:`
19 verbose = 0
21 inDepthFile = sys.argv[1]
22 outFile = sys.argv[2]
23 print('From:[{}], To:[{}].\nVerbose: [{}].'.format(inDepthFile,outFile,verbose),file=sys.stderr,flush=True)
24 RecordCnt,MaxDepth,cDepthCnt,cDepthStat = inStat(inDepthFile,verbose)
25 for k in SamplesList:
26 cDepthStat[k][2] = cDepthStat[k][0] / RecordCnt # E(X)
27 cDepthStat[k][3] = cDepthStat[k][1] / RecordCnt # E(X^2)
28 cDepthStat[k][4] = math.sqrt(cDepthStat[k][3] - cDepthStat[k][2]*cDepthStat[k][2]) # E(X^2)-E(X)^2
29 tsvout = open(outFile, 'wt')
30 print('#{}\t{}'.format('Depth','\t'.join(SamplesList)),file=tsvout)
31 #RecordCntLength = len(str(RecordCnt))
32 print( '#N={},SD:\t{}'.format(RecordCnt,'\t'.join(str(round(cDepthStat[col][4],1)) for col in SamplesList)),file=tsvout)
33 for depth in range(0,MaxDepth+1):
34 #print( '{}\t{}'.format(depth,'\t'.join(str(DepthCnt[col][depth]) for col in SamplesList)) )
35 #print( '{}\t{}'.format(depth,'\t'.join(str(yDepthCnt[depth][col]) for col in SamplesList)) )
36 print( '{}\t{}'.format(depth,'\t'.join(str(cDepthCnt[col][depth]) for col in SamplesList)),file=tsvout)
37 #pass
38 #print('#MaxDepth={}'.format(MaxDepth),file=tsvout)
39 tsvout.close()
40 pass
42 def inStat(inDepthFile,verbose):
43 import gzip
44 import csv
45 from collections import Counter
46 # Looking up things in global scope takes longer then looking up stuff in local scope. <https://stackoverflow.com/a/54645851/159695>
47 cDepthCnt = {key:Counter() for key in SamplesList}
48 cDepthStat = {key:[0,0,0,0,0] for key in SamplesList} # x and x^2
49 RecordCnt = 0
50 MaxDepth = 0
51 with gzip.open(inDepthFile, 'rt') as tsvin:
52 tsvin = csv.DictReader(tsvin, delimiter='\t', fieldnames=('ChrID','Pos')+SamplesList )
53 #headers = tsvin.fieldnames
54 #print(headers)
55 try:
56 for row in tsvin:
57 #print(', '.join(row[col] for col in SamplesList))
58 RecordCnt += 1
59 for k in SamplesList:
60 theValue = int(row[k])
61 if theValue > MaxDepth:
62 MaxDepth = theValue
63 #DepthCnt[k][theValue] += 1 # PyPy3:30.54 ns, Python3:22.23 ns
64 #yDepthCnt[theValue][k] += 1 # PyPy3:30.47 ns, Python3:21.50 ns
65 cDepthCnt[k][theValue] += 1 # PyPy3:29.82 ns, Python3:30.61 ns
66 cDepthStat[k][0] += theValue
67 cDepthStat[k][1] += theValue * theValue
68 #print(MaxDepth,DepthCnt)
69 except KeyboardInterrupt:
70 print('\n[!]Ctrl+C pressed.',file=sys.stderr,flush=True)
71 pass
72 print('[!]Lines Read:[{}], MaxDepth is [{}].'.format(RecordCnt,MaxDepth),file=sys.stderr,flush=True)
73 return RecordCnt,MaxDepth,cDepthCnt,cDepthStat
75 if __name__ == "__main__":
76 main() # time python3 ./samdepthplot.py t.tsv.gz 1
78 import platform
79 get_implementation_name = platform.python_implementation()
80 if get_implementation_name == 'CPython':
81 try:
82 # get_size only works under Python, not PyPy.
83 mylibfile = "../lib/pysize/pysize.py"
84 with open(mylibfile) as f:
85 code = compile(f.read(), mylibfile, 'exec')
86 exec(code, globals(), locals())
87 print('[!]cDepthCnt:',get_size(cDepthCnt),file=sys.stderr,flush=True)
88 print('[!]DepthCnt: ',get_size(DepthCnt),file=sys.stderr,flush=True) # smaller
89 print('[!]yDepthCnt:',get_size(yDepthCnt),file=sys.stderr,flush=True)
90 except FileNotFoundError:
91 pass
92 except NameError:
93 pass
95 '''
96 [!]DepthCnt: 993487
97 [!]yDepthCnt: 1953307
98 [!]cDepthCnt: 994207
100 time pypy3 -m cProfile -o t2.cprof ./samdepthstat.py t.tsv.gz t.out
101 pyprof2calltree -i t2.cprof -k
103 time ./samdepthstat.py t.tsv.gz t.out