limit fstBC to 30bp in Python3 ver.
[GalaxyCodeBases.git] / python / etc / microb.py
blob21739ceccaf8bbc73d93f82282efd2187a7f46a0
1 #!/usr/bin/env python3
3 import vcf
4 import gzip,csv
5 import pprint
7 SNPdata = {}
8 SNPchrID = set()
10 ChrIDs ={
11 "DS362249":"DS362249.1", "DS362248":"DS362248.1", "DS362247":"DS362247.1", "DS362246":"DS362246.1", "DS362245":"DS362245.1", "DS362244":"DS362244.1", "DS362243":"DS362243.1", "DS362242":"DS362242.1", "DS362241":"DS362241.1", "DS362240":"DS362240.1", "DS362239":"DS362239.1", "DS362238":"DS362238.1", "DS362237":"DS362237.1", "DS362236":"DS362236.1", "DS362235":"DS362235.1", "DS362234":"DS362234.1", "DS362233":"DS362233.1", "DS362232":"DS362232.1", "DS362231":"DS362231.1", "DS362230":"DS362230.1", "DS362229":"DS362229.1", "DS362228":"DS362228.1", "DS362227":"DS362227.1", "DS362226":"DS362226.1", "DS362225":"DS362225.1", "DS362224":"DS362224.1", "DS362223":"DS362223.1", "DS362222":"DS362222.1", "DS362221":"DS362221.1", "DS362220":"DS362220.1", "DS362219":"DS362219.1", "DS362218":"DS362218.1", "DS362217":"DS362217.1", "DS499581":"DS499581.1", "DS499580":"DS499580.1", "DS499579":"DS499579.1", "DS499578":"DS499578.1", "DS499577":"DS499577.1", "DS499576":"DS499576.1", "DS499575":"DS499575.1", "DS499574":"DS499574.1", "DS499573":"DS499573.1", "DS499572":"DS499572.1", "DS499571":"DS499571.1", "DS499570":"DS499570.1", "CP001107":"CP001107.1", "CP001104":"CP001104.1", "CP001105":"CP001105.1", "CP001106":"CP001106.1"
14 SNPcnt = 0
15 f = gzip.open('snpcandidatforpcr.out.gz', 'rt')
16 tsvin = csv.reader(f, delimiter='\t')
17 for row in tsvin:
18 if len(row)==0: continue
19 #print(row)
20 ChrID = row[0].split('.')[1]
21 thePos = row[2]
22 #theRef = row[3]
23 Alleles = row[4].split(',')
24 Allele1 = Alleles[0].split('|')
25 Allele2 = Alleles[1].split('|')
26 AlleleP = { Allele1[0]:sum(map(int,Allele1[2:])), Allele2[0]:sum(map(int,Allele2[2:])) }
27 if ChrID in ChrIDs:
28 theKey = '\t'.join((ChrIDs[ChrID],thePos))
29 SNPdata[theKey] = AlleleP
30 SNPchrID.add(ChrIDs[ChrID])
31 #print((ChrID,thePos,theRef,theKey,AlleleP))
32 #print(Allele1)
33 #print(Allele2)
34 SNPcnt += 1
35 f.close()
36 #pprint.pprint(SNPchrID)
37 print('[!]SNP read:%d.' % (SNPcnt))
39 SNPcnt = 0
40 DepthStat = {}
41 GTStat = {}
42 vcf_reader = vcf.Reader(filename='microb.vcf.gz',compressed=True)
43 for theChrid in sorted(SNPchrID):
44 for record in vcf_reader.fetch(theChrid):
45 theKey = '\t'.join((record.CHROM,str(record.POS)))
46 #print((theKey))
47 if theKey in SNPdata:
48 SNPcnt += 1
49 #print(record)
50 theBases = [record.REF]
51 for abp in record.ALT:
52 theBases.append(str(abp))
53 SNPdatastr = []
54 for abp in SNPdata[theKey]:
55 SNPdatastr.append( ''.join([abp,':',str(SNPdata[theKey][abp]) ]) )
56 print('%4d %s\t%s\t%s'%(SNPcnt, theKey.replace('\t',':'), ','.join(SNPdatastr),','.join(theBases)))
57 #print(SNPdata[theKey])
58 #print(record.genotype('ERR589860'))
59 SampleGTs = set()
60 Depths = []
61 for sample in record.samples:
62 #print(sample)
63 theADstr = ','.join(map(str, sample['AD'] if isinstance(sample['AD'], list) else list(str(sample['AD']))))
64 """
65 if type(sample['AD']) is list:
66 theADstr = ','.join(map(str,sample['AD']))
67 elif type(sample['AD']) is int:
68 theADstr = str(sample['AD'])
69 else:
70 theADstr = 'ERROR'
71 """
72 print('%s:%s\t%4d %s=%s'%(sample.sample,sample.gt_bases,sample['DP'],sample['GT'],theADstr))
73 Depths.append(sample['DP'])
74 if sample.gt_bases != None:
75 SampleGTs.add(sample.gt_bases)
76 if len(Depths):
77 AvgDepth = sum(Depths)/len(Depths)
78 theDepth = int(round(AvgDepth,-1))
79 DepthStat[theDepth] = 1 + DepthStat.setdefault(theDepth,0)
80 GTStat[len(SampleGTs)] = 1 + GTStat.setdefault(len(SampleGTs),0)
81 print((len(SampleGTs),AvgDepth,theDepth))
82 print((GTStat,DepthStat))
84 '''
85 gzcat snpcandidatforpcr.out.gz|awk 'NF > 0 {split($0,a,".");print a[1]}'|uniq
86 411477
87 411479
88 445970
89 515619
90 515620
91 563193
92 657321
93 702446
94 缺几个物种没基因组?
95 '''