10 CoreKeys
= ["{}{}".format(x
,y
) for x
,y
in itertools
.product(['fwd','rev'], ['A','C','G','T','N'])]
11 CounterKeys
= CoreKeys
+ ['cntIns','cntDel'];
12 theSamples
= ['B7','D3','Normal'];
15 if len(sys
.argv
) < 3 :
16 print('Usage:',sys
.argv
[0],'<samtools.mpileup.lz4> <BED file>',file=sys
.stderr
,flush
=True);
18 inDepthFile
= sys
.argv
[1]
19 inBEDFile
= sys
.argv
[2]
21 theZone
= collections
.defaultdict(dict)
22 with
open(inBEDFile
, newline
='') as csvfile
:
23 tsvin
= csv
.DictReader(csvfile
, delimiter
='\t', fieldnames
=['ChrID','Start','End'])
25 theZone
[row
['ChrID']][row
['Start']] = 1
27 #print("# Samples: [{}]".format(','.join(theSamples)))
28 print('# Format: {}'.format(','.join(CoreKeys
)))
29 print("\t".join(['ChrID','Pos']+theSamples
))
30 with lz4
.frame
.open(inDepthFile
, newline
='', mode
='rt') as fp
:
31 tsvin
= csv
.DictReader(fp
, delimiter
='\t', fieldnames
=['ChrID','Pos','Ref'] +
32 ["{}_{}".format(x
,y
) for x
,y
in itertools
.product(theSamples
, ['Depth','Bases','Qs'])]
34 sbc
= {key
:collections
.Counter() for key
in theSamples
}
35 bc
= {key
:{} for key
in theSamples
}
39 if row
['ChrID'] in theZone
:
40 if row
['Pos'] in theZone
[row
['ChrID']]:
42 print("\t".join((row
['ChrID'],row
['Pos'])),end
='\t')
44 bc
[k
] = Base_Counter(row
['{}_Depth'.format(k
)],row
['{}_Bases'.format(k
)])
45 sbc
[k
] += {x
: bc
[k
][x
] for x
in CounterKeys
}
47 print(",".join( [str(bc
[k
][x
]) for x
in CoreKeys
] ),end
='\t')
49 except KeyboardInterrupt:
50 print('\n#[!]Ctrl+C pressed.',file=sys
.stderr
,flush
=True)
55 # Base Counting Subroutine *[Completed]
56 # https://github.com/photonchang/allelecount/blob/master/allelecount.py
57 def Base_Counter(inDepth
,inBases
):
58 # Cleaning up Base String + Indel Counting
66 for currentIndex
, Strholder
in enumerate(inBases
):
67 # Skipping of '^' Signage
77 if IndelDeterminant
> 0:
84 # Determining Length of Indel
85 # Since Illumina NGS has an upper limit of less than 999bp read length
88 if (currentIndex
+ 4) <= len(inBases
):
89 if inBases
[currentIndex
+ 1: currentIndex
+ 4].isnumeric() == True:
90 IndelDeterminant
= int(inBases
[currentIndex
+ 1: currentIndex
+ 4]) + 3
91 elif inBases
[currentIndex
+ 1: currentIndex
+ 3]. isnumeric() == True:
92 IndelDeterminant
= int(inBases
[currentIndex
+ 1: currentIndex
+ 3]) + 2
93 elif inBases
[currentIndex
+ 1: currentIndex
+ 2].isnumeric() == True:
94 IndelDeterminant
= int(inBases
[currentIndex
+ 1: currentIndex
+ 2]) + 1
95 elif (currentIndex
+ 3) <= len(inBases
):
96 if inBases
[currentIndex
+ 1: currentIndex
+ 3]. isnumeric() == True:
97 IndelDeterminant
= int(inBases
[currentIndex
+ 1: currentIndex
+ 3]) + 2
98 elif inBases
[currentIndex
+ 1: currentIndex
+ 2].isnumeric() == True:
99 IndelDeterminant
= int(inBases
[currentIndex
+ 1: currentIndex
+ 2]) + 1
101 IndelHolder
.append(inBases
[currentIndex
:currentIndex
+ IndelDeterminant
+ 1])
107 # Determining Length of Indel
108 # Since Illumina NGS has an upper limit of less than 999bp read length
111 if (currentIndex
+ 4) <= len(inBases
):
112 if inBases
[currentIndex
+ 1: currentIndex
+ 4].isnumeric() == True:
113 IndelDeterminant
= int(inBases
[currentIndex
+ 1: currentIndex
+ 4]) + 3
114 elif inBases
[currentIndex
+ 1: currentIndex
+ 3]. isnumeric() == True:
115 IndelDeterminant
= int(inBases
[currentIndex
+ 1: currentIndex
+ 3]) + 2
116 elif inBases
[currentIndex
+ 1: currentIndex
+ 2].isnumeric() == True:
117 IndelDeterminant
= int(inBases
[currentIndex
+ 1: currentIndex
+ 2]) + 1
118 elif (currentIndex
+ 3) <= len(inBases
):
119 if inBases
[currentIndex
+ 1: currentIndex
+ 3]. isnumeric() == True:
120 IndelDeterminant
= int(inBases
[currentIndex
+ 1: currentIndex
+ 3]) + 2
121 elif inBases
[currentIndex
+ 1: currentIndex
+ 2].isnumeric() == True:
122 IndelDeterminant
= int(inBases
[currentIndex
+ 1: currentIndex
+ 2]) + 1
124 IndelHolder
.append(inBases
[currentIndex
: currentIndex
+ len(str(IndelDeterminant
)) + 1])
127 CleanString
+= Strholder
130 # Transferring Back Cleaned String
131 inBases
= CleanString
133 # '$' Signage Stripping
134 inBases
= inBases
.replace('$', '')
136 # Base Count Var Initialization
148 bigA
= inBases
.count('A')
149 bigC
= inBases
.count('C')
150 bigG
= inBases
.count('G')
151 bigT
= inBases
.count('T')
152 bigN
= inBases
.count('N')
154 smallA
= inBases
.count('a')
155 smallC
= inBases
.count('c')
156 smallG
= inBases
.count('g')
157 smallT
= inBases
.count('t')
158 smallN
= inBases
.count('n')
160 delBase
= inBases
.count('*')
162 # Internal Check - Throws Out Error (NOT STD-IN/OUT Compatible: Should break pipeline)
164 InternalCounter
= bigA
+ bigC
+ bigG
+ bigT
+ bigN
+ smallA
+ smallC
+ smallG
+ smallT
+ smallN
+ delBase
165 if InternalCounter
!= int(inDepth
):
167 print('Reported count: ' + inDepth
)
168 print('Internal counter: ' + str(InternalCounter
))
169 print('Internal sum: ' + str(len(inBases
)))
170 print('Number of insertions: ' + str(countIn
))
171 print('Number of deleions: ' + str(countDel
))
172 print('Post-processed bases: ' + inBases
)
176 IndelSetDict
= set(IndelHolder
)
178 FinalIndelHolder
= []
180 for EveryIndel
in IndelSetDict
:
182 tmpIndelString
= str(IndelHolder
.count(EveryIndel
)) + ":" + EveryIndel
183 FinalIndelHolder
.append(tmpIndelString
)
186 FinalOutput
= collections
.OrderedDict
.fromkeys( CounterKeys
+ ['Indels'] )
187 FinalOutput
['fwdA'] = bigA
; FinalOutput
['fwdC'] = bigC
; FinalOutput
['fwdG'] = bigG
; FinalOutput
['fwdT'] = bigT
; FinalOutput
['fwdN'] = bigN
;
188 FinalOutput
['revA'] = smallA
; FinalOutput
['revC'] = smallC
; FinalOutput
['revG'] = smallG
; FinalOutput
['revT'] = smallT
; FinalOutput
['revN'] = smallN
;
189 FinalOutput
['cntIns'] = countIn
; FinalOutput
['cntDel'] = countDel
; FinalOutput
['Indels'] = ';'.join(FinalIndelHolder
);
192 if __name__
== "__main__":
193 main() # time python3 ./samdepthplot.py t.tsv.gz 1
196 ./cntMpileup.py z.mp9k.lz4 z.bed