3 #InFile = 'out.kmers.1k'
10 from pprint
import pprint
13 from collections
import Counter
16 myEntropyKmer
= myKmerLen
- 3
17 myMinEntropy
= myKmerLen
* 3
19 def entropy(seq
, max_k
=5):
20 #p, lns = Counter(s), float(len(s))
21 #return -sum( count/lns * math.log(count/lns, 2) for count in p.values())
23 Kmers
= [ [] for x
in range(max_k
) ]
24 for i
in range(SeqLength
):
25 for thisK
in range(1,max_k
+1):
26 if SeqLength
-i
+1 > thisK
:
27 this_kmer
= seq
[i
:(i
+thisK
)]
28 Kmers
[thisK
-1].append(this_kmer
)
31 for thisKsub1
in range(max_k
): # https://rosettacode.org/wiki/Entropy
32 p
= Counter(Kmers
[thisKsub1
])
33 lns
= float(len(Kmers
[thisKsub1
]))
35 thisSNa
= -sum( count
* math
.log(count
/lns
, 2) for count
in p
.values())
36 thisSNe
= -sum( count
* math
.log(count
/lns
, n
) for count
in p
.values())
37 sumSNa
= sumSNa
+ thisSNa
38 sumSNe
= sumSNe
+ thisSNe
39 return [sumSNa
,sumSNe
]
40 #ttt = entropy('GATTACATATC',11)
44 reTri
= re
.compile(r
"(\w)\1{2,}")
46 for i
in itertools
.product('ACTG', repeat
=myKmerLen
):
50 reTriFound
= reTri
.search(oneKmer
)
55 thisE
= entropy(oneKmer
,myEntropyKmer
)
56 if thisE
[1] < myMinEntropy
:
58 hp
=primer3
.calcHairpin(oneKmer
)
59 hd
=primer3
.calcHomodimer(oneKmer
)
60 primer3Flag
= ['pF','dF']
61 if hp
.structure_found
:
63 if hd
.structure_found
:
65 print(' '.join([oneKmer
,str(thisE
[0]),str(thisE
[1]),primer3Flag
[0],primer3Flag
[1],str(round(hp
.tm
,2)),str(round(hd
.tm
,2))]))
69 hp
=primer3
.calcHairpin('CCCCCATCCGATCAGGGGG')
70 hd
=primer3
.calcHomodimer('CCCCCATCCGATCAGGGGG')
73 grep OK out.kmers.f1 | awk -v size=2.5 '{ b=int($5/size); a[b]++; bmax=b>bmax?b:bmax; bmin=b<bmin?b:bmin } END { for(i=bmin;i<=bmax;++i) print i*size,(i+1)*size,a[i] }'
75 grep OK out.kmers.f1 | awk -v size=0.5 '{ b=int($6/size); a[b]++; bmax=b>bmax?b:bmax; bmin=b<bmin?b:bmin } END { for(i=bmin;i<=bmax;++i) print i*size,(i+1)*size,a[i] }'