3 InFile
= 'out.kmers.1k'
10 from collections
import Counter
12 def entropy(seq
, max_k
=5):
13 #p, lns = Counter(s), float(len(s))
14 #return -sum( count/lns * math.log(count/lns, 2) for count in p.values())
16 Kmers
= [ [] for x
in range(max_k
) ]
17 for i
in range(SeqLength
):
18 for thisK
in range(1,max_k
+1):
19 if SeqLength
-i
+1 > thisK
:
20 this_kmer
= seq
[i
:(i
+thisK
)]
21 Kmers
[thisK
-1].append(this_kmer
)
24 for thisKsub1
in range(max_k
): # https://rosettacode.org/wiki/Entropy
25 p
= Counter(Kmers
[thisKsub1
])
26 lns
= float(len(Kmers
[thisKsub1
]))
28 thisSNa
= -sum( count
* math
.log(count
/lns
, 2) for count
in p
.values())
29 thisSNe
= -sum( count
* math
.log(count
/lns
, n
) for count
in p
.values())
30 sumSNa
= sumSNa
+ thisSNa
31 sumSNe
= sumSNe
+ thisSNe
32 return [sumSNa
,sumSNe
]
33 #ttt = entropy('GATTACATATC',11)
38 with
open(InFile
) as tsvfile
:
39 tsvreader
= csv
.reader(tsvfile
, delimiter
=' ')
40 skip
= next(tsvreader
, [None])[0]
41 for i
in itertools
.product('ACTG', repeat
=18):
44 skip
= next(tsvreader
, [None])[0]
47 thisE
= entropy(oneKmer
,15)
50 hp
=primer3
.calcHairpin(oneKmer
)
51 hd
=primer3
.calcHomodimer(oneKmer
)
52 if hp
.structure_found
or hd
.structure_found
:
53 print(' '.join(['ST',str(round(hp
.tm
,2)),str(round(hd
.tm
,2)),oneKmer
,str(thisE
[0]),str(thisE
[1])]))
55 print(' '.join(['OK','0','0',oneKmer
,str(thisE
[0]),str(thisE
[1])]))
59 hp
=primer3
.calcHairpin('CCCCCATCCGATCAGGGGG')
60 hd
=primer3
.calcHomodimer('CCCCCATCCGATCAGGGGG')
63 grep OK out.kmers.f1 | awk -v size=2.5 '{ b=int($5/size); a[b]++; bmax=b>bmax?b:bmax; bmin=b<bmin?b:bmin } END { for(i=bmin;i<=bmax;++i) print i*size,(i+1)*size,a[i] }'
65 grep OK out.kmers.f1 | awk -v size=0.5 '{ b=int($6/size); a[b]++; bmax=b>bmax?b:bmax; bmin=b<bmin?b:bmin } END { for(i=bmin;i<=bmax;++i) print i*size,(i+1)*size,a[i] }'