new file: cell2loc.py
[GalaxyCodeBases.git] / perl / etc / justonce / out.kmers.py
blobd4a951e71aff849a8d24feffc0523904a43527ef
1 #!/usr/bin/env python3
3 InFile = 'out.kmers.1k'
5 import csv
6 import primer3
7 import itertools
9 import math
10 from collections import Counter
12 def entropy(seq, max_k=5):
13 #p, lns = Counter(s), float(len(s))
14 #return -sum( count/lns * math.log(count/lns, 2) for count in p.values())
15 SeqLength = len(seq)
16 Kmers = [ [] for x in range(max_k) ]
17 for i in range(SeqLength):
18 for thisK in range(1,max_k+1):
19 if SeqLength-i+1 > thisK:
20 this_kmer = seq[i:(i+thisK)]
21 Kmers[thisK-1].append(this_kmer)
22 sumSNa = 0
23 sumSNe = 0
24 for thisKsub1 in range(max_k): # https://rosettacode.org/wiki/Entropy
25 p = Counter(Kmers[thisKsub1])
26 lns = float(len(Kmers[thisKsub1]))
27 n = 4**(thisKsub1+1)
28 thisSNa = -sum( count * math.log(count/lns, 2) for count in p.values())
29 thisSNe = -sum( count * math.log(count/lns, n) for count in p.values())
30 sumSNa = sumSNa + thisSNa
31 sumSNe = sumSNe + thisSNe
32 return [sumSNa,sumSNe]
33 #ttt = entropy('GATTACATATC',11)
34 #print(str(ttt))
38 with open(InFile) as tsvfile:
39 tsvreader = csv.reader(tsvfile, delimiter=' ')
40 skip = next(tsvreader, [None])[0]
41 for i in itertools.product('ACTG', repeat=18):
42 oneKmer = ''.join(i)
43 if oneKmer == skip:
44 skip = next(tsvreader, [None])[0]
45 else:
46 #print(oneKmer)
47 thisE = entropy(oneKmer,15)
48 if thisE[1] < 53:
49 continue
50 hp=primer3.calcHairpin(oneKmer)
51 hd=primer3.calcHomodimer(oneKmer)
52 if hp.structure_found or hd.structure_found:
53 print(' '.join(['ST',str(round(hp.tm,2)),str(round(hd.tm,2)),oneKmer,str(thisE[0]),str(thisE[1])]))
54 else:
55 print(' '.join(['OK','0','0',oneKmer,str(thisE[0]),str(thisE[1])]))
57 exit()
59 hp=primer3.calcHairpin('CCCCCATCCGATCAGGGGG')
60 hd=primer3.calcHomodimer('CCCCCATCCGATCAGGGGG')
62 """
63 grep OK out.kmers.f1 | awk -v size=2.5 '{ b=int($5/size); a[b]++; bmax=b>bmax?b:bmax; bmin=b<bmin?b:bmin } END { for(i=bmin;i<=bmax;++i) print i*size,(i+1)*size,a[i] }'
65 grep OK out.kmers.f1 | awk -v size=0.5 '{ b=int($6/size); a[b]++; bmax=b>bmax?b:bmax; bmin=b<bmin?b:bmin } END { for(i=bmin;i<=bmax;++i) print i*size,(i+1)*size,a[i] }'
66 """