modified: src1/input.c
[GalaxyCodeBases.git] / python / etc / out.kmers.py
blob42ce3fc0eaa667b8efb0ebbdd4e63cbc0f4e9a9f
1 #!/usr/bin/env python3
3 InFile = 'out.kmers.1k'
5 import csv
6 import primer3
7 import itertools
9 import math
10 from collections import Counter
12 def entropy(seq, max_k=5):
13 #p, lns = Counter(s), float(len(s))
14 #return -sum( count/lns * math.log(count/lns, 2) for count in p.values())
15 SeqLength = len(seq)
16 Kmers = [ [] for x in range(max_k) ]
17 for i in range(SeqLength):
18 for thisK in range(1,max_k+1):
19 if SeqLength-i+1 > thisK:
20 this_kmer = seq[i:(i+thisK)]
21 Kmers[thisK-1].append(this_kmer)
22 sumSNa = 0
23 sumSNe = 0
24 for thisKsub1 in range(max_k): # https://rosettacode.org/wiki/Entropy
25 p = Counter(Kmers[thisKsub1])
26 lns = float(len(Kmers[thisKsub1]))
27 n = 4**(thisKsub1+1)
28 thisSNa = -sum( count * math.log(count/lns, 2) for count in p.values())
29 thisSNe = -sum( count * math.log(count/lns, n) for count in p.values())
30 print(p)
31 print(n)
32 print(lns)
33 print(thisSNa)
34 print(thisSNe)
35 sumSNa = sumSNa + thisSNa
36 sumSNe = sumSNe + thisSNe
37 return [sumSNa,sumSNe]
38 ttt = entropy('GGGGGCCCCCCCCC',11)
39 print(ttt)
41 exit()
43 import math
44 import numpy
45 import scipy.special as spc
47 def approximate_entropy(bin_data: str, pattern_length=10):
48 """
49 Note that this description is taken from the NIST documentation [1]
50 [1] http://csrc.nist.gov/publications/nistpubs/800-22-rev1a/SP800-22rev1a.pdf
51 As with the Serial test of Section 2.11, the focus of this test is the frequency of all possible overlapping
52 m-bit patterns across the entire sequence. The purpose of the test is to compare the frequency of overlapping
53 blocks of two consecutive/adjacent lengths (m and m+1) against the expected result for a random sequence.
54 :param bin_data: a binary string
55 :param pattern_length: the length of the pattern (m)
56 :return: the P value
57 """
58 n = len(bin_data)
59 # Add first m+1 bits to the end
60 # NOTE: documentation says m-1 bits but that doesnt make sense, or work.
61 bin_data += bin_data[:pattern_length + 1:]
63 # Get max length one patterns for m, m-1, m-2
64 max_pattern = ''
65 for i in range(pattern_length + 2):
66 max_pattern += '1'
68 # Keep track of each pattern's frequency (how often it appears)
69 vobs_one = numpy.zeros(int(max_pattern[0:pattern_length:], 2) + 1)
70 vobs_two = numpy.zeros(int(max_pattern[0:pattern_length + 1:], 2) + 1)
72 for i in range(n):
73 # Work out what pattern is observed
74 vobs_one[int(bin_data[i:i + pattern_length:], 2)] += 1
75 vobs_two[int(bin_data[i:i + pattern_length + 1:], 2)] += 1
77 # Calculate the test statistics and p values
78 vobs = [vobs_one, vobs_two]
79 sums = numpy.zeros(2)
80 for i in range(2):
81 for j in range(len(vobs[i])):
82 if vobs[i][j] > 0:
83 sums[i] += vobs[i][j] * math.log(vobs[i][j] / n)
84 sums /= n
85 ape = sums[0] - sums[1]
86 chi_squared = 2.0 * n * (math.log(2) - ape)
87 p_val = spc.gammaincc(pow(2, pattern_length-1), chi_squared/2.0)
88 return p_val
90 test_str='GATCTA'
91 bin_str = ''.join(format(ord(i), 'b') for i in test_str)
92 print(approximate_entropy(bin_str,6))
94 exit()
95 #import csv
96 #import itertools
98 with open(InFile) as tsvfile:
99 tsvreader = csv.reader(tsvfile, delimiter=' ')
100 skip = next(tsvreader, [None])[0]
101 for i in itertools.product('ACTG', repeat=18):
102 oneKmer = ''.join(i)
103 if oneKmer == skip:
104 skip = next(tsvreader, [None])[0]
105 else:
106 #print(oneKmer)
107 hp=primer3.calcHairpin(oneKmer)
108 hd=primer3.calcHomodimer(oneKmer)
109 if hp.structure_found or hd.structure_found:
110 print(' '.join([str(round(hp.tm,2)),str(round(hd.tm,2)),oneKmer]))
111 else:
112 print(' '.join(['OK',oneKmer]))
114 exit()
116 hp=primer3.calcHairpin('CCCCCATCCGATCAGGGGG')
117 hd=primer3.calcHomodimer('CCCCCATCCGATCAGGGGG')
119 print(t)