3 InFile
= 'out.kmers.1k'
10 from collections
import Counter
12 def entropy(seq
, max_k
=5):
13 #p, lns = Counter(s), float(len(s))
14 #return -sum( count/lns * math.log(count/lns, 2) for count in p.values())
16 Kmers
= [ [] for x
in range(max_k
) ]
17 for i
in range(SeqLength
):
18 for thisK
in range(1,max_k
+1):
19 if SeqLength
-i
+1 > thisK
:
20 this_kmer
= seq
[i
:(i
+thisK
)]
21 Kmers
[thisK
-1].append(this_kmer
)
24 for thisKsub1
in range(max_k
): # https://rosettacode.org/wiki/Entropy
25 p
= Counter(Kmers
[thisKsub1
])
26 lns
= float(len(Kmers
[thisKsub1
]))
28 thisSNa
= -sum( count
* math
.log(count
/lns
, 2) for count
in p
.values())
29 thisSNe
= -sum( count
* math
.log(count
/lns
, n
) for count
in p
.values())
35 sumSNa
= sumSNa
+ thisSNa
36 sumSNe
= sumSNe
+ thisSNe
37 return [sumSNa
,sumSNe
]
38 ttt
= entropy('GGGGGCCCCCCCCC',11)
45 import scipy
.special
as spc
47 def approximate_entropy(bin_data
: str, pattern_length
=10):
49 Note that this description is taken from the NIST documentation [1]
50 [1] http://csrc.nist.gov/publications/nistpubs/800-22-rev1a/SP800-22rev1a.pdf
51 As with the Serial test of Section 2.11, the focus of this test is the frequency of all possible overlapping
52 m-bit patterns across the entire sequence. The purpose of the test is to compare the frequency of overlapping
53 blocks of two consecutive/adjacent lengths (m and m+1) against the expected result for a random sequence.
54 :param bin_data: a binary string
55 :param pattern_length: the length of the pattern (m)
59 # Add first m+1 bits to the end
60 # NOTE: documentation says m-1 bits but that doesnt make sense, or work.
61 bin_data
+= bin_data
[:pattern_length
+ 1:]
63 # Get max length one patterns for m, m-1, m-2
65 for i
in range(pattern_length
+ 2):
68 # Keep track of each pattern's frequency (how often it appears)
69 vobs_one
= numpy
.zeros(int(max_pattern
[0:pattern_length
:], 2) + 1)
70 vobs_two
= numpy
.zeros(int(max_pattern
[0:pattern_length
+ 1:], 2) + 1)
73 # Work out what pattern is observed
74 vobs_one
[int(bin_data
[i
:i
+ pattern_length
:], 2)] += 1
75 vobs_two
[int(bin_data
[i
:i
+ pattern_length
+ 1:], 2)] += 1
77 # Calculate the test statistics and p values
78 vobs
= [vobs_one
, vobs_two
]
81 for j
in range(len(vobs
[i
])):
83 sums
[i
] += vobs
[i
][j
] * math
.log(vobs
[i
][j
] / n
)
85 ape
= sums
[0] - sums
[1]
86 chi_squared
= 2.0 * n
* (math
.log(2) - ape
)
87 p_val
= spc
.gammaincc(pow(2, pattern_length
-1), chi_squared
/2.0)
91 bin_str
= ''.join(format(ord(i
), 'b') for i
in test_str
)
92 print(approximate_entropy(bin_str
,6))
98 with
open(InFile
) as tsvfile
:
99 tsvreader
= csv
.reader(tsvfile
, delimiter
=' ')
100 skip
= next(tsvreader
, [None])[0]
101 for i
in itertools
.product('ACTG', repeat
=18):
104 skip
= next(tsvreader
, [None])[0]
107 hp
=primer3
.calcHairpin(oneKmer
)
108 hd
=primer3
.calcHomodimer(oneKmer
)
109 if hp
.structure_found
or hd
.structure_found
:
110 print(' '.join([str(round(hp
.tm
,2)),str(round(hd
.tm
,2)),oneKmer
]))
112 print(' '.join(['OK',oneKmer
]))
116 hp
=primer3
.calcHairpin('CCCCCATCCGATCAGGGGG')
117 hd
=primer3
.calcHomodimer('CCCCCATCCGATCAGGGGG')