python/etc/out.kmers.py

   1 #!/usr/bin/env python3
   2
   3 InFile = 'out.kmers.1k'
   4
   5 import csv
   6 import primer3
   7 import itertools
   8
   9 import math
  10 from collections import Counter
  11
  12 def entropy(seq, max_k=5):
  13     #p, lns = Counter(s), float(len(s))
  14     #return -sum( count/lns * math.log(count/lns, 2) for count in p.values())
  15     SeqLength = len(seq)
  16     Kmers = [ [] for x in range(max_k) ]
  17     for i in range(SeqLength):
  18         for thisK in range(1,max_k+1):
  19             if SeqLength-i+1 > thisK:
  20                 this_kmer = seq[i:(i+thisK)]
  21                 Kmers[thisK-1].append(this_kmer)
  22     sumSNa = 0
  23     sumSNe = 0
  24     for thisKsub1 in range(max_k):  # https://rosettacode.org/wiki/Entropy
  25         p = Counter(Kmers[thisKsub1])
  26         lns = float(len(Kmers[thisKsub1]))
  27         n = 4**(thisKsub1+1)
  28         thisSNa = -sum( count * math.log(count/lns, 2) for count in p.values())
  29         thisSNe = -sum( count * math.log(count/lns, n) for count in p.values())
  30         print(p)
  31         print(n)
  32         print(lns)
  33         print(thisSNa)
  34         print(thisSNe)
  35         sumSNa = sumSNa + thisSNa
  36         sumSNe = sumSNe + thisSNe
  37     return [sumSNa,sumSNe]
  38 ttt = entropy('GGGGGCCCCCCCCC',11)
  39 print(ttt)
  40
  41 exit()
  42
  43 import math
  44 import numpy
  45 import scipy.special as spc
  46
  47 def approximate_entropy(bin_data: str, pattern_length=10):
  48     """
  49     Note that this description is taken from the NIST documentation [1]
  50     [1] http://csrc.nist.gov/publications/nistpubs/800-22-rev1a/SP800-22rev1a.pdf
  51     As with the Serial test of Section 2.11, the focus of this test is the frequency of all possible overlapping
  52     m-bit patterns across the entire sequence. The purpose of the test is to compare the frequency of overlapping
  53     blocks of two consecutive/adjacent lengths (m and m+1) against the expected result for a random sequence.
  54     :param bin_data: a binary string
  55     :param pattern_length: the length of the pattern (m)
  56     :return: the P value
  57     """
  58     n = len(bin_data)
  59     # Add first m+1 bits to the end
  60     # NOTE: documentation says m-1 bits but that doesnt make sense, or work.
  61     bin_data += bin_data[:pattern_length + 1:]
  62
  63     # Get max length one patterns for m, m-1, m-2
  64     max_pattern = ''
  65     for i in range(pattern_length + 2):
  66         max_pattern += '1'
  67
  68     # Keep track of each pattern's frequency (how often it appears)
  69     vobs_one = numpy.zeros(int(max_pattern[0:pattern_length:], 2) + 1)
  70     vobs_two = numpy.zeros(int(max_pattern[0:pattern_length + 1:], 2) + 1)
  71
  72     for i in range(n):
  73         # Work out what pattern is observed
  74         vobs_one[int(bin_data[i:i + pattern_length:], 2)] += 1
  75         vobs_two[int(bin_data[i:i + pattern_length + 1:], 2)] += 1
  76
  77     # Calculate the test statistics and p values
  78     vobs = [vobs_one, vobs_two]
  79     sums = numpy.zeros(2)
  80     for i in range(2):
  81         for j in range(len(vobs[i])):
  82             if vobs[i][j] > 0:
  83                 sums[i] += vobs[i][j] * math.log(vobs[i][j] / n)
  84     sums /= n
  85     ape = sums[0] - sums[1]
  86     chi_squared = 2.0 * n * (math.log(2) - ape)
  87     p_val = spc.gammaincc(pow(2, pattern_length-1), chi_squared/2.0)
  88     return p_val
  89
  90 test_str='GATCTA'
  91 bin_str = ''.join(format(ord(i), 'b') for i in test_str)
  92 print(approximate_entropy(bin_str,6))
  93
  94 exit()
  95 #import csv
  96 #import itertools
  97
  98 with open(InFile) as tsvfile:
  99     tsvreader = csv.reader(tsvfile, delimiter=' ')
 100     skip = next(tsvreader, [None])[0]
 101     for i in itertools.product('ACTG', repeat=18):
 102         oneKmer = ''.join(i)
 103         if oneKmer == skip:
 104             skip = next(tsvreader, [None])[0]
 105         else:
 106             #print(oneKmer)
 107             hp=primer3.calcHairpin(oneKmer)
 108             hd=primer3.calcHomodimer(oneKmer)
 109             if hp.structure_found or hd.structure_found:
 110                 print(' '.join([str(round(hp.tm,2)),str(round(hd.tm,2)),oneKmer]))
 111             else:
 112                 print(' '.join(['OK',oneKmer]))
 113
 114 exit()
 115
 116 hp=primer3.calcHairpin('CCCCCATCCGATCAGGGGG')
 117 hd=primer3.calcHomodimer('CCCCCATCCGATCAGGGGG')
 118
 119 print(t)
 120