modified: Makefile
[GalaxyCodeBases.git] / python / salus / alignkmer.py
blob60d4f00b5db27741e7fdddcc4b3f5e77ba786328
1 #!/usr/bin/env python3
3 import sys
5 def eprint(*args, **kwargs):
6 print(*args, file=sys.stderr, **kwargs)
8 def buildQuery():
9 from skbio import DNA
10 from skbio.alignment import StripedSmithWaterman
11 Adapters = {
12 'adapter1': 'CAAGCAGAAGACGGCATACGAGATTCTTTCCCTACACGACGCTCTTCCGATCT',
13 'adapter2': 'CCCGTTCGCAACATGTCTGGCGTCATATCTTGTGACTACAGCACCCTCGACTCTCGCAGACTTTCACCAGTCCATGAT',
14 'adapter3': 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGTAGATCTCGGTGGTCGCCGTATCATT'
16 Querys={}
17 for qid in Adapters.keys():
18 Seq = Adapters[qid]
19 rcSeq = str(DNA(Seq).reverse_complement())
20 Query = StripedSmithWaterman(Seq)
21 rcQuery = StripedSmithWaterman(rcSeq)
22 Querys[qid] = (Query,rcQuery)
23 return Querys
25 def aliOneSeq(QuerySSWs,targetSeq):
26 Alignments = {}
27 Scores = {}
28 for qid in QuerySSWs.keys():
29 (Query,rcQuery) = QuerySSWs[qid]
30 Alignment = Query(targetSeq)
31 rcAlignment = rcQuery(targetSeq)
32 strand = '*'
33 if rcAlignment == None or Alignment.optimal_alignment_score > rcAlignment.optimal_alignment_score:
34 resAli = Alignment
35 strand = '+'
36 else:
37 resAli = rcAlignment
38 strand = '-'
39 Alignments[qid] = (resAli,strand)
40 Scores[qid] = resAli.optimal_alignment_score
41 maxScores = [key for key, value in Scores.items() if value == max(Scores.values())]
42 maxAlignments = {k: Alignments[k] for k in maxScores}
43 return maxAlignments
45 def printAli(maxAlignments,kmer,kCount):
46 #targetSeq = 'ACATAGTCTGGCGTCATTCTTGTGTACA'
47 #maxAlignments = aliOneSeq(QuerySSWs,targetSeq)
48 for tid in maxAlignments.keys():
49 ali = maxAlignments[tid]
50 prnStr = ' '.join((
51 ':'.join((kmer,str(kCount))),
52 tid, ali[1], ali[0].cigar,
53 str(ali[0].optimal_alignment_score),
54 str(ali[0].suboptimal_alignment_score)
56 print(prnStr)
57 prnStr = '\n'.join((
58 ali[0].aligned_target_sequence,
59 ali[0].aligned_query_sequence
61 print(prnStr)
62 print()
63 return
65 def main():
66 if len(sys.argv) < 3 :
67 print('Usage:',sys.argv[0],'<kmc_dump file> <min count> >alignments.out',file=sys.stderr,flush=True);
68 exit(0);
69 kmcDumpFile = sys.argv[1]
70 minCount = int(sys.argv[2])
72 QuerySSWs = buildQuery()
74 with open(kmcDumpFile) as kmcfile:
75 for line in kmcfile:
76 kmer,kCount = line.split() # rstrip('\n') only needed when split("\t"), since whitespace include "\n".
77 #print(':'.join(('[',kmer,kCount,']')))
78 if int(kCount) < minCount:
79 eprint(':'.join(('[!]Stop reading before',kmer,kCount)))
80 break
81 maxAlignments = aliOneSeq(QuerySSWs,kmer)
82 printAli(maxAlignments,kmer,kCount)
84 if __name__ == "__main__":
85 main() # time ./alignkmer.py deumi31-9.out.rnk2 1000000 |head