new file: pixi.toml
[GalaxyCodeBases.git] / projects / recombineMap / sperm_by_fw / crossover_prediction / HMM_method / HMMCO.py
blob74fc282474f39abf987d95ee9b5e05f9b7fee9f7
1 '''
2 Created on 2012-7-11
4 @author: LiJinsen
5 '''
6 import sys, os
7 from mpmath import mpf
9 ##Baum-Welch Part
10 def Baum_Welch(obs, states, start_p, trans_p, emit_p):
11 T = {}
12 for state in states:
13 T[state] = start_p[state]
16 pass
18 ##viterbi part
19 def forward_viterbi(obs, states, start_p, trans_p, emit_p):
20 T = {}
21 for state in states:
22 ## prob. V.path V.prob.
23 T[state] = (start_p[state], [state], start_p[state])
24 for output in obs:
25 U = {}
26 for next_state in states:
27 total = mpf(0)
28 argmax = None
29 valmax = mpf(0)
30 for source_state in states:
31 (prob, v_path, v_prob) = T[source_state]
32 prob = mpf(prob)
33 v_prob = mpf(v_prob)
34 tmp1 = emit_p[source_state][output]
35 tmp2 = trans_p[source_state][next_state]
36 p = tmp1 * tmp2
37 prob *= p
38 v_prob *= p
39 total += prob
40 if v_prob > valmax:
41 argmax = v_path + [next_state]
42 valmax = v_prob
43 U[next_state] = (total, argmax, valmax)
44 T = U
45 ## apply sum/max to the final states:
46 total = 0
47 argmax = None
48 valmax = 0
49 for state in states:
50 (prob, v_path, v_prob) = T[state]
51 total += prob
52 if v_prob > valmax:
53 argmax = v_path
54 valmax = v_prob
55 return (total, argmax, valmax)
57 #read data file
58 filename1 = []
59 def mydir(arg, dirname, names):
60 files = [os.path.normpath(os.path.join(dirname, file)) for file in names]
61 for filename in files:
62 if filename.find("FaMo")!=-1:
63 filename1.append(filename)
65 if len(sys.argv)==1:
66 path=os.getcwd()
67 else:
68 path = sys.argv[1]
69 os.path.walk(path, mydir, 0)
70 data = {}
71 #diffFile = open('difference.txt','r').read().split('\n')
72 #needReRun = []
73 #for line in diffFile:
74 # tmp = line.split('\t')
75 # needReRun.append(tmp)
76 for filename in filename1:
77 print "Loading File:", filename
78 content = open(filename,'r').read()
79 tempdata = {}
80 posData = {}
81 '''
82 #Chr Pos FaHap MoHap SpNum S01 S02 S03 S04 S05 S06 S07 S08 S09 S10 S11 S12 S13 S14 S15 S16 S17 S18 S19 S20 S21 S22 S23 S24 S25 S26 S27 S28 S29 S30 S31 S32 S33 S34 S35 S36 S37 S38 S39 S40 S41 S42 S43 S44 S45 S46 S47 S48 S49 S50 S51 S52 S53 S54 S55 S56 S57 S58 S59 S60 S61 S62 S63 S64 S65 S66 S67 S68 S69 S70 S71 S72 S73 S74 S75 S76 S77 S78 S79 S80 S81 S82 S83 S84 S85 S86 S87 S88 S89 S90 S91 S92 S93 S94 S95 S96 S97 S98 S99
83 #chr13 19020145 T G 9 N T N N N N N N N N N N N G N N N N N N N G N G N N N N N T N N N N N N N N N N N N N N G N N N N T N N N G N N N N N N N N N N N N N N N N N N N N N N N N N N N N N N N N N N N N N N G N N N N N N
84 #chr13 19020627 G A 39 N G N A G N N N G N A G A A A N N G N N N A N A G G N G N G G G N N N G N G N G N N A N N N N N N G A A N A N N G N N A N N N G N N A G G N N A N N N G N N N N N N A N N N N A N N N N A G N G N N N
85 #chr13 19020776 G T 24 N G N T G N N N N N T G N T T T N N N G N T N T G N N N G G N N N N N G N N N N N N N N N N T N N N N N N N N N G N N T N N N N N N T G N T N N T N N N N N N N N G N N N N N T N N N N N N N N N N N
86 '''
87 itemkey = []
88 for line in content.split('\n'):
89 if line=='':
90 continue
91 tmp = line.split('\t')
92 if tmp[0]=='#Chr':
93 itemkey = tmp
94 for item in itemkey:
95 tempdata.update({item:[]})
96 posData.update({item:[]})
97 continue
98 for i in range(len(itemkey)):
99 if i>=5:
100 if tmp[i]==tmp[2]:
101 tmp[i]='1'#father is 1
102 if tmp[i]==tmp[3]:
103 tmp[i]='-1'#mother is -1
104 if tmp[i]=='N':
105 continue
106 tempdata[itemkey[i]].append(tmp[i])
107 posData[itemkey[i]].append(tmp[1])
108 chrName = tempdata['#Chr'][0]
109 transition_probability = {'F' : {'F': 0.9999998,'U': 0.00000009,'M': 0.00000002},
110 'M' : {'M': 0.9999998,'U': 0.00000009,'F': 0.00000002},
111 'U' : {'U': 0.9, 'F': 0.05, 'M': 0.05}}
112 emission_probability = {'F' : {'1': 0.9, '-1': 0.1},
113 'M' : {'-1': 0.9, '1': 0.1},
114 'U' : {'1':0.5, '-1':0.5}}
115 start_probability = {'F' : 0.3,
116 'M' : 0.3,
117 'U' : 0.4}
118 for item in itemkey:
119 if item[0]=='S' and item[1]!='p':
120 #if [item, chrName, 'More'] not in needReRun:
121 # continue
122 states = ('F','U','M')
123 observations = tempdata[item]
124 print 'Calculating',item
125 tmpdt = forward_viterbi(observations, states, start_probability, transition_probability, emission_probability)
126 print 'Calculating',item, 'Finished!'
127 print 'Writing File',"result."+item+'.'+chrName
128 w = open("result."+item+'.'+chrName,'w')
129 w.write('#Sperm\tChr\tPos\tObservation\tGuess\n')
130 posTemp = posData[item]
131 for i in range(len(tmpdt[1])-1):
132 w.write(item)
133 w.write('\t')
134 w.write(chrName)
135 w.write('\t')
136 w.write(posTemp[i])
137 w.write('\t')
138 w.write(observations[i])
139 w.write('\t')
140 w.write(tmpdt[1][i])
141 w.write('\n')
142 w.close()
143 print 'Writing Completed.'