modified: myjupyterlab.sh
[GalaxyCodeBases.git] / BGI / SOAPsnp / matrix.cc
blobb043130781e374089c595db861f1042d9bbdd6d8
1 #include "soap_snp.h"
3 /**
4 * DATE:
5 * FUNCTION: initialize array will be used in the prob matrix
6 * PARAMETER: flag : whether need the rank sum , if flag =1 means do the rank sum ,else don't
7 */
8 Prob_matrix::Prob_matrix(bool flag){
9 int i;
10 //update 11-11
11 bool ran_sum_mode = flag;
12 p_matrix = new rate_t [256*256*4*4]; // 8bit: q_max, 8bit: read_len, 4bit: number of types of all mismatch/match 4x4
13 p_prior = new rate_t [8*4*4]; // 8(ref ACTGNNNN) * diploid(4x4)
14 base_freq = new rate_t [4]; // 4 base
15 type_likely = new rate_t [16+1]; //The 17th element rate_t[16] will be used in comparison
16 type_prob = new rate_t [16+1];
17 //update 11-11
18 if (ran_sum_mode == true)
20 p_rank = new rate_t [64*64*2048]; // 6bit: N; 5bit: n1; 11bit; T1
21 p_binom = new rate_t [256*256]; // Total * case
23 else // if -u argument is not exit , don't need to allocate memory for the rank sum process.
25 p_rank = NULL;
26 p_binom = NULL;
29 //Initialize the array .
30 for(i=0;i!=256*256*4*4;i++)
32 p_matrix[i] = 1.0;
34 for(i=0;i!=8*4*4;i++)
36 p_prior[i] = 1.0;
38 for(i=0;i!=4;i++)
40 base_freq[i] = 1.0;
42 for(i=0;i!=16+1;i++)
44 type_likely[i] = 0.0; // LOG10 Scale
45 type_prob[i] = 0.0; // LOG10 Scale
48 //update 11-11
49 //if ran_sum_mode == false , p_rank and p_binom is NULL ; else initialize .
50 if (ran_sum_mode == true)
52 for(i=0;i!=64*64*2048;i++)
54 p_rank[i] = 1.0;
56 for(i=0;i!=256*256;i++)
58 p_binom[i] = 1.0;
63 /**
64 * DATE:
65 * FUNCTION: delete the allocate memory .
66 * PARAMETER: flag : whether need the rank sum , if flag =1 means do the rank sum ,else don't
68 Prob_matrix::~Prob_matrix(){
69 delete [] p_matrix; // 8bit: q_max, 8bit: read_len, 4bit: number of types of all mismatch/match 4x4
70 delete [] p_prior; // 8(ref ACTGNNNN) * diploid(4x4)
71 delete [] base_freq; // 4 base
72 delete [] type_likely; //The 17th element rate_t[16] will be used in comparison
73 delete [] type_prob;
74 if (ran_sum_mode == true)
76 delete [] p_rank; // 6bit: N; 5bit: n1; 11bit; T1
77 delete [] p_binom; // Total * case;
81 /**
82 * DATE:
83 * FUNCTION: soap file generate correction matrix
84 * PARAMETER: alignment :soap file ; para ; genome :reference and dbsnp inforamtion
85 * RETURN :
87 int Prob_matrix::matrix_gen(igzstream & alignment, Parameter * para, Genome * genome) {
88 // Read Alignment files
89 ubit64_t * count_matrix = new ubit64_t [256*256*4*4];
90 memset(count_matrix, 0, sizeof(ubit64_t)*256*256*4*4);
91 Soap_format soap;
92 map<Chr_name, Chr_info*>::iterator current_chr;
93 current_chr = genome->chromosomes.end();
94 std::string::size_type coord;
95 for(std::string line; getline(alignment, line);) {
96 std::istringstream ss(line);
97 if (ss >> soap)
98 deal_reads(count_matrix, genome, soap, current_chr); //get count matrix
100 count_qual(count_matrix, para);
101 delete [] count_matrix;
103 //memset(&p_matrix[((ubit64_t)para->q_max-para->q_min+1)<<12], 0, 256*256*4*4 - (para->q_max-para->q_min+1)*256*4*4);
104 // Note: from now on, the first 8 bit of p_matrix is its quality score, not the FASTQ char
105 return 1;
108 int Prob_matrix::matrix_read(std::fstream &mat_in, Parameter * para)
110 int q_char, type;
111 std::string::size_type coord;
112 for(std::string line; getline(mat_in, line);)
114 std::istringstream s(line);
115 s>>q_char>>coord;
116 for(type=0;type!=16;++type)
118 s>>p_matrix [ ((ubit64_t)q_char<<12) | (coord <<4) | type];
119 //cerr<<q_char<<"|"<<coord<<"|"<<p_matrix [ ((ubit64_t)q_char<<12) | (coord <<4) | type]<<endl;
120 //exit(0);
123 return 1;
126 int Prob_matrix::matrix_write(std::fstream &mat_out, Parameter * para)
128 for( char q_char = para->q_min; q_char <= para->q_max; q_char++ )
130 for( std::string::size_type coord=0; coord != para->read_length; coord++)
132 mat_out<<((ubit64_t)q_char-para->q_min)<<'\t'<<coord;
133 for(char type=0;type!=16;type++)
135 mat_out<<'\t'<<scientific<<showpoint<<setprecision(16)<<p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | type];
137 mat_out<<endl;
140 return 1;
143 /* program by Bill */
144 int Prob_matrix::matrix_gen(SamCtrl &alignment, Parameter * para, Genome * genome) {
145 // Read Alignment files
146 Soap_format soap;
147 ubit64_t * count_matrix = new ubit64_t [256*256*4*4];
148 memset(count_matrix, 0, sizeof(ubit64_t)*256*256*4*4);
149 map<Chr_name, Chr_info*>::iterator current_chr;
150 current_chr = genome->chromosomes.end();
151 int r;
152 std::string line;
154 while((r = alignment.readline(line)) >=0) {
155 line = alignment_format(line);
156 if (line == NOUSE_ALIGNMENT)
157 continue;
158 std::istringstream ss(line);
159 if (ss >> soap)
160 deal_reads(count_matrix, genome, soap, current_chr);
163 count_qual(count_matrix, para);
164 delete [] count_matrix;
166 //memset(&p_matrix[((ubit64_t)para->q_max-para->q_min+1)<<12], 0, 256*256*4*4 - (para->q_max-para->q_min+1)*256*4*4);
167 // Note: from now on, the first 8 bit of p_matrix is its quality score, not the FASTQ char
168 return 1;
172 * DATE: 2010-8-5
173 * FUNCTION: deal with the reads, get count matrix
174 * PARAMETER: count_matrix: is the point to the matrix. genome: is the point to the Genome.
175 soap: the soapformat to be deal with. current_chr: the current chr.
176 * RETURN: void
178 void Prob_matrix::deal_reads(ubit64_t *count_matrix, Genome *genome, Soap_format &soap, map<Chr_name, Chr_info*>::iterator &current_chr) {
179 std::string::size_type coord;
180 ubit64_t ref(0);
181 if(soap.get_pos() < 0) {
182 return;
184 //cerr<<soap<<endl;
185 // In the overloaded "+" above, soap.position will be substracted by 1 so that coordiates start from 0
186 //current _chr is a new chromose
187 cerr << __FUNCTION__ << __LINE__ << "\tpos\t" << soap.get_pos() << "\tsoap.get_read_len()\t" << soap.get_read_len() << endl;
188 if (current_chr == genome->chromosomes.end() || current_chr->first != soap.get_chr_name()) {
189 current_chr = genome->chromosomes.find(soap.get_chr_name()); // get the current chromose name
190 if(current_chr == genome->chromosomes.end()) {
191 for(map<Chr_name, Chr_info*>::iterator test = genome->chromosomes.begin();test != genome->chromosomes.end();test++) {
192 cerr<<'!'<<(test->first)<<'!'<<endl;
194 cerr<<"Assertion Failed: Chromosome: !"<<soap.get_chr_name()<<"! NOT found"<<endl;
195 exit(255);
198 else {
201 //the end of this soap position is out of this chromose length
202 if (soap.get_pos()+soap.get_read_len() >= current_chr->second->length()) {
203 return;
205 //only process the soap which best hit==1 .
206 if (soap.is_unique()) {
207 for(coord = 0; coord != soap.get_read_len(); coord++) {
208 if (soap.is_N(coord)) {
209 ; //the corrd soap is N ,do nothing
211 else {
212 if(! (soap.get_pos()+coord<current_chr->second->length())) { //the position out of chromose length,it's error
213 cerr<<soap<<endl;
214 cerr<<"The program found the above read has exceed the reference length:\n";
215 cerr<<"The read is aligned to postion: "<<soap.get_pos()<<" with read length: "<<soap.get_read_len()<<endl;
216 cerr<<"Reference: "<<current_chr->first<<" FASTA Length: "<<current_chr->second->length()<<endl;
217 exit(255);
219 ref = current_chr->second->get_bin_base(soap.get_pos()+coord); //get this position in the referece
220 if ( (ref&12) !=0 ) {
221 // This is an N on reference or a dbSNP which should be excluded from calibration
224 else {
225 if(soap.is_fwd()) { //"+"
226 // forward strand, soap.get_qual : quality , coord :position in the reads,ref : reference genotype, soap.get_base : genotype in the reads
227 //allele type, quality score, coordinates on the read, obeserve genotype.
228 cerr << __FUNCTION__ << __LINE__ << "\tcoord\t" << coord << endl;
229 count_matrix[(((ubit64_t)soap.get_qual(coord))<<12) | (coord<<4) | ((ref&0x3)<<2) | (soap.get_base(coord)>>1)&3] += 1;
230 cerr << __FUNCTION__ << __LINE__ << endl;
232 else {//"-"
233 // reverse strand
234 count_matrix[(((ubit64_t)soap.get_qual(coord))<<12) | ((soap.get_read_len()-1-coord)<<4) | ((ref&0x3)<<2) | (soap.get_base(coord)>>1)&3] += 1;
243 * DATE: 2010-8-5
244 * FUNCTION: count the quality.
245 * PARAMETER: count_matrix: is the point to the matrix. para: is the point to the Parameter.
246 * RETURN: void
248 void Prob_matrix::count_qual(ubit64_t *count_matrix, Parameter *para) {
249 std::string::size_type coord;
250 ubit64_t o_base/*o_based base*/, t_base/*theorecical(supposed) base*/, type, sum[4], same_qual_count_by_type[16], same_qual_count_by_t_base[4], same_qual_count_total, same_qual_count_mismatch;
251 char q_char/*fastq quality char*/;
253 const ubit64_t sta_pow=10; // minimum number to say statistically powerful
255 //the q_min default is 64, q_max is 64+40
256 for(q_char=para->q_min; q_char<=para->q_max ;q_char++)
258 memset(same_qual_count_by_type, 0, sizeof(ubit64_t)*16);
259 memset(same_qual_count_by_t_base, 0, sizeof(ubit64_t)*4);
260 same_qual_count_total = 0;
261 same_qual_count_mismatch = 0;
262 for(coord=0; coord != para->read_length ; coord++) { //para->read_length is get from "-L"
263 for(type=0;type!=16;type++) {//type is the 2 genotype from ACGT
264 // If the sample is small, then we will not consider the effect of read cycle.
265 //quality == q_char , coordinates on the read is the same,
266 same_qual_count_by_type[type] += count_matrix[ ((ubit64_t)q_char<<12) | coord <<4 | type];
267 //same_qual_count_by_t_base[0]~[3],(type>>2)&3 is referce genotype
268 same_qual_count_by_t_base[(type>>2)&3] += count_matrix[ ((ubit64_t)q_char<<12) | coord <<4 | type];
269 same_qual_count_total += count_matrix[ ((ubit64_t)q_char<<12) | coord <<4 | type];
270 //cerr<<(int)type<<'\t'<<same_qual_count_by_type[type]<<'\t'<<same_qual_count_by_t_base[0]<<endl;
271 if(type % 5 != 0) {
272 // Mismatches
273 same_qual_count_mismatch += count_matrix[ ((ubit64_t)q_char<<12) | coord <<4 | type];
277 for(coord=0; coord != para->read_length ; coord++)
279 //cerr<<(q_char)<<'\t'<<coord;
280 memset(sum, (ubit64_t)0, sizeof(ubit64_t)*4);
281 // Count of all ref base at certain coord and quality
282 for(type=0;type!=16;type++) {
283 sum[(type>>2)&3] += count_matrix[ ((ubit64_t)q_char<<12) | (coord <<4) | type]; // (type>>2)&3: the ref base
284 //cerr<<sum[type&3]<<endl;
286 for(t_base=0; t_base!=4; t_base++) {
287 for(o_base=0; o_base!=4; o_base++) {
288 //cerr<<'\t'<<count_matrix[ ((ubit64_t)q_char<<12) | (coord <<4) | (t_base<<2) | o_base];
289 if (count_matrix[ ((ubit64_t)q_char<<12) | (coord <<4) | (t_base<<2) | o_base] > sta_pow) {
290 // Statistically powerful
291 p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | (t_base<<2) | o_base] = ((double)count_matrix[ ((ubit64_t)q_char<<12) | (coord <<4) | (t_base<<2) | o_base]) / sum[t_base];
293 else if (same_qual_count_by_type[t_base<<2|o_base] > sta_pow) {
294 // Smaller sample, given up effect from read cycle
295 p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | (t_base<<2) | o_base] = ((double)same_qual_count_by_type[t_base<<2|o_base]) / same_qual_count_by_t_base[t_base];
297 else if (same_qual_count_total > 0){
298 // Too small sample, given up effect of mismatch types
299 if (o_base == t_base) {
300 p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | (t_base<<2) | o_base] = ((double)(same_qual_count_total-same_qual_count_mismatch))/same_qual_count_total;
302 else {
303 p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | (t_base<<2) | o_base] = ((double)same_qual_count_mismatch)/same_qual_count_total;
306 else {
310 // For these cases like:
311 // Ref: G o_base: G x10 Ax5. When calculate the probability of this allele to be A,
312 // If there's no A in reference gives observation of G, then the probability will be zero,
313 // And therefore exclude the possibility of this pos to have an A
314 // These cases should be avoid when the dataset is large enough
315 // If no base with certain quality is o_based, it also doesn't matter
316 if( (p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | (t_base<<2) | o_base]==0) || p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | (t_base<<2) | o_base] ==1) {
317 if (o_base == t_base) {
318 p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | (t_base<<2) | o_base] = (1-pow(10, -((q_char-para->q_min)/10.0)));
319 if(p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | (t_base<<2) | o_base]<0.25) {
320 p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | (t_base<<2) | o_base] = 0.25;
323 else {
324 p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | (t_base<<2) | o_base] = (pow(10, -((q_char-para->q_min)/10.0))/3);
325 if(p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | (t_base<<2) | o_base]>0.25) {
326 p_matrix [ ((ubit64_t)(q_char-para->q_min)<<12) | (coord <<4) | (t_base<<2) | o_base] = 0.25;
333 //cerr<<'\t'<<same_qual_count_by_type[0]<<'\t'<<same_qual_count_by_t_base[0]<<endl;