5 * FUNCTION: initialize array will be used in the prob matrix
6 * PARAMETER: flag : whether need the rank sum , if flag =1 means do the rank sum ,else don't
8 Prob_matrix::Prob_matrix(bool flag
){
11 bool ran_sum_mode
= flag
;
12 p_matrix
= new rate_t
[256*256*4*4]; // 8bit: q_max, 8bit: read_len, 4bit: number of types of all mismatch/match 4x4
13 p_prior
= new rate_t
[8*4*4]; // 8(ref ACTGNNNN) * diploid(4x4)
14 base_freq
= new rate_t
[4]; // 4 base
15 type_likely
= new rate_t
[16+1]; //The 17th element rate_t[16] will be used in comparison
16 type_prob
= new rate_t
[16+1];
18 if (ran_sum_mode
== true)
20 p_rank
= new rate_t
[64*64*2048]; // 6bit: N; 5bit: n1; 11bit; T1
21 p_binom
= new rate_t
[256*256]; // Total * case
23 else // if -u argument is not exit , don't need to allocate memory for the rank sum process.
29 //Initialize the array .
30 for(i
=0;i
!=256*256*4*4;i
++)
44 type_likely
[i
] = 0.0; // LOG10 Scale
45 type_prob
[i
] = 0.0; // LOG10 Scale
49 //if ran_sum_mode == false , p_rank and p_binom is NULL ; else initialize .
50 if (ran_sum_mode
== true)
52 for(i
=0;i
!=64*64*2048;i
++)
56 for(i
=0;i
!=256*256;i
++)
65 * FUNCTION: delete the allocate memory .
66 * PARAMETER: flag : whether need the rank sum , if flag =1 means do the rank sum ,else don't
68 Prob_matrix::~Prob_matrix(){
69 delete [] p_matrix
; // 8bit: q_max, 8bit: read_len, 4bit: number of types of all mismatch/match 4x4
70 delete [] p_prior
; // 8(ref ACTGNNNN) * diploid(4x4)
71 delete [] base_freq
; // 4 base
72 delete [] type_likely
; //The 17th element rate_t[16] will be used in comparison
74 if (ran_sum_mode
== true)
76 delete [] p_rank
; // 6bit: N; 5bit: n1; 11bit; T1
77 delete [] p_binom
; // Total * case;
83 * FUNCTION: soap file generate correction matrix
84 * PARAMETER: alignment :soap file ; para ; genome :reference and dbsnp inforamtion
87 int Prob_matrix::matrix_gen(igzstream
& alignment
, Parameter
* para
, Genome
* genome
) {
88 // Read Alignment files
89 ubit64_t
* count_matrix
= new ubit64_t
[256*256*4*4];
90 memset(count_matrix
, 0, sizeof(ubit64_t
)*256*256*4*4);
92 map
<Chr_name
, Chr_info
*>::iterator current_chr
;
93 current_chr
= genome
->chromosomes
.end();
94 std::string::size_type coord
;
95 for(std::string line
; getline(alignment
, line
);) {
96 std::istringstream
ss(line
);
98 deal_reads(count_matrix
, genome
, soap
, current_chr
); //get count matrix
100 count_qual(count_matrix
, para
);
101 delete [] count_matrix
;
103 //memset(&p_matrix[((ubit64_t)para->q_max-para->q_min+1)<<12], 0, 256*256*4*4 - (para->q_max-para->q_min+1)*256*4*4);
104 // Note: from now on, the first 8 bit of p_matrix is its quality score, not the FASTQ char
108 int Prob_matrix::matrix_read(std::fstream
&mat_in
, Parameter
* para
)
111 std::string::size_type coord
;
112 for(std::string line
; getline(mat_in
, line
);)
114 std::istringstream
s(line
);
116 for(type
=0;type
!=16;++type
)
118 s
>>p_matrix
[ ((ubit64_t
)q_char
<<12) | (coord
<<4) | type
];
119 //cerr<<q_char<<"|"<<coord<<"|"<<p_matrix [ ((ubit64_t)q_char<<12) | (coord <<4) | type]<<endl;
126 int Prob_matrix::matrix_write(std::fstream
&mat_out
, Parameter
* para
)
128 for( char q_char
= para
->q_min
; q_char
<= para
->q_max
; q_char
++ )
130 for( std::string::size_type coord
=0; coord
!= para
->read_length
; coord
++)
132 mat_out
<<((ubit64_t
)q_char
-para
->q_min
)<<'\t'<<coord
;
133 for(char type
=0;type
!=16;type
++)
135 mat_out
<<'\t'<<scientific
<<showpoint
<<setprecision(16)<<p_matrix
[ ((ubit64_t
)(q_char
-para
->q_min
)<<12) | (coord
<<4) | type
];
143 /* program by Bill */
144 int Prob_matrix::matrix_gen(SamCtrl
&alignment
, Parameter
* para
, Genome
* genome
) {
145 // Read Alignment files
147 ubit64_t
* count_matrix
= new ubit64_t
[256*256*4*4];
148 memset(count_matrix
, 0, sizeof(ubit64_t
)*256*256*4*4);
149 map
<Chr_name
, Chr_info
*>::iterator current_chr
;
150 current_chr
= genome
->chromosomes
.end();
154 while((r
= alignment
.readline(line
)) >=0) {
155 line
= alignment_format(line
);
156 if (line
== NOUSE_ALIGNMENT
)
158 std::istringstream
ss(line
);
160 deal_reads(count_matrix
, genome
, soap
, current_chr
);
163 count_qual(count_matrix
, para
);
164 delete [] count_matrix
;
166 //memset(&p_matrix[((ubit64_t)para->q_max-para->q_min+1)<<12], 0, 256*256*4*4 - (para->q_max-para->q_min+1)*256*4*4);
167 // Note: from now on, the first 8 bit of p_matrix is its quality score, not the FASTQ char
173 * FUNCTION: deal with the reads, get count matrix
174 * PARAMETER: count_matrix: is the point to the matrix. genome: is the point to the Genome.
175 soap: the soapformat to be deal with. current_chr: the current chr.
178 void Prob_matrix::deal_reads(ubit64_t
*count_matrix
, Genome
*genome
, Soap_format
&soap
, map
<Chr_name
, Chr_info
*>::iterator
¤t_chr
) {
179 std::string::size_type coord
;
181 if(soap
.get_pos() < 0) {
185 // In the overloaded "+" above, soap.position will be substracted by 1 so that coordiates start from 0
186 //current _chr is a new chromose
187 cerr
<< __FUNCTION__
<< __LINE__
<< "\tpos\t" << soap
.get_pos() << "\tsoap.get_read_len()\t" << soap
.get_read_len() << endl
;
188 if (current_chr
== genome
->chromosomes
.end() || current_chr
->first
!= soap
.get_chr_name()) {
189 current_chr
= genome
->chromosomes
.find(soap
.get_chr_name()); // get the current chromose name
190 if(current_chr
== genome
->chromosomes
.end()) {
191 for(map
<Chr_name
, Chr_info
*>::iterator test
= genome
->chromosomes
.begin();test
!= genome
->chromosomes
.end();test
++) {
192 cerr
<<'!'<<(test
->first
)<<'!'<<endl
;
194 cerr
<<"Assertion Failed: Chromosome: !"<<soap
.get_chr_name()<<"! NOT found"<<endl
;
201 //the end of this soap position is out of this chromose length
202 if (soap
.get_pos()+soap
.get_read_len() >= current_chr
->second
->length()) {
205 //only process the soap which best hit==1 .
206 if (soap
.is_unique()) {
207 for(coord
= 0; coord
!= soap
.get_read_len(); coord
++) {
208 if (soap
.is_N(coord
)) {
209 ; //the corrd soap is N ,do nothing
212 if(! (soap
.get_pos()+coord
<current_chr
->second
->length())) { //the position out of chromose length,it's error
214 cerr
<<"The program found the above read has exceed the reference length:\n";
215 cerr
<<"The read is aligned to postion: "<<soap
.get_pos()<<" with read length: "<<soap
.get_read_len()<<endl
;
216 cerr
<<"Reference: "<<current_chr
->first
<<" FASTA Length: "<<current_chr
->second
->length()<<endl
;
219 ref
= current_chr
->second
->get_bin_base(soap
.get_pos()+coord
); //get this position in the referece
220 if ( (ref
&12) !=0 ) {
221 // This is an N on reference or a dbSNP which should be excluded from calibration
225 if(soap
.is_fwd()) { //"+"
226 // forward strand, soap.get_qual : quality , coord :position in the reads,ref : reference genotype, soap.get_base : genotype in the reads
227 //allele type, quality score, coordinates on the read, obeserve genotype.
228 cerr
<< __FUNCTION__
<< __LINE__
<< "\tcoord\t" << coord
<< endl
;
229 count_matrix
[(((ubit64_t
)soap
.get_qual(coord
))<<12) | (coord
<<4) | ((ref
&0x3)<<2) | (soap
.get_base(coord
)>>1)&3] += 1;
230 cerr
<< __FUNCTION__
<< __LINE__
<< endl
;
234 count_matrix
[(((ubit64_t
)soap
.get_qual(coord
))<<12) | ((soap
.get_read_len()-1-coord
)<<4) | ((ref
&0x3)<<2) | (soap
.get_base(coord
)>>1)&3] += 1;
244 * FUNCTION: count the quality.
245 * PARAMETER: count_matrix: is the point to the matrix. para: is the point to the Parameter.
248 void Prob_matrix::count_qual(ubit64_t
*count_matrix
, Parameter
*para
) {
249 std::string::size_type coord
;
250 ubit64_t o_base
/*o_based base*/, t_base
/*theorecical(supposed) base*/, type
, sum
[4], same_qual_count_by_type
[16], same_qual_count_by_t_base
[4], same_qual_count_total
, same_qual_count_mismatch
;
251 char q_char
/*fastq quality char*/;
253 const ubit64_t sta_pow
=10; // minimum number to say statistically powerful
255 //the q_min default is 64, q_max is 64+40
256 for(q_char
=para
->q_min
; q_char
<=para
->q_max
;q_char
++)
258 memset(same_qual_count_by_type
, 0, sizeof(ubit64_t
)*16);
259 memset(same_qual_count_by_t_base
, 0, sizeof(ubit64_t
)*4);
260 same_qual_count_total
= 0;
261 same_qual_count_mismatch
= 0;
262 for(coord
=0; coord
!= para
->read_length
; coord
++) { //para->read_length is get from "-L"
263 for(type
=0;type
!=16;type
++) {//type is the 2 genotype from ACGT
264 // If the sample is small, then we will not consider the effect of read cycle.
265 //quality == q_char , coordinates on the read is the same,
266 same_qual_count_by_type
[type
] += count_matrix
[ ((ubit64_t
)q_char
<<12) | coord
<<4 | type
];
267 //same_qual_count_by_t_base[0]~[3],(type>>2)&3 is referce genotype
268 same_qual_count_by_t_base
[(type
>>2)&3] += count_matrix
[ ((ubit64_t
)q_char
<<12) | coord
<<4 | type
];
269 same_qual_count_total
+= count_matrix
[ ((ubit64_t
)q_char
<<12) | coord
<<4 | type
];
270 //cerr<<(int)type<<'\t'<<same_qual_count_by_type[type]<<'\t'<<same_qual_count_by_t_base[0]<<endl;
273 same_qual_count_mismatch
+= count_matrix
[ ((ubit64_t
)q_char
<<12) | coord
<<4 | type
];
277 for(coord
=0; coord
!= para
->read_length
; coord
++)
279 //cerr<<(q_char)<<'\t'<<coord;
280 memset(sum
, (ubit64_t
)0, sizeof(ubit64_t
)*4);
281 // Count of all ref base at certain coord and quality
282 for(type
=0;type
!=16;type
++) {
283 sum
[(type
>>2)&3] += count_matrix
[ ((ubit64_t
)q_char
<<12) | (coord
<<4) | type
]; // (type>>2)&3: the ref base
284 //cerr<<sum[type&3]<<endl;
286 for(t_base
=0; t_base
!=4; t_base
++) {
287 for(o_base
=0; o_base
!=4; o_base
++) {
288 //cerr<<'\t'<<count_matrix[ ((ubit64_t)q_char<<12) | (coord <<4) | (t_base<<2) | o_base];
289 if (count_matrix
[ ((ubit64_t
)q_char
<<12) | (coord
<<4) | (t_base
<<2) | o_base
] > sta_pow
) {
290 // Statistically powerful
291 p_matrix
[ ((ubit64_t
)(q_char
-para
->q_min
)<<12) | (coord
<<4) | (t_base
<<2) | o_base
] = ((double)count_matrix
[ ((ubit64_t
)q_char
<<12) | (coord
<<4) | (t_base
<<2) | o_base
]) / sum
[t_base
];
293 else if (same_qual_count_by_type
[t_base
<<2|o_base
] > sta_pow
) {
294 // Smaller sample, given up effect from read cycle
295 p_matrix
[ ((ubit64_t
)(q_char
-para
->q_min
)<<12) | (coord
<<4) | (t_base
<<2) | o_base
] = ((double)same_qual_count_by_type
[t_base
<<2|o_base
]) / same_qual_count_by_t_base
[t_base
];
297 else if (same_qual_count_total
> 0){
298 // Too small sample, given up effect of mismatch types
299 if (o_base
== t_base
) {
300 p_matrix
[ ((ubit64_t
)(q_char
-para
->q_min
)<<12) | (coord
<<4) | (t_base
<<2) | o_base
] = ((double)(same_qual_count_total
-same_qual_count_mismatch
))/same_qual_count_total
;
303 p_matrix
[ ((ubit64_t
)(q_char
-para
->q_min
)<<12) | (coord
<<4) | (t_base
<<2) | o_base
] = ((double)same_qual_count_mismatch
)/same_qual_count_total
;
310 // For these cases like:
311 // Ref: G o_base: G x10 Ax5. When calculate the probability of this allele to be A,
312 // If there's no A in reference gives observation of G, then the probability will be zero,
313 // And therefore exclude the possibility of this pos to have an A
314 // These cases should be avoid when the dataset is large enough
315 // If no base with certain quality is o_based, it also doesn't matter
316 if( (p_matrix
[ ((ubit64_t
)(q_char
-para
->q_min
)<<12) | (coord
<<4) | (t_base
<<2) | o_base
]==0) || p_matrix
[ ((ubit64_t
)(q_char
-para
->q_min
)<<12) | (coord
<<4) | (t_base
<<2) | o_base
] ==1) {
317 if (o_base
== t_base
) {
318 p_matrix
[ ((ubit64_t
)(q_char
-para
->q_min
)<<12) | (coord
<<4) | (t_base
<<2) | o_base
] = (1-pow(10, -((q_char
-para
->q_min
)/10.0)));
319 if(p_matrix
[ ((ubit64_t
)(q_char
-para
->q_min
)<<12) | (coord
<<4) | (t_base
<<2) | o_base
]<0.25) {
320 p_matrix
[ ((ubit64_t
)(q_char
-para
->q_min
)<<12) | (coord
<<4) | (t_base
<<2) | o_base
] = 0.25;
324 p_matrix
[ ((ubit64_t
)(q_char
-para
->q_min
)<<12) | (coord
<<4) | (t_base
<<2) | o_base
] = (pow(10, -((q_char
-para
->q_min
)/10.0))/3);
325 if(p_matrix
[ ((ubit64_t
)(q_char
-para
->q_min
)<<12) | (coord
<<4) | (t_base
<<2) | o_base
]>0.25) {
326 p_matrix
[ ((ubit64_t
)(q_char
-para
->q_min
)<<12) | (coord
<<4) | (t_base
<<2) | o_base
] = 0.25;
333 //cerr<<'\t'<<same_qual_count_by_type[0]<<'\t'<<same_qual_count_by_t_base[0]<<endl;