2 ******************************************************************************
\r
7 *CREATE DATE : 2010-7-14
\r
9 *FUNCTION : definition of the useful functions
\r
10 *FILE NAME : tool.cpp
\r
11 *UPDATE DATE : 2010-8-3
\r
12 *UPDATE BY : Bill Tang
\r
13 UPDATE DATE : 2010-9-9
\r
14 *UPDATE BY : Bill Tang
\r
15 UPDATE DATE : 2010-9-15
\r
16 *UPDATE BY : Zhou Guyue
\r
17 *******************************************************************************
\r
25 using namespace std;
\r
27 /* integer to string*/
\r
28 std::string myitoa(int num) {
\r
29 char num_str[256] = {0};
\r
30 sprintf(num_str, "%d", num);
\r
34 // A function to spilt string s into vector vec according to char splitchar
\r
35 void StringSplit(std::string s, char splitchar, std::vector<std::string>& vec) {
\r
36 // promise the vec is empty
\r
40 while(s.size() > 0 && s[0] == splitchar)
\r
41 s = s.substr(1, s.length() - 1);
\r
42 while(s.size() > 0 && s[s.length() - 1] == splitchar)
\r
43 s = s.substr(0, s.length() - 1);
\r
44 int length = s.length();
\r
46 for(int i=0;i<length;i++)
\r
48 if(s[i] == splitchar)
\r
50 vec.push_back(s.substr(start,i - start));
\r
51 while(s[i + 1] == splitchar)
\r
55 else if(i == length-1)// attach last
\r
57 vec.push_back(s.substr(start,i+1 - start));
\r
62 /* count the indel length from the cigar string. return the indel length and the position*/
\r
63 int count_indel_len(const std::string cigar, int &pos) {
\r
64 const char *tmp = cigar.c_str();
\r
69 for (end = 0; end < cigar.size(); end ++) {
\r
73 pos += atoi(cigar.substr(begin, end - begin).c_str());
\r
79 //len = 200 + atoi(cigar.substr(begin, end - begin).c_str());
\r
80 len = 100 + atoi(cigar.substr(begin, end - begin).c_str());
\r
86 //len = 100 + atoi(cigar.substr(begin, end - begin).c_str());
\r
87 len = 200 + atoi(cigar.substr(begin, end - begin).c_str());
\r
93 pos += atoi(cigar.substr(begin, end - begin).c_str());
\r
99 pos += atoi(cigar.substr(begin, end - begin).c_str());
\r
105 pos += atoi(cigar.substr(begin, end - begin).c_str());
\r
111 pos += atoi(cigar.substr(begin, end - begin).c_str());
\r
124 /* format the sam text to the soap text*/
\r
125 std::string alignment_format(const std::string &sam_ali) {
\r
126 if(sam_ali.empty()) {
\r
127 return NOUSE_ALIGNMENT;
\r
129 std::vector<std::string> vec;
\r
130 StringSplit(sam_ali, '\t', vec);
\r
131 if ((vec.size() < 11) || (vec[5] == "*"))
\r
132 return NOUSE_ALIGNMENT;
\r
134 std::string format;
\r
137 int flag = atoi(vec[1].c_str());
\r
139 int clip_num = count_soft_clip(vec[5], last_clip_num);
\r
141 if (clip_num + last_clip_num > vec[9].size())
\r
142 return NOUSE_ALIGNMENT;
\r
144 if (flag & (0x1 << 6)) {
\r
145 format = vec[0] + "/1" + "\t"; // add query name
\r
146 } else if (flag & (0x1 << 7)) {
\r
147 format = vec[0] + "/2" + "\t"; // add query name
\r
149 format = vec[0] + "\t"; // add query name
\r
152 format += vec[9].substr(clip_num, vec[9].size() - last_clip_num - clip_num) + "\t"; // add sequence
\r
153 format += vec[10].substr(clip_num, vec[10].size() - last_clip_num - clip_num) + "\t"; // add quality
\r
155 for (int i = 11; i < vec.size(); i++) {
\r
156 if (vec[i].find(HIT_COUNT_H0) != std::string::npos) {
\r
157 best_hit = atoi(vec[i].substr(5).c_str());
\r
159 } else if (vec[i].find(HIT_COUNT_X0) != std::string::npos) {
\r
160 best_hit = atoi(vec[i].substr(5).c_str());
\r
164 format += myitoa(best_hit) + "\t"; // add number of best hit.
\r
166 format += ((flag & (0x1 << 6)) ? "a" : "b"); // add a/b
\r
168 format += myitoa(vec[9].size() - clip_num - last_clip_num); // add length
\r
170 format += ((flag & (0x1 << 4)) ? "-" : "+"); // add strand
\r
172 format += vec[2] + "\t"; // add chr
\r
173 format += vec[3] + "\t"; // add location
\r
174 format += myitoa(count_indel_len(vec[5], pos)); // add number of mismatch
\r
176 format += myitoa(pos - clip_num); // add indel position
\r
177 //format += myitoa(pos) + "\t"; // add indel position. changed at 2010-11-22, to compate to the new version soap.
\r
178 //format += vec[5];
\r
184 * FUNCTION: count the soft clip number at the read's beginning.
\r
185 * PARAMETER: cigar: the cigar string. last_S: last soft clip's length.
\r
186 * RETURN: soft clip number.
\r
188 int count_soft_clip(const std::string cigar, int &last_S)
\r
190 const char *tmp = cigar.c_str();
\r
192 // count the last soft clip number.
\r
193 if (tmp[cigar.size() - 1] == 'S')
\r
195 int j = cigar.size() - 2;
\r
196 while (tmp[j] >= '0' && tmp[j] <= '9')
\r
200 last_S = atoi(cigar.substr(j + 1, cigar.size() - j - 1).c_str());
\r
204 for (; i < cigar.size(); ++i)
\r
206 if (tmp[i] >= '0' && tmp[i] <= '9')
\r
208 // jump to the next position.
\r
211 else if (tmp[i] == 'S')
\r
213 // return the soft clip number.
\r
214 int num = atoi(cigar.substr(0, i).c_str());
\r
219 // no soft clip infront of the reads.
\r
228 * FUNCTION: change string to bool
\r
229 * PARAMETER: str: a string which will change to bool. len: the str length.
\r
230 * RETURN: bool type of str
\r
232 int str_bool(const char* str)
\r
246 * FUNCTION: intialize sfs parament
\r
247 * PARAMETER: sfs_path: sfs configuration file path. sfs: a sfs_para struct.
\r
248 * RETURN: SFS_PARA_ERROR:if sfs parament is wrong ,SFS_PARA_SUCC : if sfs parament is right.
\r
249 * UPDATE: 2010-10-15
\r
251 int init_sfs_para(const string sfs_path, SFS_PARA *sfs )
\r
255 /*set default vaule for sfs_par struct*/
\r
256 sfs->allow_PathZ = 0;
\r
262 sfs->outfiles = "";
\r
263 sfs->pThres = 0.01;
\r
267 sfs->start_pos = 0;
\r
271 sfs->alternative = 0;
\r
272 sfs->jointFold = 1;
\r
274 //update 11-29 for control the column
\r
275 sfs->sfs_first_last = 0 ;
\r
277 my_ifstream sfsfile(sfs_path.c_str()); //open sfs configuration file
\r
279 if (!sfsfile.is_open()) //can not open sfsfile
\r
281 cerr << "Cannot open file:" << sfs_path << endl;
\r
282 return SFS_PARA_ERROR;
\r
285 stringstream temp_ss;
\r
286 while (sfsfile >> in)
\r
295 sfs->doBay = str_bool(in.c_str());
\r
298 else if (in == "doJoint")
\r
301 sfs->doJoint = str_bool(in.c_str());
\r
304 else if (in == "writeFr")
\r
307 sfs->writeFr = str_bool(in.c_str());
\r
310 else if (strcmp(in.c_str() , "outfiles") == 0)
\r
312 sfsfile >> sfs->outfiles;
\r
315 else if (in == "start")
\r
319 temp_ss >> sfs->start_pos;
\r
323 else if (in == "stop")
\r
327 temp_ss >> sfs->stop_pos;
\r
331 else if (in == "underFlowProtect")
\r
334 sfs->under_FP = str_bool(in.c_str());
\r
337 else if (in == "allowPhatZero")
\r
340 sfs->allow_PathZ = str_bool(in.c_str());
\r
343 else if ( in == "bias")
\r
347 temp_ss >> sfs->bias;
\r
351 else if (in == "pvar")
\r
355 temp_ss >> sfs->pvar;
\r
359 else if (in == "eps")
\r
363 temp_ss >> sfs->eps;
\r
367 else if (in == "pThres")
\r
371 temp_ss >> sfs->pThres;
\r
375 else if (in == "sigleMinorJoint")
\r
378 sfs->sigle_MJ = str_bool(in.c_str());
\r
381 else if (in == "sigleMinorBay" )
\r
384 sfs->sigle_MB = str_bool(in.c_str());
\r
387 else if (in == "alt" )
\r
390 sfs->alternative = atoi(in.c_str());
\r
393 else if (in == "qs" )
\r
395 sfsfile >> sfs->qs;
\r
398 //else if (in == "sfsfirstlast")
\r
399 else if ( in == "lightoutput")
\r
402 sfs->sfs_first_last = str_bool(in.c_str());
\r
408 return SFS_PARA_ERROR;
\r
413 return SFS_PARA_SUCC;
\r
416 /*sfs help information*/
\r
419 * FUNCTION: return help information
\r
425 cerr << "\t-> (outfiles) -outfiles \n" << endl;
\r
426 cerr << "\t-> (which analysis) -doBay -doJoint\n" << endl;
\r
427 cerr << "\t-> (optional variables) -eps -bias -pThres -underFlowProtect\n" << endl;
\r
428 cerr << "\t-> (optional variables) -start -stop \t(format is chromo:position)\n" << endl;
\r
429 cerr << "\t-> (optional variables) -singleMinor[Joint,Bay] [0 or 1] should we only use a singleminor?\n" << endl;
\r
430 cerr << "\t-> (optional variables) -sfsfirstlast [0 or 1] should we only output sfs the first column and the last column \n" <<endl;
\r