modified: SpatialOmicsCoord.py
[GalaxyCodeBases.git] / BGI / SOAPsnp / FileListManager.cpp
blob1f5d3750f33ab6b796922e1fd7271c799f48a65f
1 /*
2 ******************************************************************************
3 *Copyright 2010
4 * BGI-SHENZHEN
5 *All Rights Reserved
6 *ATHUOR : Bill Tang
7 *CREATE DATE : 2010-8-9
8 *CLASS NAME: FileListManager
9 *FUNCTION : a class that can control file list. support open, readline, close etc
10 * file functions.
11 *FILE NAME : FileListManager.cpp
12 *UPDATE DATE : 2010-8-19
13 *UPDATE BY : Bill Tang
14 *UPDATE DATE : 2010-9-3
15 *UPDATE BY : Bill Tang
17 *UPDATE: 2010-9-3 add class member m_endfile_count, change function readWin,
18 * readWinFromSoap and readWinFromSam.
19 * Make it can judge the files number that reach tail.
20 *UPDATE: 2010-10-4 change fucntion closeAliFiles, make it can close matrix files and delete all object!
21 *UPDATE: 2010-10-8 change function openCnsFile: add the function that if don't need output consensus, create empty objects.
22 *UPDATE: 2010-11-2 change the m_soap_file_vec's type, to vector<*igzstream>.
23 change the interface which is related to soap file.
24 *UPDATA BY :BIll Tang
25 *UPDATE: 2010-11-15 add the sem's control function.
26 *******************************************************************************
29 #include "FileListManager.h"
30 #include "SamCtrl.h"
31 #include <sys/resource.h>
34 FileListManager::FileListManager()
35 : m_stop_pos(0)
36 , m_endfile_count(0)
38 m_soap_file_vec.clear();
39 m_sam_file_vec.clear();
40 m_cns_file_vec.clear();
41 m_file_number = 0;
42 m_base_file.clear();
47 FileListManager::~FileListManager()
49 /*delete f_output stream*/
50 for (vector<gzoutstream*>::iterator iter = m_cns_file_vec.begin();
51 iter != m_cns_file_vec.end();
52 ++iter)
54 delete *iter;
57 /*delete f_input stream*/
58 for (vector<igzstream*>::iterator iter = m_soap_file_vec.begin();
59 iter != m_soap_file_vec.end();
60 ++iter)
62 delete *iter;
66 for (vector<SamCtrl*>::iterator iter = m_sam_file_vec.begin();
67 iter != m_sam_file_vec.end();
68 ++iter)
70 delete *iter;
73 for (vector<fstream*>::iterator iter = m_matrix_file_vec.begin();
74 iter != m_matrix_file_vec.end();
75 ++iter)
77 delete *iter;
82 /**
83 * DATE: 2010-8-9
84 * FUNCTION: open output file function.
85 * PARAMETER: path: the output consensus files path. mode: the open mode.
86 * RETURN: the number of consensus files that be opened.
88 int FileListManager::openCnsFile(const string& path, const string& outdir, const char* mode, Parameter * para)
90 string cns_Path, cns_name; //the one consensus file name
91 gzoutstream* f_output;
93 // if don't need to out put consensus, then create empty objects.
94 if (outdir == "")
96 for (int i = 0; i < m_file_number; ++i)
98 // create new ofstream object.update by zhukai on 2010-12-09
99 if(para->is_cns_gz)
101 f_output = new myogzstream();
103 else
105 f_output = new myofstream();
107 m_cns_file_vec.push_back(f_output);
109 return m_cns_file_vec.size();
112 m_base_file.open(path.c_str()); //open filelist
113 if (!m_base_file) //can not open filelist
115 cerr << "Cannot open file:" << path << endl;
116 return BASEFILE_ERROR;
119 string cmd = "mkdir -p " + outdir;
120 system(cmd.c_str());
122 /*open each cnsfile and push in cns vector*/
123 while (getline(m_base_file, cns_Path))
125 //f_output = new gzoutstream();
127 /*find cons name from cns path*/
128 int pos = cns_Path.rfind('/');
130 //update by zhukai on 2010-12-09
131 if(para->is_cns_gz)
133 f_output = new myogzstream();
134 cns_name = outdir + cns_Path.substr(pos, string::npos) + ".consus.gz";
136 else
138 f_output = new myofstream();
139 cns_name = outdir + cns_Path.substr(pos, string::npos) + ".consus";
144 /*put cns in to cns vector*/
145 f_output->open(cns_name.c_str(),std::ios::out);
147 if (!f_output->is_open())
149 cerr << "Cannot open file:" << cns_name << endl;
150 return CNSFILE_ERROR;
153 // set the output format. 2010-12-15 Bill
154 f_output->set_f(std::ios::showpoint);
155 m_cns_file_vec.push_back(f_output);
158 m_base_file.close(); //close consensus files list
160 return m_cns_file_vec.size();
165 * DATE: 2010-8-9
166 * FUNCTION: open all the file in the file list
167 * PARAMETER: listfilename: the path of file that contain list of infile. mode: the open mode.
168 * RETURN:
170 int FileListManager::openAli(const string& listfilename,const char* mode)
172 if (mode[0] == 'r') //is sam/bam file
174 return openSamFile(listfilename,mode); //open sam/bam file ,return file number
176 else
178 //cerr << "open soap file" <<endl;
179 return openSoapFile(listfilename); //open soap file , return file number
185 * DATE: 2010-8-30
186 * FUNCTION: open sam/bam file
187 * PARAMETER: mode: the open mode.
188 * RETURN: BASEFILE_ERROR when cannot open file list. SAMFILE_ERROR when cannot open sam file.
190 int FileListManager::openSamFile(const string& listfilename, const char* mode)
192 string filepath;
193 SamCtrl *sctrl;
195 m_base_file.open(listfilename.c_str()); //open sam/bam files list
196 if (!m_base_file)
198 cerr << "Cannot open file:" << listfilename << endl;
199 return BASEFILE_ERROR;
202 /*open each sam/bam file in the filelist and input sam vector*/
203 while (m_base_file >> filepath)
205 /*set max limit */
206 // if(setlimit(m_sam_file_vec.size()))
209 sctrl = new SamCtrl();
210 if (!(sctrl->open(filepath.c_str(), mode)))
212 cerr << "Cannot open file:" << filepath << endl;
213 return SAMFILE_ERROR;
215 m_sam_file_vec.push_back(sctrl);
217 // else
219 // return SETLIMIT_ERROR;
222 m_file_number = m_sam_file_vec.size(); // bam/sam file number in file list
223 m_base_file.close(); //close sam/bam files list
225 return m_file_number;
230 * DATE: 2010-8-30
231 * FUNCTION: open soap format file.
232 * PARAMETER: void.
233 * RETURN: BASEFILE_ERROR when cannot open file list. SAMFILE_ERROR when cannot open soap file.
235 int FileListManager::openSoapFile(const string& listfilename)
237 string filepath;
238 igzstream* f_input;
240 m_base_file.open(listfilename.c_str()); //open soap filelist
241 if (!m_base_file)
243 cerr<<"Cannot open file:" <<listfilename<<endl;
244 return BASEFILE_ERROR;
246 /*open each soap file in the filelist and input soap vector*/
247 while (m_base_file >> filepath)
249 /*set max limit */
250 //if ( setlimit( m_soap_file_vec.size() ) )
252 f_input = new igzstream();
253 f_input->open(filepath.c_str()) ;
254 if (!(*f_input))
256 cerr << "Cannot open file:" << filepath << endl;
257 return SOAPFILE_ERROR;
259 m_soap_file_vec.push_back(f_input);
260 f_input->clear();
262 //else
264 // return SETLIMIT_ERROR;
268 m_file_number = m_soap_file_vec.size(); // soap file number in file list
269 m_base_file.close(); //close soap listfile
271 return m_file_number;
276 * DATE: 2010-8-9
277 * FUNCTION: read a line from the *vec[index]
278 * PARAMETER: line: the read buffer. index: the index of file handles vec.
279 * RETURN: the length of string line.
281 int FileListManager::readLine(string& line, const int index)
283 return 0;
288 * DATE: 2010-8-9
289 * FUNCTION: close all the files
290 * PARAMETER: void.
291 * RETURN: void
292 * UPDATE ON 2010-10-4
293 * delete the vector object and add delete m_matrix_file_vec.
295 void FileListManager::closeAliFiles(void)
297 int vec_size = 0;
299 /*close soap files*/
300 while (vec_size < m_soap_file_vec.size())
302 m_soap_file_vec[vec_size]->close();
303 delete m_soap_file_vec[vec_size];
304 vec_size++;
307 vec_size = 0;
308 /*close sam/bam files*/
309 while(vec_size < m_sam_file_vec.size())
311 m_sam_file_vec[vec_size]->close();
312 delete m_sam_file_vec[vec_size];
313 vec_size++;
316 vec_size = 0;
317 /*close sam/bam files*/
318 while(vec_size < m_matrix_file_vec.size())
320 m_matrix_file_vec[vec_size]->close();
321 delete m_matrix_file_vec[vec_size];
322 vec_size++;
325 m_soap_file_vec.clear();
326 m_sam_file_vec.clear();
327 m_matrix_file_vec.clear();
331 * DATE: 2010-8-9
332 * FUNCTION: close all the files
333 * PARAMETER: void.
334 * RETURN: void
336 void FileListManager::closeCnsFiles(void)
338 int vec_size=0;
339 /*close cns file*/
340 while (vec_size < m_cns_file_vec.size())
342 m_cns_file_vec[vec_size]->close();
343 vec_size++;
350 * DATE: 2010-8-9
351 * FUNCTION: read lines to the readwin_vec. The lines' start position must inside a win size.
352 * When the lines' end position exceed the win,put it to the next win.
353 * PARAMETER: readwin_vec: the vector of Readwin objs. storing the info of every readwin.
354 * RETURN: COUNT_ERROR when readwin_vec.size() not equal to the files' number.
356 int FileListManager::readWin(vector<Readwin>& readwin_vec, Parameter * para)
358 //CThreadPool threadpool(para->CPU);
359 if (readwin_vec.size() != m_cns_file_vec.size())
361 // file size not right.
362 return COUNT_ERROR;
365 // get signal pointers.
366 sem_t * sem_read_p = &(para->sem_read);
367 sem_t * sem_call_cns_p = &(para->sem_call_cns);
368 sem_t * sem_readwin_return_p = &(para->sem_readwin_return);
370 ////CThreadPool threadpool(cpu);
371 //READ_WIN_ARGS * read_win_args = NULL;
372 //Read_win_Task * rw_Task = NULL;
373 //// two vector used to release resource.
374 //vector<READ_WIN_ARGS*> read_win_args_vec ;
375 //vector<Read_win_Task*> rw_Task_vec ;
376 int ret = 0;
377 while(1)
379 // waiting for the signal.
380 sem_wait(sem_read_p);
381 sem_wait(sem_readwin_return_p);
383 if (m_soap_file_vec.size() == readwin_vec.size())
385 for (int i = 0; i < readwin_vec.size(); ++i)
387 // read records from soap file to the win.
388 ret = readWinFromSoap(readwin_vec[i], *m_soap_file_vec[i]);
390 if (COME_NEW_CHR == ret)
392 para->ret = COME_NEW_CHR;
393 goto next_cycle;
394 //return COME_NEW_CHR;
396 else if (FILE_END == ret)
398 m_endfile_count ++;
402 // int *ret = new int[m_file_number];
403 // memset(ret, 0, sizeof(int) * m_file_number);
404 // for (int i = 0; i < readwin_vec.size(); ++i)
405 // {
406 // // read records from soap file to the win.
407 // read_win_args = new READ_WIN_ARGS(&readwin_vec[i], m_soap_file_vec[i],NULL, ret, i);
408 // read_win_args_vec.push_back(read_win_args);
409 // rw_Task = new Read_win_Task();
410 // rw_Task->SetData(read_win_args);
411 // rw_Task_vec.push_back(rw_Task);
412 // threadpool.AddTask(rw_Task);
413 // }
414 // while (threadpool.getTaskSize() != 0)
415 // {
416 // usleep(1);
417 // }
418 // //clear vector
419 // for (int j = 0; j < rw_Task_vec.size(); ++j)
420 // {
421 // delete read_win_args_vec[j];
422 // delete rw_Task_vec[j];
423 // }
424 // read_win_args_vec.clear();
425 // rw_Task_vec.clear();
426 ////cerr << __FUNCTION__ << __LINE__ << endl;
428 // for (int i = 0; i < readwin_vec.size(); ++i)
429 // {
430 // if (COME_NEW_CHR == ret[i])
431 // {
432 // para->ret = COME_NEW_CHR;
433 // //return COME_NEW_CHR;
434 // }
435 // else if (FILE_END == ret[i])
436 // {
437 // m_endfile_count ++;
438 // }
439 // }
440 ////cerr << __FUNCTION__ << __LINE__ << endl;
441 // delete [] ret;
443 else if (m_sam_file_vec.size() == readwin_vec.size())
445 for (int i = 0; i < readwin_vec.size(); ++i)
447 // read records from soap file to the win.
448 ret = readWinFromSam(readwin_vec[i], *m_sam_file_vec[i]);
449 if (COME_NEW_CHR == ret)
451 para->ret = COME_NEW_CHR;
452 goto next_cycle;
453 //return COME_NEW_CHR;
455 else if (FILE_END == ret)
457 m_endfile_count ++;
460 //int *ret = new int[m_file_number];
461 //memset(ret, 0, sizeof(int) * m_file_number);
462 //for (int i = 0; i < readwin_vec.size(); ++i)
464 // read_win_args = new READ_WIN_ARGS(&readwin_vec[i], NULL, m_sam_file_vec[i], ret, i);
465 // read_win_args_vec.push_back(read_win_args);
466 // rw_Task = new Read_win_Task();
467 // rw_Task->SetData(read_win_args);
468 // rw_Task_vec.push_back(rw_Task);
469 // threadpool.AddTask(rw_Task);
471 //while (threadpool.getTaskSize() != 0)
473 // usleep(10);
475 ////clear vector
476 //for (int j = 0; j < rw_Task_vec.size(); ++j)
478 // delete read_win_args_vec[j];
479 // delete rw_Task_vec[j];
481 //read_win_args_vec.clear();
482 //rw_Task_vec.clear();
484 //for (int i = 0; i < readwin_vec.size(); ++i)
486 // if (COME_NEW_CHR == ret[i])
487 // {
488 // para->ret = COME_NEW_CHR;
489 // // return COME_NEW_CHR;
490 // }
491 // else if (FILE_END == ret[i])
492 // {
493 // m_endfile_count ++;
494 // }
496 //delete [] ret;
498 else
500 // something wrong with input files.
501 para->ret = INPUT_ERROR;
502 //return INPUT_ERROR;
505 if (m_endfile_count != m_file_number)
507 para->ret = 0;
508 //return 0;
510 else
512 // all file reach tail.
513 para->ret = FILE_END;
514 sem_post(sem_call_cns_p);
515 break;
516 //return FILE_END;
519 next_cycle:
520 sem_post(sem_call_cns_p);
522 return 1;
526 // get the soap file handle pointer m_soap_file_vec[index]
528 * DATE: 2010-8-9
529 * FUNCTION: get the soap file handle pointer m_soap_file_vec[index]
530 * PARAMETER: index: the vector index.
531 * RETURN: return soap file pointer m_soap_file_vec[index].failed return INDEX_OVER_FLOW
533 igzstream * FileListManager::getSoapFile(const int index)
535 if (index < 0 || index >= m_soap_file_vec.size())
537 // index overflow
538 return INDEX_OVER_FLOW;
541 return m_soap_file_vec[index];
546 * DATE: 2010-8-9
547 * FUNCTION: get the soap file handle pointer m_soap_file_vec[index]
548 * PARAMETER: index: the vector index.
549 * RETURN: return sam/bam file pointer m_soap_file_vec[index].failed return INDEX_OVER_FLOW
551 SamCtrl* FileListManager::getSamFile(const int index)
553 if (index < 0 || index >= m_sam_file_vec.size())
555 // index overflow
556 return INDEX_OVER_FLOW;
559 return m_sam_file_vec[index];
564 * DATE: 2010-8-9
565 * FUNCTION: read record from soapfile and send to the readwin.
566 * PARAMETER: readwin: the Readwin object that would be processed.
567 * soapfile: the input file handls.
568 * RETURN: FILE_END when file is over else 0.
570 int FileListManager::readWinFromSoap(Readwin& readwin, igzstream& soapfile)
572 readwin.winChange(); // start from next win.
573 std::string line; // a temp string object.
574 int ret; // store the flag from function Readwin::addRead.
577 // judge if the win can be added.
578 if (readwin.isAbleToAdd())
580 // read record form file.
581 while (getline(soapfile,line))
583 ret = readwin.addRead(line);
585 if (ADD_SUCCESSFUL == ret)
587 // successful.
588 continue;
590 else if (READ_EXCEED_WIN == ret)
592 // record exceed.
593 break;
595 else if (READ_POS_ERROR == ret)
597 // something error with the record's position.
598 cerr << "ERROR : Something wrong with the read's position with line:" << endl;
599 cerr << line << endl;
600 //break;
601 continue;
603 else if (COME_NEW_CHR == ret)
605 if (readwin.getLastCount() == m_file_number)
607 // all the samples are reach a new chromosome.
608 return COME_NEW_CHR;
610 break;
613 if (soapfile.eof())
615 // file is over.
616 readwin.setLastPos(FILE_END);
617 return FILE_END;
620 return 0;
625 * DATE: 2010-8-9
626 * FUNCTION: read record from samCtrl and send to the readwin.
627 * PARAMETER: readwin: the Readwin object that would be processed.
628 * samCtrl: the input file handls.
629 * RETURN: FILE_END when file is over else 0.
631 int FileListManager::readWinFromSam(Readwin& readwin, SamCtrl& samCtrl)
633 readwin.winChange(); // start from next win.
634 std::string line; // a temp string object.
635 int ret; // store the flag from function Readwin::addRead.
636 int size = 0; // store the readline flag.
638 // judge if the win can be added.
639 if (readwin.isAbleToAdd())
641 // read record form file.
642 while ((size = samCtrl.readline(line)) != -1)
644 line = alignment_format(line);
645 // add record to the Readwin object.
646 ret = readwin.addRead(line);
648 if (ADD_SUCCESSFUL == ret)
650 // successful.
651 continue;
653 else if (READ_EXCEED_WIN == ret)
655 // record exceed.
656 break;
658 else if (READ_POS_ERROR == ret)
660 // something error with the record's position.
661 cerr << "ERROR : Something wrong with the read's position with line:" << endl;
662 cerr << line << endl;
663 //break;
664 continue;
666 else if (COME_NEW_CHR == ret)
668 if (readwin.getLastCount() == m_file_number)
670 // all the samples are reach a new chromosome.
671 return COME_NEW_CHR;
673 break;
677 if (size == -1)
679 // file is over.
680 readwin.setLastPos(FILE_END);
681 return FILE_END;
686 return 0;
690 // get the file number
692 * DATE: 2010-8-9
693 * FUNCTION: get the file number
694 * PARAMETER:
695 * RETURN: m_file_number
697 int FileListManager::getFileNum(void)
699 return m_file_number;
704 * DATE: 2010-8-9
705 * FUNCTION: get the consensus file handle pointer m_cns_file_vec[index]
706 * PARAMETER: index: the vector index.
707 * RETURN: return consensus file pointer m_cns_file_vec[index].failed return INDEX_OVER_FLOW
709 gzoutstream* FileListManager::getCnsFile(const int index)
711 if (index < 0 || index >= m_cns_file_vec.size())
713 // index overflow
714 return INDEX_OVER_FLOW;
717 return m_cns_file_vec[index];
722 * DATE: 2010-8-25
723 * FUNCTION: open matrix files with open_mode mode
724 * PARAMETER: matrix_list: the output or input matrix files path. mode: the open mode.
725 * alignment_list: alignment file list.
726 * RETURN: the number of matrix files that be opened.
728 int FileListManager::openMatrixFile(const std::string matrix_list, std::ios_base::open_mode mode, const std::string alignment_list)
730 fstream *fsp;
731 if (mode == std::ios::in) //matrix file is existed , read it
733 string filepath;
734 m_base_file.open(matrix_list.c_str()); //open matrix filelist
735 if (!m_base_file)
737 cerr<<"Cannot open file:" << matrix_list << endl;
738 return BASEFILE_ERROR;
740 //cerr <<"open matrix file 642"<<endl;
741 /*open each matrix file in the filelist and input soap vector*/
742 while (m_base_file >> filepath)
744 //cerr <<"open matrix file 646"<<endl;
745 fsp = new fstream();
746 //cerr <<"open matrix file 647"<<endl;
747 fsp->open(filepath.c_str(), fstream::in) ;
748 //cerr <<"open matrix file 649"<<endl;
749 if (!fsp->is_open())
751 cerr << "Cannot open file:" << filepath << endl;
752 return SOAPFILE_ERROR;
754 m_matrix_file_vec.push_back(fsp);
755 fsp->clear();
758 m_base_file.close();
760 else if (alignment_list != "")
762 string mat_Path, mat_name; //the one matrix file name
764 m_base_file.open(alignment_list.c_str()); //open matrix file path
766 if (!m_base_file)
768 cerr << "Cannot open file:" << alignment_list << endl;
769 return BASEFILE_ERROR;
772 string cmd = "mkdir -p " + matrix_list;
773 system(cmd.c_str());
774 string list_path = matrix_list + "/matrix_list";
775 ofstream list_out(list_path.c_str());
777 /*open each matrix file and push in cns vector*/
778 while (getline(m_base_file, mat_Path))
780 fsp = new fstream();
781 /*find cons name from cns path*/
782 int pos = mat_Path.rfind('/');
783 mat_name = matrix_list + mat_Path.substr(pos, string::npos) + ".matrix";
784 /*put cns in to cns vector*/
785 fsp->open(mat_name.c_str(), fstream::out);
787 if (!fsp->is_open())
789 cerr << "Cannot open file:" << mat_name << endl;
790 return CNSFILE_ERROR;
793 list_out << mat_name << endl;
794 m_matrix_file_vec.push_back(fsp);
797 list_out.close();
798 m_base_file.close();
800 return m_matrix_file_vec.size();
805 * DATE: 2010-8-25
806 * FUNCTION: get the matrix file handle pointer m_matrix_file_vec[index]
807 * PARAMETER: index: the vector index.
808 * RETURN: return matrix file pointer m_matrix_file_vec[index].failed return INDEX_OVER_FLOW
810 fstream* FileListManager::getMatrixFile(const int index)
812 if (index >= m_matrix_file_vec.size())
814 // index overflow
815 return INDEX_OVER_FLOW;
818 return m_matrix_file_vec[index];
822 * DATE: 2010-8-30
823 * FUNCTION: set max limit file number
824 * PARAMETER: filenumber : open filenumber now
825 * RETURN: return SETLIMIT_ERROR or return SETLIMIT_OK
827 //int FileListManager::setlimit(int filenumber)
829 // struct rlimit r;
831 // /*if open file number up to limit */
832 // if (filenumber >= r.rlim_max)
833 // {
834 // /*set limit*/
835 // r.rlim_cur = 2000;
836 // r.rlim_max = 2000;
838 // /*if set limit error*/
839 // if (setrlimit(RLIMIT_NOFILE, &r) < 0)
840 // {
841 // cerr << "setrlimit error\n";
842 // return SETLIMIT_ERROR;
843 // }
844 // }
845 // return SETLIMIT_OK;
851 //2010-11-08
852 int Read_win_Task::Run()
854 READ_WIN_ARGS *read_win_args = (READ_WIN_ARGS*)this->m_ptrData;
856 Readwin & readwin = *(read_win_args->readwin_p);
857 igzstream * m_soap_file = read_win_args->m_soap_file_p ;
858 SamCtrl * m_sam_file = read_win_args->m_sam_file_p;
859 int * ret = read_win_args->ret_p;
860 int index = read_win_args->index;
861 if (read_win_args->m_sam_file_p == NULL)
863 ret[index] = fileListManager.readWinFromSoap(readwin, *m_soap_file);
865 else
867 ret[index] = fileListManager.readWinFromSam(readwin,* m_sam_file);
869 return 1;
873 * DATE: 2010-11-12
874 * FUNCTION: a thread function used to run FlieListManager::readWin function.
875 * PARAMETER: __Args the parameter structure.
876 * RETURN:
878 void *_flieListManager_readWin(void * __Args)
880 // get args.
881 BIG_READ_WIN_ARGS * args = (BIG_READ_WIN_ARGS*)__Args;
882 // run the function.
883 args->fileListManager->readWin((*args->readwin_vec), args->para);
884 return NULL;