modified: src1/input.c
[GalaxyCodeBases.git] / c_cpp / etc / mVicuna / src / MergeReadPair.cpp
blob072c9daced715fab905f62efab6f3e0c04583b84
1 //========================================================================
2 // Project : M-Vicuna
3 // Name : MergeReadPair.cpp
4 // Author : Xiao Yang
5 // Created on : Jun 3, 2013
6 // Version : 1.0
7 // Copyright : The Broad Institute
8 // SOFTWARE COPYRIGHT NOTICE AGREEMENT
9 // This software and its documentation are copyright (2013)
10 // by the Broad Institute. All rights are reserved.
12 // This software is supplied without any warranty or
13 // guaranteed support whatsoever. The Broad Institute cannot
14 // be responsible for its use, misuse, or functionality.
15 // Description :
16 //========================================================================
18 #include "MergeReadPair.h"
19 #include "ReadBioFile.h"
21 void debug_output_to_file (std::ofstream& ofh, std::vector<fqtuple_t>& seq){
22 for (int i = 0; i < seq.size(); ++ i) {
23 ofh << "@" << std::get<0> (seq[i]) << "\n"
24 << std::get<1> (seq[i]) << "\n+\n"
25 << std::get<2> (seq[i]) << "\n";
29 void merge_paired_read (const strvec_t& ifq, const strpair_t& ofq,
30 const std::string& os, int batch) {
32 // sanity check
33 if (os.empty()) abording ("merge_paired_read: ofa is empty");
35 std::ofstream ofhfq, ofhfq2, ofhs;
36 xny::openfile<std::ofstream> (ofhfq, ofq.first);
37 xny::openfile<std::ofstream> (ofhfq2, ofq.second);
38 xny::openfile<std::ofstream> (ofhs, os);
40 int num_merged_pairs = 0, total_read_pairs = 0;
41 std::vector<fqtuple_t> seq;
42 int cnt = batch;
43 for (unsigned int i = 0; i < ifq.size(); i += 2) {
45 std::cout << "\tprocess files: " << ifq[i] << "\t" << ifq[i + 1] << "\n\n";
47 std::ifstream fh, fh2;
48 xny::openfile<std::ifstream>(fh, ifq[i]);
49 xny::openfile<std::ifstream>(fh2, ifq[i+1]);
51 /*{
52 std::ofstream debug_output;
53 xny::openfile<std::ofstream>(debug_output, "output/test.fq");
54 }*/
55 bio::fastq_input_iterator<> fq(fh), end;
56 bio::fastq_input_iterator<> fq2(fh2);
58 //int debug = 1;
59 while ((fq != end) && (fq2 != end)) {
61 add_fq_reads (seq, cnt/2, fq, end);
63 /*{ // check if input is properly read
64 //debug_output_to_file (debug_output, seq);
65 }*/
67 add_fq_reads (seq, cnt/2, fq2, end);
69 /*{
70 std::cout << debug << "\t" << seq.size() << "\t" << cnt << "\n";
71 ++ debug;
72 }*/
74 if ((int) seq.size() >= batch) {
75 total_read_pairs += seq.size()/2;
76 num_merged_pairs += apply_merging (ofhs, ofhfq, ofhfq2, seq);
77 cnt = batch;
78 seq.clear();
79 } else { // not enough reads to fill in batch for current file pair
80 cnt = batch - seq.size();
81 xny::closefile(fh);
82 xny::closefile(fh2);
84 } // while
87 total_read_pairs += seq.size()/2;
88 num_merged_pairs += apply_merging (ofhs, ofhfq, ofhfq2, seq);
90 std::cout << "\tnumber of merged pairs vs total: " << num_merged_pairs
91 << " vs " << total_read_pairs << " ("
92 << 100.0*num_merged_pairs /total_read_pairs << "%) \n";
94 xny::closefile (ofhfq);
95 xny::closefile (ofhfq2);
96 xny::closefile (ofhs);
98 } // merge_paired_read
102 /* @brief Given n read-pairs (1, ...n, n+1, ..., 2n) stored in [seq],
103 * where the reads (i, n+i) for 1 <= i <= n form a pair. Every pair is
104 * attempted for merging, the ones failed merging are added to [fq] [fq2]
105 * and the merged ones are added to [fhs] in single end fastq format
107 * NOTE: assume the paired reads are from different strands either:
108 * ----->
109 * <-----
110 * OR abnormal case
111 * <-----
112 * ----->
114 int apply_merging (std::ofstream& fhs, std::ofstream& fhfq,
115 std::ofstream& fhfq2, std::vector<fqtuple_t>& seq) {
117 bool debug = false;
118 int debug_cnter = 0;
119 // reverse complementary of second pair
120 int num = seq.size();
122 #pragma omp parallel for
123 for (int i = num/2; i < num; ++ i) {
124 xny::rvc_str(std::get<1> (seq[i]));
127 xny::suffix_prefix_gap_free_aln aligner (7, 90, 1);
129 std::vector<strpair_t> list_merged_fragments (num/2);
131 if (debug){ // debug case for merging
132 omp_set_num_threads(1);
135 #pragma omp parallel for
136 for (int i = num/2; i < num; ++ i) {
138 coord_t coord = aligner (std::get<1> (seq[i-num/2]),
139 std::get<1> (seq[i]));
141 if (std::get<0> (coord) != -1) {
142 int start0 = std::get<0> (coord), end0 = std::get<1> (coord),
143 start1 = std::get<2> (coord), end1 = std::get<3> (coord),
144 l0 = (std::get<1> (seq[i-num/2])).length(),
145 l1 = (std::get<1> (seq[i])).length();
147 /*if (debug) { // debug print
148 std::cout << start0 << "\t" << end0 << "\t"
149 << start1 << "\t" << end1 << "\n";
150 int num_dash = start0 - start1;
151 std::cout << std::get<1> (seq[i-num/2]) << "\n";
152 for (int g = 0; g < num_dash; ++ g) std::cout << "-";
153 std::cout << std::get<1> (seq[i]) << "\n\n";
156 // to merge
157 int overlap_sz = end0 - start0 + 1;
158 std::string merged_seq, merged_qual;
159 for (int idx = 0; idx < overlap_sz; ++ idx) {
160 char c0 = (std::get<1> (seq[i-num/2])).at(start0 + idx),
161 c1 = (std::get<1> (seq[i])).at(start1 + idx),
162 q0 = (std::get<2> (seq[i-num/2])).at(start0 + idx),
163 q1 = (std::get<2> (seq[i])).at(start1 + idx);
165 if (c0 == c1) {
166 merged_seq += c0;
167 if (q0 >= q1) merged_qual += q0;
168 else merged_qual += q1;
169 } else {
170 if (q0 >= q1) {
171 merged_seq += c0;
172 merged_qual += q0;
173 } else {
174 merged_seq += c1;
175 merged_qual += q1;
178 } // for (int idx = 0;
180 //if (debug) std::cout << "Merged part: \n" << merged_seq << "\n";
182 // generate the full string
183 if (start0 + 1 >= l0 - end0 - 1) { // normal direction
184 merged_seq = (std::get<1> (seq[i-num/2])).substr(0, start0) + merged_seq;
185 merged_qual = (std::get<2> (seq[i-num/2])).substr(0, start0) + merged_qual;
186 if (end1 < l1 - 1) {
187 merged_seq += (std::get<1> (seq[i])).substr(end1 + 1, l1 - end1);
188 merged_qual += (std::get<2> (seq[i])).substr(end1 + 1, l1 - end1);
190 } else { // abnormal direction
191 merged_seq = (std::get<1> (seq[i])).substr(0, start1) + merged_seq;
192 merged_qual = (std::get<2> (seq[i])).substr(0, start1) + merged_qual;
193 if (end0 < l0 - 1) {
194 merged_seq += (std::get<1> (seq[i-num/2])).substr(end0 + 1, l0 - end0);
195 merged_qual += (std::get<2> (seq[i-num/2])).substr(end0 + 1, l0 - end0);
198 if (debug) {
199 std::cout << "> " << debug_cnter << "\n";
200 std::cout << merged_seq << "\n\n";
202 std::cout << std::get<0> (seq[i-num/2]) << "\n" << std::get<1> (seq[i-num/2]) << "\n";
203 std::cout << std::get<0> (seq[i]) << "\n" << std::get<1> (seq[i]) << "\n\n";
205 ++ debug_cnter;
210 if (debug){
211 std::cout << "post-merging, we have\n" << merged_seq
212 << "\n" << merged_qual << "\n";
215 list_merged_fragments[i - num/2] = strpair_t (merged_seq,
216 merged_qual);
219 } // if (std::get<0> (coord) != -1)
221 } // for (int i = num/2; i < num; ++ i) {
223 /* write to file */
224 int merged_cnt = 0;
225 for (int i = 0; i < num/2; ++ i) {
226 if (!list_merged_fragments[i].first.empty()) {
227 fhs << "@" << std::get<0> (seq[i + num/2]) << "\n"
228 << list_merged_fragments[i].first << "\n+\n"
229 << list_merged_fragments[i].second << "\n";
230 ++ merged_cnt;
231 } else {
232 fhfq << "@" << std::get<0> (seq[i]) << "\n"
233 << std::get<1> (seq[i]) << "\n+\n"
234 << std::get<2> (seq[i]) << "\n";
235 fhfq2 << "@" << std::get<0> (seq[i + num/2]) << "\n"
236 << std::get<1> (seq[i + num/2])
237 << "\n+\n" << std::get<2> (seq[i + num/2]) << "\n";
241 return merged_cnt;
242 //std::cout << "Number of merged read pairs = " << merged << "\n";
244 // suffix_prefix_gap_free_aln ();
246 } // apply_merging