1 //========================================================================
3 // Name : MergeReadPair.cpp
5 // Created on : Jun 3, 2013
7 // Copyright : The Broad Institute
8 // SOFTWARE COPYRIGHT NOTICE AGREEMENT
9 // This software and its documentation are copyright (2013)
10 // by the Broad Institute. All rights are reserved.
12 // This software is supplied without any warranty or
13 // guaranteed support whatsoever. The Broad Institute cannot
14 // be responsible for its use, misuse, or functionality.
16 //========================================================================
18 #include "MergeReadPair.h"
19 #include "ReadBioFile.h"
21 void debug_output_to_file (std::ofstream
& ofh
, std::vector
<fqtuple_t
>& seq
){
22 for (int i
= 0; i
< seq
.size(); ++ i
) {
23 ofh
<< "@" << std::get
<0> (seq
[i
]) << "\n"
24 << std::get
<1> (seq
[i
]) << "\n+\n"
25 << std::get
<2> (seq
[i
]) << "\n";
29 void merge_paired_read (const strvec_t
& ifq
, const strpair_t
& ofq
,
30 const std::string
& os
, int batch
) {
33 if (os
.empty()) abording ("merge_paired_read: ofa is empty");
35 std::ofstream ofhfq
, ofhfq2
, ofhs
;
36 xny::openfile
<std::ofstream
> (ofhfq
, ofq
.first
);
37 xny::openfile
<std::ofstream
> (ofhfq2
, ofq
.second
);
38 xny::openfile
<std::ofstream
> (ofhs
, os
);
40 int num_merged_pairs
= 0, total_read_pairs
= 0;
41 std::vector
<fqtuple_t
> seq
;
43 for (unsigned int i
= 0; i
< ifq
.size(); i
+= 2) {
45 std::cout
<< "\tprocess files: " << ifq
[i
] << "\t" << ifq
[i
+ 1] << "\n\n";
47 std::ifstream fh
, fh2
;
48 xny::openfile
<std::ifstream
>(fh
, ifq
[i
]);
49 xny::openfile
<std::ifstream
>(fh2
, ifq
[i
+1]);
52 std::ofstream debug_output;
53 xny::openfile<std::ofstream>(debug_output, "output/test.fq");
55 bio::fastq_input_iterator
<> fq(fh
), end
;
56 bio::fastq_input_iterator
<> fq2(fh2
);
59 while ((fq
!= end
) && (fq2
!= end
)) {
61 add_fq_reads (seq
, cnt
/2, fq
, end
);
63 /*{ // check if input is properly read
64 //debug_output_to_file (debug_output, seq);
67 add_fq_reads (seq
, cnt
/2, fq2
, end
);
70 std::cout << debug << "\t" << seq.size() << "\t" << cnt << "\n";
74 if ((int) seq
.size() >= batch
) {
75 total_read_pairs
+= seq
.size()/2;
76 num_merged_pairs
+= apply_merging (ofhs
, ofhfq
, ofhfq2
, seq
);
79 } else { // not enough reads to fill in batch for current file pair
80 cnt
= batch
- seq
.size();
87 total_read_pairs
+= seq
.size()/2;
88 num_merged_pairs
+= apply_merging (ofhs
, ofhfq
, ofhfq2
, seq
);
90 std::cout
<< "\tnumber of merged pairs vs total: " << num_merged_pairs
91 << " vs " << total_read_pairs
<< " ("
92 << 100.0*num_merged_pairs
/total_read_pairs
<< "%) \n";
94 xny::closefile (ofhfq
);
95 xny::closefile (ofhfq2
);
96 xny::closefile (ofhs
);
98 } // merge_paired_read
102 /* @brief Given n read-pairs (1, ...n, n+1, ..., 2n) stored in [seq],
103 * where the reads (i, n+i) for 1 <= i <= n form a pair. Every pair is
104 * attempted for merging, the ones failed merging are added to [fq] [fq2]
105 * and the merged ones are added to [fhs] in single end fastq format
107 * NOTE: assume the paired reads are from different strands either:
114 int apply_merging (std::ofstream
& fhs
, std::ofstream
& fhfq
,
115 std::ofstream
& fhfq2
, std::vector
<fqtuple_t
>& seq
) {
119 // reverse complementary of second pair
120 int num
= seq
.size();
122 #pragma omp parallel for
123 for (int i
= num
/2; i
< num
; ++ i
) {
124 xny::rvc_str(std::get
<1> (seq
[i
]));
127 xny::suffix_prefix_gap_free_aln
aligner (7, 90, 1);
129 std::vector
<strpair_t
> list_merged_fragments (num
/2);
131 if (debug
){ // debug case for merging
132 omp_set_num_threads(1);
135 #pragma omp parallel for
136 for (int i
= num
/2; i
< num
; ++ i
) {
138 coord_t coord
= aligner (std::get
<1> (seq
[i
-num
/2]),
139 std::get
<1> (seq
[i
]));
141 if (std::get
<0> (coord
) != -1) {
142 int start0
= std::get
<0> (coord
), end0
= std::get
<1> (coord
),
143 start1
= std::get
<2> (coord
), end1
= std::get
<3> (coord
),
144 l0
= (std::get
<1> (seq
[i
-num
/2])).length(),
145 l1
= (std::get
<1> (seq
[i
])).length();
147 /*if (debug) { // debug print
148 std::cout << start0 << "\t" << end0 << "\t"
149 << start1 << "\t" << end1 << "\n";
150 int num_dash = start0 - start1;
151 std::cout << std::get<1> (seq[i-num/2]) << "\n";
152 for (int g = 0; g < num_dash; ++ g) std::cout << "-";
153 std::cout << std::get<1> (seq[i]) << "\n\n";
157 int overlap_sz
= end0
- start0
+ 1;
158 std::string merged_seq
, merged_qual
;
159 for (int idx
= 0; idx
< overlap_sz
; ++ idx
) {
160 char c0
= (std::get
<1> (seq
[i
-num
/2])).at(start0
+ idx
),
161 c1
= (std::get
<1> (seq
[i
])).at(start1
+ idx
),
162 q0
= (std::get
<2> (seq
[i
-num
/2])).at(start0
+ idx
),
163 q1
= (std::get
<2> (seq
[i
])).at(start1
+ idx
);
167 if (q0
>= q1
) merged_qual
+= q0
;
168 else merged_qual
+= q1
;
178 } // for (int idx = 0;
180 //if (debug) std::cout << "Merged part: \n" << merged_seq << "\n";
182 // generate the full string
183 if (start0
+ 1 >= l0
- end0
- 1) { // normal direction
184 merged_seq
= (std::get
<1> (seq
[i
-num
/2])).substr(0, start0
) + merged_seq
;
185 merged_qual
= (std::get
<2> (seq
[i
-num
/2])).substr(0, start0
) + merged_qual
;
187 merged_seq
+= (std::get
<1> (seq
[i
])).substr(end1
+ 1, l1
- end1
);
188 merged_qual
+= (std::get
<2> (seq
[i
])).substr(end1
+ 1, l1
- end1
);
190 } else { // abnormal direction
191 merged_seq
= (std::get
<1> (seq
[i
])).substr(0, start1
) + merged_seq
;
192 merged_qual
= (std::get
<2> (seq
[i
])).substr(0, start1
) + merged_qual
;
194 merged_seq
+= (std::get
<1> (seq
[i
-num
/2])).substr(end0
+ 1, l0
- end0
);
195 merged_qual
+= (std::get
<2> (seq
[i
-num
/2])).substr(end0
+ 1, l0
- end0
);
199 std::cout
<< "> " << debug_cnter
<< "\n";
200 std::cout
<< merged_seq
<< "\n\n";
202 std::cout
<< std::get
<0> (seq
[i
-num
/2]) << "\n" << std::get
<1> (seq
[i
-num
/2]) << "\n";
203 std::cout
<< std::get
<0> (seq
[i
]) << "\n" << std::get
<1> (seq
[i
]) << "\n\n";
211 std::cout << "post-merging, we have\n" << merged_seq
212 << "\n" << merged_qual << "\n";
215 list_merged_fragments
[i
- num
/2] = strpair_t (merged_seq
,
219 } // if (std::get<0> (coord) != -1)
221 } // for (int i = num/2; i < num; ++ i) {
225 for (int i
= 0; i
< num
/2; ++ i
) {
226 if (!list_merged_fragments
[i
].first
.empty()) {
227 fhs
<< "@" << std::get
<0> (seq
[i
+ num
/2]) << "\n"
228 << list_merged_fragments
[i
].first
<< "\n+\n"
229 << list_merged_fragments
[i
].second
<< "\n";
232 fhfq
<< "@" << std::get
<0> (seq
[i
]) << "\n"
233 << std::get
<1> (seq
[i
]) << "\n+\n"
234 << std::get
<2> (seq
[i
]) << "\n";
235 fhfq2
<< "@" << std::get
<0> (seq
[i
+ num
/2]) << "\n"
236 << std::get
<1> (seq
[i
+ num
/2])
237 << "\n+\n" << std::get
<2> (seq
[i
+ num
/2]) << "\n";
242 //std::cout << "Number of merged read pairs = " << merged << "\n";
244 // suffix_prefix_gap_free_aln ();