modified: src1/worker.c
[GalaxyCodeBases.git] / c_cpp / etc / mVicuna / src / Trim.cpp
blob6f95d7006fe05e5f89a37ae8f2468ce332dec143
1 //========================================================================
2 // Project : M-Vicuna
3 // Name : Trim.cpp
4 // Author : Xiao Yang
5 // Created on : Jul 10, 2013
6 // Version : 1.0
7 // Copyright : The Broad Institute
8 // SOFTWARE COPYRIGHT NOTICE AGREEMENT
9 // This software and its documentation are copyright (2013)
10 // by the Broad Institute. All rights are reserved.
12 // This software is supplied without any warranty or
13 // guaranteed support whatsoever. The Broad Institute cannot
14 // be responsible for its use, misuse, or functionality.
15 // Description :
16 //========================================================================
18 #include "Trim.h"
19 /** Function trimming ()
22 void trimming (const strvec_t& ipfq, const strvec_t& isfq,
23 const trm_t& trm, xny::low_complexity& lc, int batch, bool silent){
25 // sanity check
26 if ((trm.op.size() == ipfq.size() && trm.os.size() == ipfq.size()/2) ||
27 (trm.op.size() == 2 && trm.os.size() == 1)) { }
28 else abording ("In trimming() SC failed");
30 // --------------- read input vector --------------------------
31 std::cout << "\tRead in vectors ...";
32 strvec_t vectors;
33 kindex_t kindex; //[kmerID -> list (vecID, vecPos, dir)]
34 process_vector_file (vectors, kindex, trm.vecfa, std::min (trm.min_match, 16));
36 if (vectors.size() == 0 || kindex.empty()) {
37 std::cout << "\t\tno vector trimming applied\n\n";
38 //return;
40 // --------------- prepare output when applicable -----------------
41 std::ofstream ofhfq, ofhfq2, ofhs;
42 if (trm.op.size() == 2 && trm.os.size() == 1) {
43 xny::openfile<std::ofstream> (ofhfq, trm.op[0]);
44 xny::openfile<std::ofstream> (ofhfq2, trm.op[1]);
45 xny::openfile<std::ofstream> (ofhs, trm.os[0]);
48 // process every pair of fastq files
49 int num_file_pairs = ipfq.size()/2;
51 for (int i = 0; i < num_file_pairs; ++ i) {
53 int fID = 2*i;
55 if (! silent) {
56 std::cout << "\tprocess files: " << ipfq[fID] << " and "
57 << ipfq[fID + 1] << "\n\n";
60 // ----- output non-redundant read-pairs ---------
61 if (trm.op.size() > 2) {
62 xny::openfile<std::ofstream> (ofhfq, trm.op[fID]);
63 xny::openfile<std::ofstream> (ofhfq2, trm.op[fID + 1]);
64 xny::openfile<std::ofstream> (ofhs, trm.os[i]);
67 trim_pfq (ipfq[fID], ipfq[fID+1], vectors, kindex, ofhfq,
68 ofhfq2, ofhs, trm, lc, batch);
70 if (trm.op.size() > 2) {
71 xny::closefile(ofhfq);
72 xny::closefile(ofhfq2);
73 xny::closefile (ofhs);
75 } // for (int i = 0
77 // process singleton fastq files, the result is stored in the
78 // last specified output fastq file
79 int num_sfiles = isfq.size();
80 std::ofstream ofhsfq;
81 xny::openfile<std::ofstream> (ofhsfq, trm.os.back(), std::ios::app);
82 for (int fID = 0; fID < num_sfiles; ++ fID) {
83 if (! silent) std::cout << "\tprocess files: " << isfq[fID] << "\n\n";
84 trim_sfq (ofhsfq, isfq[fID], trm, lc, batch);
86 xny::closefile(ofhsfq);
87 } // trimming
90 /** Function process_vector_file ()
92 * Given an input fasta file [vecfa], record all vectors in [vectors] and
93 * process each vector kmer then store information in
94 * [kindex]: [kmerID -> list (vecID, vecPos, dir)]
96 void process_vector_file (strvec_t& vectors, kindex_t& kindex,
97 const std::string& vecfa, int k) {
99 /* ------------ read input vectors ----------- */
100 std::ifstream fh;
101 xny::openfile<std::ifstream>(fh, vecfa);
102 bio::fasta_input_iterator<> iter_fa(fh), end;
103 add_fa_reads_only (vectors, INT_MAX, iter_fa, end);
104 std::cout << "\t\t" << vectors.size() << " vector sequences read\n";
105 if (vectors.size() == 0) return;
106 xny::closefile (fh);
108 /* generate kindex information [kmerID -> list (vecID, vecPos, dir)] */
109 int vectorID = 0;
110 for (auto& v: vectors) {
112 // make sure vector doesn't contain 'n' or 'N'
113 if (std::isupper(v.at(0))) std::replace (v.begin(), v.end(), 'N', 'A');
114 else std::replace (v.begin(), v.end(), 'n', 'a');
116 // generate (k)-spectrum for fwd & rvc strands of vector[i]
117 uvec_t kIDs;
118 xny::get_bitkmer<std::back_insert_iterator<uvec_t>, uint32_t>
119 (v, std::back_inserter(kIDs), k, 2);
121 for (unsigned int j = 0; j < kIDs.size(); ++ j) {
122 kindex_t::iterator it = kindex.find(kIDs[j]);
123 if (it != kindex.end()) {
124 if (j % 2 == 0) { // forward strand
125 it->second.push_back(kloc_t(vectorID, j/2, true));
126 } else it->second.push_back(kloc_t(vectorID, j/2, false));
127 } else {
128 if (j % 2 == 0) kindex[kIDs[j]] = { kloc_t(vectorID, j/2, true) };
129 else kindex[kIDs[j]] = { kloc_t (vectorID, j/2, false) };
131 } // for (unsigned int j
133 vectorID ++;
134 } // for (auto & v: vectors )
136 }// process_vector_file
139 /* @brief Function trim_sfq ()
141 * Apply trimming to single end fastq files
142 * -- low quality score criterion
143 * -- low complexity criterion
145 void trim_sfq (std::ofstream& ofhsfq, const std::string& ifq,
146 const trm_t& trm, xny::low_complexity& lc, int batch) {
148 std::ifstream ifhfq;
149 xny::openfile<std::ifstream>(ifhfq, ifq);
150 bio::fastq_input_iterator<> iter_fq (ifhfq), iter_end;
152 int total_reads = 0;
153 int num_trimmed = 0;
154 std::vector<fqtuple_t> reads;
156 while (iter_fq != iter_end) {
157 add_fq_reads (reads, batch, iter_fq, iter_end);
159 int read_cnt = reads.size();
160 #pragma omp parallel for
161 for (int i = 0; i < read_cnt; ++ i) {
162 if (trim_lc_lq (std::get<1>(reads[i]), std::get<2>(reads[i]),
163 trm.min_qual, trm.min_rlen, lc)) {
164 ++ num_trimmed;
167 // write out
168 for (int i = 0; i < read_cnt; ++ i) {
169 if(! (std::get<1>(reads[i])).empty()) {
170 ofhsfq << "@" << std::get<0>(reads[i]) << "\n";
171 ofhsfq << std::get<1>(reads[i]) << "\n";
172 ofhsfq << "+\n";
173 ofhsfq << std::get<2>(reads[i]) << "\n";
176 total_reads += read_cnt;
178 reads.clear();
180 } // while
182 xny::closefile(ifhfq);
184 std::cout << "\t\ttotal reads: " << total_reads << ", " << num_trimmed << " trimmed\n";
185 } // trim_sfq
187 /** Function trim_paired_fqfiles ()
189 * Apply trimming to two paired input fastq files
190 * -- primer trimming
191 * -- low quality score criterion
192 * -- low complexity criterion
194 void trim_pfq (const std::string& ifq, const std::string& ifq2,
195 const strvec_t& vectors, const kindex_t& kindex, std::ofstream& ofhfq,
196 std::ofstream& ofhfq2, std::ofstream& ofhs, const trm_t& trm,
197 xny::low_complexity& lc, int batch){
199 std::ifstream ifhfq, ifhfq2;
200 xny::openfile<std::ifstream>(ifhfq, ifq);
201 xny::openfile<std::ifstream>(ifhfq2, ifq2);
202 bio::fastq_input_iterator<> iter_fq (ifhfq), iter_end, iter_fq2 (ifhfq2);
204 int total_read_pairs = 0;
205 int num_trimmed = 0;
206 std::vector<fqtuple_t> pairs;
208 while (iter_fq != iter_end && iter_fq2 != iter_end) {
210 add_fq_reads (pairs, batch/2, iter_fq, iter_end);
211 add_fq_reads (pairs, batch/2, iter_fq2, iter_end);
213 num_trimmed += apply_trimming (pairs, vectors, kindex, lc, trm);
215 // output to file
216 int fragnum = pairs.size()/2;
217 for (int i = 0; i < fragnum; ++ i) {
218 if ((std::get<1>(pairs[i])).empty()) { // first pair is empty
219 if (! std::get<1>(pairs[i + fragnum]).empty()) { // 2nd not empty
220 ofhs << "@" << std::get<0>(pairs[i + fragnum]) << "\n";
221 ofhs << std::get<1>(pairs[i + fragnum]) << "\n";
222 ofhs << "+\n";
223 ofhs << std::get<2>(pairs[i + fragnum]) << "\n";
225 } else if (std::get<1>(pairs[i + fragnum]).empty()) { // 2nd empty
226 ofhs << "@" << std::get<0>(pairs[i]) << "\n";
227 ofhs << std::get<1>(pairs[i]) << "\n";
228 ofhs << "+\n";
229 ofhs << std::get<2>(pairs[i]) << "\n";
230 } else {
231 ofhfq << "@" << std::get<0>(pairs[i]) << "\n";
232 ofhfq << std::get<1>(pairs[i]) << "\n";
233 ofhfq << "+\n";
234 ofhfq << std::get<2>(pairs[i]) << "\n";
236 ofhfq2 << "@" << std::get<0>(pairs[i + fragnum]) << "\n";
237 ofhfq2 << std::get<1>(pairs[i + fragnum]) << "\n";
238 ofhfq2 << "+\n";
239 ofhfq2 << std::get<2>(pairs[i + fragnum]) << "\n";
243 total_read_pairs += pairs.size()/2;
245 pairs.clear();
247 } // while
249 xny::closefile(ifhfq);
250 xny::closefile(ifhfq2);
252 std::cout << "\t\ttotal reads: " << total_read_pairs * 2 << ", "
253 << num_trimmed << " trimmed\n";
254 } //trim_pfq
256 /** Function apply_trimming ()
258 * Apply trimming to each read in [seq] return number of trimmed reads,
259 * no paired information is used
261 int apply_trimming (std::vector<fqtuple_t>& seq, const strvec_t& vectors,
262 const kindex_t& kindex, xny::low_complexity& lc, const trm_t& trm) {
264 int sz = seq.size();
265 int k = std::min (trm.min_match, 16);
267 bvec_t trimmed (sz, false); // record which reads are trimmed
268 #pragma omp parallel for
269 for (int i = 0; i < sz; ++ i) {
270 std::string rseq = std::get<1>(seq[i]);
271 std::string qual = std::get<2>(seq[i]);
272 // make sure vector doesn't contain 'n' or 'N'
273 if (std::isupper(rseq.at(0))) {
274 std::replace (rseq.begin(), rseq.end(), 'N', 'A');
275 } else std::replace (rseq.begin(), rseq.end(), 'n', 'a');
277 // Primer Trimming: checking each read against all primers
278 while (true) {
279 bool trim_applied = false;
281 uvec_t rkmers;
282 xny::get_bitkmer<std::back_insert_iterator<uvec_t>, uint32_t>
283 (rseq, std::back_inserter(rkmers), k, 3);
285 for (unsigned int rPos = 0; rPos < rkmers.size(); ++ rPos) {
287 kindex_t::const_iterator it = kindex.find(rkmers[rPos]);
289 // check if we can do trimming using current kmer: kmers[j]
290 if (it != kindex.end()) {
291 if (trim_primer (rseq, qual, it->second,
292 vectors, rPos, trm)) {
293 trimmed[i] = true;
294 trim_applied = true;
295 break;
298 } // for rPos
300 // break when no more trimming can be applied to the read
301 if (!trim_applied) break;
303 // Low quality score and low complexity trimming
304 if (trim_lc_lq (rseq, qual, trm.min_qual, trm.min_rlen, lc)) {
305 trimmed[i] = true;
308 std::get<1>(seq[i]) = rseq;
309 std::get<2>(seq[i]) = qual;
310 } // for i
312 // counting the trimmed reads
313 int num_trimmed = 0;
314 for (int i = 0; i < sz; ++ i) { if (trimmed[i]) ++ num_trimmed; }
315 return num_trimmed;
317 } // apply_trimming
319 /** Function trim_primer ()
321 * Apply primer trimming to a read sequence [rSeq] and quality string [qual]
322 * For every kmer shared between [rSeq] and an input vector sequence
323 * return once the first trimming event happens when
324 * the minimum match length is identified; trim is either applied to prefix,
325 * suffix, or the complete read by choosing whichever the remaining
326 * fragment is longer post-trimming.
328 bool trim_primer (std::string& rSeq, std::string& qual,
329 const std::vector<kloc_t>& kloclist,
330 const strvec_t& vectors, int rPos, const trm_t& trm) {
332 int k = std::min (16, trm.min_match);
334 int rBegin = rPos, rEnd = rPos + k - 1;
335 int rLen = rSeq.length();
337 for (auto& kloc: kloclist) { // for each matched kmer locus
339 std::string myVec = vectors[std::get<0> (kloc)];// vector sequence
340 int vBegin = std::get<1> (kloc), vEnd = vBegin + k - 1;
341 bool vDir = std::get<2> (kloc); // fwd or rv direction
343 std::string rPrf = rSeq.substr(0, rBegin),
344 rSuf = rSeq.substr(rEnd+1, rLen - rEnd),
345 vPrf = myVec.substr(0, vBegin),
346 vSuf = myVec.substr(vEnd+1, myVec.length() - vEnd);
348 if (vDir) { // fwd
349 rEnd += xny::common_pref_len (rSuf, vSuf);
350 rBegin -= xny::common_suf_len (rPrf, vPrf);
351 } else { // rv
352 rEnd += xny::common_pref_len (rSuf, xny::get_rvc_str (vPrf));
353 rBegin -= xny::common_suf_len (rPrf, xny::get_rvc_str(vSuf));
356 if (rEnd - rBegin + 1 >= trm.min_match) { // apply trimming
357 int remain_suffix_len = rLen - rEnd - 1,
358 remain_prefix_len = rBegin + 1;
359 if (remain_suffix_len >= remain_prefix_len &&
360 remain_suffix_len >= trm.min_rlen) { //trim prefix
361 rSeq = rSeq.substr(rEnd + 1, remain_suffix_len);
362 qual = qual.substr(rEnd + 1, remain_suffix_len);
363 } else if (remain_prefix_len >= remain_suffix_len &&
364 remain_prefix_len >= trm.min_rlen) { // trim suffix
365 rSeq = rSeq.substr(0, remain_prefix_len);
366 qual = qual.substr(0, remain_prefix_len);
367 } else {
368 rSeq = ""; // rm whole read
369 qual = "";
372 return true;
374 } // for
376 return false;
377 } // trim_primer
379 /* @brief Trim read and quality string by min quality score and
380 * low complexity criteria
382 bool trim_lc_lq (std::string& rSeq, std::string& qual, int min_qual,
383 int min_len, xny::low_complexity& lc) {
384 /* Trim max prefix and suffix where quality score of each base <= min_qual
386 int rlen = qual.length();
387 int first_hq_idx = 0, last_hq_idx = rlen - 1;
388 for (auto& c : qual) { // prefix
389 if ((int) c > min_qual) break;
390 ++ first_hq_idx;
392 bool last_pos_found = false;
393 for (; last_hq_idx >= min_len + first_hq_idx - 1; -- last_hq_idx) {
394 if ((int) qual.at(last_hq_idx) - 33 > min_qual) {
395 last_pos_found = true;
396 break;
399 if (last_pos_found) {
400 rSeq = rSeq.substr(first_hq_idx, last_hq_idx - first_hq_idx + 1);
401 if (! lc (rSeq)) {
402 qual = qual.substr(first_hq_idx, last_hq_idx - first_hq_idx + 1);
403 if (rSeq.length() != rlen) return true;
404 else return false;
407 rSeq.clear();
408 qual.clear();
409 return true;
410 } // trim_lc_lq