1 //========================================================================
5 // Created on : Jul 10, 2013
7 // Copyright : The Broad Institute
8 // SOFTWARE COPYRIGHT NOTICE AGREEMENT
9 // This software and its documentation are copyright (2013)
10 // by the Broad Institute. All rights are reserved.
12 // This software is supplied without any warranty or
13 // guaranteed support whatsoever. The Broad Institute cannot
14 // be responsible for its use, misuse, or functionality.
16 //========================================================================
19 /** Function trimming ()
22 void trimming (const strvec_t
& ipfq
, const strvec_t
& isfq
,
23 const trm_t
& trm
, xny::low_complexity
& lc
, int batch
, bool silent
){
26 if ((trm
.op
.size() == ipfq
.size() && trm
.os
.size() == ipfq
.size()/2) ||
27 (trm
.op
.size() == 2 && trm
.os
.size() == 1)) { }
28 else abording ("In trimming() SC failed");
30 // --------------- read input vector --------------------------
31 std::cout
<< "\tRead in vectors ...";
33 kindex_t kindex
; //[kmerID -> list (vecID, vecPos, dir)]
34 process_vector_file (vectors
, kindex
, trm
.vecfa
, std::min (trm
.min_match
, 16));
36 if (vectors
.size() == 0 || kindex
.empty()) {
37 std::cout
<< "\t\tno vector trimming applied\n\n";
40 // --------------- prepare output when applicable -----------------
41 std::ofstream ofhfq
, ofhfq2
, ofhs
;
42 if (trm
.op
.size() == 2 && trm
.os
.size() == 1) {
43 xny::openfile
<std::ofstream
> (ofhfq
, trm
.op
[0]);
44 xny::openfile
<std::ofstream
> (ofhfq2
, trm
.op
[1]);
45 xny::openfile
<std::ofstream
> (ofhs
, trm
.os
[0]);
48 // process every pair of fastq files
49 int num_file_pairs
= ipfq
.size()/2;
51 for (int i
= 0; i
< num_file_pairs
; ++ i
) {
56 std::cout
<< "\tprocess files: " << ipfq
[fID
] << " and "
57 << ipfq
[fID
+ 1] << "\n\n";
60 // ----- output non-redundant read-pairs ---------
61 if (trm
.op
.size() > 2) {
62 xny::openfile
<std::ofstream
> (ofhfq
, trm
.op
[fID
]);
63 xny::openfile
<std::ofstream
> (ofhfq2
, trm
.op
[fID
+ 1]);
64 xny::openfile
<std::ofstream
> (ofhs
, trm
.os
[i
]);
67 trim_pfq (ipfq
[fID
], ipfq
[fID
+1], vectors
, kindex
, ofhfq
,
68 ofhfq2
, ofhs
, trm
, lc
, batch
);
70 if (trm
.op
.size() > 2) {
71 xny::closefile(ofhfq
);
72 xny::closefile(ofhfq2
);
73 xny::closefile (ofhs
);
77 // process singleton fastq files, the result is stored in the
78 // last specified output fastq file
79 int num_sfiles
= isfq
.size();
81 xny::openfile
<std::ofstream
> (ofhsfq
, trm
.os
.back(), std::ios::app
);
82 for (int fID
= 0; fID
< num_sfiles
; ++ fID
) {
83 if (! silent
) std::cout
<< "\tprocess files: " << isfq
[fID
] << "\n\n";
84 trim_sfq (ofhsfq
, isfq
[fID
], trm
, lc
, batch
);
86 xny::closefile(ofhsfq
);
90 /** Function process_vector_file ()
92 * Given an input fasta file [vecfa], record all vectors in [vectors] and
93 * process each vector kmer then store information in
94 * [kindex]: [kmerID -> list (vecID, vecPos, dir)]
96 void process_vector_file (strvec_t
& vectors
, kindex_t
& kindex
,
97 const std::string
& vecfa
, int k
) {
99 /* ------------ read input vectors ----------- */
101 xny::openfile
<std::ifstream
>(fh
, vecfa
);
102 bio::fasta_input_iterator
<> iter_fa(fh
), end
;
103 add_fa_reads_only (vectors
, INT_MAX
, iter_fa
, end
);
104 std::cout
<< "\t\t" << vectors
.size() << " vector sequences read\n";
105 if (vectors
.size() == 0) return;
108 /* generate kindex information [kmerID -> list (vecID, vecPos, dir)] */
110 for (auto& v
: vectors
) {
112 // make sure vector doesn't contain 'n' or 'N'
113 if (std::isupper(v
.at(0))) std::replace (v
.begin(), v
.end(), 'N', 'A');
114 else std::replace (v
.begin(), v
.end(), 'n', 'a');
116 // generate (k)-spectrum for fwd & rvc strands of vector[i]
118 xny::get_bitkmer
<std::back_insert_iterator
<uvec_t
>, uint32_t>
119 (v
, std::back_inserter(kIDs
), k
, 2);
121 for (unsigned int j
= 0; j
< kIDs
.size(); ++ j
) {
122 kindex_t::iterator it
= kindex
.find(kIDs
[j
]);
123 if (it
!= kindex
.end()) {
124 if (j
% 2 == 0) { // forward strand
125 it
->second
.push_back(kloc_t(vectorID
, j
/2, true));
126 } else it
->second
.push_back(kloc_t(vectorID
, j
/2, false));
128 if (j
% 2 == 0) kindex
[kIDs
[j
]] = { kloc_t(vectorID
, j
/2, true) };
129 else kindex
[kIDs
[j
]] = { kloc_t (vectorID
, j
/2, false) };
131 } // for (unsigned int j
134 } // for (auto & v: vectors )
136 }// process_vector_file
139 /* @brief Function trim_sfq ()
141 * Apply trimming to single end fastq files
142 * -- low quality score criterion
143 * -- low complexity criterion
145 void trim_sfq (std::ofstream
& ofhsfq
, const std::string
& ifq
,
146 const trm_t
& trm
, xny::low_complexity
& lc
, int batch
) {
149 xny::openfile
<std::ifstream
>(ifhfq
, ifq
);
150 bio::fastq_input_iterator
<> iter_fq (ifhfq
), iter_end
;
154 std::vector
<fqtuple_t
> reads
;
156 while (iter_fq
!= iter_end
) {
157 add_fq_reads (reads
, batch
, iter_fq
, iter_end
);
159 int read_cnt
= reads
.size();
160 #pragma omp parallel for
161 for (int i
= 0; i
< read_cnt
; ++ i
) {
162 if (trim_lc_lq (std::get
<1>(reads
[i
]), std::get
<2>(reads
[i
]),
163 trm
.min_qual
, trm
.min_rlen
, lc
)) {
168 for (int i
= 0; i
< read_cnt
; ++ i
) {
169 if(! (std::get
<1>(reads
[i
])).empty()) {
170 ofhsfq
<< "@" << std::get
<0>(reads
[i
]) << "\n";
171 ofhsfq
<< std::get
<1>(reads
[i
]) << "\n";
173 ofhsfq
<< std::get
<2>(reads
[i
]) << "\n";
176 total_reads
+= read_cnt
;
182 xny::closefile(ifhfq
);
184 std::cout
<< "\t\ttotal reads: " << total_reads
<< ", " << num_trimmed
<< " trimmed\n";
187 /** Function trim_paired_fqfiles ()
189 * Apply trimming to two paired input fastq files
191 * -- low quality score criterion
192 * -- low complexity criterion
194 void trim_pfq (const std::string
& ifq
, const std::string
& ifq2
,
195 const strvec_t
& vectors
, const kindex_t
& kindex
, std::ofstream
& ofhfq
,
196 std::ofstream
& ofhfq2
, std::ofstream
& ofhs
, const trm_t
& trm
,
197 xny::low_complexity
& lc
, int batch
){
199 std::ifstream ifhfq
, ifhfq2
;
200 xny::openfile
<std::ifstream
>(ifhfq
, ifq
);
201 xny::openfile
<std::ifstream
>(ifhfq2
, ifq2
);
202 bio::fastq_input_iterator
<> iter_fq (ifhfq
), iter_end
, iter_fq2 (ifhfq2
);
204 int total_read_pairs
= 0;
206 std::vector
<fqtuple_t
> pairs
;
208 while (iter_fq
!= iter_end
&& iter_fq2
!= iter_end
) {
210 add_fq_reads (pairs
, batch
/2, iter_fq
, iter_end
);
211 add_fq_reads (pairs
, batch
/2, iter_fq2
, iter_end
);
213 num_trimmed
+= apply_trimming (pairs
, vectors
, kindex
, lc
, trm
);
216 int fragnum
= pairs
.size()/2;
217 for (int i
= 0; i
< fragnum
; ++ i
) {
218 if ((std::get
<1>(pairs
[i
])).empty()) { // first pair is empty
219 if (! std::get
<1>(pairs
[i
+ fragnum
]).empty()) { // 2nd not empty
220 ofhs
<< "@" << std::get
<0>(pairs
[i
+ fragnum
]) << "\n";
221 ofhs
<< std::get
<1>(pairs
[i
+ fragnum
]) << "\n";
223 ofhs
<< std::get
<2>(pairs
[i
+ fragnum
]) << "\n";
225 } else if (std::get
<1>(pairs
[i
+ fragnum
]).empty()) { // 2nd empty
226 ofhs
<< "@" << std::get
<0>(pairs
[i
]) << "\n";
227 ofhs
<< std::get
<1>(pairs
[i
]) << "\n";
229 ofhs
<< std::get
<2>(pairs
[i
]) << "\n";
231 ofhfq
<< "@" << std::get
<0>(pairs
[i
]) << "\n";
232 ofhfq
<< std::get
<1>(pairs
[i
]) << "\n";
234 ofhfq
<< std::get
<2>(pairs
[i
]) << "\n";
236 ofhfq2
<< "@" << std::get
<0>(pairs
[i
+ fragnum
]) << "\n";
237 ofhfq2
<< std::get
<1>(pairs
[i
+ fragnum
]) << "\n";
239 ofhfq2
<< std::get
<2>(pairs
[i
+ fragnum
]) << "\n";
243 total_read_pairs
+= pairs
.size()/2;
249 xny::closefile(ifhfq
);
250 xny::closefile(ifhfq2
);
252 std::cout
<< "\t\ttotal reads: " << total_read_pairs
* 2 << ", "
253 << num_trimmed
<< " trimmed\n";
256 /** Function apply_trimming ()
258 * Apply trimming to each read in [seq] return number of trimmed reads,
259 * no paired information is used
261 int apply_trimming (std::vector
<fqtuple_t
>& seq
, const strvec_t
& vectors
,
262 const kindex_t
& kindex
, xny::low_complexity
& lc
, const trm_t
& trm
) {
265 int k
= std::min (trm
.min_match
, 16);
267 bvec_t
trimmed (sz
, false); // record which reads are trimmed
268 #pragma omp parallel for
269 for (int i
= 0; i
< sz
; ++ i
) {
270 std::string rseq
= std::get
<1>(seq
[i
]);
271 std::string qual
= std::get
<2>(seq
[i
]);
272 // make sure vector doesn't contain 'n' or 'N'
273 if (std::isupper(rseq
.at(0))) {
274 std::replace (rseq
.begin(), rseq
.end(), 'N', 'A');
275 } else std::replace (rseq
.begin(), rseq
.end(), 'n', 'a');
277 // Primer Trimming: checking each read against all primers
279 bool trim_applied
= false;
282 xny::get_bitkmer
<std::back_insert_iterator
<uvec_t
>, uint32_t>
283 (rseq
, std::back_inserter(rkmers
), k
, 3);
285 for (unsigned int rPos
= 0; rPos
< rkmers
.size(); ++ rPos
) {
287 kindex_t::const_iterator it
= kindex
.find(rkmers
[rPos
]);
289 // check if we can do trimming using current kmer: kmers[j]
290 if (it
!= kindex
.end()) {
291 if (trim_primer (rseq
, qual
, it
->second
,
292 vectors
, rPos
, trm
)) {
300 // break when no more trimming can be applied to the read
301 if (!trim_applied
) break;
303 // Low quality score and low complexity trimming
304 if (trim_lc_lq (rseq
, qual
, trm
.min_qual
, trm
.min_rlen
, lc
)) {
308 std::get
<1>(seq
[i
]) = rseq
;
309 std::get
<2>(seq
[i
]) = qual
;
312 // counting the trimmed reads
314 for (int i
= 0; i
< sz
; ++ i
) { if (trimmed
[i
]) ++ num_trimmed
; }
319 /** Function trim_primer ()
321 * Apply primer trimming to a read sequence [rSeq] and quality string [qual]
322 * For every kmer shared between [rSeq] and an input vector sequence
323 * return once the first trimming event happens when
324 * the minimum match length is identified; trim is either applied to prefix,
325 * suffix, or the complete read by choosing whichever the remaining
326 * fragment is longer post-trimming.
328 bool trim_primer (std::string
& rSeq
, std::string
& qual
,
329 const std::vector
<kloc_t
>& kloclist
,
330 const strvec_t
& vectors
, int rPos
, const trm_t
& trm
) {
332 int k
= std::min (16, trm
.min_match
);
334 int rBegin
= rPos
, rEnd
= rPos
+ k
- 1;
335 int rLen
= rSeq
.length();
337 for (auto& kloc
: kloclist
) { // for each matched kmer locus
339 std::string myVec
= vectors
[std::get
<0> (kloc
)];// vector sequence
340 int vBegin
= std::get
<1> (kloc
), vEnd
= vBegin
+ k
- 1;
341 bool vDir
= std::get
<2> (kloc
); // fwd or rv direction
343 std::string rPrf
= rSeq
.substr(0, rBegin
),
344 rSuf
= rSeq
.substr(rEnd
+1, rLen
- rEnd
),
345 vPrf
= myVec
.substr(0, vBegin
),
346 vSuf
= myVec
.substr(vEnd
+1, myVec
.length() - vEnd
);
349 rEnd
+= xny::common_pref_len (rSuf
, vSuf
);
350 rBegin
-= xny::common_suf_len (rPrf
, vPrf
);
352 rEnd
+= xny::common_pref_len (rSuf
, xny::get_rvc_str (vPrf
));
353 rBegin
-= xny::common_suf_len (rPrf
, xny::get_rvc_str(vSuf
));
356 if (rEnd
- rBegin
+ 1 >= trm
.min_match
) { // apply trimming
357 int remain_suffix_len
= rLen
- rEnd
- 1,
358 remain_prefix_len
= rBegin
+ 1;
359 if (remain_suffix_len
>= remain_prefix_len
&&
360 remain_suffix_len
>= trm
.min_rlen
) { //trim prefix
361 rSeq
= rSeq
.substr(rEnd
+ 1, remain_suffix_len
);
362 qual
= qual
.substr(rEnd
+ 1, remain_suffix_len
);
363 } else if (remain_prefix_len
>= remain_suffix_len
&&
364 remain_prefix_len
>= trm
.min_rlen
) { // trim suffix
365 rSeq
= rSeq
.substr(0, remain_prefix_len
);
366 qual
= qual
.substr(0, remain_prefix_len
);
368 rSeq
= ""; // rm whole read
379 /* @brief Trim read and quality string by min quality score and
380 * low complexity criteria
382 bool trim_lc_lq (std::string
& rSeq
, std::string
& qual
, int min_qual
,
383 int min_len
, xny::low_complexity
& lc
) {
384 /* Trim max prefix and suffix where quality score of each base <= min_qual
386 int rlen
= qual
.length();
387 int first_hq_idx
= 0, last_hq_idx
= rlen
- 1;
388 for (auto& c
: qual
) { // prefix
389 if ((int) c
> min_qual
) break;
392 bool last_pos_found
= false;
393 for (; last_hq_idx
>= min_len
+ first_hq_idx
- 1; -- last_hq_idx
) {
394 if ((int) qual
.at(last_hq_idx
) - 33 > min_qual
) {
395 last_pos_found
= true;
399 if (last_pos_found
) {
400 rSeq
= rSeq
.substr(first_hq_idx
, last_hq_idx
- first_hq_idx
+ 1);
402 qual
= qual
.substr(first_hq_idx
, last_hq_idx
- first_hq_idx
+ 1);
403 if (rSeq
.length() != rlen
) return true;