1 //========================================================================
3 // Name : SeqFrqEst.cpp
5 // Created on : Aug 6, 2013
7 // Copyright : The Broad Institute
8 // SOFTWARE COPYRIGHT NOTICE AGREEMENT
9 // This software and its documentation are copyright (2013)
10 // by the Broad Institute. All rights are reserved.
12 // This software is supplied without any warranty or
13 // guaranteed support whatsoever. The Broad Institute cannot
14 // be responsible for its use, misuse, or functionality.
16 //========================================================================
19 #include "SeqFrqEst.h"
21 /* @brief Given a list of paired fastq files [ipfq] and unpaired fastq
22 * files [isfq], the kmer size [k] estimate the frequency of each input
25 void estSeqFrq (const strvec_t
& ipfq
, const strvec_t
& isfq
,
26 int k
, int batch
, bool silent
){
28 strvec_t myfiles
= ipfq
;
29 myfiles
.insert(myfiles
.end(), isfq
.begin(), isfq
.end());
31 if (! silent
) std::cout
<< "\tobtain " << k
<< "-mer frequency\n";
33 obtain_kfrq (kcnt
, k
, batch
, myfiles
, silent
);
35 if (! silent
) std::cout
<< "\tnumer of kmers: " << kcnt
.size() << "\n";
38 obtain_seqfrq (frq
, kcnt
, k
, batch
, myfiles
, silent
);
42 std::cout
<< kcnt
.size() << " kmers observed\n";
44 for (auto & x
: kcnt
) { kfrq
.push_back(x
.second
); }
45 std::sort (kfrq
.begin(), kfrq
.end());
46 for (uvec_t::reverse_iterator it
= kfrq
.rbegin(); it
!= kfrq
.rend(); ++ it
) {
47 std::cout
<< *it
<< "\t";
50 std::cout
<< "# reads " << frq
.size() << "\n";
51 for (auto & x
: frq
) std::cout
<< x
<< "\n";
55 /* @brief Estimate sequence frequency for all reads in fastq files
57 void obtain_seqfrq (ivec_t
& frq
, const umap_t
& kcnt
,
58 int k
, int batch
, const strvec_t
& files
, bool silent
) {
60 int cnt
= batch
, total_reads
= 0;
61 for (int i
= 0; i
< (int) files
.size(); ++ i
) {
63 if (!silent
) std::cout
<< "\t\tprocess file: " << files
[i
] << "\n";
66 xny::openfile
<std::ifstream
>(fh
, files
[i
]);
68 bio::fastq_input_iterator
<> fq(fh
), end
;
72 add_fq_reads_only (seqs
, cnt
, fq
, end
);
74 if ((int) seqs
.size() >= batch
) {
75 total_reads
+= seqs
.size();
76 seq_freq (frq
, kcnt
, k
, seqs
);
79 } else { // not enough reads to fill in batch for current file pair
80 cnt
= batch
- seqs
.size();
85 total_reads
+= seqs
.size();
86 seq_freq (frq
, kcnt
, k
, seqs
);
87 if (!silent
) std::cout
<< "\t\t" << total_reads
<< " reads analyzed\n";
91 /* @brief Adding frequency of sequences [seqs] to vector [frq]
93 void seq_freq (ivec_t
& frq
, const umap_t
& kcnt
,
94 int k
, const strvec_t
& seqs
) {
97 ivec_t
local_frq (sz
, 0);
98 // int total_debug = 0;
99 #pragma omp parallel for
100 for (int i
= 0; i
< sz
; ++ i
) {
103 xny::get_bitkmer
<std::back_insert_iterator
<uvec_t
>, uint32_t>
104 (seqs
[i
], std::back_inserter(kIDs
), k
, 0);
105 uset_t
kID_set (kIDs
.begin(), kIDs
.end());
107 for (auto& id
: kID_set
) {
108 umap_t::const_iterator it
= kcnt
.find(id
);
109 if (it
!= kcnt
.end()) frq_sum
+= it
->second
;
111 if (kID_set
.size() != 0) {
112 local_frq
[i
] = frq_sum
/ kID_set
.size();
115 if (local_frq
[i
] > 1000) {
116 std::cout
<< ">" << i
<< "\n" << seqs
[i
] << "\n";
119 for (auto& id: kIDs) {
120 int cnt = kcnt.find(id)->second;
121 std::cout << cnt << ", ";
125 std::sort (cnts.begin(), cnts.end());
126 for (auto& x : cnts) std::cout << x << ", ";
133 frq
.insert(frq
.end(), local_frq
.begin(), local_frq
.end());
134 // std::cout << "total debug = " << total_debug << "\n";
137 /* @brief Obtain kmer frequency in a given set of fastq files
139 void obtain_kfrq (umap_t
& kcnt
, int k
, int batch
,
140 const strvec_t
& files
, bool silent
) {
143 int cnt
= batch
, total_reads
= 0;
144 for (int i
= 0; i
< (int) files
.size(); ++ i
) {
146 if (!silent
) std::cout
<< "\t\tprocess file: " << files
[i
] << "\n";
149 xny::openfile
<std::ifstream
>(fh
, files
[i
]);
151 bio::fastq_input_iterator
<> fq(fh
), end
;
155 add_fq_reads_only (seqs
, cnt
, fq
, end
);
157 if ((int) seqs
.size() >= batch
) {
158 total_reads
+= seqs
.size();
159 kmer_cnt_in_seqs (kcnt
, seqs
, k
);
162 } else { // not enough reads to fill in batch for current file
163 cnt
= batch
- seqs
.size();
168 total_reads
+= seqs
.size();
169 kmer_cnt_in_seqs (kcnt
, seqs
, k
);
171 if (!silent
) std::cout
<< "\t\t" << total_reads
<< " reads analyzed\n";
174 /* @brief Count kmers in a given set of sequences [seqs]
176 void kmer_cnt_in_seqs (umap_t
& kcnt
, const strvec_t
& seqs
, int k
){
177 int sz
= seqs
.size();
180 umap_t local_kcnt
; // private to each thread
183 for (int i
= 0; i
< sz
; ++ i
) {
185 xny::get_bitkmer
<std::back_insert_iterator
<uvec_t
>, uint32_t>
186 (seqs
[i
], std::back_inserter(kIDs
), k
, 0);
187 uset_t
kID_set (kIDs
.begin(), kIDs
.end());
189 for (auto& id
: kID_set
) {
190 umap_t::iterator it
= local_kcnt
.find(id
);
191 if (it
!= local_kcnt
.end()) ++ it
->second
;
192 else local_kcnt
[id
] = 1;
197 for (auto& kc
: local_kcnt
) {
198 umap_t::iterator it
= kcnt
.find(kc
.first
);
199 if (it
!= kcnt
.end()) it
->second
+= kc
.second
;
200 else kcnt
.insert(kc
);
203 } // #pragma omp parallel
205 } // kmer_cnt_in_seqs