modified: src1/worker.c
[GalaxyCodeBases.git] / c_cpp / etc / mVicuna / src / SeqFrqEst.cpp
blobf7eec6738c8b02a98798a89a954da3c22815a2e1
1 //========================================================================
2 // Project : M-Vicuna
3 // Name : SeqFrqEst.cpp
4 // Author : Xiao Yang
5 // Created on : Aug 6, 2013
6 // Version : 1.0
7 // Copyright : The Broad Institute
8 // SOFTWARE COPYRIGHT NOTICE AGREEMENT
9 // This software and its documentation are copyright (2013)
10 // by the Broad Institute. All rights are reserved.
12 // This software is supplied without any warranty or
13 // guaranteed support whatsoever. The Broad Institute cannot
14 // be responsible for its use, misuse, or functionality.
15 // Description :
16 //========================================================================
19 #include "SeqFrqEst.h"
21 /* @brief Given a list of paired fastq files [ipfq] and unpaired fastq
22 * files [isfq], the kmer size [k] estimate the frequency of each input
23 * read
25 void estSeqFrq (const strvec_t& ipfq, const strvec_t& isfq,
26 int k, int batch, bool silent){
28 strvec_t myfiles = ipfq;
29 myfiles.insert(myfiles.end(), isfq.begin(), isfq.end());
31 if (! silent) std::cout << "\tobtain " << k << "-mer frequency\n";
32 umap_t kcnt;
33 obtain_kfrq (kcnt, k, batch, myfiles, silent);
35 if (! silent) std::cout << "\tnumer of kmers: " << kcnt.size() << "\n";
37 ivec_t frq;
38 obtain_seqfrq (frq, kcnt, k, batch, myfiles, silent);
41 { // debug
42 std::cout << kcnt.size() << " kmers observed\n";
43 uvec_t kfrq;
44 for (auto & x: kcnt) { kfrq.push_back(x.second); }
45 std::sort (kfrq.begin(), kfrq.end());
46 for (uvec_t::reverse_iterator it = kfrq.rbegin(); it != kfrq.rend(); ++ it) {
47 std::cout << *it << "\t";
49 std::cout<< "\n\n";
50 std::cout << "# reads " << frq.size() << "\n";
51 for (auto & x: frq) std::cout << x << "\n";
53 } // estSeqFrq
55 /* @brief Estimate sequence frequency for all reads in fastq files
57 void obtain_seqfrq (ivec_t& frq, const umap_t& kcnt,
58 int k, int batch, const strvec_t& files, bool silent) {
59 strvec_t seqs;
60 int cnt = batch, total_reads = 0;
61 for (int i = 0; i < (int) files.size(); ++ i) {
63 if (!silent) std::cout << "\t\tprocess file: " << files[i] << "\n";
65 std::ifstream fh;
66 xny::openfile<std::ifstream>(fh, files[i]);
68 bio::fastq_input_iterator<> fq(fh), end;
70 while (fq != end) {
72 add_fq_reads_only (seqs, cnt, fq, end);
74 if ((int) seqs.size() >= batch) {
75 total_reads += seqs.size();
76 seq_freq (frq, kcnt, k, seqs);
77 cnt = batch;
78 seqs.clear();
79 } else { // not enough reads to fill in batch for current file pair
80 cnt = batch - seqs.size();
81 xny::closefile(fh);
83 } // while
84 } // for
85 total_reads += seqs.size();
86 seq_freq (frq, kcnt, k, seqs);
87 if (!silent) std::cout << "\t\t" << total_reads << " reads analyzed\n";
89 } // obtain_seqfrq
91 /* @brief Adding frequency of sequences [seqs] to vector [frq]
93 void seq_freq (ivec_t& frq, const umap_t& kcnt,
94 int k, const strvec_t& seqs) {
96 int sz = seqs.size();
97 ivec_t local_frq (sz, 0);
98 // int total_debug = 0;
99 #pragma omp parallel for
100 for (int i = 0; i < sz; ++ i) {
101 int frq_sum = 0;
102 uvec_t kIDs;
103 xny::get_bitkmer<std::back_insert_iterator<uvec_t>, uint32_t>
104 (seqs[i], std::back_inserter(kIDs), k, 0);
105 uset_t kID_set (kIDs.begin(), kIDs.end());
107 for (auto& id: kID_set) {
108 umap_t::const_iterator it = kcnt.find(id);
109 if (it != kcnt.end()) frq_sum += it->second;
111 if (kID_set.size() != 0) {
112 local_frq[i] = frq_sum / kID_set.size();
114 { // debug
115 if (local_frq[i] > 1000) {
116 std::cout << ">" << i << "\n" << seqs[i] << "\n";
118 uvec_t cnts;
119 for (auto& id: kIDs) {
120 int cnt = kcnt.find(id)->second;
121 std::cout << cnt << ", ";
122 cnts.push_back(cnt);
124 std::cout << "\n";
125 std::sort (cnts.begin(), cnts.end());
126 for (auto& x : cnts) std::cout << x << ", ";
127 std::cout << "\n\n";
133 frq.insert(frq.end(), local_frq.begin(), local_frq.end());
134 // std::cout << "total debug = " << total_debug << "\n";
135 } // seq_freq
137 /* @brief Obtain kmer frequency in a given set of fastq files
139 void obtain_kfrq (umap_t& kcnt, int k, int batch,
140 const strvec_t& files, bool silent) {
142 strvec_t seqs;
143 int cnt = batch, total_reads = 0;
144 for (int i = 0; i < (int) files.size(); ++ i) {
146 if (!silent) std::cout << "\t\tprocess file: " << files[i] << "\n";
148 std::ifstream fh;
149 xny::openfile<std::ifstream>(fh, files[i]);
151 bio::fastq_input_iterator<> fq(fh), end;
153 while (fq != end) {
155 add_fq_reads_only (seqs, cnt, fq, end);
157 if ((int) seqs.size() >= batch) {
158 total_reads += seqs.size();
159 kmer_cnt_in_seqs (kcnt, seqs, k);
160 cnt = batch;
161 seqs.clear();
162 } else { // not enough reads to fill in batch for current file
163 cnt = batch - seqs.size();
164 xny::closefile(fh);
166 } // while
167 } // for
168 total_reads += seqs.size();
169 kmer_cnt_in_seqs (kcnt, seqs, k);
171 if (!silent) std::cout << "\t\t" << total_reads << " reads analyzed\n";
172 } // obtain_kfrq
174 /* @brief Count kmers in a given set of sequences [seqs]
176 void kmer_cnt_in_seqs (umap_t& kcnt, const strvec_t& seqs, int k){
177 int sz = seqs.size();
178 #pragma omp parallel
180 umap_t local_kcnt; // private to each thread
182 #pragma omp for
183 for (int i = 0; i < sz; ++ i) {
184 uvec_t kIDs;
185 xny::get_bitkmer<std::back_insert_iterator<uvec_t>, uint32_t>
186 (seqs[i], std::back_inserter(kIDs), k, 0);
187 uset_t kID_set (kIDs.begin(), kIDs.end());
189 for (auto& id: kID_set) {
190 umap_t::iterator it = local_kcnt.find(id);
191 if (it != local_kcnt.end()) ++ it->second;
192 else local_kcnt[id] = 1;
195 #pragma omp critical
197 for (auto& kc: local_kcnt) {
198 umap_t::iterator it = kcnt.find(kc.first);
199 if (it != kcnt.end()) it->second += kc.second;
200 else kcnt.insert(kc);
203 } // #pragma omp parallel
205 } // kmer_cnt_in_seqs