modified: src1/worker.c
[GalaxyCodeBases.git] / c_cpp / etc / mVicuna / src / jaz / sequence_compare.hpp
blobf37fc5dde0442fcae3c4a9a5ba2a4e0647ec4cce
1 /***
2 * $Id$
3 **
4 * File: sequence_compare.hpp
5 * Created: May 03, 2012
7 * Author: Jaroslaw Zola <jaroslaw.zola@gmail.com>
8 * Xiao Yang <isuyang@gmail.com>
9 * Copyright (c) 2012-2013 Jaroslaw Zola
10 * Distributed under the Boost Software License.
12 * Boost Software License - Version 1.0 - August 17th, 2003
14 * Permission is hereby granted, free of charge, to any person or organization
15 * obtaining a copy of the software and accompanying documentation covered by
16 * this license (the "Software") to use, reproduce, display, distribute,
17 * execute, and transmit the Software, and to prepare derivative works of the
18 * Software, and to permit third-parties to whom the Software is furnished to
19 * do so, all subject to the following:
21 * The copyright notices in the Software and this entire statement, including
22 * the above license grant, this restriction and the following disclaimer,
23 * must be included in all copies of the Software, in whole or in part, and
24 * all derivative works of the Software, unless such copies or derivative
25 * works are solely in the form of machine-executable object code generated by
26 * a source language processor.
28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
31 * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
32 * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
33 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
34 * DEALINGS IN THE SOFTWARE.
37 #ifndef SEQUENCE_COMPARE_HPP
38 #define SEQUENCE_COMPARE_HPP
40 #include <algorithm>
41 #include <cmath>
42 #include <cstring>
43 #include <fstream>
44 #include <functional>
45 #include <limits>
46 #include <climits>
47 #include <map>
48 #include <string>
49 #include <vector>
50 #include <tuple>
53 namespace bio {
55 namespace detail {
57 // this code comes from jaz
58 template <typename Iter1, typename Iter2, typename Pred>
59 std::size_t intersection_size(Iter1 first1, Iter1 last1, Iter2 first2, Iter2 last2, Pred pred) {
60 std::size_t S = 0;
62 while ((first1 != last1) && (first2 != last2)) {
63 if (pred(*first1, *first2)) ++first1;
64 else if (pred(*first2, *first1)) ++first2;
65 else {
66 first1++;
67 first2++;
68 S++;
70 } // while
72 return S;
73 } // intersection_size
76 template <typename Iter1, typename Iter2>
77 int count_distance(Iter1 first1, Iter1 last1, Iter2 first2, Iter2 last2) {
78 int S = 0;
80 while ((first1 != last1) && (first2 != last2)) {
81 if (first1->first < first2->first) {
82 S += (first1->second * first1->second);
83 ++first1;
85 else if (first2->first < first1->first) {
86 S += (first2->second * first2->second);
87 ++first2;
89 else {
90 int d = (first1->second - first2->second);
91 S += d * d;
92 first1++;
93 first2++;
95 } // while
97 return S;
98 } // count_distance
101 template <typename Sequence> void general_kmer_index(const std::string& s, unsigned int k, Sequence& S) {
102 unsigned int l = s.size();
103 unsigned int end = l - k + 1;
104 S.resize(end);
105 for (unsigned int i = 0; i < end; ++i) {
106 S[i] = std::string(s.begin() + i, s.begin() + i + k);
108 std::sort(S.begin(), S.end());
109 } // general_kmer_index
112 template <typename Map> void general_kmer_count(const std::string& s, unsigned int k, Map& S) {
113 S.clear();
114 unsigned int l = s.size();
115 unsigned int end = l - k + 1;
116 for (unsigned int i = 0; i < end; ++i) {
117 S[std::string(s.begin() + i, s.begin() + i + k)]++;
119 } // general_kmer_count
122 class dna_digit {
123 public:
124 dna_digit() {
125 std::memset(digit_, 0, 256);
126 digit_['c'] = digit_['C'] = 1;
127 digit_['g'] = digit_['G'] = 2;
128 digit_['t'] = digit_['T'] = 3;
129 digit_['u'] = digit_['U'] = 3;
130 } // dna_digit
132 protected:
133 char digit_[256];
135 }; // dna_digit
138 class dna_kmer_index : public dna_digit {
139 public:
140 dna_kmer_index() : dna_digit() { }
142 template <typename Sequence>
143 void operator()(const std::string& s, unsigned int k, Sequence& S) {
144 unsigned int l = s.size();
145 unsigned int end = l - k + 1;
147 S.resize(end);
149 // first kmer
150 unsigned long long int v = digit_[s[k - 1]];
151 for (unsigned int i = 0; i < k - 1; ++i) {
152 v += digit_[s[i]] * (1ULL << ((k - i - 1) << 1));
155 S[0] = v;
157 // and then all other
158 unsigned long long int b = 1ULL << ((k - 1) << 1);
160 for (unsigned int i = 1; i < end; ++i) {
161 v = (v - b * digit_[s[i - 1]]) * 4 + digit_[s[i + k - 1]];
162 S[i] = v;
165 std::sort(S.begin(), S.end());
166 } // operator()
168 }; // class dna_kmer_index
171 class dna_kmer_count : public dna_digit {
172 public:
173 dna_kmer_count() : dna_digit() { }
175 template <typename Map>
176 void operator()(const std::string& s, unsigned int k, Map& S) {
177 unsigned int l = s.size();
178 unsigned int end = l - k + 1;
180 // first kmer
181 unsigned long long int v = digit_[s[k - 1]];
183 for (unsigned int i = 0; i < k - 1; ++i) {
184 v += digit_[s[i]] * (1ULL << ((k - i - 1) << 1));
187 S[v] = 1;
189 // and then all other
190 unsigned long long int b = 1ULL << ((k - 1) << 1);
192 for (unsigned int i = 1; i < end; ++i) {
193 v = (v - b * digit_[s[i - 1]]) * 4 + digit_[s[i + k - 1]];
194 S[v]++;
196 } // operator()
198 }; // class dna_kmer_count
200 } // namespace detail
204 template <typename Derived> struct sequence_compare {
205 std::tuple<int, int, int> operator()(const std::string& s0, const std::string& s1) {
206 return static_cast<Derived*>(this)->operator()(s0, s1);
208 }; // struct sequence_compare
211 /** Class: scoring_matrix
213 * Functor encapsulating a scoring_matrix functionality.
215 class scoring_matrix {
216 public:
217 scoring_matrix() : sz_(0), matrix_(0) { std::memset(sigma_, 0, 256); }
219 /** Constructor: scoring_matrix
221 * Parameter:
222 * sigma - Map of the alphabet used by the matrix.
223 * matrix - Raw-wise substitution matrix.
225 scoring_matrix(unsigned char sigma[256], const std::vector<char>& matrix)
226 : matrix_(matrix), sz_(std::sqrt(matrix.size())) { std::memcpy(sigma_, sigma, 256); }
228 /** Function: operator()
230 * Returns:
231 * Substitution score between a and b.
233 int operator()(char a, char b) const { return matrix_[sigma_[a] * sz_ + sigma_[b]]; }
236 private:
237 unsigned int sz_;
238 unsigned char sigma_[256];
239 std::vector<char> matrix_;
241 }; // scoring_matrix
244 inline scoring_matrix make_dummy_sm(int m, int s) {
245 unsigned char sigma[256];
246 for (unsigned int i = 0; i < 256; ++i) sigma[i] = i;
247 std::vector<char> matrix(256 * 256, s);
248 for (unsigned int i = 0; i < 256; ++i) matrix[(i << 8) + i] = m;
249 return scoring_matrix(sigma, matrix);
250 } // make_dummy_sm
252 inline scoring_matrix make_dna_sm(int m, int s) {
253 unsigned char sigma[256];
254 for (unsigned int i = 0; i < 256; ++i) sigma[i] = 4;
256 sigma['a'] = sigma['A'] = 0;
257 sigma['c'] = sigma['C'] = 1;
258 sigma['g'] = sigma['G'] = 2;
259 sigma['t'] = sigma['T'] = 3;
261 std::vector<char> matrix(5 * 5, s);
262 for (unsigned int i = 0; i < 5; ++i) matrix[(5 * i) + i] = m;
264 return scoring_matrix(sigma, matrix);
265 }; // make_dna_sm
267 inline bool read_file_sm(const std::string& name, scoring_matrix& sub) {
268 std::ifstream f(name.c_str());
269 if (!f) return false;
271 unsigned char sigma[256];
272 std::string buf;
274 // read comments
275 while (!f.eof()) {
276 buf = "";
277 std::getline(f, buf);
278 if ((buf.empty() == true) || (buf[0] != '#')) break;
279 } // while
281 if (buf.empty() == true) return false;
283 // parse column header
284 std::string head = buf;
285 head.erase(std::remove(head.begin(), head.end(), ' '), head.end());
287 int len = head.size();
288 if (head.at (head.length() - 1) != '*') return false;
290 std::vector<char> matrix(len * len, 0);
292 for (int i = 0; i < 256; ++i) sigma[i] = len - 1;
293 for (int i = 0; i < len - 1; ++i) sigma[head[i]] = i;
295 // read matrix
296 for (unsigned int i = 0; i < len; ++i) {
297 char id = 0;
298 int val;
300 f >> id;
301 if (!f || (id != head[i])) return false;
303 for (unsigned int j = 0; j < len; ++j) {
304 f >> val;
305 if (!f) return false;
306 unsigned int pos0 = sigma[head[i]];
307 unsigned int pos1 = sigma[head[j]];
308 matrix[pos0 * len + pos1] = val;
309 } // for j
310 } // for i
312 f.close();
314 sub = scoring_matrix(sigma, matrix);
315 return true;
316 } // read_scoring_matrix
319 /** Class: global_alignment
320 * Functor implementing global pairwise sequence alignment
321 * with affine gap penalty. The implementation is alphabet oblivious.
322 * No thread safety guarantees.
324 * set_alignment_type() function modifies the global alignment type:
326 * type 0: default normal affine gap global alignment
327 * type 1: suffix-prefix or containment alignment, where end gaps
328 * are penalty free (differing from type 0: matrix initialization
329 * and traceback starts from the max value of last row/col)
330 * type 2: prefix alignment with penalty free gaps in the suffix
331 * (differing from type 0, traceback starts from the max value of the
332 * last row/col)
333 * e.g. ACTGTTCCC
334 * -CTGTT---
335 * the first indel is penalized whereas the last 3 indels are not
337 class global_alignment : public sequence_compare<global_alignment> {
338 public:
339 /** Constructor: global_alignment
341 * Parameter:
342 * m - Match score (some positive number).
343 * s - Substitution penalty (usually negative number).
344 * g - Gap opening penalty (negative number).
345 * h - Gap extension penalty (negative number).
347 explicit global_alignment(int m = 0, int s = 0, int g = 0, int h = 0)
348 : sub_(make_dummy_sm(m, s)), g_(g), h_(h) {
349 alignment_type_ = 0;
350 path_prefix_ = path_suffix_ = "";
354 /** Constructor: global_alignment
356 * Parameter:
357 * sm - Substitution matrix.
358 * g - Gap opening penalty (negative number).
359 * h - Gap extension penalty (negative number).
361 global_alignment(const scoring_matrix& sm, int g, int h)
362 : sub_(sm), g_(g), h_(h) {
363 alignment_type_ = 0;
364 path_prefix_ = path_suffix_ = "";
367 void set_alignment_type (int flag) {
368 alignment_type_ = flag;
371 /** Function: operator()
372 * Compute alignment score between s0 and s1.
374 * Returns:
375 * 3-tuple (alignment score, alignment length without terminal gaps,
376 * number of matches).
378 std::tuple<int, int, int> operator()(const std::string& s0, const std::string& s1) {
379 int n = s0.size();
380 int m = s1.size();
382 S_.clear(); I_.clear(); D_.clear(); track_.clear();
383 // S(i, j) = max{ I(i, j), D(i, j), S(i - 1, j - 1) + d(i,j) }
384 // D(i, j) = max{ D(i, j - 1), S(i, j - 1) + g } + h
385 // I(i, j) = max{ I(i - 1, j), S(i - 1, j) + g } + h
386 S_.resize ((n + 1) * (m + 1));
387 I_.resize ((n + 1) * (m + 1));
388 D_.resize ((n + 1) * (m + 1));
389 track_.resize((n + 1) * (m + 1));
391 // ------ matrix initialization for first row and column ------
392 S_[0] = 0;
393 if (alignment_type_ == 1) I_[0] = D_[0] = 0;
394 else I_[0] = D_[0] = g_;
396 for (int j = 1; j <= m; ++ j) { // first row
397 track_[j] = LEFT;
398 I_[j] = -100;
399 if (alignment_type_ == 1) D_[j] = S_[j] = 0;
400 else S_[j] = D_[j] = g_ + j * h_;
403 for (int i = 1; i <= n; ++ i) { // first col
404 int idx = (m + 1) * i;
405 track_[idx] = TOP;
406 D_[idx] = -100;
407 if (alignment_type_ == 1) I_[idx] = S_[idx] = 0;
408 else S_[idx] = I_[idx] = g_ + i * h_;
411 // ----------------- matrix filling --------------------
412 for (int i = 1; i <= n; ++ i) { // row
413 for (int j = 1; j <= m; ++ j) { // col
414 int idx = i * (m + 1) + j;
416 D_[idx] = std::max (D_[idx - 1], S_[idx - 1] + g_) + h_;
417 I_[idx] = std::max (I_[idx - m - 1], S_[idx - m - 1] + g_) + h_;
418 S_[idx] = S_[idx - m - 2] + sub_(s0[i - 1], s1[j - 1]);
420 track_[idx] = DIAG;
421 if (D_[idx] >= S_[idx]) {
422 S_[idx] = D_[idx];
423 track_[idx] = LEFT;
425 if (I_[idx] >= S_[idx]) {
426 S_[idx] = I_[idx];
427 track_[idx] = TOP;
432 {// debug: print out content of S_ I_ D_
433 for (int i = 0; i <= n ; ++ i) {
434 for (int j = 0; j <= m; ++ j) {
435 int idx = i * (m + 1) + j;
436 std::cout << std::setw(6) << S_[idx];
438 std::cout << "\n";
439 for (int j = 0; j <= m; ++ j) {
440 int idx = i * (m + 1) + j;
441 std::cout << std::setw(6) << I_[idx];
443 std::cout << "\n";
444 for (int j = 0; j <= m; ++ j) {
445 int idx = i * (m + 1) + j;
446 std::cout << std::setw(6) << D_[idx];
448 std::cout << "\n";
449 std::cout << "\n";
451 // trace matrix
452 for (int i = 0; i <= n; ++ i) {
453 for (int j = 0; j <= m; ++ j) {
454 int idx = i * (m + 1) + j;
455 int track = track_[idx];
456 switch (track){
457 case 0: std::cout << "Top\t";
458 break;
459 case 1: std::cout << "lef\t";
460 break;
461 case 2: std::cout << "dia\t";
462 break;
465 std::cout << "\n";
469 // ----------------- backtracking --------------------
470 int i = n;
471 int j = m;
472 int max_val = S_.back();
473 if (alignment_type_ != 0) {
474 // identify the idx of the max value of last row
475 max_val = INT_MIN;
476 int row = n, col = 1; // row col wrt to (n+1)x(m+1) matrix
477 for (; col <= m; ++ col) { // last row
478 int idx = row * (m + 1) + col;
479 if (S_[idx] > max_val) {
480 max_val = S_[idx];
481 i = row;
482 j = col;
485 // identify the idx of the max value of last col
486 col = m; row = 1;
487 for (; row <= n; ++ row) {
488 int idx = row * (m + 1) + col;
489 if (S_[idx] > max_val) {
490 max_val = S_[idx];
491 i = row;
492 j = col;
496 if (i < n) path_prefix_ = std::string (n - i, 'D');
497 else if (j < m) path_prefix_ = std::string (m - j, 'I');
501 /*{ // debug print
502 std::cout << "path_prefix = " << path_prefix_ << "\n";
503 std::cout << "i, j: " << i << ", " << j << "\n";
506 int match = 0;
507 int length = 0;
509 bool has_gap = false;
510 int sgap = 0;
511 int egap = 0;
513 has_path_ = false;
514 path_.clear();
516 while ((i > 0) || (j > 0)) {
518 std::cout << "i, j = " << i << ", " << j << "\n";
520 switch (track_[i * (m + 1) + j]) {
521 case TOP:
522 --i;
523 sgap++;
524 path_.push_back('D');
525 break;
527 case LEFT:
528 --j;
529 sgap++;
530 path_.push_back('I');
531 break;
533 case DIAG:
535 if (s0[i - 1] == s1[j - 1]) {
536 match++;
537 path_.push_back('M');
538 } else path_.push_back('R');
540 --i;
541 --j;
543 if (has_gap == false) {
544 has_gap = true;
545 egap = sgap;
548 sgap = 0;
549 break;
550 } // switch
552 length ++;
553 } // while
556 std::cout << "i, j = " << i << "," << j << "\n";
559 //if (i > 0) path_suffix_ = std::string (i, 'D');
560 //else if (j > 0) path_suffix_ = std::string (j, 'I');
562 path_ = path_prefix_ + path_;
563 //path_ += path_suffix_;
565 return std::make_tuple(max_val, length - sgap - egap, match);
566 } // operator()
568 /** Function: path
569 * Backtrack the last computed alignment to obtain its edit path.
571 * Returns:
572 * Edit path where 'I' means insert wrt s0, 'D' is deletion, 'R' is substitution,
573 * and 'M' is match.
575 std::string path() {
576 if (has_path_ == false) {
577 std::reverse(path_.begin(), path_.end());
578 has_path_ = true;
580 return path_;
581 } // path
584 private:
585 enum { TOP, LEFT, DIAG };
587 int alignment_type_; // default 0: standard global alignment
588 // 1: end gap free alignment,
589 // suffix-prefix/containment etc
590 // 2: suffix gap free alignment,
591 // penalize gaps in the prefix
592 bool has_path_;
593 std::string path_;
594 std::string path_prefix_, // if alignment end within one string,
595 path_suffix_; // these store what needs to be attached,
596 // in prefix or suffix e.g. DDDD or III
597 std::vector<unsigned char> track_;
599 std::vector<int> S_;
600 std::vector<int> I_;
601 std::vector<int> D_;
602 scoring_matrix sub_;
604 int g_;
605 int h_;
607 }; // class global_alignment
610 } // namespace bio
612 #endif // SEQUENCE_COMPARE_HPP