4 * File: sequence_compare.hpp
5 * Created: May 03, 2012
7 * Author: Jaroslaw Zola <jaroslaw.zola@gmail.com>
8 * Xiao Yang <isuyang@gmail.com>
9 * Copyright (c) 2012-2013 Jaroslaw Zola
10 * Distributed under the Boost Software License.
12 * Boost Software License - Version 1.0 - August 17th, 2003
14 * Permission is hereby granted, free of charge, to any person or organization
15 * obtaining a copy of the software and accompanying documentation covered by
16 * this license (the "Software") to use, reproduce, display, distribute,
17 * execute, and transmit the Software, and to prepare derivative works of the
18 * Software, and to permit third-parties to whom the Software is furnished to
19 * do so, all subject to the following:
21 * The copyright notices in the Software and this entire statement, including
22 * the above license grant, this restriction and the following disclaimer,
23 * must be included in all copies of the Software, in whole or in part, and
24 * all derivative works of the Software, unless such copies or derivative
25 * works are solely in the form of machine-executable object code generated by
26 * a source language processor.
28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
31 * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
32 * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
33 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
34 * DEALINGS IN THE SOFTWARE.
37 #ifndef SEQUENCE_COMPARE_HPP
38 #define SEQUENCE_COMPARE_HPP
57 // this code comes from jaz
58 template <typename Iter1
, typename Iter2
, typename Pred
>
59 std::size_t intersection_size(Iter1 first1
, Iter1 last1
, Iter2 first2
, Iter2 last2
, Pred pred
) {
62 while ((first1
!= last1
) && (first2
!= last2
)) {
63 if (pred(*first1
, *first2
)) ++first1
;
64 else if (pred(*first2
, *first1
)) ++first2
;
73 } // intersection_size
76 template <typename Iter1
, typename Iter2
>
77 int count_distance(Iter1 first1
, Iter1 last1
, Iter2 first2
, Iter2 last2
) {
80 while ((first1
!= last1
) && (first2
!= last2
)) {
81 if (first1
->first
< first2
->first
) {
82 S
+= (first1
->second
* first1
->second
);
85 else if (first2
->first
< first1
->first
) {
86 S
+= (first2
->second
* first2
->second
);
90 int d
= (first1
->second
- first2
->second
);
101 template <typename Sequence
> void general_kmer_index(const std::string
& s
, unsigned int k
, Sequence
& S
) {
102 unsigned int l
= s
.size();
103 unsigned int end
= l
- k
+ 1;
105 for (unsigned int i
= 0; i
< end
; ++i
) {
106 S
[i
] = std::string(s
.begin() + i
, s
.begin() + i
+ k
);
108 std::sort(S
.begin(), S
.end());
109 } // general_kmer_index
112 template <typename Map
> void general_kmer_count(const std::string
& s
, unsigned int k
, Map
& S
) {
114 unsigned int l
= s
.size();
115 unsigned int end
= l
- k
+ 1;
116 for (unsigned int i
= 0; i
< end
; ++i
) {
117 S
[std::string(s
.begin() + i
, s
.begin() + i
+ k
)]++;
119 } // general_kmer_count
125 std::memset(digit_
, 0, 256);
126 digit_
['c'] = digit_
['C'] = 1;
127 digit_
['g'] = digit_
['G'] = 2;
128 digit_
['t'] = digit_
['T'] = 3;
129 digit_
['u'] = digit_
['U'] = 3;
138 class dna_kmer_index
: public dna_digit
{
140 dna_kmer_index() : dna_digit() { }
142 template <typename Sequence
>
143 void operator()(const std::string
& s
, unsigned int k
, Sequence
& S
) {
144 unsigned int l
= s
.size();
145 unsigned int end
= l
- k
+ 1;
150 unsigned long long int v
= digit_
[s
[k
- 1]];
151 for (unsigned int i
= 0; i
< k
- 1; ++i
) {
152 v
+= digit_
[s
[i
]] * (1ULL << ((k
- i
- 1) << 1));
157 // and then all other
158 unsigned long long int b
= 1ULL << ((k
- 1) << 1);
160 for (unsigned int i
= 1; i
< end
; ++i
) {
161 v
= (v
- b
* digit_
[s
[i
- 1]]) * 4 + digit_
[s
[i
+ k
- 1]];
165 std::sort(S
.begin(), S
.end());
168 }; // class dna_kmer_index
171 class dna_kmer_count
: public dna_digit
{
173 dna_kmer_count() : dna_digit() { }
175 template <typename Map
>
176 void operator()(const std::string
& s
, unsigned int k
, Map
& S
) {
177 unsigned int l
= s
.size();
178 unsigned int end
= l
- k
+ 1;
181 unsigned long long int v
= digit_
[s
[k
- 1]];
183 for (unsigned int i
= 0; i
< k
- 1; ++i
) {
184 v
+= digit_
[s
[i
]] * (1ULL << ((k
- i
- 1) << 1));
189 // and then all other
190 unsigned long long int b
= 1ULL << ((k
- 1) << 1);
192 for (unsigned int i
= 1; i
< end
; ++i
) {
193 v
= (v
- b
* digit_
[s
[i
- 1]]) * 4 + digit_
[s
[i
+ k
- 1]];
198 }; // class dna_kmer_count
200 } // namespace detail
204 template <typename Derived
> struct sequence_compare
{
205 std::tuple
<int, int, int> operator()(const std::string
& s0
, const std::string
& s1
) {
206 return static_cast<Derived
*>(this)->operator()(s0
, s1
);
208 }; // struct sequence_compare
211 /** Class: scoring_matrix
213 * Functor encapsulating a scoring_matrix functionality.
215 class scoring_matrix
{
217 scoring_matrix() : sz_(0), matrix_(0) { std::memset(sigma_
, 0, 256); }
219 /** Constructor: scoring_matrix
222 * sigma - Map of the alphabet used by the matrix.
223 * matrix - Raw-wise substitution matrix.
225 scoring_matrix(unsigned char sigma
[256], const std::vector
<char>& matrix
)
226 : matrix_(matrix
), sz_(std::sqrt(matrix
.size())) { std::memcpy(sigma_
, sigma
, 256); }
228 /** Function: operator()
231 * Substitution score between a and b.
233 int operator()(char a
, char b
) const { return matrix_
[sigma_
[a
] * sz_
+ sigma_
[b
]]; }
238 unsigned char sigma_
[256];
239 std::vector
<char> matrix_
;
244 inline scoring_matrix
make_dummy_sm(int m
, int s
) {
245 unsigned char sigma
[256];
246 for (unsigned int i
= 0; i
< 256; ++i
) sigma
[i
] = i
;
247 std::vector
<char> matrix(256 * 256, s
);
248 for (unsigned int i
= 0; i
< 256; ++i
) matrix
[(i
<< 8) + i
] = m
;
249 return scoring_matrix(sigma
, matrix
);
252 inline scoring_matrix
make_dna_sm(int m
, int s
) {
253 unsigned char sigma
[256];
254 for (unsigned int i
= 0; i
< 256; ++i
) sigma
[i
] = 4;
256 sigma
['a'] = sigma
['A'] = 0;
257 sigma
['c'] = sigma
['C'] = 1;
258 sigma
['g'] = sigma
['G'] = 2;
259 sigma
['t'] = sigma
['T'] = 3;
261 std::vector
<char> matrix(5 * 5, s
);
262 for (unsigned int i
= 0; i
< 5; ++i
) matrix
[(5 * i
) + i
] = m
;
264 return scoring_matrix(sigma
, matrix
);
267 inline bool read_file_sm(const std::string
& name
, scoring_matrix
& sub
) {
268 std::ifstream
f(name
.c_str());
269 if (!f
) return false;
271 unsigned char sigma
[256];
277 std::getline(f
, buf
);
278 if ((buf
.empty() == true) || (buf
[0] != '#')) break;
281 if (buf
.empty() == true) return false;
283 // parse column header
284 std::string head
= buf
;
285 head
.erase(std::remove(head
.begin(), head
.end(), ' '), head
.end());
287 int len
= head
.size();
288 if (head
.at (head
.length() - 1) != '*') return false;
290 std::vector
<char> matrix(len
* len
, 0);
292 for (int i
= 0; i
< 256; ++i
) sigma
[i
] = len
- 1;
293 for (int i
= 0; i
< len
- 1; ++i
) sigma
[head
[i
]] = i
;
296 for (unsigned int i
= 0; i
< len
; ++i
) {
301 if (!f
|| (id
!= head
[i
])) return false;
303 for (unsigned int j
= 0; j
< len
; ++j
) {
305 if (!f
) return false;
306 unsigned int pos0
= sigma
[head
[i
]];
307 unsigned int pos1
= sigma
[head
[j
]];
308 matrix
[pos0
* len
+ pos1
] = val
;
314 sub
= scoring_matrix(sigma
, matrix
);
316 } // read_scoring_matrix
319 /** Class: global_alignment
320 * Functor implementing global pairwise sequence alignment
321 * with affine gap penalty. The implementation is alphabet oblivious.
322 * No thread safety guarantees.
324 * set_alignment_type() function modifies the global alignment type:
326 * type 0: default normal affine gap global alignment
327 * type 1: suffix-prefix or containment alignment, where end gaps
328 * are penalty free (differing from type 0: matrix initialization
329 * and traceback starts from the max value of last row/col)
330 * type 2: prefix alignment with penalty free gaps in the suffix
331 * (differing from type 0, traceback starts from the max value of the
335 * the first indel is penalized whereas the last 3 indels are not
337 class global_alignment
: public sequence_compare
<global_alignment
> {
339 /** Constructor: global_alignment
342 * m - Match score (some positive number).
343 * s - Substitution penalty (usually negative number).
344 * g - Gap opening penalty (negative number).
345 * h - Gap extension penalty (negative number).
347 explicit global_alignment(int m
= 0, int s
= 0, int g
= 0, int h
= 0)
348 : sub_(make_dummy_sm(m
, s
)), g_(g
), h_(h
) {
350 path_prefix_
= path_suffix_
= "";
354 /** Constructor: global_alignment
357 * sm - Substitution matrix.
358 * g - Gap opening penalty (negative number).
359 * h - Gap extension penalty (negative number).
361 global_alignment(const scoring_matrix
& sm
, int g
, int h
)
362 : sub_(sm
), g_(g
), h_(h
) {
364 path_prefix_
= path_suffix_
= "";
367 void set_alignment_type (int flag
) {
368 alignment_type_
= flag
;
371 /** Function: operator()
372 * Compute alignment score between s0 and s1.
375 * 3-tuple (alignment score, alignment length without terminal gaps,
376 * number of matches).
378 std::tuple
<int, int, int> operator()(const std::string
& s0
, const std::string
& s1
) {
382 S_
.clear(); I_
.clear(); D_
.clear(); track_
.clear();
383 // S(i, j) = max{ I(i, j), D(i, j), S(i - 1, j - 1) + d(i,j) }
384 // D(i, j) = max{ D(i, j - 1), S(i, j - 1) + g } + h
385 // I(i, j) = max{ I(i - 1, j), S(i - 1, j) + g } + h
386 S_
.resize ((n
+ 1) * (m
+ 1));
387 I_
.resize ((n
+ 1) * (m
+ 1));
388 D_
.resize ((n
+ 1) * (m
+ 1));
389 track_
.resize((n
+ 1) * (m
+ 1));
391 // ------ matrix initialization for first row and column ------
393 if (alignment_type_
== 1) I_
[0] = D_
[0] = 0;
394 else I_
[0] = D_
[0] = g_
;
396 for (int j
= 1; j
<= m
; ++ j
) { // first row
399 if (alignment_type_
== 1) D_
[j
] = S_
[j
] = 0;
400 else S_
[j
] = D_
[j
] = g_
+ j
* h_
;
403 for (int i
= 1; i
<= n
; ++ i
) { // first col
404 int idx
= (m
+ 1) * i
;
407 if (alignment_type_
== 1) I_
[idx
] = S_
[idx
] = 0;
408 else S_
[idx
] = I_
[idx
] = g_
+ i
* h_
;
411 // ----------------- matrix filling --------------------
412 for (int i
= 1; i
<= n
; ++ i
) { // row
413 for (int j
= 1; j
<= m
; ++ j
) { // col
414 int idx
= i
* (m
+ 1) + j
;
416 D_
[idx
] = std::max (D_
[idx
- 1], S_
[idx
- 1] + g_
) + h_
;
417 I_
[idx
] = std::max (I_
[idx
- m
- 1], S_
[idx
- m
- 1] + g_
) + h_
;
418 S_
[idx
] = S_
[idx
- m
- 2] + sub_(s0
[i
- 1], s1
[j
- 1]);
421 if (D_
[idx
] >= S_
[idx
]) {
425 if (I_
[idx
] >= S_
[idx
]) {
432 {// debug: print out content of S_ I_ D_
433 for (int i
= 0; i
<= n
; ++ i
) {
434 for (int j
= 0; j
<= m
; ++ j
) {
435 int idx
= i
* (m
+ 1) + j
;
436 std::cout
<< std::setw(6) << S_
[idx
];
439 for (int j
= 0; j
<= m
; ++ j
) {
440 int idx
= i
* (m
+ 1) + j
;
441 std::cout
<< std::setw(6) << I_
[idx
];
444 for (int j
= 0; j
<= m
; ++ j
) {
445 int idx
= i
* (m
+ 1) + j
;
446 std::cout
<< std::setw(6) << D_
[idx
];
452 for (int i
= 0; i
<= n
; ++ i
) {
453 for (int j
= 0; j
<= m
; ++ j
) {
454 int idx
= i
* (m
+ 1) + j
;
455 int track
= track_
[idx
];
457 case 0: std::cout
<< "Top\t";
459 case 1: std::cout
<< "lef\t";
461 case 2: std::cout
<< "dia\t";
469 // ----------------- backtracking --------------------
472 int max_val
= S_
.back();
473 if (alignment_type_
!= 0) {
474 // identify the idx of the max value of last row
476 int row
= n
, col
= 1; // row col wrt to (n+1)x(m+1) matrix
477 for (; col
<= m
; ++ col
) { // last row
478 int idx
= row
* (m
+ 1) + col
;
479 if (S_
[idx
] > max_val
) {
485 // identify the idx of the max value of last col
487 for (; row
<= n
; ++ row
) {
488 int idx
= row
* (m
+ 1) + col
;
489 if (S_
[idx
] > max_val
) {
496 if (i
< n
) path_prefix_
= std::string (n
- i
, 'D');
497 else if (j
< m
) path_prefix_
= std::string (m
- j
, 'I');
502 std::cout << "path_prefix = " << path_prefix_ << "\n";
503 std::cout << "i, j: " << i << ", " << j << "\n";
509 bool has_gap
= false;
516 while ((i
> 0) || (j
> 0)) {
518 std::cout
<< "i, j = " << i
<< ", " << j
<< "\n";
520 switch (track_
[i
* (m
+ 1) + j
]) {
524 path_
.push_back('D');
530 path_
.push_back('I');
535 if (s0
[i
- 1] == s1
[j
- 1]) {
537 path_
.push_back('M');
538 } else path_
.push_back('R');
543 if (has_gap
== false) {
556 std::cout
<< "i, j = " << i
<< "," << j
<< "\n";
559 //if (i > 0) path_suffix_ = std::string (i, 'D');
560 //else if (j > 0) path_suffix_ = std::string (j, 'I');
562 path_
= path_prefix_
+ path_
;
563 //path_ += path_suffix_;
565 return std::make_tuple(max_val
, length
- sgap
- egap
, match
);
569 * Backtrack the last computed alignment to obtain its edit path.
572 * Edit path where 'I' means insert wrt s0, 'D' is deletion, 'R' is substitution,
576 if (has_path_
== false) {
577 std::reverse(path_
.begin(), path_
.end());
585 enum { TOP
, LEFT
, DIAG
};
587 int alignment_type_
; // default 0: standard global alignment
588 // 1: end gap free alignment,
589 // suffix-prefix/containment etc
590 // 2: suffix gap free alignment,
591 // penalize gaps in the prefix
594 std::string path_prefix_
, // if alignment end within one string,
595 path_suffix_
; // these store what needs to be attached,
596 // in prefix or suffix e.g. DDDD or III
597 std::vector
<unsigned char> track_
;
607 }; // class global_alignment
612 #endif // SEQUENCE_COMPARE_HPP