2 * Copyright (c) 2010 Nicolaus Lance Hepler
5 * Permission is hereby granted, free of charge, to any person
6 * obtaining a copy of this software and associated documentation
7 * files (the "Software"), to deal in the Software without
8 * restriction, including without limitation the rights to use,
9 * copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following
14 * The above copyright notice and this permission notice shall be
15 * included in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24 * OTHER DEALINGS IN THE SOFTWARE.
26 // Note: That's an MIT license.
27 // All double-commented comments below are from Nicolaus Lance Hepler.
28 // Original repository: https://code.google.com/archive/p/swalign/
30 /* Tao Liu made some modifications.... Note to myself: maybe I should use ksw.c/h instead */
37 #define MISMATCH -3.0 /* the scoring parameters are from BLASTN default */
38 // ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz
39 #define TRANS "TVGHEFCDIJMLKNOPQYWAABSXRZ[\\]^_`tvghefcdijmlknopqywaabsxrz"
40 #define TRANS_OFFSET 65
47 // /* reverse a string str0 in place, return str */
48 static char* reverse(char *str
, unsigned int l
) {
50 char *right
= left
+ l
- 1;
52 while (left
< right
) {
61 // Return the reverse complement of a sequence.
62 char* revcomp(char *str
) {
64 char *right
= left
+ strlen(str
) - 1;
67 while (left
< right
) {
68 tmp
= get_char_comp(*left
);
69 *(left
++) = get_char_comp(*right
);
76 // Return the complement of a base.
77 // Uses a simple lookup table: a string with the complements of all possible sequence characters.
78 static char get_char_comp(char c
) {
79 int i
= c
- TRANS_OFFSET
;
80 if (i
< 0 || i
> 57) {
87 // return the alignment of two sequences
88 static align_t
*traceback(seq_pair_t
*problem
, unsigned short *S
, int row
, int col
, float score
, unsigned short *ngap_vertical
, unsigned short *ngap_horizontal
) {
90 unsigned int m
= problem
->alen
+ 1;
91 unsigned int n
= problem
->blen
+ 1;
92 align_t
*result
= malloc(sizeof(align_t
));
93 seq_pair_t
*seqs
= malloc(sizeof(seq_pair_t
));
95 int max_len
= problem
->alen
+ problem
->blen
; /* maximum length of alignment */
97 char reversed1
[max_len
]; /* reversed sequence #1 */
98 char reversed2
[max_len
]; /* reversed sequence #2 */
99 char reversed3
[max_len
]; /* reversed markup */
101 unsigned int len1
= 0; /* length of seq #1 in alignment */
102 unsigned int len2
= 0; /* length of seq #2 in alignment */
103 unsigned int len3
= 0; /* length of the markup line in alignment */
105 unsigned int identity
= 0; // count of identitcal pairs
106 unsigned int gaps
= 0; // count of gaps
110 int i
= row
; // traceback start row
111 int j
= col
; // traceback start col
113 bool still_going
= true; // traceback flag: true -> continue & false -> stop
115 while ( still_going
) {
118 for (l
= 0, l2
= ngap_vertical
[k
+ j
]; l
< l2
; l
++) {
119 reversed1
[len1
++] = problem
->a
[--i
];
120 reversed2
[len2
++] = '-';
121 reversed3
[len3
++] = ' ';
127 c1
= problem
->a
[--i
];
128 c2
= problem
->b
[--j
];
130 reversed1
[len1
++] = c1
;
131 reversed2
[len2
++] = c2
;
133 reversed3
[len3
++] = '|';
136 reversed3
[len3
++] = '.';
139 for (l
= 0, l2
= ngap_horizontal
[k
+ j
]; l
< l2
; l
++) {
140 reversed1
[len1
++] = '-';
141 reversed2
[len2
++] = problem
->b
[--j
];
142 reversed3
[len3
++] = ' ';
151 seqs
->a
= malloc(sizeof(char) * (len1
+ 1) );
152 seqs
->b
= malloc(sizeof(char) * (len2
+ 1) );
153 result
->markup
= malloc(sizeof(char) * (len3
+ 1) );
155 memset(seqs
->a
, '\0', sizeof(char) * (len1
+ 1));
156 memset(seqs
->b
, '\0', sizeof(char) * (len2
+ 1));
157 memset(result
->markup
, '\0', sizeof(char) * (len3
+ 1) );
159 reverse(reversed1
, len1
);
160 reverse(reversed2
, len2
);
161 reverse(reversed3
, len3
);
163 strcpy(seqs
->a
, reversed1
);
164 strcpy(seqs
->b
, reversed2
);
165 strcpy(result
->markup
, reversed3
);
171 result
->score
= score
;
172 result
->matches
= identity
;
179 //printf("%d %d %d",len1, len2, len3);
184 void destroy_seq_pair(seq_pair_t
*pair
) {
191 void destroy_align(align_t
*ali
) {
192 destroy_seq_pair( ali
->seqs
);
197 //align_t *smith_waterman(seq_pair_t *problem, bool local) {
198 align_t
*smith_waterman(seq_pair_t
*problem
) {
199 unsigned int m
= problem
->alen
+ 1;
200 unsigned int n
= problem
->blen
+ 1;
202 /* traceback matrix */
203 unsigned short *S
= malloc(sizeof(unsigned short) * m
* n
); /* 0 = STOP; 1 = LEFT; 2 = DIAGONAL; 3 = UP */
205 /* number of vertical gaps for each cell */
206 unsigned short *ngap_vertical
= malloc(sizeof(unsigned short) * m
* n
);
207 /* number of horizontal gaps for each cell */
208 unsigned short *ngap_horizontal
= malloc(sizeof(unsigned short) * m
* n
);
211 unsigned int i
, j
, k
, l
;
213 float f
; /* score of final alignment */
214 float * g
= malloc(sizeof(float) * n
); /* score if x_i aligns to a gap after y_i */
215 float h
; /* score if y_i aligns to a gap after x_i */
216 float * v
= malloc(sizeof(float) * n
); /* best of score of alignment x_1 ... x_i to y_1 ... y_i */
219 float sim_score
, g1
, g2
, h1
, h2
;
221 /* row and col number, and score of optimal cell in the matrix */
222 unsigned int t_row
, t_col
;
225 /* Initialize traceback matrix */
226 for ( i
= 0, k
= 0; i
< m
; i
++, k
+= n
)
228 for ( j
= 1; j
< n
; j
++ )
231 /* set number of gaps */
232 for (i
= 0, k
= 0; i
< m
; i
++, k
+= n
)
233 for (j
= 0; j
< n
; j
++)
234 ngap_vertical
[k
+ j
] = ngap_horizontal
[k
+ j
] = 1;
245 for ( j
= 1; j
< n
; j
++ ) {
250 for (i
= 1, k
= n
; i
< m
; i
++, k
+= n
) { /* i for row number/# on seq1; k for position of first column in m*n matrix; */
253 for (j
= 1, l
= k
+ 1; j
< n
; j
++, l
++) { /* j for column number/# on seq2; l for position of ith row jth column in m*n matrix; */
254 sim_score
= (strncmp(problem
->a
+(i
-1), problem
->b
+(j
-1), 1) == 0) ? MATCH
: MISMATCH
;
256 /* set direction in traceback matrix */
257 f
= v_diagonal
+ sim_score
;
262 if ( g1
> g2
) { /* perfer gap extension in vertical direction (seq 1) */
264 ngap_vertical
[l
] = (short) (ngap_vertical
[l
- n
] + 1);
265 } else { /* prefer gap openning */
272 if (h1
> h2
) { /* prefer gap extention in horizontal direction ( seq 2) */
274 ngap_horizontal
[l
] = (short) (ngap_horizontal
[l
- 1] + 1);
280 //v[j] = max( f, g[j], h, 0 ); /* the final score */
282 /* Determine the traceback direction */
283 if ( f
<= 0 && g
[j
] <=0 && h
<=0 ) { /* 0 is max */
287 else if ( g
[j
] <= f
&& h
<= f
) { /* or f is max */
291 else if ( h
<= g
[j
]) { /* or g[j] is max */
295 else { /* or h is max */
300 // Set the traceback start at the current cell i, j and score
301 if (v
[j
] > t_score
) {
309 result
= traceback(problem
, S
, t_row
, t_col
, t_score
, ngap_vertical
, ngap_horizontal
);
311 // print_matrix(S, problem);
317 free(ngap_horizontal
);
322 void print_alignment(align_t
*result
) {
323 printf("Score: %0.0f Matches: %d Gaps: %d\n", result
->score
, result
->matches
, result
->gaps
);
324 printf("Target: %3d %s %-3d\n", result
->start_a
, result
->seqs
->a
, result
->end_a
);
325 printf(" %s \n", result
->markup
);
326 printf("Query: %3d %s %-3d\n", result
->start_b
, result
->seqs
->b
, result
->end_b
);
329 int main(int argc
, const char **argv
) {
332 printf("usage: swalign TARGET_SEQ QUERY_SEQ\n");
339 char c
[strlen(argv
[1])], d
[strlen(argv
[2])];
345 problem
.alen
= strlen(problem
.a
);
347 problem
.blen
= strlen(problem
.b
);
349 result
= smith_waterman(&problem
);
351 print_alignment(result
);