implement PETrackII for fragment files from scATAC experiments
[MACS.git] / MACS3 / Signal / swalign.c
blobd28d0f4a6c2c6034ea465a3d90b6a1f0ce7d5a49
1 /*
2 * Copyright (c) 2010 Nicolaus Lance Hepler
3 *
4 *
5 * Permission is hereby granted, free of charge, to any person
6 * obtaining a copy of this software and associated documentation
7 * files (the "Software"), to deal in the Software without
8 * restriction, including without limitation the rights to use,
9 * copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following
12 * conditions:
14 * The above copyright notice and this permission notice shall be
15 * included in all copies or substantial portions of the Software.
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24 * OTHER DEALINGS IN THE SOFTWARE.
26 // Note: That's an MIT license.
27 // All double-commented comments below are from Nicolaus Lance Hepler.
28 // Original repository: https://code.google.com/archive/p/swalign/
30 /* Tao Liu made some modifications.... Note to myself: maybe I should use ksw.c/h instead */
32 #include "swalign.h"
34 #define GAPO -10.0
35 #define GAPE -2.0
36 #define MATCH 2.0
37 #define MISMATCH -3.0 /* the scoring parameters are from BLASTN default */
38 // ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz
39 #define TRANS "TVGHEFCDIJMLKNOPQYWAABSXRZ[\\]^_`tvghefcdijmlknopqywaabsxrz"
40 #define TRANS_OFFSET 65
42 #define STOP 0
43 #define LEFT 1
44 #define DIAGONAL 2
45 #define UP 3
47 // /* reverse a string str0 in place, return str */
48 static char* reverse(char *str, unsigned int l) {
49 char *left = str;
50 char *right = left + l - 1;
51 char tmp;
52 while (left < right) {
53 tmp = *left;
54 *(left++) = *right;
55 *(right--) = tmp;
57 *(str+l) = '\0';
58 return str;
61 // Return the reverse complement of a sequence.
62 char* revcomp(char *str) {
63 char *left = str;
64 char *right = left + strlen(str) - 1;
65 char tmp;
67 while (left < right) {
68 tmp = get_char_comp(*left);
69 *(left++) = get_char_comp(*right);
70 *(right--) = tmp;
73 return str;
76 // Return the complement of a base.
77 // Uses a simple lookup table: a string with the complements of all possible sequence characters.
78 static char get_char_comp(char c) {
79 int i = c - TRANS_OFFSET;
80 if (i < 0 || i > 57) {
81 return c;
82 } else {
83 return TRANS[i];
87 // return the alignment of two sequences
88 static align_t *traceback(seq_pair_t *problem, unsigned short *S, int row, int col, float score, unsigned short *ngap_vertical, unsigned short *ngap_horizontal) {
89 unsigned int l, l2;
90 unsigned int m = problem->alen + 1;
91 unsigned int n = problem->blen + 1;
92 align_t *result = malloc(sizeof(align_t));
93 seq_pair_t *seqs = malloc(sizeof(seq_pair_t));
95 int max_len = problem->alen + problem->blen; /* maximum length of alignment */
97 char reversed1[max_len]; /* reversed sequence #1 */
98 char reversed2[max_len]; /* reversed sequence #2 */
99 char reversed3[max_len]; /* reversed markup */
101 unsigned int len1 = 0; /* length of seq #1 in alignment */
102 unsigned int len2 = 0; /* length of seq #2 in alignment */
103 unsigned int len3 = 0; /* length of the markup line in alignment */
105 unsigned int identity = 0; // count of identitcal pairs
106 unsigned int gaps = 0; // count of gaps
108 char c1, c2;
110 int i = row; // traceback start row
111 int j = col; // traceback start col
112 int k = i * n;
113 bool still_going = true; // traceback flag: true -> continue & false -> stop
115 while ( still_going ) {
116 switch ( S[k+j] ) {
117 case UP:
118 for (l = 0, l2 = ngap_vertical[k + j]; l < l2; l++) {
119 reversed1[len1++] = problem->a[--i];
120 reversed2[len2++] = '-';
121 reversed3[len3++] = ' ';
122 k -= n;
123 gaps++;
125 break;
126 case DIAGONAL:
127 c1 = problem->a[--i];
128 c2 = problem->b[--j];
129 k -= n;
130 reversed1[len1++] = c1;
131 reversed2[len2++] = c2;
132 if (c1 == c2) {
133 reversed3[len3++] = '|';
134 identity++;
135 } else
136 reversed3[len3++] = '.';
137 break;
138 case LEFT:
139 for (l = 0, l2 = ngap_horizontal[k + j]; l < l2; l++) {
140 reversed1[len1++] = '-';
141 reversed2[len2++] = problem->b[--j];
142 reversed3[len3++] = ' ';
143 gaps++;
145 break;
146 case STOP:
147 still_going = false;
151 seqs->a = malloc(sizeof(char) * (len1 + 1) );
152 seqs->b = malloc(sizeof(char) * (len2 + 1) );
153 result->markup = malloc(sizeof(char) * (len3 + 1) );
155 memset(seqs->a, '\0', sizeof(char) * (len1 + 1));
156 memset(seqs->b, '\0', sizeof(char) * (len2 + 1));
157 memset(result->markup, '\0', sizeof(char) * (len3 + 1) );
159 reverse(reversed1, len1);
160 reverse(reversed2, len2);
161 reverse(reversed3, len3);
163 strcpy(seqs->a, reversed1);
164 strcpy(seqs->b, reversed2);
165 strcpy(result->markup, reversed3);
167 seqs->alen = k;
168 seqs->blen = k;
170 result->seqs = seqs;
171 result->score = score;
172 result->matches = identity;
173 result->gaps = gaps;
174 result->start_a = i;
175 result->start_b = j;
176 result->end_a = row;
177 result->end_b = col;
179 //printf("%d %d %d",len1, len2, len3);
181 return result;
184 void destroy_seq_pair(seq_pair_t *pair) {
185 free(pair->a);
186 free(pair->b);
187 free(pair);
188 return;
191 void destroy_align(align_t *ali) {
192 destroy_seq_pair( ali->seqs );
193 free(ali->markup);
194 return;
197 //align_t *smith_waterman(seq_pair_t *problem, bool local) {
198 align_t *smith_waterman(seq_pair_t *problem) {
199 unsigned int m = problem->alen + 1;
200 unsigned int n = problem->blen + 1;
202 /* traceback matrix */
203 unsigned short *S = malloc(sizeof(unsigned short) * m * n); /* 0 = STOP; 1 = LEFT; 2 = DIAGONAL; 3 = UP */
205 /* number of vertical gaps for each cell */
206 unsigned short *ngap_vertical = malloc(sizeof(unsigned short) * m * n);
207 /* number of horizontal gaps for each cell */
208 unsigned short *ngap_horizontal = malloc(sizeof(unsigned short) * m * n);
210 align_t *result;
211 unsigned int i, j, k, l;
213 float f; /* score of final alignment */
214 float * g = malloc(sizeof(float) * n); /* score if x_i aligns to a gap after y_i */
215 float h; /* score if y_i aligns to a gap after x_i */
216 float * v = malloc(sizeof(float) * n); /* best of score of alignment x_1 ... x_i to y_1 ... y_i */
217 float v_diagonal;
219 float sim_score, g1, g2, h1, h2;
221 /* row and col number, and score of optimal cell in the matrix */
222 unsigned int t_row, t_col;
223 float t_score;
225 /* Initialize traceback matrix */
226 for ( i = 0, k = 0; i < m; i++, k+= n )
227 S[k] = STOP;
228 for ( j = 1; j < n; j++ )
229 S[j] = STOP;
231 /* set number of gaps */
232 for (i = 0, k = 0; i < m; i++, k += n)
233 for (j = 0; j < n; j++)
234 ngap_vertical[k + j] = ngap_horizontal[k + j] = 1;
237 g[0] = -INFINITY;
238 h = -INFINITY;
239 v[0] = 0;
241 t_row = 0;
242 t_col = 0;
243 t_score = -INFINITY;
245 for ( j = 1; j < n; j++ ) {
246 g[j] = -INFINITY;
247 v[j] = 0;
250 for (i = 1, k = n; i < m; i++, k += n) { /* i for row number/# on seq1; k for position of first column in m*n matrix; */
251 h = -INFINITY;
252 v_diagonal = v[0];
253 for (j = 1, l = k + 1; j < n; j++, l++) { /* j for column number/# on seq2; l for position of ith row jth column in m*n matrix; */
254 sim_score = (strncmp(problem->a+(i-1), problem->b+(j-1), 1) == 0) ? MATCH : MISMATCH;
256 /* set direction in traceback matrix */
257 f = v_diagonal + sim_score;
259 g1 = g[j] + GAPE;
260 g2 = v[j] + GAPO;
262 if ( g1 > g2 ) { /* perfer gap extension in vertical direction (seq 1) */
263 g[j] = g1;
264 ngap_vertical[l] = (short) (ngap_vertical[l - n] + 1);
265 } else { /* prefer gap openning */
266 g[j] = g2;
269 h1 = h + GAPE;
270 h2 = v[j-1] + GAPO;
272 if (h1 > h2) { /* prefer gap extention in horizontal direction ( seq 2) */
273 h = h1;
274 ngap_horizontal[l] = (short) (ngap_horizontal[l - 1] + 1);
275 } else {
276 h = h2;
279 v_diagonal = v[j];
280 //v[j] = max( f, g[j], h, 0 ); /* the final score */
282 /* Determine the traceback direction */
283 if ( f <= 0 && g[j] <=0 && h <=0 ) { /* 0 is max */
284 v[j] = 0;
285 S[l] = STOP;
287 else if ( g[j] <= f && h <= f ) { /* or f is max */
288 v[j] = f;
289 S[l] = DIAGONAL;
291 else if ( h <= g[j]) { /* or g[j] is max */
292 v[j] = g[j];
293 S[l] = UP;
295 else { /* or h is max */
296 v[j] = h;
297 S[l] = LEFT;
300 // Set the traceback start at the current cell i, j and score
301 if (v[j] > t_score) {
302 t_row = i;
303 t_col = j;
304 t_score = v[j];
309 result = traceback(problem, S, t_row, t_col, t_score, ngap_vertical, ngap_horizontal);
311 // print_matrix(S, problem);
313 free(S);
314 free(g);
315 free(v);
316 free(ngap_vertical);
317 free(ngap_horizontal);
319 return result;
322 void print_alignment(align_t *result) {
323 printf("Score: %0.0f Matches: %d Gaps: %d\n", result->score, result->matches, result->gaps);
324 printf("Target: %3d %s %-3d\n", result->start_a, result->seqs->a, result->end_a);
325 printf(" %s \n", result->markup);
326 printf("Query: %3d %s %-3d\n", result->start_b, result->seqs->b, result->end_b);
329 int main(int argc, const char **argv) {
331 if (argc != 3) {
332 printf("usage: swalign TARGET_SEQ QUERY_SEQ\n");
333 exit(1);
337 seq_pair_t problem;
338 align_t *result;
339 char c[strlen(argv[1])], d[strlen(argv[2])];
341 strcpy(c, argv[1]);
342 strcpy(d, argv[2]);
344 problem.a = c;
345 problem.alen = strlen(problem.a);
346 problem.b = d;
347 problem.blen = strlen(problem.b);
349 result = smith_waterman(&problem);
351 print_alignment(result);
354 exit(0);