modified: makefile
[GalaxyCodeBases.git] / funny / string_alignment / nbt0704-909-S1.c
blob9eec8154d197fa1d60da12dec8829649a29b101a
1 /* An example of global sequence alignment by dynamic programming.
3 * The program is ANSI C and should compile on any machine that has
4 * a C compiler, with a command line like:
5 * gcc -o global global.c
6 * To run the program:
7 * ./global
9 * To change the sequences or the scoring system, you have to edit the
10 * appropriate #define's at the top of the program, and recompile.
11 * This is meant to be a simple working example of the mechanics of
12 * dynamic programming sequence alignment. The details of reading
13 * FASTA files and such are left as an exercise to the reader...
15 * SRE, Wed Jun 2 08:54:40 2004
20 /* Define the problem here: two sequences SEQX and SEQY,
21 * and the simple scoring system (MATCH, MISMATCH, and
22 * the linear gap score INDEL).
24 #define SEQX "TTCATA"
25 #define SEQY "TGCTCGTA"
27 #define MATCH 5
28 #define MISMATCH -2
29 #define INDEL -6
33 #include <stdlib.h>
34 #include <stdio.h>
35 #include <string.h>
37 int
38 main(void)
40 char *x,*y; /* the two sequences, x_1..M and y_1..N */
41 int M,N; /* lengths of sequences x,y */
42 int i,j; /* residue coords in x and y */
43 int **S; /* dynamic programming matrix */
44 char *ax,*ay; /* result: the two aligned sequences */
45 int K; /* length of pairwise alignment */
46 int k; /* index in pairwise alignment, 0..K-1 */
47 int sc; /* tmp variable used to find best path */
48 char tmp; /* tmp variable used to swap chars */
51 /*****************************************************************
52 * Some initial bookkeeping.
53 *****************************************************************/
54 /* Initialize the sequences x,y, using SEQX and SEQY.
56 * In C, arrays are indexed 0..N-1, but we want our sequences to be
57 * indexed 1..N, and we need row/column 0 in the DP matrix for
58 * initialization conditions. That's why we do some +1 fiddling
59 * here, so our sequences are x_1..x_M and y_1..y_N.
61 M = strlen(SEQX);
62 N = strlen(SEQY);
63 x = malloc(sizeof(char) * (M+2)); /* +2: leading blank, and trailing \0 */
64 y = malloc(sizeof(char) * (N+2));
65 strcpy(x+1, SEQX); /* x_1..x_M now defined; x_0 undefined */
66 strcpy(y+1, SEQY); /* y_1..y_N now defined; y_0 undefined */
68 /* Allocate the dynamic programming matrix, 0..M rows by 0..N columns.
70 S = malloc(sizeof(int *) * (M+1));
71 for (i = 0; i <= M; i++)
72 S[i] = malloc(sizeof(int) * (N+1));
77 /*****************************************************************
78 * Recursion: the heart of the DP algorithm.
79 *****************************************************************/
80 /* Initialization.
82 S[0][0] = 0;
83 for (i = 1; i <= M; i++) S[i][0] = i * INDEL;
84 for (j = 1; j <= N; j++) S[0][j] = j * INDEL;
86 /* The dynamic programming, global alignment (Needleman/Wunsch) recursion.
88 for (i = 1; i <= M; i++)
89 for (j = 1; j <= N; j++)
91 /* case #1: i,j are aligned */
92 if (x[i] == y[j]) S[i][j] = S[i-1][j-1] + MATCH;
93 else S[i][j] = S[i-1][j-1] + MISMATCH;
95 sc = S[i-1][j] + INDEL; /* case #2: i aligned to - */
96 if (sc > S[i][j]) S[i][j] = sc;
98 sc = S[i][j-1] + INDEL; /* case #3: j aligned to - */
99 if (sc > S[i][j]) S[i][j] = sc;
101 /* The result (optimal alignment score) is now in S[M][N]. */
108 /*****************************************************************
109 * Traceback
110 *****************************************************************
112 * Traceback is a little fiddly, maybe a little obvious, and
113 * there's more than one way to do it, so it often gets glossed
114 * over in intros to DP.
116 * There are two general strategies that people use for
117 * tracebacks. You can keep a "shadow matrix" of traceback
118 * pointers in the recursion, so that you remember which
119 * case won in every cell S(i,j). Alternatively, you can
120 * recapitulate the calculations to figure out which case
121 * won.
123 * Recapitulation is usually a little simpler, for small examples;
124 * but shadow matrices are probably more common in real-world
125 * implementations. Recapitulation is used here.
127 * Because we trace backwards, and we don't know the length of
128 * the alignment 'til we're done, we have a bookkeeping problem
129 * with creating the aligned sequences. There's more than one
130 * way to solve it. A simple (but brutish) way is to allocate
131 * for the maximum length alignment (M+N; happens if x,y are
132 * completely unaligned to each other), make the alignment backwards,
133 * then reverse the aligned strings.
136 /* Allocate for our aligned strings (which will be 0..K-1)
138 ax = malloc(sizeof(char) * (M+N+1));
139 ay = malloc(sizeof(char) * (M+N+1));
141 /* Start at M,N; work backwards */
142 i = M;
143 j = N;
144 k = 0;
145 while (i > 0 && j > 0)
147 /* Case 1: best was x_i aligned to x_j?
149 if (i > 0 && j > 0) { /* watch out for boundaries */
150 sc = S[i-1][j-1];
151 if (x[i] == y[j]) sc += MATCH; else sc += MISMATCH;
152 if (sc == S[i][j]) { /* residue i aligns to j: */
153 ax[k] = x[i]; ay[k] = y[j]; k++; /* record the alignment */
154 i--; j--; /* move to next cell in trace */
155 continue;
159 /* Case 2: best was x_i aligned to -?
161 if (i > 0) { /* watch out for boundaries */
162 if (S[i-1][j] + INDEL == S[i][j]) {
163 ax[k] = x[i]; ay[k] = '-'; k++; /* record the alignment */
164 i--; /* move to next cell in trace */
165 continue;
169 /* Case 3: best was - aligned to y_j?
171 if (j > 0) { /* watch out for boundaries */
172 if (S[i][j-1] + INDEL == S[i][j]) {
173 ax[k] = '-'; ay[k] = y[j]; k++; /* record the alignment */
174 j--; /* move to next cell in trace */
175 continue;
179 /* Done with the traceback.
180 * Now ax[0..k-1] and ay[0..k-1] are aligned strings, but backwards;
181 * and k is our alignment length.
182 * Reverse the strings and we're done.
184 K = k; /* K = remember the alignment length */
185 for (k = 0; k < K/2; k++)
186 { /* a sly, efficient in-place reversal by swapping pairs of chars: */
187 tmp = ax[K-k-1]; ax[K-k-1] = ax[k]; ax[k] = tmp;
188 tmp = ay[K-k-1]; ay[K-k-1] = ay[k]; ay[k] = tmp;
192 /*****************************************************************
193 * Output
194 *****************************************************************
196 printf("Sequence X: %s\n", x+1);
197 printf("Sequence Y: %s\n", y+1);
198 printf("Scoring system: %d for match; %d for mismatch; %d for gap\n",
199 MATCH, MISMATCH, INDEL);
200 printf("\n");
202 printf("Dynamic programming matrix:\n");
203 for (j = -1; j <= N; j++)
204 printf(" %c ", j > 0 ? y[j] : ' ');
205 printf("\n");
206 for (i = 0; i <= M; i++)
208 printf(" %c ", i > 0 ? x[i] : ' ');
209 for (j = 0; j <= N; j++)
210 printf("%4d ", S[i][j]);
211 printf("\n");
213 printf("\n");
215 printf("Alignment:\n");
216 printf("X: %s\n", ax);
217 printf("Y: %s\n", ay);
219 printf("Optimum alignment score: %d\n\n", S[M][N]);
221 /* Free the memory we allocated; then exit.
223 free(x); free(y);
224 free(ax); free(ay);
225 for (i = 0; i <= M; i++) free(S[i]);
226 free(S);
227 exit(0);