1 /* An example of global sequence alignment by dynamic programming.
3 * The program is ANSI C and should compile on any machine that has
4 * a C compiler, with a command line like:
5 * gcc -o global global.c
9 * To change the sequences or the scoring system, you have to edit the
10 * appropriate #define's at the top of the program, and recompile.
11 * This is meant to be a simple working example of the mechanics of
12 * dynamic programming sequence alignment. The details of reading
13 * FASTA files and such are left as an exercise to the reader...
15 * SRE, Wed Jun 2 08:54:40 2004
20 /* Define the problem here: two sequences SEQX and SEQY,
21 * and the simple scoring system (MATCH, MISMATCH, and
22 * the linear gap score INDEL).
25 #define SEQY "TGCTCGTA"
40 char *x
,*y
; /* the two sequences, x_1..M and y_1..N */
41 int M
,N
; /* lengths of sequences x,y */
42 int i
,j
; /* residue coords in x and y */
43 int **S
; /* dynamic programming matrix */
44 char *ax
,*ay
; /* result: the two aligned sequences */
45 int K
; /* length of pairwise alignment */
46 int k
; /* index in pairwise alignment, 0..K-1 */
47 int sc
; /* tmp variable used to find best path */
48 char tmp
; /* tmp variable used to swap chars */
51 /*****************************************************************
52 * Some initial bookkeeping.
53 *****************************************************************/
54 /* Initialize the sequences x,y, using SEQX and SEQY.
56 * In C, arrays are indexed 0..N-1, but we want our sequences to be
57 * indexed 1..N, and we need row/column 0 in the DP matrix for
58 * initialization conditions. That's why we do some +1 fiddling
59 * here, so our sequences are x_1..x_M and y_1..y_N.
63 x
= malloc(sizeof(char) * (M
+2)); /* +2: leading blank, and trailing \0 */
64 y
= malloc(sizeof(char) * (N
+2));
65 strcpy(x
+1, SEQX
); /* x_1..x_M now defined; x_0 undefined */
66 strcpy(y
+1, SEQY
); /* y_1..y_N now defined; y_0 undefined */
68 /* Allocate the dynamic programming matrix, 0..M rows by 0..N columns.
70 S
= malloc(sizeof(int *) * (M
+1));
71 for (i
= 0; i
<= M
; i
++)
72 S
[i
] = malloc(sizeof(int) * (N
+1));
77 /*****************************************************************
78 * Recursion: the heart of the DP algorithm.
79 *****************************************************************/
83 for (i
= 1; i
<= M
; i
++) S
[i
][0] = i
* INDEL
;
84 for (j
= 1; j
<= N
; j
++) S
[0][j
] = j
* INDEL
;
86 /* The dynamic programming, global alignment (Needleman/Wunsch) recursion.
88 for (i
= 1; i
<= M
; i
++)
89 for (j
= 1; j
<= N
; j
++)
91 /* case #1: i,j are aligned */
92 if (x
[i
] == y
[j
]) S
[i
][j
] = S
[i
-1][j
-1] + MATCH
;
93 else S
[i
][j
] = S
[i
-1][j
-1] + MISMATCH
;
95 sc
= S
[i
-1][j
] + INDEL
; /* case #2: i aligned to - */
96 if (sc
> S
[i
][j
]) S
[i
][j
] = sc
;
98 sc
= S
[i
][j
-1] + INDEL
; /* case #3: j aligned to - */
99 if (sc
> S
[i
][j
]) S
[i
][j
] = sc
;
101 /* The result (optimal alignment score) is now in S[M][N]. */
108 /*****************************************************************
110 *****************************************************************
112 * Traceback is a little fiddly, maybe a little obvious, and
113 * there's more than one way to do it, so it often gets glossed
114 * over in intros to DP.
116 * There are two general strategies that people use for
117 * tracebacks. You can keep a "shadow matrix" of traceback
118 * pointers in the recursion, so that you remember which
119 * case won in every cell S(i,j). Alternatively, you can
120 * recapitulate the calculations to figure out which case
123 * Recapitulation is usually a little simpler, for small examples;
124 * but shadow matrices are probably more common in real-world
125 * implementations. Recapitulation is used here.
127 * Because we trace backwards, and we don't know the length of
128 * the alignment 'til we're done, we have a bookkeeping problem
129 * with creating the aligned sequences. There's more than one
130 * way to solve it. A simple (but brutish) way is to allocate
131 * for the maximum length alignment (M+N; happens if x,y are
132 * completely unaligned to each other), make the alignment backwards,
133 * then reverse the aligned strings.
136 /* Allocate for our aligned strings (which will be 0..K-1)
138 ax
= malloc(sizeof(char) * (M
+N
+1));
139 ay
= malloc(sizeof(char) * (M
+N
+1));
141 /* Start at M,N; work backwards */
145 while (i
> 0 && j
> 0)
147 /* Case 1: best was x_i aligned to x_j?
149 if (i
> 0 && j
> 0) { /* watch out for boundaries */
151 if (x
[i
] == y
[j
]) sc
+= MATCH
; else sc
+= MISMATCH
;
152 if (sc
== S
[i
][j
]) { /* residue i aligns to j: */
153 ax
[k
] = x
[i
]; ay
[k
] = y
[j
]; k
++; /* record the alignment */
154 i
--; j
--; /* move to next cell in trace */
159 /* Case 2: best was x_i aligned to -?
161 if (i
> 0) { /* watch out for boundaries */
162 if (S
[i
-1][j
] + INDEL
== S
[i
][j
]) {
163 ax
[k
] = x
[i
]; ay
[k
] = '-'; k
++; /* record the alignment */
164 i
--; /* move to next cell in trace */
169 /* Case 3: best was - aligned to y_j?
171 if (j
> 0) { /* watch out for boundaries */
172 if (S
[i
][j
-1] + INDEL
== S
[i
][j
]) {
173 ax
[k
] = '-'; ay
[k
] = y
[j
]; k
++; /* record the alignment */
174 j
--; /* move to next cell in trace */
179 /* Done with the traceback.
180 * Now ax[0..k-1] and ay[0..k-1] are aligned strings, but backwards;
181 * and k is our alignment length.
182 * Reverse the strings and we're done.
184 K
= k
; /* K = remember the alignment length */
185 for (k
= 0; k
< K
/2; k
++)
186 { /* a sly, efficient in-place reversal by swapping pairs of chars: */
187 tmp
= ax
[K
-k
-1]; ax
[K
-k
-1] = ax
[k
]; ax
[k
] = tmp
;
188 tmp
= ay
[K
-k
-1]; ay
[K
-k
-1] = ay
[k
]; ay
[k
] = tmp
;
192 /*****************************************************************
194 *****************************************************************
196 printf("Sequence X: %s\n", x
+1);
197 printf("Sequence Y: %s\n", y
+1);
198 printf("Scoring system: %d for match; %d for mismatch; %d for gap\n",
199 MATCH
, MISMATCH
, INDEL
);
202 printf("Dynamic programming matrix:\n");
203 for (j
= -1; j
<= N
; j
++)
204 printf(" %c ", j
> 0 ? y
[j
] : ' ');
206 for (i
= 0; i
<= M
; i
++)
208 printf(" %c ", i
> 0 ? x
[i
] : ' ');
209 for (j
= 0; j
<= N
; j
++)
210 printf("%4d ", S
[i
][j
]);
215 printf("Alignment:\n");
216 printf("X: %s\n", ax
);
217 printf("Y: %s\n", ay
);
219 printf("Optimum alignment score: %d\n\n", S
[M
][N
]);
221 /* Free the memory we allocated; then exit.
225 for (i
= 0; i
<= M
; i
++) free(S
[i
]);