modified: nfig1.py
[GalaxyCodeBases.git] / BGI / BASE / src / 2bwt / 2BWT-Sample.c
blobffe9c7c0d03b79d8e4d570f55590f541dc969fdf
1 /*
3 2BWT-Sample.c SAMPLE CODE FOR 2BWT-LIB
5 Copyright (C) 2011, Edward Wu.
6 Website: http://www.cs.hku.hk/~mkewu/2bwt
8 This program is free software; you can redistribute it and/or
9 modify it under the terms of the GNU General Public License
10 as published by the Free Software Foundation; either version 2
11 of the License, or (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
22 Date : 19th June 2011
23 Author : Edward MK Wu
24 Change : Sample file to use 2BWT index.
26 This program performs the following with 2BWT library
27 1. BWT backward search on a given pattern
28 2. BWT forward search on a given pattern
29 3. BWT bi-directional search on a given pattern
30 3. BWT 1-mismatch search on a given pattern
31 4. Report the first occurrences of (1).
33 For complete guide on what the 2BWT index library provided
34 please check out the file 2BWT-Interface.h for the description
35 of all library calls.
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include "2BWT-Interface.h"
43 int main() {
44 int i,j,k,c;
46 //Variables for backward and forward search
47 unsigned long long l,r,rev_l,rev_r;
49 //Variables for search all sa ranges functions
50 unsigned long long result_l[ALPHABET_SIZE];
51 unsigned long long result_r[ALPHABET_SIZE];
52 unsigned long long result_rev_l[ALPHABET_SIZE];
53 unsigned long long result_rev_r[ALPHABET_SIZE];
55 //Variables for result
56 unsigned long long offset;
57 unsigned int sequenceId;
58 unsigned long long saCount;
60 //Variables for pattern
61 char pattern[1024];
62 strcpy(pattern,"AACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA");
63 int patternLength = strlen(pattern);
65 // Load up the index with the below statement
66 printf("Loading index ... ");
67 fflush(stdout);
68 Idx2BWT * idx2BWT = BWTLoad2BWT("ncbi.genome.fa.index_64.a8",".sa4");
69 printf("DONE\n\n");
71 // Convert the pattern into 2BWT recognised coding scheme
72 BWTConvertPattern(idx2BWT,pattern,patternLength,(unsigned char*) pattern);
79 // The following performs a backward search of the pattern
80 // ===================================================================================
81 // |
82 printf("Performing backward search of the pattern..\n");
83 BWTSARangeInitial(idx2BWT,pattern[patternLength-1],&l,&r);
84 for (i=patternLength-2;i>=0;i--) {
85 BWTSARangeBackward(idx2BWT,pattern[i],&l,&r);
87 printf("SA Range being = %llu %llu (%llu)\n\n",l,r,r-l+1);
88 // |
89 // ===================================================================================
99 // The following performs a forward search of the pattern
100 // ===================================================================================
101 // |
102 printf("Performing forward search of the pattern..\n");
103 BWTSARangeInitial(idx2BWT,pattern[0],&l,&r);
104 BWTSARangeInitial(idx2BWT,pattern[0],&rev_l,&rev_r);
105 for (i=1;i<patternLength;i++) {
106 BWTSARangeForward_Bidirection(idx2BWT,pattern[i],&l,&r,&rev_l,&rev_r);
108 printf("SA Range being = %llu %llu %llu %llu (%llu)\n\n",l,r,rev_l,rev_r,r-l+1);
109 // |
110 // ===================================================================================
113 // The following performs a bi-directional search of the pattern
114 // Starting from the middle of the pattern, first move right, then move left.
115 // ===================================================================================
116 // |
117 printf("Performing bi-directional search of the pattern..\n");
118 j = patternLength / 2;
119 BWTSARangeInitial(idx2BWT,pattern[j],&l,&r);
120 BWTSARangeInitial(idx2BWT,pattern[j],&rev_l,&rev_r);
121 for (i=j+1;i<patternLength;i++) {
122 BWTSARangeForward_Bidirection(idx2BWT,pattern[i],&l,&r,&rev_l,&rev_r);
124 for (i=j-1;i>=0;i--) {
125 BWTSARangeBackward_Bidirection(idx2BWT,pattern[i],&l,&r,&rev_l,&rev_r);
127 printf("SA Range being = %llu %llu %llu %llu (%llu)\n\n",l,r,rev_l,rev_r,r-l+1);
128 // |
129 // ===================================================================================
132 // The following performs a 1-mismatch search of the pattern
133 // ===================================================================================
134 // |
135 // |
136 printf("Performing 1-mismatch search of the pattern..\n");
137 saCount = 0;
138 j = patternLength / 2;
139 BWTSARangeInitial(idx2BWT,pattern[patternLength-1],&l,&r);
140 for (i=patternLength-2;i>j-1;i--) { BWTSARangeBackward(idx2BWT,pattern[i],&l,&r); }
142 for (i=j-1;i>=0;i--) {
143 BWTAllSARangesBackward(idx2BWT,l,r,result_l,result_r);
144 for (c=0;c<ALPHABET_SIZE;c++) {
145 if (c==pattern[i]) continue;
146 unsigned long long err_l=result_l[c];
147 unsigned long long err_r=result_r[c];
148 for (k=i-1;k>=0;k--) {
149 if (err_l>err_r) break;
150 BWTSARangeBackward(idx2BWT,pattern[k],&err_l,&err_r);
152 if (err_l<=err_r && k<0) {
153 //An SA range of occurrence is found (err_l,err_r)
154 saCount+=err_r-err_l+1;
157 l=result_l[pattern[i]];
158 r=result_r[pattern[i]];
161 BWTSARangeInitial(idx2BWT,pattern[0],&l,&r);
162 BWTSARangeInitial(idx2BWT,pattern[0],&rev_l,&rev_r);
163 for (i=1;i<j;i++) { BWTSARangeForward_Bidirection(idx2BWT,pattern[i],&l,&r,&rev_l,&rev_r); }
164 for (i=j;i<patternLength;i++) {
165 BWTAllSARangesForward_Bidirection(idx2BWT,l,r,rev_l,rev_r,result_l,result_r,result_rev_l,result_rev_r);
166 for (c=0;c<ALPHABET_SIZE;c++) {
167 if (c==pattern[i]) continue;
168 unsigned long long err_l=result_l[c];
169 unsigned long long err_r=result_r[c];
170 unsigned long long rev_err_l=result_rev_l[c];
171 unsigned long long rev_err_r=result_rev_r[c];
172 for (k=i+1;k<patternLength;k++) {
173 if (err_l>err_r) break;
174 BWTSARangeForward_Bidirection(idx2BWT,pattern[k],&err_l,&err_r,&rev_err_l,&rev_err_r);
176 if (err_l<=err_r && k>=patternLength) {
177 //An SA range of occurrence is found (err_l,err_r)
178 saCount+=err_r-err_l+1;
181 l=result_l[pattern[i]];
182 r=result_r[pattern[i]];
183 rev_l=result_rev_l[pattern[i]];
184 rev_r=result_rev_r[pattern[i]];
186 printf("%llu SA-indexes/occurrences were found.\n\n",saCount);
187 // |
188 // |
189 // ===================================================================================
196 // The following output the first 5 position of the pattern
197 // ===================================================================================
198 // |
199 // |
200 j=(r-l+1<5)?r-l+1:5;
201 printf("Reporting %d arbitrary occurrences..\n",j);
202 for (i=0;i<j;i++) {
203 BWTRetrievePositionFromSAIndex(idx2BWT,l+i,&sequenceId,&offset);
204 printf("Occurrence found in sequence #%u with offset %llu\n",sequenceId,offset);
206 // |
207 // |
208 // ===================================================================================
215 // Free up the 2BWT index
216 printf("\nFree index ... ");
217 fflush(stdout);
218 BWTFree2BWT(idx2BWT);
219 printf("DONE\n");
221 return 0;