limit fstBC to 30bp in Python3 ver.
[GalaxyCodeBases.git] / BGI / soap_src / soap_builder / HSP.c
blob1119029143d795b51aaa3dcb6cf7a1faac80c43d
1 /*
3 HSP.c BWTBlastn functions
5 This module contains miscellaneous BWTBlastn functions.
7 Copyright (C) 2004, Wong Chi Kwong.
9 This program is free software; you can redistribute it and/or
10 modify it under the terms of the GNU General Public License
11 as published by the Free Software Foundation; either version 2
12 of the License, or (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <math.h>
29 #include <stdint.h>
30 #include "TextConverter.h"
31 #include "MiscUtilities.h"
32 #include "Socket.h"
33 #include "r250.h"
34 #include "HSP.h"
35 #include "HSPstatistic.h"
38 extern double stat_expectationValue;
40 void HSPFillCharMap(unsigned char charMap[255]) {
42 int i;
44 for (i=0; i<255; i++) {
45 charMap[i] = nonMatchDnaCharIndex;
47 for (i=0; i<16; i++) {
48 charMap[(int)dnaChar[i]] = (unsigned char)i;
49 charMap[(int)dnaChar[i] - 'A' + 'a'] = (unsigned char)i;
54 void HSPFillComplementMap(unsigned char complementMap[255]) {
56 int i;
58 for (i=0; i<255; i++) {
59 complementMap[i] = nonMatchDnaCharIndex;
61 for (i=0; i<16; i++) {
62 complementMap[(int)dnaComplement[i]] = (unsigned int)i;
63 complementMap[(int)dnaComplement[i] - 'A' + 'a'] = (unsigned int)i;
69 HSP *HSPLoad(MMPool *mmPool, const char *PackedDNAFileName, const char *AnnotationFileName, const char *AmbiguityFileName) {
71 HSP *hsp;
73 FILE *annotationFile = NULL ;
75 hsp = MMPoolDispatch(mmPool, sizeof(HSP));
77 // Load packed DNA
78 if (PackedDNAFileName != NULL && PackedDNAFileName[0] != '\0' && PackedDNAFileName[0] != '-') {
79 hsp->packedDNA = DNALoadPacked(PackedDNAFileName, &hsp->dnaLength, TRUE);
80 } else {
81 hsp->packedDNA = NULL;
82 hsp->dnaLength = 0;
84 return hsp;
89 void HSPFree(MMPool *mmPool, HSP *hsp) {
91 if (hsp->packedDNA != NULL) {
92 DNAFreePacked(hsp->packedDNA, hsp->dnaLength);
94 MMUnitFree(hsp->seqOffset, (hsp->numOfSeq+1) * sizeof(SeqOffset));
95 MMUnitFree(hsp->annotation, (hsp->numOfSeq+1) * sizeof(Annotation));
96 MMUnitFree(hsp->ambiguity, (hsp->numOfAmbiguity+2) * sizeof(Ambiguity));
98 MMPoolReturn(mmPool, hsp, sizeof(hsp));
102 unsigned int HSPParseFASTAToPacked(const char * FASTAFileName, const char * annotationFileName, const char * packedDNAFileName, const char* ambiguityFileName, const char * repeatFileName, const unsigned int FASTARandomSeed, const int maskLowerCase) {
104 FILE *FASTAFile, *annotationFile, *packedDNAFile, *ambiguityFile, *repeatFile;
106 NewAnnotation *chrAnnotation;
107 int chrAnnAllocated = 256;
108 int blockAllocated = 256;
110 char c, *ch;
111 int chrNum, blockNum;
112 unsigned int i, l;
113 int nCount;
114 unsigned int chrLen, usefulCharNum, numCharInBuffer, totalNumChar;
115 unsigned int numRepeat = 0;
116 unsigned char charMap[255];
117 char *chrSeq, *p;
118 unsigned int chrAllocated = 65536;
119 unsigned char buffer[PACKED_BUFFER_SIZE];
120 unsigned char packedBuffer[PACKED_BUFFER_SIZE / 4];
121 chrLen = usefulCharNum = numCharInBuffer = totalNumChar = chrNum = blockNum = i = l = nCount = 0;
122 FASTAFile = (FILE*)fopen64(FASTAFileName, "r");
123 if (FASTAFile == NULL) {
124 fprintf(stderr, "ParseFASTToPacked() : Cannot open FASTAFileName!\n");
125 exit(1);
128 annotationFile = (FILE*)fopen64(annotationFileName, "w");
129 if (annotationFile == NULL) {
130 fprintf(stderr, "ParseFASTToPacked() : Cannot open annotationFileName!\n");
131 exit(1);
134 packedDNAFile = (FILE*)fopen64(packedDNAFileName, "wb");
135 if (packedDNAFile == NULL) {
136 fprintf(stderr, "ParseFASTToPacked() : Cannot open packedDNAFileName!\n");
137 exit(1);
140 ambiguityFile = (FILE*)fopen64(ambiguityFileName, "w");
141 if (ambiguityFile == NULL) {
142 fprintf(stderr, "ParseFASTToPacked() : Cannot open ambiguityFileName!\n");
143 exit(1);
146 repeatFile = (FILE *)fopen64(repeatFileName, "w");
147 if (maskLowerCase && repeatFile == NULL ) {
148 fprintf(stderr, "ParseFASTToPacked() : Cannot open repeatFileName!\n");
149 exit(1);
151 //*/
153 HSPFillCharMap(charMap);
155 c = (char)getc(FASTAFile);
156 if (c != '>') {
157 fprintf(stderr, "ParseFASTToPacked() : FASTA file does not begin with '>'!\n");
158 exit(1);
160 chrAnnotation = (NewAnnotation *)malloc(sizeof(NewAnnotation)*chrAnnAllocated);
161 chrSeq = (char*)malloc(sizeof(char)*chrAllocated);
162 chrNum = blockNum = usefulCharNum = numCharInBuffer = 0;
163 while(!feof(FASTAFile)){
164 if (feof(FASTAFile)) break;
165 if (chrNum == chrAnnAllocated){
166 chrAnnAllocated <<= 1;
167 chrAnnotation = (NewAnnotation *)realloc(chrAnnotation, sizeof(NewAnnotation)*chrAnnAllocated);
168 // printf("%d\n", chrNum);
171 l=0;
172 c = (char)getc(FASTAFile);
173 while(!feof(FASTAFile) && c!='\t' && c!=' ' && c!='\n' && l<MAX_SEQ_NAME_LENGTH){
174 chrAnnotation[chrNum].chrName[l]=c;
175 l++;
176 c=(char)getc(FASTAFile);
178 chrAnnotation[chrNum].chrName[l]='\0';
179 while(c!='\n'){
180 c=(char)getc(FASTAFile);
182 chrLen = 0;
183 while(c!='>' && !feof(FASTAFile)){
184 if (c!='\n'){
185 if (!maskLowerCase && c>='a' && c<='z'){
186 c += 'A'-'a';
188 if (chrLen >= chrAllocated){
189 chrAllocated <<= 1;
190 chrSeq = (char*)realloc(chrSeq, sizeof(char)*chrAllocated);
192 *(chrSeq+chrLen) = c;
193 chrLen += 1;
195 c=(char)getc(FASTAFile);
197 if (maskLowerCase) {
198 p = chrSeq;
199 unsigned int repeat_beg , repeat_end = 0;
200 repeat_end = repeat_beg = 0;
201 i = 0;
202 while (p != chrSeq + chrLen) {
203 if (*p >= 'a' && *p <= 'z') {
204 repeat_beg = i;
205 while (*p >= 'a' && *p <= 'z' && p != chrSeq + chrLen) {
206 *p += 'A' - 'a';
207 i ++; p ++;
209 repeat_end = i;
211 i++; p++;
212 numRepeat++;
213 fprintf(repeatFile, "%d\t%u\t%u\n", chrNum, repeat_beg, repeat_end);
216 if (chrLen <= 75) {
217 fprintf(stderr, "ParseFASTToPacked(): reference <= 75 bp is filtered. Continue ...\n");
218 continue;
221 i=0;
222 p=chrSeq;
223 while (ambiguityCount[charMap[(int)*p]] == 1 && i++ != chrLen) p++;
224 if (i == chrLen) {
225 blockNum = 1;
226 chrAnnotation[chrNum].blockInChr = (ChrBlock *)malloc(sizeof(ChrBlock)*blockNum);
227 chrAnnotation[chrNum].chrStart = usefulCharNum;
228 chrAnnotation[chrNum].blockNum = blockNum;
229 chrAnnotation[chrNum].blockInChr[0].blockStart = usefulCharNum;
230 chrAnnotation[chrNum].blockInChr[0].ori = 0;
231 usefulCharNum += chrLen;
232 chrAnnotation[chrNum].chrEnd = usefulCharNum-1;
233 chrAnnotation[chrNum].blockInChr[0].blockEnd = usefulCharNum-1;
234 i=0;
235 while(i<chrLen){
236 if (numCharInBuffer >= PACKED_BUFFER_SIZE) {
237 ConvertTextToBytePacked(buffer, packedBuffer, charMap, 4, PACKED_BUFFER_SIZE);
238 fwrite(packedBuffer, 1, PACKED_BUFFER_SIZE / 4, packedDNAFile);
239 numCharInBuffer = 0;
241 buffer[numCharInBuffer++] = chrSeq[i++];
243 } else {
244 i=0;
245 p = chrSeq;
246 while (ambiguityCount[charMap[(int)*p]]!=1 && i++!=chrLen) p++;
247 if (i<10) i=0;
248 blockNum = 1;
249 chrAnnotation[chrNum].blockInChr = (ChrBlock *)malloc(sizeof(ChrBlock)*blockAllocated);
250 chrAnnotation[chrNum].chrStart = usefulCharNum;
251 chrAnnotation[chrNum].blockInChr[blockNum-1].ori = i;
252 chrAnnotation[chrNum].blockInChr[blockNum-1].blockStart = usefulCharNum;
253 int len=0;
254 while (i<chrLen) {
255 if(ambiguityCount[charMap[(int)*p]] == 1){
256 if (numCharInBuffer >= PACKED_BUFFER_SIZE) {
257 ConvertTextToBytePacked(buffer, packedBuffer, charMap, 4, PACKED_BUFFER_SIZE);
258 fwrite(packedBuffer, 1, PACKED_BUFFER_SIZE / 4, packedDNAFile);
259 numCharInBuffer = 0;
261 buffer[numCharInBuffer++] = *p++;
262 i++;
263 usefulCharNum++;
264 len++;
265 }else{
266 nCount = 0;
267 while((ambiguityCount[charMap[(int)*p]]!=1) && i<chrLen){
268 nCount++;
269 i++;
270 p++;
272 if (nCount<10) {
273 do {
274 if (numCharInBuffer >= PACKED_BUFFER_SIZE) {
275 ConvertTextToBytePacked(buffer, packedBuffer, charMap, 4, PACKED_BUFFER_SIZE);
276 fwrite(packedBuffer, 1, PACKED_BUFFER_SIZE / 4, packedDNAFile);
277 numCharInBuffer = 0;
279 buffer[numCharInBuffer++] = 'G';
280 usefulCharNum++;
281 len++;
282 } while(--nCount>0);
283 } else {
284 if (i<chrLen) {
285 chrAnnotation[chrNum].blockInChr[blockNum-1].blockEnd = usefulCharNum -1;
286 chrAnnotation[chrNum].blockInChr[blockNum-1].ori = i-nCount-len;
287 if (blockNum == blockAllocated){
288 blockAllocated <<= 1;
289 chrAnnotation[chrNum].blockInChr = (ChrBlock *)realloc(chrAnnotation[chrNum].blockInChr, sizeof(ChrBlock)*blockAllocated);
291 blockNum++;
292 len=0;
293 chrAnnotation[chrNum].blockInChr[blockNum-1].blockStart = usefulCharNum;
294 } else {
295 i-=nCount;
296 break;
301 chrAnnotation[chrNum].blockInChr[blockNum-1].blockEnd = usefulCharNum-1;
302 chrAnnotation[chrNum].blockInChr[blockNum-1].ori = i-len;
303 chrAnnotation[chrNum].blockNum = blockNum;
304 chrAnnotation[chrNum].chrEnd = usefulCharNum-1;
306 //*/
307 chrNum++;
308 totalNumChar += chrLen;
310 if (numCharInBuffer > 0) {
311 ConvertTextToBytePacked(buffer, packedBuffer, charMap, 4, numCharInBuffer);
312 fwrite(packedBuffer, 1, (numCharInBuffer + 3) / 4, packedDNAFile);
313 numCharInBuffer = 0;
315 if (totalNumChar % 4 == 0) {
316 c = 0;
317 fwrite(&c, 1, 1, packedDNAFile);
319 c = (char)(totalNumChar % 4);
320 fwrite(&c, 1, 1, packedDNAFile);
321 fclose(packedDNAFile);
322 fprintf(annotationFile, "%u\t%d\t%d\n", totalNumChar, chrNum, FASTARandomSeed);
323 int j=0;
324 int total = 0;
325 for (i=0;i<chrNum;i++) {
326 fprintf(annotationFile, "%d\t%s\n", (int)strlen(chrAnnotation[i].chrName), chrAnnotation[i].chrName);
327 total += chrAnnotation[i].blockNum;
329 fprintf(annotationFile, "%d\n", total);
330 for(i=0;i<chrNum;i++){
331 for(j=0;j<chrAnnotation[i].blockNum;j++){
332 fprintf(annotationFile,"%d\t%u\t%u\t%u\n",i, chrAnnotation[i].blockInChr[j].blockStart, chrAnnotation[i].blockInChr[j].blockEnd, chrAnnotation[i].blockInChr[j].ori);
334 free(chrAnnotation[i].blockInChr);
337 if (maskLowerCase) {
338 fprintf(repeatFile, "%u\n", numRepeat);
339 fclose(repeatFile);
342 free(chrAnnotation);
343 fclose(annotationFile);
344 return chrNum;
347 NewAnnotation *AnnLoad(const char *annotationFileName, int *chrNum){
348 FILE *annotationFile;
349 if ((annotationFile = fopen(annotationFileName, "r")) == NULL){
350 fprintf(stderr, "Load Annotation File error\n");
351 exit(1);
353 NewAnnotation *chrAnn;
354 int tmp, blockNum, totalNumChar, FASTARandomSeed;
355 fscanf(annotationFile, "%u\t%d\t%d\n", &totalNumChar, &tmp, &FASTARandomSeed);
356 chrAnn = (NewAnnotation *)malloc(sizeof(NewAnnotation)*tmp);
357 int i,j;
358 for(i=0;i<tmp;i++){
359 fscanf(annotationFile, "%s\t%u\t%u\t%d\n", chrAnn[i].chrName, &chrAnn[i].chrStart, &chrAnn[i].chrEnd, &chrAnn[i].blockNum);
360 chrAnn[i].blockInChr = (ChrBlock*) malloc(sizeof(ChrBlock)*chrAnn[i].blockNum);
361 for(j=0;j<chrAnn[i].blockNum;j++){
362 fscanf(annotationFile,"%u\t%u\t%u\n", &chrAnn[i].blockInChr[j].blockStart, &chrAnn[i].blockInChr[j].blockEnd, &chrAnn[i].blockInChr[j].ori);
365 fclose(annotationFile);
366 *chrNum=tmp;
367 return chrAnn;
372 unsigned int HSPPackedToFASTA(const char* FASTAFileName, const char* annotationFileName, const char* packedDNAFileName, const char* ambiguityFileName) {
374 HSP *hsp;
375 FILE *FASTAFile;
377 hsp = HSPLoad(NULL, packedDNAFileName, annotationFileName, ambiguityFileName);
379 // Generate FASTA from packed
380 FASTAFile = (FILE*)fopen64(FASTAFileName, "w");
381 if (FASTAFile == NULL) {
382 fprintf(stderr, "Cannot open FASTA file!\n");
383 exit(1);
386 // to be done...
387 fprintf(stderr, "HSPPackedToFASTA(): Function not complete!\n");
389 fclose(FASTAFile);
391 HSPFree(NULL, hsp);
393 return 0;