3 HSP.c BWTBlastn functions
5 This module contains miscellaneous BWTBlastn functions.
7 Copyright (C) 2004, Wong Chi Kwong.
9 This program is free software; you can redistribute it and/or
10 modify it under the terms of the GNU General Public License
11 as published by the Free Software Foundation; either version 2
12 of the License, or (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
30 #include "TextConverter.h"
31 #include "MiscUtilities.h"
35 #include "HSPstatistic.h"
38 extern double stat_expectationValue
;
40 void HSPFillCharMap(unsigned char charMap
[255]) {
44 for (i
=0; i
<255; i
++) {
45 charMap
[i
] = nonMatchDnaCharIndex
;
47 for (i
=0; i
<16; i
++) {
48 charMap
[(int)dnaChar
[i
]] = (unsigned char)i
;
49 charMap
[(int)dnaChar
[i
] - 'A' + 'a'] = (unsigned char)i
;
54 void HSPFillComplementMap(unsigned char complementMap
[255]) {
58 for (i
=0; i
<255; i
++) {
59 complementMap
[i
] = nonMatchDnaCharIndex
;
61 for (i
=0; i
<16; i
++) {
62 complementMap
[(int)dnaComplement
[i
]] = (unsigned int)i
;
63 complementMap
[(int)dnaComplement
[i
] - 'A' + 'a'] = (unsigned int)i
;
69 HSP
*HSPLoad(MMPool
*mmPool
, const char *PackedDNAFileName
, const char *AnnotationFileName
, const char *AmbiguityFileName
) {
73 FILE *annotationFile
= NULL
;
75 hsp
= MMPoolDispatch(mmPool
, sizeof(HSP
));
78 if (PackedDNAFileName
!= NULL
&& PackedDNAFileName
[0] != '\0' && PackedDNAFileName
[0] != '-') {
79 hsp
->packedDNA
= DNALoadPacked(PackedDNAFileName
, &hsp
->dnaLength
, TRUE
);
81 hsp
->packedDNA
= NULL
;
89 void HSPFree(MMPool
*mmPool
, HSP
*hsp
) {
91 if (hsp
->packedDNA
!= NULL
) {
92 DNAFreePacked(hsp
->packedDNA
, hsp
->dnaLength
);
94 MMUnitFree(hsp
->seqOffset
, (hsp
->numOfSeq
+1) * sizeof(SeqOffset
));
95 MMUnitFree(hsp
->annotation
, (hsp
->numOfSeq
+1) * sizeof(Annotation
));
96 MMUnitFree(hsp
->ambiguity
, (hsp
->numOfAmbiguity
+2) * sizeof(Ambiguity
));
98 MMPoolReturn(mmPool
, hsp
, sizeof(hsp
));
102 unsigned int HSPParseFASTAToPacked(const char * FASTAFileName
, const char * annotationFileName
, const char * packedDNAFileName
, const char* ambiguityFileName
, const char * repeatFileName
, const unsigned int FASTARandomSeed
, const int maskLowerCase
) {
104 FILE *FASTAFile
, *annotationFile
, *packedDNAFile
, *ambiguityFile
, *repeatFile
;
106 NewAnnotation
*chrAnnotation
;
107 int chrAnnAllocated
= 256;
108 int blockAllocated
= 256;
111 int chrNum
, blockNum
;
114 unsigned int chrLen
, usefulCharNum
, numCharInBuffer
, totalNumChar
;
115 unsigned int numRepeat
= 0;
116 unsigned char charMap
[255];
118 unsigned int chrAllocated
= 65536;
119 unsigned char buffer
[PACKED_BUFFER_SIZE
];
120 unsigned char packedBuffer
[PACKED_BUFFER_SIZE
/ 4];
121 chrLen
= usefulCharNum
= numCharInBuffer
= totalNumChar
= chrNum
= blockNum
= i
= l
= nCount
= 0;
122 FASTAFile
= (FILE*)fopen64(FASTAFileName
, "r");
123 if (FASTAFile
== NULL
) {
124 fprintf(stderr
, "ParseFASTToPacked() : Cannot open FASTAFileName!\n");
128 annotationFile
= (FILE*)fopen64(annotationFileName
, "w");
129 if (annotationFile
== NULL
) {
130 fprintf(stderr
, "ParseFASTToPacked() : Cannot open annotationFileName!\n");
134 packedDNAFile
= (FILE*)fopen64(packedDNAFileName
, "wb");
135 if (packedDNAFile
== NULL
) {
136 fprintf(stderr
, "ParseFASTToPacked() : Cannot open packedDNAFileName!\n");
140 ambiguityFile
= (FILE*)fopen64(ambiguityFileName
, "w");
141 if (ambiguityFile
== NULL
) {
142 fprintf(stderr
, "ParseFASTToPacked() : Cannot open ambiguityFileName!\n");
146 repeatFile = (FILE *)fopen64(repeatFileName, "w");
147 if (maskLowerCase && repeatFile == NULL ) {
148 fprintf(stderr, "ParseFASTToPacked() : Cannot open repeatFileName!\n");
153 HSPFillCharMap(charMap
);
155 c
= (char)getc(FASTAFile
);
157 fprintf(stderr
, "ParseFASTToPacked() : FASTA file does not begin with '>'!\n");
160 chrAnnotation
= (NewAnnotation
*)malloc(sizeof(NewAnnotation
)*chrAnnAllocated
);
161 chrSeq
= (char*)malloc(sizeof(char)*chrAllocated
);
162 chrNum
= blockNum
= usefulCharNum
= numCharInBuffer
= 0;
163 while(!feof(FASTAFile
)){
164 if (feof(FASTAFile
)) break;
165 if (chrNum
== chrAnnAllocated
){
166 chrAnnAllocated
<<= 1;
167 chrAnnotation
= (NewAnnotation
*)realloc(chrAnnotation
, sizeof(NewAnnotation
)*chrAnnAllocated
);
168 // printf("%d\n", chrNum);
172 c
= (char)getc(FASTAFile
);
173 while(!feof(FASTAFile
) && c
!='\t' && c
!=' ' && c
!='\n' && l
<MAX_SEQ_NAME_LENGTH
){
174 chrAnnotation
[chrNum
].chrName
[l
]=c
;
176 c
=(char)getc(FASTAFile
);
178 chrAnnotation
[chrNum
].chrName
[l
]='\0';
180 c
=(char)getc(FASTAFile
);
183 while(c
!='>' && !feof(FASTAFile
)){
185 if (!maskLowerCase
&& c
>='a' && c
<='z'){
188 if (chrLen
>= chrAllocated
){
190 chrSeq
= (char*)realloc(chrSeq
, sizeof(char)*chrAllocated
);
192 *(chrSeq
+chrLen
) = c
;
195 c
=(char)getc(FASTAFile
);
199 unsigned int repeat_beg
, repeat_end
= 0;
200 repeat_end
= repeat_beg
= 0;
202 while (p
!= chrSeq
+ chrLen
) {
203 if (*p
>= 'a' && *p
<= 'z') {
205 while (*p
>= 'a' && *p
<= 'z' && p
!= chrSeq
+ chrLen
) {
213 fprintf(repeatFile
, "%d\t%u\t%u\n", chrNum
, repeat_beg
, repeat_end
);
217 fprintf(stderr
, "ParseFASTToPacked(): reference <= 75 bp is filtered. Continue ...\n");
223 while (ambiguityCount
[charMap
[(int)*p
]] == 1 && i
++ != chrLen
) p
++;
226 chrAnnotation
[chrNum
].blockInChr
= (ChrBlock
*)malloc(sizeof(ChrBlock
)*blockNum
);
227 chrAnnotation
[chrNum
].chrStart
= usefulCharNum
;
228 chrAnnotation
[chrNum
].blockNum
= blockNum
;
229 chrAnnotation
[chrNum
].blockInChr
[0].blockStart
= usefulCharNum
;
230 chrAnnotation
[chrNum
].blockInChr
[0].ori
= 0;
231 usefulCharNum
+= chrLen
;
232 chrAnnotation
[chrNum
].chrEnd
= usefulCharNum
-1;
233 chrAnnotation
[chrNum
].blockInChr
[0].blockEnd
= usefulCharNum
-1;
236 if (numCharInBuffer
>= PACKED_BUFFER_SIZE
) {
237 ConvertTextToBytePacked(buffer
, packedBuffer
, charMap
, 4, PACKED_BUFFER_SIZE
);
238 fwrite(packedBuffer
, 1, PACKED_BUFFER_SIZE
/ 4, packedDNAFile
);
241 buffer
[numCharInBuffer
++] = chrSeq
[i
++];
246 while (ambiguityCount
[charMap
[(int)*p
]]!=1 && i
++!=chrLen
) p
++;
249 chrAnnotation
[chrNum
].blockInChr
= (ChrBlock
*)malloc(sizeof(ChrBlock
)*blockAllocated
);
250 chrAnnotation
[chrNum
].chrStart
= usefulCharNum
;
251 chrAnnotation
[chrNum
].blockInChr
[blockNum
-1].ori
= i
;
252 chrAnnotation
[chrNum
].blockInChr
[blockNum
-1].blockStart
= usefulCharNum
;
255 if(ambiguityCount
[charMap
[(int)*p
]] == 1){
256 if (numCharInBuffer
>= PACKED_BUFFER_SIZE
) {
257 ConvertTextToBytePacked(buffer
, packedBuffer
, charMap
, 4, PACKED_BUFFER_SIZE
);
258 fwrite(packedBuffer
, 1, PACKED_BUFFER_SIZE
/ 4, packedDNAFile
);
261 buffer
[numCharInBuffer
++] = *p
++;
267 while((ambiguityCount
[charMap
[(int)*p
]]!=1) && i
<chrLen
){
274 if (numCharInBuffer
>= PACKED_BUFFER_SIZE
) {
275 ConvertTextToBytePacked(buffer
, packedBuffer
, charMap
, 4, PACKED_BUFFER_SIZE
);
276 fwrite(packedBuffer
, 1, PACKED_BUFFER_SIZE
/ 4, packedDNAFile
);
279 buffer
[numCharInBuffer
++] = 'G';
285 chrAnnotation
[chrNum
].blockInChr
[blockNum
-1].blockEnd
= usefulCharNum
-1;
286 chrAnnotation
[chrNum
].blockInChr
[blockNum
-1].ori
= i
-nCount
-len
;
287 if (blockNum
== blockAllocated
){
288 blockAllocated
<<= 1;
289 chrAnnotation
[chrNum
].blockInChr
= (ChrBlock
*)realloc(chrAnnotation
[chrNum
].blockInChr
, sizeof(ChrBlock
)*blockAllocated
);
293 chrAnnotation
[chrNum
].blockInChr
[blockNum
-1].blockStart
= usefulCharNum
;
301 chrAnnotation
[chrNum
].blockInChr
[blockNum
-1].blockEnd
= usefulCharNum
-1;
302 chrAnnotation
[chrNum
].blockInChr
[blockNum
-1].ori
= i
-len
;
303 chrAnnotation
[chrNum
].blockNum
= blockNum
;
304 chrAnnotation
[chrNum
].chrEnd
= usefulCharNum
-1;
308 totalNumChar
+= chrLen
;
310 if (numCharInBuffer
> 0) {
311 ConvertTextToBytePacked(buffer
, packedBuffer
, charMap
, 4, numCharInBuffer
);
312 fwrite(packedBuffer
, 1, (numCharInBuffer
+ 3) / 4, packedDNAFile
);
315 if (totalNumChar
% 4 == 0) {
317 fwrite(&c
, 1, 1, packedDNAFile
);
319 c
= (char)(totalNumChar
% 4);
320 fwrite(&c
, 1, 1, packedDNAFile
);
321 fclose(packedDNAFile
);
322 fprintf(annotationFile
, "%u\t%d\t%d\n", totalNumChar
, chrNum
, FASTARandomSeed
);
325 for (i
=0;i
<chrNum
;i
++) {
326 fprintf(annotationFile
, "%d\t%s\n", (int)strlen(chrAnnotation
[i
].chrName
), chrAnnotation
[i
].chrName
);
327 total
+= chrAnnotation
[i
].blockNum
;
329 fprintf(annotationFile
, "%d\n", total
);
330 for(i
=0;i
<chrNum
;i
++){
331 for(j
=0;j
<chrAnnotation
[i
].blockNum
;j
++){
332 fprintf(annotationFile
,"%d\t%u\t%u\t%u\n",i
, chrAnnotation
[i
].blockInChr
[j
].blockStart
, chrAnnotation
[i
].blockInChr
[j
].blockEnd
, chrAnnotation
[i
].blockInChr
[j
].ori
);
334 free(chrAnnotation
[i
].blockInChr
);
338 fprintf(repeatFile
, "%u\n", numRepeat
);
343 fclose(annotationFile
);
347 NewAnnotation
*AnnLoad(const char *annotationFileName
, int *chrNum
){
348 FILE *annotationFile
;
349 if ((annotationFile
= fopen(annotationFileName
, "r")) == NULL
){
350 fprintf(stderr
, "Load Annotation File error\n");
353 NewAnnotation
*chrAnn
;
354 int tmp
, blockNum
, totalNumChar
, FASTARandomSeed
;
355 fscanf(annotationFile
, "%u\t%d\t%d\n", &totalNumChar
, &tmp
, &FASTARandomSeed
);
356 chrAnn
= (NewAnnotation
*)malloc(sizeof(NewAnnotation
)*tmp
);
359 fscanf(annotationFile
, "%s\t%u\t%u\t%d\n", chrAnn
[i
].chrName
, &chrAnn
[i
].chrStart
, &chrAnn
[i
].chrEnd
, &chrAnn
[i
].blockNum
);
360 chrAnn
[i
].blockInChr
= (ChrBlock
*) malloc(sizeof(ChrBlock
)*chrAnn
[i
].blockNum
);
361 for(j
=0;j
<chrAnn
[i
].blockNum
;j
++){
362 fscanf(annotationFile
,"%u\t%u\t%u\n", &chrAnn
[i
].blockInChr
[j
].blockStart
, &chrAnn
[i
].blockInChr
[j
].blockEnd
, &chrAnn
[i
].blockInChr
[j
].ori
);
365 fclose(annotationFile
);
372 unsigned int HSPPackedToFASTA(const char* FASTAFileName
, const char* annotationFileName
, const char* packedDNAFileName
, const char* ambiguityFileName
) {
377 hsp
= HSPLoad(NULL
, packedDNAFileName
, annotationFileName
, ambiguityFileName
);
379 // Generate FASTA from packed
380 FASTAFile
= (FILE*)fopen64(FASTAFileName
, "w");
381 if (FASTAFile
== NULL
) {
382 fprintf(stderr
, "Cannot open FASTA file!\n");
387 fprintf(stderr
, "HSPPackedToFASTA(): Function not complete!\n");