Merge pull request #589 from tjni/clean-up-build-dependencies
[MACS.git] / MACS3 / Signal / RACollection.pyx
blobc0650deee1ff1507484ce4fe2e0903de5198b2c4
1 # cython: language_level=3
2 # cython: profile=True
3 # Time-stamp: <2021-03-10 23:39:52 Tao Liu>
5 """Module for SAPPER BAMParser class
7 Copyright (c) 2017 Tao Liu <tliu4@buffalo.edu>
9 This code is free software; you can redistribute it and/or modify it
10 under the terms of the BSD License (see the file COPYING included
11 with the distribution).
13 @status: experimental
14 @version: $Revision$
15 @author: Tao Liu
16 @contact: tliu4@buffalo.edu
17 """
18 # ------------------------------------
19 # python modules
20 # ------------------------------------
21 import struct
22 from collections import Counter
23 from operator import itemgetter
24 from copy import copy
26 from MACS3.Signal.ReadAlignment import ReadAlignment
27 from MACS3.Signal.PosReadsInfo import PosReadsInfo
28 from MACS3.Signal.UnitigRACollection import UnitigRAs, UnitigCollection
29 from MACS3.IO.PeakIO import PeakIO
31 from cpython cimport bool
32 from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
34 import numpy as np
35 cimport numpy as np
36 from numpy cimport uint32_t, uint64_t, int32_t, int64_t
38 from libc.stdlib cimport malloc, free, realloc
40 cdef extern from "stdlib.h":
41 ctypedef unsigned int size_t
42 size_t strlen(char *s)
43 #void *malloc(size_t size)
44 void *calloc(size_t n, size_t size)
45 #void free(void *ptr)
46 int strcmp(char *a, char *b)
47 char * strcpy(char *a, char *b)
48 long atol(char *bytes)
49 int atoi(char *bytes)
52 # --- fermi-lite functions ---
53 #define MAG_F_AGGRESSIVE 0x20 // pop variant bubbles (not default)
54 #define MAG_F_POPOPEN 0x40 // aggressive tip trimming (default)
55 #define MAG_F_NO_SIMPL 0x80 // skip bubble simplification (default)
57 cdef extern from "fml.h":
58 ctypedef struct bseq1_t:
59 int32_t l_seq
60 char *seq
61 char *qual # NULL-terminated strings; length expected to match $l_seq
63 ctypedef struct magopt_t:
64 int flag, min_ovlp, min_elen, min_ensr, min_insr, max_bdist, max_bdiff, max_bvtx, min_merge_len, trim_len, trim_depth
65 float min_dratio1, max_bcov, max_bfrac
67 ctypedef struct fml_opt_t:
68 int n_threads # number of threads; don't use multi-threading for small data sets
69 int ec_k # k-mer length for error correction; 0 for auto estimate
70 int min_cnt, max_cnt # both occ threshold in ec and tip threshold in cleaning lie in [min_cnt,max_cnt]
71 int min_asm_ovlp # min overlap length during assembly
72 int min_merge_len # during assembly, don't explicitly merge an overlap if shorter than this value
73 magopt_t mag_opt # graph cleaning options
75 ctypedef struct fml_ovlp_t:
76 uint32_t len_, from_, id_, to_
77 #unit32_t from # $from and $to: 0 meaning overlapping 5'-end; 1 overlapping 3'-end
78 #uint32_t id
79 #uint32_t to # $id: unitig number
81 ctypedef struct fml_utg_t:
82 int32_t len # length of sequence
83 int32_t nsr # number of supporting reads
84 char *seq # unitig sequence
85 char *cov # cov[i]-33 gives per-base coverage at i
86 int n_ovlp[2] # number of 5'-end [0] and 3'-end [1] overlaps
87 fml_ovlp_t *ovlp # overlaps, of size n_ovlp[0]+n_ovlp[1]
89 void fml_opt_init(fml_opt_t *opt)
90 fml_utg_t* fml_assemble(const fml_opt_t *opt, int n_seqs, bseq1_t *seqs, int *n_utg)
91 void fml_utg_destroy(int n_utg, fml_utg_t *utg)
92 void fml_utg_print(int n_utgs, const fml_utg_t *utg)
93 bseq1_t *bseq_read(const char *fn, int *n)
95 # --- end of fermi-lite functions ---
97 # --- smith-waterman alignment functions ---
99 cdef extern from "swalign.h":
100 ctypedef struct seq_pair_t:
101 char *a
102 unsigned int alen
103 char *b
104 unsigned int blen
105 ctypedef struct align_t:
106 seq_pair_t *seqs
107 char *markup;
108 int start_a
109 int start_b
110 int end_a
111 int end_b
112 int matches
113 int gaps
114 double score
115 align_t *smith_waterman(seq_pair_t *problem)
116 void destroy_seq_pair(seq_pair_t *pair)
117 void destroy_align(align_t *ali)
119 # ------------------------------------
120 # constants
121 # ------------------------------------
122 __version__ = "Parser $Revision$"
123 __author__ = "Tao Liu <tliu4@buffalo.edu>"
124 __doc__ = "All Parser classes"
126 __DNACOMPLEMENT__ = b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@TBGDEFCHIJKLMNOPQRSAUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff' # A trans table to convert A to T, C to G, G to C, and T to A.
128 __CIGARCODE__ = "MIDNSHP=X"
130 # ------------------------------------
131 # Misc functions
132 # ------------------------------------
134 # ------------------------------------
135 # Classes
136 # ------------------------------------
138 cdef class RACollection:
139 """A collection of ReadAlignment objects and the corresponding
140 PeakIO.
143 cdef:
144 bytes chrom
145 object peak # A PeakIO object
146 list RAlists # contain ReadAlignment lists for treatment (0) and control (1)
147 long left # left position of peak
148 long right # right position of peak
149 long length # length of peak
150 long RAs_left # left position of all RAs in the collection
151 long RAs_right # right position of all RAs in the collection
152 bool sorted # if sorted by lpos
153 bytes peak_refseq # reference sequence in peak region b/w left and right
154 bytes peak_refseq_ext # reference sequence in peak region with extension on both sides b/w RAs_left and RAs_right
156 def __init__ ( self, chrom, peak, RAlist_T, RAlist_C=[] ):
157 """Create RACollection object by taking:
159 1. peak: a PeakIO object indicating the peak region.
161 2. RAlist: a python list of ReadAlignment objects containing
162 all the reads overlapping the peak region. If no RAlist_C
163 given, it will be [].
166 if len(RAlist_T) == 0:
167 # no reads, return None
168 raise Exception("No reads from ChIP sample to construct RAcollection!")
169 self.chrom = chrom
170 self.peak = peak
171 #print(len(RAlist_T),"\n")
172 #print(len(RAlist_C),"\n")
173 self.RAlists = [ RAlist_T, RAlist_C ]
174 self.left = peak["start"]
175 self.right = peak["end"]
176 self.length = self.right - self.left
177 if RAlist_T:
178 self.RAs_left = RAlist_T[0]["lpos"] # initial assignment of RAs_left
179 self.RAs_right = RAlist_T[-1]["rpos"] # initial assignment of RAs_right
180 self.sort() # it will set self.sorted = True
181 else:
182 self.RAs_left = -1
183 self.RAs_right = -1
184 # check RAs_left and RAs_right
185 for ra in RAlist_T:
186 if ra[ "lpos" ] < self.RAs_left:
187 self.RAs_left = ra[ "lpos" ]
188 if ra[ "rpos" ] > self.RAs_right:
189 self.RAs_right = ra[ "rpos" ]
191 for ra in RAlist_C:
192 if ra[ "lpos" ] < self.RAs_left:
193 self.RAs_left = ra[ "lpos" ]
194 if ra[ "rpos" ] > self.RAs_right:
195 self.RAs_right = ra[ "rpos" ]
196 (self.peak_refseq, self.peak_refseq_ext) = self.__get_peak_REFSEQ()
198 def __getitem__ ( self, keyname ):
199 if keyname == "chrom":
200 return self.chrom
201 elif keyname == "left":
202 return self.left
203 elif keyname == "right":
204 return self.right
205 elif keyname == "RAs_left":
206 return self.RAs_left
207 elif keyname == "RAs_right":
208 return self.RAs_right
209 elif keyname == "length":
210 return self.length
211 elif keyname == "count":
212 return len( self.RAlists[ 0 ] )+ len( self.RAlists[ 1 ] )
213 elif keyname == "count_T":
214 return len( self.RAlists[ 0 ] )
215 elif keyname == "count_C":
216 return len( self.RAlists[ 1 ] )
217 elif keyname == "peak_refseq":
218 return self.peak_refseq
219 elif keyname == "peak_refseq_ext":
220 return self.peak_refseq_ext
221 else:
222 raise KeyError("Unavailable key:", keyname)
224 def __getstate__ ( self ):
225 #return {"chrom":self.chrom, "peak":self.peak, "RAlists":self.RAlists,
226 # "left":self.left, "right":self.right, "length": self.length,
227 # "RAs_left":self.RAs_left, "RAs_right":self.RAs_right}
228 return (self.chrom, self.peak, self.RAlists, self.left, self.right, self.length, self.RAs_left, self.RAs_right, self.peak_refseq, self.peak_refseq_ext)
230 def __setstate__ ( self, state ):
231 (self.chrom, self.peak, self.RAlists, self.left, self.right, self.length, self.RAs_left, self.RAs_right, self.peak_refseq, self.peak_refseq_ext) = state
233 cpdef sort ( self ):
234 """Sort RAs according to lpos. Should be used after realignment.
237 if self.RAlists[ 0 ]:
238 self.RAlists[ 0 ].sort(key=itemgetter("lpos"))
239 if self.RAlists[ 1 ]:
240 self.RAlists[ 1 ].sort(key=itemgetter("lpos"))
241 self.sorted = True
242 return
244 cpdef remove_outliers ( self, int percent = 5 ):
245 """ Remove outliers with too many n_edits. The outliers with
246 n_edits in top p% will be removed.
248 Default: remove top 5% of reads that have too many differences
249 with reference genome.
251 cdef:
252 list n_edits_list
253 object read # ReadAlignment object
254 int highest_n_edits
255 list new_RAlist
256 int i
258 n_edits_list = []
259 for ralist in self.RAlists:
260 for read in ralist:
261 n_edits_list.append( read["n_edits"] )
262 n_edits_list.sort()
263 highest_n_edits = n_edits_list[ int( len( n_edits_list ) * (1 - percent * .01) ) ]
265 for i in ( range(len(self.RAlists)) ):
266 new_RAlist = []
267 for read in self.RAlists[ i ]:
268 if read["n_edits"] <= highest_n_edits:
269 new_RAlist.append( read )
270 self.RAlists[ i ] = new_RAlist
272 return
274 cpdef n_edits_sum ( self ):
277 cdef:
278 list n_edits_list
279 object read
280 int highest_n_edits
282 n_edits_list = []
284 for ralist in self.RAlists:
285 for read in ralist:
286 n_edits_list.append( read["n_edits"] )
288 n_edits_list.sort()
289 # print ( n_edits_list )
290 c = Counter( n_edits_list )
291 #print( c )
293 cdef tuple __get_peak_REFSEQ ( self ):
294 """Get the reference sequence within the peak region.
297 cdef:
298 bytearray peak_refseq
299 int i
300 long prev_r #remember the previous filled right end
301 long start
302 long end
303 long ind, ind_r
304 object read
305 bytearray read_refseq_ext
306 bytearray read_refseq
308 start = min( self.RAs_left, self.left )
309 end = max( self.RAs_right, self.right )
310 #print ("left",start,"right",end)
311 peak_refseq_ext = bytearray( b'N' * ( end - start ) )
313 # for treatment.
314 peak_refseq_ext = self.__fill_refseq ( peak_refseq_ext, self.RAlists[0] )
315 # and control if available.
316 if self.RAlists[1]:
317 peak_refseq_ext = self.__fill_refseq ( peak_refseq_ext, self.RAlists[1] )
319 # trim
320 peak_refseq = peak_refseq_ext[ self.left - start: self.right - start ]
321 return ( bytes( peak_refseq ), bytes( peak_refseq_ext ) )
323 cdef bytearray __fill_refseq ( self, bytearray seq, list ralist ):
324 """Fill refseq sequence of whole peak with refseq sequences of
325 each read in ralist.
328 cdef:
329 long prev_r # previous right position of last
330 # filled
331 long ind, ind_r
332 long start, end
333 object read
334 bytearray read_refseq
336 start = min( self.RAs_left, self.left )
338 #print(len(ralist),"\n")
339 prev_r = ralist[0]["lpos"]
341 for i in range( len( ralist ) ):
342 read = ralist[ i ]
343 if read[ "lpos" ] > prev_r:
344 read = ralist[ i - 1 ]
345 read_refseq = read.get_REFSEQ()
346 ind = read["lpos"] - start
347 ind_r = ind + read["rpos"] - read["lpos"]
348 seq[ ind: ind_r ] = read_refseq
349 prev_r = read[ "rpos" ]
350 # last
351 read = ralist[ -1 ]
352 read_refseq = read.get_REFSEQ()
353 ind = read["lpos"] - start
354 ind_r = ind + read["rpos"] - read["lpos"]
355 seq[ ind: ind_r ] = read_refseq
356 return seq
358 cpdef get_PosReadsInfo_ref_pos ( self, long ref_pos, bytes ref_nt, int Q=20 ):
359 """Generate a PosReadsInfo object for a given reference genome
360 position.
362 Return a PosReadsInfo object.
365 cdef:
366 bytearray s
367 bytearray bq
368 int strand
369 object ra
370 list bq_list_t = []
371 list bq_list_c = []
372 int i
373 int pos
374 bool tip
376 posreadsinfo_p = PosReadsInfo( ref_pos, ref_nt )
378 #Treatment group
379 for i in range( len( self.RAlists[ 0 ] ) ):
380 ra = self.RAlists[ 0 ][ i ]
381 if ra[ "lpos" ] <= ref_pos and ra[ "rpos" ] > ref_pos:
382 ( s, bq, strand, tip, pos) = ra.get_variant_bq_by_ref_pos( ref_pos )
383 posreadsinfo_p.add_T( i, bytes( s ), bq[ 0 ], strand, tip, Q=Q )
385 #Control group
386 for i in range( len( self.RAlists[ 1 ] ) ):
387 ra = self.RAlists[ 1 ][ i ]
388 if ra[ "lpos" ] <= ref_pos and ra[ "rpos" ] > ref_pos:
389 ( s, bq, strand, tip, pos ) = ra.get_variant_bq_by_ref_pos( ref_pos )
390 posreadsinfo_p.add_C( i, bytes( s ), bq[ 0 ], strand, Q=Q )
392 return posreadsinfo_p
394 cpdef bytearray get_FASTQ ( self ):
395 """Get FASTQ file for all reads in RACollection.
398 cdef:
399 object ra
400 bytearray fastq_text
402 fastq_text = bytearray(b"")
404 for ra in self.RAlists[0]:
405 fastq_text += ra.get_FASTQ()
407 for ra in self.RAlists[1]:
408 fastq_text += ra.get_FASTQ()
410 return fastq_text
412 cdef list fermi_assemble( self, int fermiMinOverlap, int opt_flag = 0x80 ):
413 """A wrapper function to call Fermi unitig building functions.
415 cdef:
416 fml_opt_t *opt
417 int c, n_seqs
418 int * n_utg
419 bseq1_t *seqs
420 fml_utg_t *utg
421 fml_utg_t p
423 int unitig_k, merge_min_len
424 bytes tmps
425 bytes tmpq
426 int ec_k = -1
427 int64_t l
428 char * cseq
429 char * cqual
430 int i, j
431 bytes tmpunitig
432 bytes unitig #final unitig
433 list unitig_list # contain list of sequences in bytes format
434 int * n
436 n_seqs = len(self.RAlists[0]) + len(self.RAlists[1])
438 # print n_seqs
440 # prepare seq and qual, note, we only extract SEQ according to the +
441 # strand of reference sequence.
442 seqs = <bseq1_t *> malloc( n_seqs * sizeof(bseq1_t) ) # we rely on fermi-lite to free this mem
444 i = 0
445 for ra in self.RAlists[0]:
446 tmps = ra["SEQ"]
447 tmpq = ra["QUAL"]
448 l = len(tmps)
449 cseq = <char *>malloc( (l+1)*sizeof(char))# we rely on fermi-lite to free this mem
450 cqual = <char *>malloc( (l+1)*sizeof(char))# we rely on fermi-lite to free this mem
451 for j in range(l):
452 cseq[ j ] = tmps[ j ]
453 cqual[ j ] = tmpq[ j ] + 33
454 cseq[ l ] = b'\x00'
455 cqual[ l ]= b'\x00'
457 seqs[ i ].seq = cseq
458 seqs[ i ].qual = cqual
459 seqs[ i ].l_seq = len(tmps)
460 i += 1
462 # print "@",ra["readname"].decode()
463 # print cseq.decode()
464 # print "+"
465 # print cqual.decode()
467 for ra in self.RAlists[1]:
468 tmps = ra["SEQ"]
469 tmpq = ra["QUAL"]
470 l = len(tmps)
471 cseq = <char *>malloc( (l+1)*sizeof(char))# we rely on fermi-lite to free this mem
472 cqual = <char *>malloc( (l+1)*sizeof(char))# we rely on fermi-lite to free this mem
473 for j in range(l):
474 cseq[ j ] = tmps[ j ]
475 cqual[ j ] = tmpq[ j ] + 33
476 cseq[ l ] = b'\x00'
477 cqual[ l ]= b'\x00'
479 seqs[ i ].seq = cseq
480 seqs[ i ].qual = cqual
481 seqs[ i ].l_seq = len(tmps)
482 i += 1
483 # print "@",ra["readname"].decode()
484 # print cseq.decode()
485 # print "+"
486 # print cqual.decode()
488 # if self.RAlists[1]:
489 # unitig_k=int(min(self.RAlists[0][0]["l"],self.RAlists[1][0]["l"])*fermiOverlapMinRatio)
491 # merge_min_len=int(min(self.RAlists[0][0]["l"],self.RAlists[1][0]["l"])*0.5)
492 # else:
493 # unitig_k = int(self.RAlists[0][0]["l"]*fermiOverlapMinRatio)
495 # merge_min_len=int(self.RAlists[0][0]["l"]*0.5)
496 #fermiMinOverlap = int(self.RAlists[0][0]["l"]*fermiOverlapMinRatio)
498 # minimum overlap to merge, default 0
499 # merge_min_len= max( 25, int(self.RAlists[0][0]["l"]*0.5) )
500 # merge_min_len= int(self.RAlists[0][0]["l"]*0.5)
502 opt = <fml_opt_t *> PyMem_Malloc( sizeof(fml_opt_t) )
503 n_utg = <int *> PyMem_Malloc( sizeof(int) )
505 fml_opt_init(opt)
506 # k-mer length for error correction (0 for auto; -1 to disable)
507 #opt.ec_k = 0
509 # min overlap length during initial assembly
510 opt.min_asm_ovlp = fermiMinOverlap
512 # minimum length to merge, during assembly, don't explicitly merge an overlap if shorter than this value
513 # opt.min_merge_len = merge_min_len
515 # there are more 'options' for mag clean:
516 # flag, min_ovlp, min_elen, min_ensr, min_insr, max_bdist, max_bdiff, max_bvtx, min_merge_len, trim_len, trim_depth, min_dratio1, max_bcov, max_bfrac
517 # min_elen (300) will be adjusted
518 # min_ensr (4), min_insr (3) will be computed
519 # min_merge_len (0) will be updated using opt.min_merge_len
521 # We can adjust: flag (0x40|0x80), min_ovlp (0), min_dratio1 (0.7), max_bdiff (50), max_bdist (512), max_bvtx (64), trim_len (0), trim_depth (6), max_bcov (10.), max_bfrac (0.15)
523 # 0x20: MAG_F_AGGRESSIVE pop variant bubbles
524 # 0x40: MAG_F_POPOPEN aggressive tip trimming
525 # 0x80: MAG_F_NO_SIMPL skip bubble simplification
526 opt.mag_opt.flag = opt_flag
528 # mag_opt.min_ovlp
529 #opt.mag_opt.min_ovlp = fermiMinOverlap
531 # drop an overlap if its length is below maxOvlpLen*FLOAT
532 #opt.mag_opt.min_dratio1 = 0.5
534 # retain a bubble if one side is longer than the other side by >INT-bp
535 #opt.mag_opt.max_bdiff = 10#merge_min_len
537 # trim_len:
538 # trim_depth: Parameter used to trim the open end/tip. If trim_len == 0, do nothing
540 # max_bdist:
541 # max_bvtx: Parameter used to simply bubble while 0x80 flag is set.
542 #opt.mag_opt.max_bdist = 1024
543 #opt.mag_opt.max_bvtx = 128
545 # max_bcov:
546 # max_bfrac: Parameter used when aggressive bubble removal is not used. Bubble will be removed if its average coverage lower than max_bcov and fraction (cov1/(cov1+cov2)) is lower than max_bfrac
547 #opt.mag_opt.max_bcov = 10.
548 #opt.mag_opt.max_bfrac = 0.01
550 utg = fml_assemble(opt, n_seqs, seqs, n_utg)
551 # get results
552 unitig_list = []
553 for i in range( n_utg[0] ):
554 p = utg[ i ]
555 if (p.len < 0):
556 continue
557 #unitig = b''
558 #for j in range( p.len ):
559 # unitig += [b'A',b'C',b'G',b'T',b'N'][int(p.seq[j]) - 1]
560 #unitig_list.append( unitig )
561 unitig_list.append( p.seq )
563 fml_utg_destroy(n_utg[0], utg)
565 PyMem_Free( opt )
566 PyMem_Free( n_utg )
568 return unitig_list
570 cdef tuple align_unitig_to_REFSEQ ( self, list unitig_list ):
571 """Note: we use smith waterman, but we don't use linear gap
572 penalty at this time.
574 Also, if unitig is mapped to - strand, we will revcomp the
575 unitig. So the unitig_list will be changed in this case.
578 cdef:
579 bytes unitig
580 seq_pair_t problem
581 align_t * results
582 char * tmp
583 bytes target
584 bytes reference
585 bytes target_aln_f, target_aln_r
586 bytes reference_aln_f, reference_aln_r
587 bytes markup_aln_f, markup_aln_r
588 double score_f, score_r
589 list target_alns = []
590 list reference_alns = []
591 list markup_alns = []
592 list aln_scores = []
593 int i
595 reference = copy(self.peak_refseq_ext+b'\x00')
597 for i in range( len(unitig_list) ):
598 unitig = unitig_list[ i ]
599 target = copy(unitig + b'\x00')
600 # we use swalign.c for local alignment (without affine gap
601 # penalty). Will revise later.
602 problem.a = target
603 problem.alen = len( unitig )
604 problem.b = reference
605 problem.blen = len( self.peak_refseq_ext )
606 results = smith_waterman( &problem )
607 target_aln_f = results.seqs.a
608 reference_aln_f = results.seqs.b
609 markup_aln_f = results.markup
610 score_f = results.score
611 free( results.seqs.a )
612 free( results.seqs.b )
613 free( results.markup )
614 free( results )
615 # end of local alignment
617 # try reverse complement
618 target = copy(unitig[::-1] + b'\x00')
619 target = target.translate( __DNACOMPLEMENT__ )
620 problem.a = target
621 problem.alen = len( unitig )
622 problem.b = reference
623 problem.blen = len( self.peak_refseq_ext )
624 results = smith_waterman( &problem )
625 target_aln_r = results.seqs.a
626 reference_aln_r = results.seqs.b
627 markup_aln_r = results.markup
628 score_r = results.score
629 free( results.seqs.a )
630 free( results.seqs.b )
631 free( results.markup )
632 free( results )
633 # end of local alignment
635 if score_f > score_r:
636 target_alns.append( target_aln_f )
637 reference_alns.append( reference_aln_f )
638 markup_alns.append( markup_aln_f )
639 aln_scores.append( score_f )
640 else:
641 target_alns.append( target_aln_r )
642 reference_alns.append( reference_aln_r )
643 markup_alns.append( markup_aln_r )
644 aln_scores.append( score_r )
645 # we will revcomp unitig
646 unitig = unitig[::-1]
647 unitig_list[ i ] = unitig.translate( __DNACOMPLEMENT__ )
649 return ( target_alns, reference_alns, aln_scores, markup_alns )
651 cdef verify_alns( self, unitig_list, unitig_alns, reference_alns, aln_scores, markup_alns, float min_score_100 = 150 ):
652 """Remove aln/unitig if it contains too many edits in a small region
654 default min score is 150, which means under 2/-3/-5/-2 scoring schema, there are 10 mismatches within 100bps region.
656 cdef:
657 int i
658 for i in range( len( unitig_list )-1, -1, -1 ):
659 #print i, aln_scores[ i ]
660 #print unitig_alns[ i ]
661 #print markup_alns[ i ]
662 #print reference_alns[ i ]
663 if aln_scores[ i ] * 100 /len( markup_alns[ i ] ) < min_score_100:
664 unitig_list.pop( i )
665 unitig_alns.pop( i )
666 reference_alns.pop( i )
667 aln_scores.pop( i )
668 markup_alns.pop( i )
669 return
672 cdef tuple filter_unitig_with_bad_aln ( self, list unitig_list, list target_alns, list reference_alns, float gratio = 0.25 ):
673 """Remove unitigs that has too much gaps (both on target and reference) during alignments.
675 pass
677 cdef list remap_RAs_w_unitigs ( self, list unitig_list ):
678 """Remap RAs to unitigs, requiring perfect match.
680 Return RAlists_T, RAlists_C, unmapped_racollection.
682 cdef:
683 list RAlists_T = [] # lists of list of RAs of ChIP mapped to each unitig
684 list RAlists_C = []
685 list unmapped_RAlist_T = [] # list of RAs of ChIP unmappable to unitigs
686 list unmapped_RAlist_C = []
687 #RACollection unmapped_ra_collection
688 int flag = 0
689 int i
690 object tmp_ra
691 bytes tmp_ra_seq, unitig
693 for i in range( len(unitig_list) ):
694 RAlists_T.append([]) # for each unitig, there is another list of RAs
695 RAlists_C.append([])
697 # assign RAs to unitigs
699 for tmp_ra in self.RAlists[0]:
700 flag = 0
701 tmp_ra_seq = tmp_ra["SEQ"]
702 for i in range( len(unitig_list) ):
703 unitig = unitig_list[ i ]
704 if tmp_ra_seq in unitig:
705 flag = 1
706 RAlists_T[ i ].append( tmp_ra )
707 break
708 if flag == 0:
709 unmapped_RAlist_T.append( tmp_ra )
710 #print "unmapped:", tmp_ra["SEQ"]
712 for tmp_ra in self.RAlists[1]:
713 flag = 0
714 tmp_ra_seq = tmp_ra["SEQ"]
715 for i in range( len(unitig_list) ):
716 unitig = unitig_list[ i ]
717 if tmp_ra_seq in unitig:
718 flag = 1
719 RAlists_C[ i ].append( tmp_ra )
720 break
721 if flag == 0:
722 unmapped_RAlist_C.append( tmp_ra )
723 #print "unmapped:", tmp_ra["SEQ"]
725 #if unmapped_RAlist_T:
726 #unmapped_ra_collection = RACollection( self.chrom, self.peak, unmapped_RAlist_T, unmapped_RAlist_C )
727 return [ RAlists_T, RAlists_C, unmapped_RAlist_T, unmapped_RAlist_C ]
729 cdef list add_to_unitig_list ( self, unitig_list, unitigs_2nd ):
732 cdef:
733 int i,j
734 int flag
735 bytes u0, u1
736 list new_unitig_list
738 new_unitig_list = []
740 for i in range( len(unitigs_2nd) ):
741 flag = 0 # initial value: can't be found in unitig_list
742 u0 = unitigs_2nd[ i ]
743 for j in range( len( unitig_list ) ):
744 u1 = unitig_list[ j ]
745 if u1.find( u0 ) != -1:
746 flag = 1
747 break
748 u1 = u1[::-1].translate(__DNACOMPLEMENT__)
749 if u1.find( u0 ) != -1:
750 flag = 1
751 break
752 if not flag:
753 new_unitig_list.append( u0 )
754 new_unitig_list.extend( unitig_list )
755 return new_unitig_list
758 cpdef object build_unitig_collection ( self, fermiMinOverlap ):
759 """unitig_list and tuple_alns are in the same order!
761 return UnitigCollection object.
764 cdef:
765 long start, end
766 list unitigs_2nd
767 bytes u
768 list target_alns, reference_alns, aln_scores, markup_alns
769 list target_alns_2nd, reference_alns_2nd, aln_scores_2nd
770 list RAlists_T = [] # lists of list of RAs of ChIP mapped to each unitig
771 list RAlists_C = []
772 list unmapped_RAlist_T = []
773 list unmapped_RAlist_C = []
774 bytes tmp_unitig_seq, tmp_reference_seq
775 bytes tmp_unitig_aln, tmp_reference_aln,
776 int i, j
777 long left_padding_ref, right_padding_ref
778 long left_padding_unitig, right_padding_unitig
779 list ura_list = []
780 RACollection unmapped_ra_collection
781 int flag = 0
782 int n_unmapped, n_unitigs_0, n_unitigs_1
784 # first round of assembly
785 # print (" First round to assemble unitigs")
786 unitig_list = self.fermi_assemble( fermiMinOverlap, opt_flag = 0x80 )
787 if len(unitig_list) == 0:
788 return 0
790 n_unitigs_0 = -1
791 n_unitigs_1 = len( unitig_list )
792 #print " # of Unitigs:", n_unitigs_1
793 #print " Map reads to unitigs"
794 ( unitig_alns, reference_alns, aln_scores, markup_alns) = self.align_unitig_to_REFSEQ( unitig_list )
796 self.verify_alns( unitig_list, unitig_alns, reference_alns, aln_scores, markup_alns )
797 if len(unitig_list) == 0:
798 # if stop here, it raises a flag that the region may contain too many mismapped reads, we return -1
799 return -1
800 # print (" # of Unitigs:", n_unitigs_1)
802 # assign RAs to unitigs
803 [ RAlists_T, RAlists_C, unmapped_RAlist_T, unmapped_RAlist_C ] = self.remap_RAs_w_unitigs( unitig_list )
804 #print unmapped_ra_collection.get_FASTQ().decode()
806 n_unmapped = len( unmapped_RAlist_T ) + len( unmapped_RAlist_C )
808 while len( unmapped_RAlist_T ) > 0 and n_unitigs_1 != n_unitigs_0:
809 # if there are unmapped reads AND we can get more unitigs
810 # from last round of assembly, do assembly again
812 # print (" # of RAs not mapped, will be assembled again:", n_unmapped)
813 n_unitigs_0 = n_unitigs_1
814 # another round of assembly
815 unmapped_ra_collection = RACollection( self.chrom, self.peak, unmapped_RAlist_T, unmapped_RAlist_C )
816 unitigs_2nd = unmapped_ra_collection.fermi_assemble( fermiMinOverlap, opt_flag = 0x80 )
818 if unitigs_2nd:
819 unitig_list = self.add_to_unitig_list ( unitig_list, unitigs_2nd )
820 n_unitigs_1 = len( unitig_list )
821 #print " # of Unitigs:", n_unitigs_1
822 #print " Map reads to unitigs"
823 ( unitig_alns, reference_alns, aln_scores, markup_alns ) = self.align_unitig_to_REFSEQ( unitig_list )
824 self.verify_alns( unitig_list, unitig_alns, reference_alns, aln_scores, markup_alns )
825 [ RAlists_T, RAlists_C, unmapped_RAlist_T, unmapped_RAlist_C ] = self.remap_RAs_w_unitigs( unitig_list )
826 n_unmapped = len( unmapped_RAlist_T ) + len( unmapped_RAlist_C )
827 #else:
828 # for r in unmapped_RAlist_T:
829 # print r.get_FASTQ().decode().lstrip()
831 # print (" # of RAs not mapped, will be assembled again with 1/2 of fermiMinOverlap:", n_unmapped)
832 # another round of assembly
833 unmapped_ra_collection = RACollection( self.chrom, self.peak, unmapped_RAlist_T, unmapped_RAlist_C )
834 unitigs_2nd = unmapped_ra_collection.fermi_assemble( fermiMinOverlap/2, opt_flag = 0x80 )
836 if unitigs_2nd:
837 unitig_list = self.add_to_unitig_list ( unitig_list, unitigs_2nd )
838 n_unitigs_1 = len( unitig_list )
839 #print " # of Unitigs:", n_unitigs_1
840 #print " Map reads to unitigs"
841 ( unitig_alns, reference_alns, aln_scores, markup_alns ) = self.align_unitig_to_REFSEQ( unitig_list )
842 self.verify_alns( unitig_list, unitig_alns, reference_alns, aln_scores, markup_alns )
843 [ RAlists_T, RAlists_C, unmapped_RAlist_T, unmapped_RAlist_C ] = self.remap_RAs_w_unitigs( unitig_list )
844 n_unmapped = len( unmapped_RAlist_T ) + len( unmapped_RAlist_C )
845 #else:
846 # for r in unmapped_RAlist_T:
847 # print r.get_FASTQ().decode().lstrip()
848 if len(unitig_list) == 0:
849 raise Exception("Shouldn't reach here")
850 # print (" # of Unitigs:", n_unitigs_1)
852 if len(unitig_list) == 0:
853 return None
854 #print (" Final round: # of Unitigs:", len(unitig_list))
855 #print (" Final round: # of RAs not mapped:", n_unmapped)
857 start = min( self.left, self.RAs_left )
858 end = max( self.right, self.RAs_right )
860 # create UnitigCollection
861 for i in range( len( unitig_list ) ):
862 #b'---------------------------AAATAATTTTATGTCCTTCAGTACAAAAAGCAGTTTCAACTAAAACCCAGTAACAAGCTAGCAATTCCTTTTAAATGGTGCTACTTCAAGCTGCAGCCAGGTAGCTTTTTATTACAAAAAATCCCACAGGCAGCCACTAGGTGGCAGTAACAGGCTTTTGCCAGCGGCTCCAGTCAGCATGGCTTGACTGTGTGCTGCAGAAACTTCTTAAATCGTCTGTGTTTGGGACTCGTGGGGCCCCACAGGGCTTTACAAGGGCTTTTTAATTTCCAAAAACATAAAACAAAAAAA--------------'
863 #b'GATATAAATAGGATGTTATGAGTTTTCAAATAATTTTATGTCCTTCAGTACAAAAAGCAGTTTCAACTAAAACCCAGTAACAAGCTAGCAATTCCTTTTAAATGGTGCTACTTCAAGCTGCAGCCAGGTAGCTTTTTATTACAAAAA-TCCCACAGGCAGCCACTAGGTGGCAGTAACAGGCTTTTGCCAGCGGCTCCAGTCAGCATGGCTTGACTGTGTGCTGCAGAAACTTCTTAAATCGTCTGTGTTTGGGACTCGTGGGGCCCCACAGGGCTTTACAAGGGCTTTTTAATTTCCAAAAACATAAAACAAAAAAAAATACAAATGTATT'
864 tmp_unitig_aln = unitig_alns[ i ]
865 tmp_reference_aln = reference_alns[ i ]
866 tmp_unitig_seq = tmp_unitig_aln.replace(b'-',b'')
867 tmp_reference_seq = tmp_reference_aln.replace(b'-',b'')
869 # print tmp_unitig_aln
870 # print tmp_reference_aln
871 # print tmp_unitig_seq
872 # print tmp_reference_aln
874 # find the position on self.peak_refseq_ext
875 left_padding_ref = self.peak_refseq_ext.find( tmp_reference_seq ) # this number of nts should be skipped on refseq_ext from left
876 right_padding_ref = len(self.peak_refseq_ext) - left_padding_ref - len(tmp_reference_seq) # this number of nts should be skipped on refseq_ext from right
878 #now, decide the lpos and rpos on reference of this unitig
879 #first, trim left padding '-'
880 left_padding_unitig = len(tmp_unitig_aln) - len(tmp_unitig_aln.lstrip(b'-'))
881 right_padding_unitig = len(tmp_unitig_aln) - len(tmp_unitig_aln.rstrip(b'-'))
883 tmp_lpos = start + left_padding_ref
884 tmp_rpos = end - right_padding_ref
886 for j in range( left_padding_unitig ):
887 if tmp_reference_aln[ j ] != b'-':
888 tmp_lpos += 1
889 for j in range( 1, right_padding_unitig + 1 ):
890 if tmp_reference_aln[ -j ] != b'-':
891 tmp_rpos -= 1
893 tmp_unitig_aln = tmp_unitig_aln[ left_padding_unitig:(len(tmp_unitig_aln)-right_padding_unitig)]
894 tmp_reference_aln = tmp_reference_aln[ left_padding_unitig:(len(tmp_reference_aln)-right_padding_unitig)]
896 ura_list.append( UnitigRAs( self.chrom, tmp_lpos, tmp_rpos, tmp_unitig_aln, tmp_reference_aln, [RAlists_T[i], RAlists_C[i]] ) )
898 return UnitigCollection( self.chrom, self.peak, ura_list )