1 # cython: language_level=3
3 # Time-stamp: <2021-03-10 23:39:52 Tao Liu>
5 """Module for SAPPER BAMParser class
7 Copyright (c) 2017 Tao Liu <tliu4@buffalo.edu>
9 This code is free software; you can redistribute it and/or modify it
10 under the terms of the BSD License (see the file COPYING included
11 with the distribution).
16 @contact: tliu4@buffalo.edu
18 # ------------------------------------
20 # ------------------------------------
22 from collections
import Counter
23 from operator
import itemgetter
26 from MACS3
.Signal
.ReadAlignment
import ReadAlignment
27 from MACS3
.Signal
.PosReadsInfo
import PosReadsInfo
28 from MACS3
.Signal
.UnitigRACollection
import UnitigRAs
, UnitigCollection
29 from MACS3
.IO
.PeakIO
import PeakIO
31 from cpython cimport bool
32 from cpython
.mem cimport PyMem_Malloc
, PyMem_Realloc
, PyMem_Free
36 from numpy cimport uint32_t
, uint64_t
, int32_t
, int64_t
38 from libc
.stdlib cimport malloc
, free
, realloc
40 cdef extern from "stdlib.h":
41 ctypedef unsigned
int size_t
42 size_t strlen
(char *s
)
43 #void *malloc(size_t size)
44 void *calloc
(size_t n
, size_t size
)
46 int strcmp
(char *a
, char *b
)
47 char * strcpy
(char *a
, char *b
)
48 long atol
(char *bytes
)
52 # --- fermi-lite functions ---
53 #define MAG_F_AGGRESSIVE 0x20 // pop variant bubbles (not default)
54 #define MAG_F_POPOPEN 0x40 // aggressive tip trimming (default)
55 #define MAG_F_NO_SIMPL 0x80 // skip bubble simplification (default)
57 cdef extern from "fml.h":
58 ctypedef struct bseq1_t
:
61 char *qual
# NULL-terminated strings; length expected to match $l_seq
63 ctypedef struct magopt_t
:
64 int flag
, min_ovlp
, min_elen
, min_ensr
, min_insr
, max_bdist
, max_bdiff
, max_bvtx
, min_merge_len
, trim_len
, trim_depth
65 float min_dratio1
, max_bcov
, max_bfrac
67 ctypedef struct fml_opt_t
:
68 int n_threads
# number of threads; don't use multi-threading for small data sets
69 int ec_k
# k-mer length for error correction; 0 for auto estimate
70 int min_cnt
, max_cnt
# both occ threshold in ec and tip threshold in cleaning lie in [min_cnt,max_cnt]
71 int min_asm_ovlp
# min overlap length during assembly
72 int min_merge_len
# during assembly, don't explicitly merge an overlap if shorter than this value
73 magopt_t mag_opt
# graph cleaning options
75 ctypedef struct fml_ovlp_t
:
76 uint32_t len_
, from_
, id_
, to_
77 #unit32_t from # $from and $to: 0 meaning overlapping 5'-end; 1 overlapping 3'-end
79 #uint32_t to # $id: unitig number
81 ctypedef struct fml_utg_t
:
82 int32_t
len # length of sequence
83 int32_t nsr
# number of supporting reads
84 char *seq
# unitig sequence
85 char *cov
# cov[i]-33 gives per-base coverage at i
86 int n_ovlp
[2] # number of 5'-end [0] and 3'-end [1] overlaps
87 fml_ovlp_t
*ovlp
# overlaps, of size n_ovlp[0]+n_ovlp[1]
89 void fml_opt_init
(fml_opt_t
*opt
)
90 fml_utg_t
* fml_assemble
(const fml_opt_t
*opt
, int n_seqs
, bseq1_t
*seqs
, int *n_utg
)
91 void fml_utg_destroy
(int n_utg
, fml_utg_t
*utg
)
92 void fml_utg_print
(int n_utgs
, const fml_utg_t
*utg
)
93 bseq1_t
*bseq_read
(const
char *fn
, int *n
)
95 # --- end of fermi-lite functions ---
97 # --- smith-waterman alignment functions ---
99 cdef extern from "swalign.h":
100 ctypedef struct seq_pair_t
:
105 ctypedef struct align_t
:
115 align_t
*smith_waterman
(seq_pair_t
*problem
)
116 void destroy_seq_pair
(seq_pair_t
*pair
)
117 void destroy_align
(align_t
*ali
)
119 # ------------------------------------
121 # ------------------------------------
122 __version__
= "Parser $Revision$"
123 __author__
= "Tao Liu <tliu4@buffalo.edu>"
124 __doc__
= "All Parser classes"
126 __DNACOMPLEMENT__
= b
'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@TBGDEFCHIJKLMNOPQRSAUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff' # A trans table to convert A to T, C to G, G to C, and T to A.
128 __CIGARCODE__
= "MIDNSHP=X"
130 # ------------------------------------
132 # ------------------------------------
134 # ------------------------------------
136 # ------------------------------------
138 cdef class RACollection
:
139 """A collection of ReadAlignment objects and the corresponding
145 object peak
# A PeakIO object
146 list RAlists
# contain ReadAlignment lists for treatment (0) and control (1)
147 long left
# left position of peak
148 long right
# right position of peak
149 long length
# length of peak
150 long RAs_left
# left position of all RAs in the collection
151 long RAs_right
# right position of all RAs in the collection
152 bool sorted
# if sorted by lpos
153 bytes peak_refseq
# reference sequence in peak region b/w left and right
154 bytes peak_refseq_ext
# reference sequence in peak region with extension on both sides b/w RAs_left and RAs_right
156 def __init__
( self, chrom
, peak
, RAlist_T
, RAlist_C
=[] ):
157 """Create RACollection object by taking:
159 1. peak: a PeakIO object indicating the peak region.
161 2. RAlist: a python list of ReadAlignment objects containing
162 all the reads overlapping the peak region. If no RAlist_C
163 given, it will be [].
166 if len(RAlist_T
) == 0:
167 # no reads, return None
168 raise Exception("No reads from ChIP sample to construct RAcollection!")
171 #print(len(RAlist_T),"\n")
172 #print(len(RAlist_C),"\n")
173 self.RAlists
= [ RAlist_T
, RAlist_C
]
174 self.left
= peak
["start"]
175 self.right
= peak
["end"]
176 self.length
= self.right
- self.left
178 self.RAs_left
= RAlist_T
[0]["lpos"] # initial assignment of RAs_left
179 self.RAs_right
= RAlist_T
[-1]["rpos"] # initial assignment of RAs_right
180 self.sort
() # it will set self.sorted = True
184 # check RAs_left and RAs_right
186 if ra
[ "lpos" ] < self.RAs_left
:
187 self.RAs_left
= ra
[ "lpos" ]
188 if ra
[ "rpos" ] > self.RAs_right
:
189 self.RAs_right
= ra
[ "rpos" ]
192 if ra
[ "lpos" ] < self.RAs_left
:
193 self.RAs_left
= ra
[ "lpos" ]
194 if ra
[ "rpos" ] > self.RAs_right
:
195 self.RAs_right
= ra
[ "rpos" ]
196 (self.peak_refseq
, self.peak_refseq_ext
) = self.__get_peak_REFSEQ
()
198 def __getitem__
( self, keyname
):
199 if keyname
== "chrom":
201 elif keyname
== "left":
203 elif keyname
== "right":
205 elif keyname
== "RAs_left":
207 elif keyname
== "RAs_right":
208 return self.RAs_right
209 elif keyname
== "length":
211 elif keyname
== "count":
212 return len( self.RAlists
[ 0 ] )+ len( self.RAlists
[ 1 ] )
213 elif keyname
== "count_T":
214 return len( self.RAlists
[ 0 ] )
215 elif keyname
== "count_C":
216 return len( self.RAlists
[ 1 ] )
217 elif keyname
== "peak_refseq":
218 return self.peak_refseq
219 elif keyname
== "peak_refseq_ext":
220 return self.peak_refseq_ext
222 raise KeyError("Unavailable key:", keyname
)
224 def __getstate__
( self ):
225 #return {"chrom":self.chrom, "peak":self.peak, "RAlists":self.RAlists,
226 # "left":self.left, "right":self.right, "length": self.length,
227 # "RAs_left":self.RAs_left, "RAs_right":self.RAs_right}
228 return (self.chrom
, self.peak
, self.RAlists
, self.left
, self.right
, self.length
, self.RAs_left
, self.RAs_right
, self.peak_refseq
, self.peak_refseq_ext
)
230 def __setstate__
( self, state
):
231 (self.chrom
, self.peak
, self.RAlists
, self.left
, self.right
, self.length
, self.RAs_left
, self.RAs_right
, self.peak_refseq
, self.peak_refseq_ext
) = state
234 """Sort RAs according to lpos. Should be used after realignment.
237 if self.RAlists
[ 0 ]:
238 self.RAlists
[ 0 ].sort
(key
=itemgetter
("lpos"))
239 if self.RAlists
[ 1 ]:
240 self.RAlists
[ 1 ].sort
(key
=itemgetter
("lpos"))
244 cpdef remove_outliers
( self, int percent
= 5 ):
245 """ Remove outliers with too many n_edits. The outliers with
246 n_edits in top p% will be removed.
248 Default: remove top 5% of reads that have too many differences
249 with reference genome.
253 object read
# ReadAlignment object
259 for ralist
in self.RAlists
:
261 n_edits_list
.append
( read
["n_edits"] )
263 highest_n_edits
= n_edits_list
[ int( len( n_edits_list
) * (1 - percent
* .01) ) ]
265 for i
in ( range(len(self.RAlists
)) ):
267 for read
in self.RAlists
[ i
]:
268 if read
["n_edits"] <= highest_n_edits
:
269 new_RAlist
.append
( read
)
270 self.RAlists
[ i
] = new_RAlist
274 cpdef n_edits_sum
( self ):
284 for ralist
in self.RAlists
:
286 n_edits_list
.append
( read
["n_edits"] )
289 # print ( n_edits_list )
290 c
= Counter
( n_edits_list
)
293 cdef tuple __get_peak_REFSEQ
( self ):
294 """Get the reference sequence within the peak region.
298 bytearray peak_refseq
300 long prev_r
#remember the previous filled right end
305 bytearray read_refseq_ext
306 bytearray read_refseq
308 start
= min( self.RAs_left
, self.left
)
309 end
= max( self.RAs_right
, self.right
)
310 #print ("left",start,"right",end)
311 peak_refseq_ext
= bytearray
( b
'N' * ( end
- start
) )
314 peak_refseq_ext
= self.__fill_refseq
( peak_refseq_ext
, self.RAlists
[0] )
315 # and control if available.
317 peak_refseq_ext
= self.__fill_refseq
( peak_refseq_ext
, self.RAlists
[1] )
320 peak_refseq
= peak_refseq_ext
[ self.left
- start
: self.right
- start
]
321 return ( bytes
( peak_refseq
), bytes
( peak_refseq_ext
) )
323 cdef bytearray __fill_refseq
( self, bytearray seq
, list ralist
):
324 """Fill refseq sequence of whole peak with refseq sequences of
329 long prev_r
# previous right position of last
334 bytearray read_refseq
336 start
= min( self.RAs_left
, self.left
)
338 #print(len(ralist),"\n")
339 prev_r
= ralist
[0]["lpos"]
341 for i
in range( len( ralist
) ):
343 if read
[ "lpos" ] > prev_r
:
344 read
= ralist
[ i
- 1 ]
345 read_refseq
= read
.get_REFSEQ
()
346 ind
= read
["lpos"] - start
347 ind_r
= ind
+ read
["rpos"] - read
["lpos"]
348 seq
[ ind
: ind_r
] = read_refseq
349 prev_r
= read
[ "rpos" ]
352 read_refseq
= read
.get_REFSEQ
()
353 ind
= read
["lpos"] - start
354 ind_r
= ind
+ read
["rpos"] - read
["lpos"]
355 seq
[ ind
: ind_r
] = read_refseq
358 cpdef get_PosReadsInfo_ref_pos
( self, long ref_pos
, bytes ref_nt
, int Q
=20 ):
359 """Generate a PosReadsInfo object for a given reference genome
362 Return a PosReadsInfo object.
376 posreadsinfo_p
= PosReadsInfo
( ref_pos
, ref_nt
)
379 for i
in range( len( self.RAlists
[ 0 ] ) ):
380 ra
= self.RAlists
[ 0 ][ i
]
381 if ra
[ "lpos" ] <= ref_pos
and ra
[ "rpos" ] > ref_pos
:
382 ( s
, bq
, strand
, tip
, pos
) = ra
.get_variant_bq_by_ref_pos
( ref_pos
)
383 posreadsinfo_p
.add_T
( i
, bytes
( s
), bq
[ 0 ], strand
, tip
, Q
=Q
)
386 for i
in range( len( self.RAlists
[ 1 ] ) ):
387 ra
= self.RAlists
[ 1 ][ i
]
388 if ra
[ "lpos" ] <= ref_pos
and ra
[ "rpos" ] > ref_pos
:
389 ( s
, bq
, strand
, tip
, pos
) = ra
.get_variant_bq_by_ref_pos
( ref_pos
)
390 posreadsinfo_p
.add_C
( i
, bytes
( s
), bq
[ 0 ], strand
, Q
=Q
)
392 return posreadsinfo_p
394 cpdef bytearray get_FASTQ
( self ):
395 """Get FASTQ file for all reads in RACollection.
402 fastq_text
= bytearray
(b
"")
404 for ra
in self.RAlists
[0]:
405 fastq_text
+= ra
.get_FASTQ
()
407 for ra
in self.RAlists
[1]:
408 fastq_text
+= ra
.get_FASTQ
()
412 cdef list fermi_assemble
( self, int fermiMinOverlap
, int opt_flag
= 0x80 ):
413 """A wrapper function to call Fermi unitig building functions.
423 int unitig_k
, merge_min_len
432 bytes unitig
#final unitig
433 list unitig_list
# contain list of sequences in bytes format
436 n_seqs
= len(self.RAlists
[0]) + len(self.RAlists
[1])
440 # prepare seq and qual, note, we only extract SEQ according to the +
441 # strand of reference sequence.
442 seqs
= <bseq1_t
*> malloc
( n_seqs
* sizeof(bseq1_t
) ) # we rely on fermi-lite to free this mem
445 for ra
in self.RAlists
[0]:
449 cseq
= <char *>malloc
( (l
+1)*sizeof(char))# we rely on fermi-lite to free this mem
450 cqual
= <char *>malloc
( (l
+1)*sizeof(char))# we rely on fermi-lite to free this mem
452 cseq
[ j
] = tmps
[ j
]
453 cqual
[ j
] = tmpq
[ j
] + 33
458 seqs
[ i
].qual
= cqual
459 seqs
[ i
].l_seq
= len(tmps
)
462 # print "@",ra["readname"].decode()
463 # print cseq.decode()
465 # print cqual.decode()
467 for ra
in self.RAlists
[1]:
471 cseq
= <char *>malloc
( (l
+1)*sizeof(char))# we rely on fermi-lite to free this mem
472 cqual
= <char *>malloc
( (l
+1)*sizeof(char))# we rely on fermi-lite to free this mem
474 cseq
[ j
] = tmps
[ j
]
475 cqual
[ j
] = tmpq
[ j
] + 33
480 seqs
[ i
].qual
= cqual
481 seqs
[ i
].l_seq
= len(tmps
)
483 # print "@",ra["readname"].decode()
484 # print cseq.decode()
486 # print cqual.decode()
488 # if self.RAlists[1]:
489 # unitig_k=int(min(self.RAlists[0][0]["l"],self.RAlists[1][0]["l"])*fermiOverlapMinRatio)
491 # merge_min_len=int(min(self.RAlists[0][0]["l"],self.RAlists[1][0]["l"])*0.5)
493 # unitig_k = int(self.RAlists[0][0]["l"]*fermiOverlapMinRatio)
495 # merge_min_len=int(self.RAlists[0][0]["l"]*0.5)
496 #fermiMinOverlap = int(self.RAlists[0][0]["l"]*fermiOverlapMinRatio)
498 # minimum overlap to merge, default 0
499 # merge_min_len= max( 25, int(self.RAlists[0][0]["l"]*0.5) )
500 # merge_min_len= int(self.RAlists[0][0]["l"]*0.5)
502 opt
= <fml_opt_t
*> PyMem_Malloc
( sizeof(fml_opt_t
) )
503 n_utg
= <int *> PyMem_Malloc
( sizeof(int) )
506 # k-mer length for error correction (0 for auto; -1 to disable)
509 # min overlap length during initial assembly
510 opt
.min_asm_ovlp
= fermiMinOverlap
512 # minimum length to merge, during assembly, don't explicitly merge an overlap if shorter than this value
513 # opt.min_merge_len = merge_min_len
515 # there are more 'options' for mag clean:
516 # flag, min_ovlp, min_elen, min_ensr, min_insr, max_bdist, max_bdiff, max_bvtx, min_merge_len, trim_len, trim_depth, min_dratio1, max_bcov, max_bfrac
517 # min_elen (300) will be adjusted
518 # min_ensr (4), min_insr (3) will be computed
519 # min_merge_len (0) will be updated using opt.min_merge_len
521 # We can adjust: flag (0x40|0x80), min_ovlp (0), min_dratio1 (0.7), max_bdiff (50), max_bdist (512), max_bvtx (64), trim_len (0), trim_depth (6), max_bcov (10.), max_bfrac (0.15)
523 # 0x20: MAG_F_AGGRESSIVE pop variant bubbles
524 # 0x40: MAG_F_POPOPEN aggressive tip trimming
525 # 0x80: MAG_F_NO_SIMPL skip bubble simplification
526 opt
.mag_opt
.flag
= opt_flag
529 #opt.mag_opt.min_ovlp = fermiMinOverlap
531 # drop an overlap if its length is below maxOvlpLen*FLOAT
532 #opt.mag_opt.min_dratio1 = 0.5
534 # retain a bubble if one side is longer than the other side by >INT-bp
535 #opt.mag_opt.max_bdiff = 10#merge_min_len
538 # trim_depth: Parameter used to trim the open end/tip. If trim_len == 0, do nothing
541 # max_bvtx: Parameter used to simply bubble while 0x80 flag is set.
542 #opt.mag_opt.max_bdist = 1024
543 #opt.mag_opt.max_bvtx = 128
546 # max_bfrac: Parameter used when aggressive bubble removal is not used. Bubble will be removed if its average coverage lower than max_bcov and fraction (cov1/(cov1+cov2)) is lower than max_bfrac
547 #opt.mag_opt.max_bcov = 10.
548 #opt.mag_opt.max_bfrac = 0.01
550 utg
= fml_assemble
(opt
, n_seqs
, seqs
, n_utg
)
553 for i
in range( n_utg
[0] ):
558 #for j in range( p.len ):
559 # unitig += [b'A',b'C',b'G',b'T',b'N'][int(p.seq[j]) - 1]
560 #unitig_list.append( unitig )
561 unitig_list
.append
( p
.seq
)
563 fml_utg_destroy
(n_utg
[0], utg
)
570 cdef tuple align_unitig_to_REFSEQ
( self, list unitig_list
):
571 """Note: we use smith waterman, but we don't use linear gap
572 penalty at this time.
574 Also, if unitig is mapped to - strand, we will revcomp the
575 unitig. So the unitig_list will be changed in this case.
585 bytes target_aln_f
, target_aln_r
586 bytes reference_aln_f
, reference_aln_r
587 bytes markup_aln_f
, markup_aln_r
588 double score_f
, score_r
589 list target_alns
= []
590 list reference_alns
= []
591 list markup_alns
= []
595 reference
= copy
(self.peak_refseq_ext
+b
'\x00')
597 for i
in range( len(unitig_list
) ):
598 unitig
= unitig_list
[ i
]
599 target
= copy
(unitig
+ b
'\x00')
600 # we use swalign.c for local alignment (without affine gap
601 # penalty). Will revise later.
603 problem
.alen
= len( unitig
)
604 problem
.b
= reference
605 problem
.blen
= len( self.peak_refseq_ext
)
606 results
= smith_waterman
( &problem
)
607 target_aln_f
= results
.seqs
.a
608 reference_aln_f
= results
.seqs
.b
609 markup_aln_f
= results
.markup
610 score_f
= results
.score
611 free
( results
.seqs
.a
)
612 free
( results
.seqs
.b
)
613 free
( results
.markup
)
615 # end of local alignment
617 # try reverse complement
618 target
= copy
(unitig
[::-1] + b
'\x00')
619 target
= target
.translate
( __DNACOMPLEMENT__
)
621 problem
.alen
= len( unitig
)
622 problem
.b
= reference
623 problem
.blen
= len( self.peak_refseq_ext
)
624 results
= smith_waterman
( &problem
)
625 target_aln_r
= results
.seqs
.a
626 reference_aln_r
= results
.seqs
.b
627 markup_aln_r
= results
.markup
628 score_r
= results
.score
629 free
( results
.seqs
.a
)
630 free
( results
.seqs
.b
)
631 free
( results
.markup
)
633 # end of local alignment
635 if score_f
> score_r
:
636 target_alns
.append
( target_aln_f
)
637 reference_alns
.append
( reference_aln_f
)
638 markup_alns
.append
( markup_aln_f
)
639 aln_scores
.append
( score_f
)
641 target_alns
.append
( target_aln_r
)
642 reference_alns
.append
( reference_aln_r
)
643 markup_alns
.append
( markup_aln_r
)
644 aln_scores
.append
( score_r
)
645 # we will revcomp unitig
646 unitig
= unitig
[::-1]
647 unitig_list
[ i
] = unitig
.translate
( __DNACOMPLEMENT__
)
649 return ( target_alns
, reference_alns
, aln_scores
, markup_alns
)
651 cdef verify_alns
( self, unitig_list
, unitig_alns
, reference_alns
, aln_scores
, markup_alns
, float min_score_100
= 150 ):
652 """Remove aln/unitig if it contains too many edits in a small region
654 default min score is 150, which means under 2/-3/-5/-2 scoring schema, there are 10 mismatches within 100bps region.
658 for i
in range( len( unitig_list
)-1, -1, -1 ):
659 #print i, aln_scores[ i ]
660 #print unitig_alns[ i ]
661 #print markup_alns[ i ]
662 #print reference_alns[ i ]
663 if aln_scores
[ i
] * 100 /len( markup_alns
[ i
] ) < min_score_100
:
666 reference_alns
.pop
( i
)
672 cdef tuple filter_unitig_with_bad_aln
( self, list unitig_list
, list target_alns
, list reference_alns
, float gratio
= 0.25 ):
673 """Remove unitigs that has too much gaps (both on target and reference) during alignments.
677 cdef list remap_RAs_w_unitigs
( self, list unitig_list
):
678 """Remap RAs to unitigs, requiring perfect match.
680 Return RAlists_T, RAlists_C, unmapped_racollection.
683 list RAlists_T
= [] # lists of list of RAs of ChIP mapped to each unitig
685 list unmapped_RAlist_T
= [] # list of RAs of ChIP unmappable to unitigs
686 list unmapped_RAlist_C
= []
687 #RACollection unmapped_ra_collection
691 bytes tmp_ra_seq
, unitig
693 for i
in range( len(unitig_list
) ):
694 RAlists_T
.append
([]) # for each unitig, there is another list of RAs
697 # assign RAs to unitigs
699 for tmp_ra
in self.RAlists
[0]:
701 tmp_ra_seq
= tmp_ra
["SEQ"]
702 for i
in range( len(unitig_list
) ):
703 unitig
= unitig_list
[ i
]
704 if tmp_ra_seq
in unitig
:
706 RAlists_T
[ i
].append
( tmp_ra
)
709 unmapped_RAlist_T
.append
( tmp_ra
)
710 #print "unmapped:", tmp_ra["SEQ"]
712 for tmp_ra
in self.RAlists
[1]:
714 tmp_ra_seq
= tmp_ra
["SEQ"]
715 for i
in range( len(unitig_list
) ):
716 unitig
= unitig_list
[ i
]
717 if tmp_ra_seq
in unitig
:
719 RAlists_C
[ i
].append
( tmp_ra
)
722 unmapped_RAlist_C
.append
( tmp_ra
)
723 #print "unmapped:", tmp_ra["SEQ"]
725 #if unmapped_RAlist_T:
726 #unmapped_ra_collection = RACollection( self.chrom, self.peak, unmapped_RAlist_T, unmapped_RAlist_C )
727 return [ RAlists_T
, RAlists_C
, unmapped_RAlist_T
, unmapped_RAlist_C
]
729 cdef list add_to_unitig_list
( self, unitig_list
, unitigs_2nd
):
740 for i
in range( len(unitigs_2nd
) ):
741 flag
= 0 # initial value: can't be found in unitig_list
742 u0
= unitigs_2nd
[ i
]
743 for j
in range( len( unitig_list
) ):
744 u1
= unitig_list
[ j
]
745 if u1
.find
( u0
) != -1:
748 u1
= u1
[::-1].translate
(__DNACOMPLEMENT__
)
749 if u1
.find
( u0
) != -1:
753 new_unitig_list
.append
( u0
)
754 new_unitig_list
.extend
( unitig_list
)
755 return new_unitig_list
758 cpdef
object build_unitig_collection
( self, fermiMinOverlap
):
759 """unitig_list and tuple_alns are in the same order!
761 return UnitigCollection object.
768 list target_alns
, reference_alns
, aln_scores
, markup_alns
769 list target_alns_2nd
, reference_alns_2nd
, aln_scores_2nd
770 list RAlists_T
= [] # lists of list of RAs of ChIP mapped to each unitig
772 list unmapped_RAlist_T
= []
773 list unmapped_RAlist_C
= []
774 bytes tmp_unitig_seq
, tmp_reference_seq
775 bytes tmp_unitig_aln
, tmp_reference_aln
,
777 long left_padding_ref
, right_padding_ref
778 long left_padding_unitig
, right_padding_unitig
780 RACollection unmapped_ra_collection
782 int n_unmapped
, n_unitigs_0
, n_unitigs_1
784 # first round of assembly
785 # print (" First round to assemble unitigs")
786 unitig_list
= self.fermi_assemble
( fermiMinOverlap
, opt_flag
= 0x80 )
787 if len(unitig_list
) == 0:
791 n_unitigs_1
= len( unitig_list
)
792 #print " # of Unitigs:", n_unitigs_1
793 #print " Map reads to unitigs"
794 ( unitig_alns
, reference_alns
, aln_scores
, markup_alns
) = self.align_unitig_to_REFSEQ
( unitig_list
)
796 self.verify_alns
( unitig_list
, unitig_alns
, reference_alns
, aln_scores
, markup_alns
)
797 if len(unitig_list
) == 0:
798 # if stop here, it raises a flag that the region may contain too many mismapped reads, we return -1
800 # print (" # of Unitigs:", n_unitigs_1)
802 # assign RAs to unitigs
803 [ RAlists_T
, RAlists_C
, unmapped_RAlist_T
, unmapped_RAlist_C
] = self.remap_RAs_w_unitigs
( unitig_list
)
804 #print unmapped_ra_collection.get_FASTQ().decode()
806 n_unmapped
= len( unmapped_RAlist_T
) + len( unmapped_RAlist_C
)
808 while len( unmapped_RAlist_T
) > 0 and n_unitigs_1
!= n_unitigs_0
:
809 # if there are unmapped reads AND we can get more unitigs
810 # from last round of assembly, do assembly again
812 # print (" # of RAs not mapped, will be assembled again:", n_unmapped)
813 n_unitigs_0
= n_unitigs_1
814 # another round of assembly
815 unmapped_ra_collection
= RACollection
( self.chrom
, self.peak
, unmapped_RAlist_T
, unmapped_RAlist_C
)
816 unitigs_2nd
= unmapped_ra_collection
.fermi_assemble
( fermiMinOverlap
, opt_flag
= 0x80 )
819 unitig_list
= self.add_to_unitig_list
( unitig_list
, unitigs_2nd
)
820 n_unitigs_1
= len( unitig_list
)
821 #print " # of Unitigs:", n_unitigs_1
822 #print " Map reads to unitigs"
823 ( unitig_alns
, reference_alns
, aln_scores
, markup_alns
) = self.align_unitig_to_REFSEQ
( unitig_list
)
824 self.verify_alns
( unitig_list
, unitig_alns
, reference_alns
, aln_scores
, markup_alns
)
825 [ RAlists_T
, RAlists_C
, unmapped_RAlist_T
, unmapped_RAlist_C
] = self.remap_RAs_w_unitigs
( unitig_list
)
826 n_unmapped
= len( unmapped_RAlist_T
) + len( unmapped_RAlist_C
)
828 # for r in unmapped_RAlist_T:
829 # print r.get_FASTQ().decode().lstrip()
831 # print (" # of RAs not mapped, will be assembled again with 1/2 of fermiMinOverlap:", n_unmapped)
832 # another round of assembly
833 unmapped_ra_collection
= RACollection
( self.chrom
, self.peak
, unmapped_RAlist_T
, unmapped_RAlist_C
)
834 unitigs_2nd
= unmapped_ra_collection
.fermi_assemble
( fermiMinOverlap
/2, opt_flag
= 0x80 )
837 unitig_list
= self.add_to_unitig_list
( unitig_list
, unitigs_2nd
)
838 n_unitigs_1
= len( unitig_list
)
839 #print " # of Unitigs:", n_unitigs_1
840 #print " Map reads to unitigs"
841 ( unitig_alns
, reference_alns
, aln_scores
, markup_alns
) = self.align_unitig_to_REFSEQ
( unitig_list
)
842 self.verify_alns
( unitig_list
, unitig_alns
, reference_alns
, aln_scores
, markup_alns
)
843 [ RAlists_T
, RAlists_C
, unmapped_RAlist_T
, unmapped_RAlist_C
] = self.remap_RAs_w_unitigs
( unitig_list
)
844 n_unmapped
= len( unmapped_RAlist_T
) + len( unmapped_RAlist_C
)
846 # for r in unmapped_RAlist_T:
847 # print r.get_FASTQ().decode().lstrip()
848 if len(unitig_list
) == 0:
849 raise Exception("Shouldn't reach here")
850 # print (" # of Unitigs:", n_unitigs_1)
852 if len(unitig_list
) == 0:
854 #print (" Final round: # of Unitigs:", len(unitig_list))
855 #print (" Final round: # of RAs not mapped:", n_unmapped)
857 start
= min( self.left
, self.RAs_left
)
858 end
= max( self.right
, self.RAs_right
)
860 # create UnitigCollection
861 for i
in range( len( unitig_list
) ):
862 #b'---------------------------AAATAATTTTATGTCCTTCAGTACAAAAAGCAGTTTCAACTAAAACCCAGTAACAAGCTAGCAATTCCTTTTAAATGGTGCTACTTCAAGCTGCAGCCAGGTAGCTTTTTATTACAAAAAATCCCACAGGCAGCCACTAGGTGGCAGTAACAGGCTTTTGCCAGCGGCTCCAGTCAGCATGGCTTGACTGTGTGCTGCAGAAACTTCTTAAATCGTCTGTGTTTGGGACTCGTGGGGCCCCACAGGGCTTTACAAGGGCTTTTTAATTTCCAAAAACATAAAACAAAAAAA--------------'
863 #b'GATATAAATAGGATGTTATGAGTTTTCAAATAATTTTATGTCCTTCAGTACAAAAAGCAGTTTCAACTAAAACCCAGTAACAAGCTAGCAATTCCTTTTAAATGGTGCTACTTCAAGCTGCAGCCAGGTAGCTTTTTATTACAAAAA-TCCCACAGGCAGCCACTAGGTGGCAGTAACAGGCTTTTGCCAGCGGCTCCAGTCAGCATGGCTTGACTGTGTGCTGCAGAAACTTCTTAAATCGTCTGTGTTTGGGACTCGTGGGGCCCCACAGGGCTTTACAAGGGCTTTTTAATTTCCAAAAACATAAAACAAAAAAAAATACAAATGTATT'
864 tmp_unitig_aln
= unitig_alns
[ i
]
865 tmp_reference_aln
= reference_alns
[ i
]
866 tmp_unitig_seq
= tmp_unitig_aln
.replace
(b
'-',b
'')
867 tmp_reference_seq
= tmp_reference_aln
.replace
(b
'-',b
'')
869 # print tmp_unitig_aln
870 # print tmp_reference_aln
871 # print tmp_unitig_seq
872 # print tmp_reference_aln
874 # find the position on self.peak_refseq_ext
875 left_padding_ref
= self.peak_refseq_ext
.find
( tmp_reference_seq
) # this number of nts should be skipped on refseq_ext from left
876 right_padding_ref
= len(self.peak_refseq_ext
) - left_padding_ref
- len(tmp_reference_seq
) # this number of nts should be skipped on refseq_ext from right
878 #now, decide the lpos and rpos on reference of this unitig
879 #first, trim left padding '-'
880 left_padding_unitig
= len(tmp_unitig_aln
) - len(tmp_unitig_aln
.lstrip
(b
'-'))
881 right_padding_unitig
= len(tmp_unitig_aln
) - len(tmp_unitig_aln
.rstrip
(b
'-'))
883 tmp_lpos
= start
+ left_padding_ref
884 tmp_rpos
= end
- right_padding_ref
886 for j
in range( left_padding_unitig
):
887 if tmp_reference_aln
[ j
] != b
'-':
889 for j
in range( 1, right_padding_unitig
+ 1 ):
890 if tmp_reference_aln
[ -j
] != b
'-':
893 tmp_unitig_aln
= tmp_unitig_aln
[ left_padding_unitig
:(len(tmp_unitig_aln
)-right_padding_unitig
)]
894 tmp_reference_aln
= tmp_reference_aln
[ left_padding_unitig
:(len(tmp_reference_aln
)-right_padding_unitig
)]
896 ura_list
.append
( UnitigRAs
( self.chrom
, tmp_lpos
, tmp_rpos
, tmp_unitig_aln
, tmp_reference_aln
, [RAlists_T
[i
], RAlists_C
[i
]] ) )
898 return UnitigCollection
( self.chrom
, self.peak
, ura_list
)