Merge pull request #589 from tjni/clean-up-build-dependencies
[MACS.git] / MACS3 / Signal / Region.pyx
blob6165b6469a7ed1d0dc000b3e09ddea787116b116
1 # cython: language_level=3
2 # cython: profile=True
3 # Time-stamp: <2022-09-15 16:13:48 Tao Liu>
5 """Module for Region classe.
7 This code is free software; you can redistribute it and/or modify it
8 under the terms of the BSD License (see the file LICENSE included with
9 the distribution).
10 """
12 # ------------------------------------
13 # python modules
14 # ------------------------------------
15 #import random
16 #import re
17 #import sys
19 # ------------------------------------
20 # MACS3 modules
21 # ------------------------------------
23 from MACS3.Utilities.Constants import *
25 # ------------------------------------
26 # Other modules
27 # ------------------------------------
29 from cpython cimport bool
31 # ------------------------------------
32 # constants
33 # ------------------------------------
34 __version__ = "Region $Revision$"
35 __author__ = "Tao Liu <vladimir.liu@gmail.com>"
36 __doc__ = "Region class"
38 # ------------------------------------
39 # Misc functions
40 # ------------------------------------
42 # ------------------------------------
43 # Classes
44 # ------------------------------------
45 cdef class Regions:
46 """For plain region of chrom, start and end
47 """
48 cdef:
49 public dict regions
50 public int total
51 bool __flag_sorted
53 def __init__ (self):
54 self.regions= {}
55 self.__flag_sorted = False
56 self.total = 0
58 def __getitem__ (self, chrom):
59 return self.regions[ chrom ]
61 cpdef pop ( self, int n ):
62 # when called, pop the first n regions in Regions class. Self will be modified.
63 cdef:
64 list clist # for chromosomes
65 int tmp_l
66 bytes chrom
67 int n_taken # remember the number of regions prepared
68 object ret # returned Regions
70 if self.total == 0:
71 raise Exception("None left")
73 clist = sorted(list(self.regions.keys()))
74 n_taken = n
75 ret = Regions()
76 for chrom in sorted(clist):
77 ret.regions[chrom] = self.regions[chrom][:n_taken]
78 self.regions[chrom] = self.regions[chrom][n_taken:]
79 if not self.regions[chrom]:
80 # remove this chromosome if there is none left
81 self.regions.pop( chrom )
82 tmp_l = len( ret.regions[chrom] )
83 ret.total += tmp_l
84 # calculate remained
85 self.total -= tmp_l
86 #print( ret.total, self.total )
87 n_taken -= tmp_l
88 if not n_taken:
89 # when there is no need, quit loop
90 break
91 return ret
93 cpdef init_from_PeakIO ( self, object peaks ):
94 """Initialize the object with a PeakIO object.
96 Note: I intentionally forgot to check if peakio is actually a
97 PeakIO...
98 """
99 cdef:
100 bytes chrom
101 list ps
102 object p
103 int i
105 peaks.sort()
106 self.total = 0
107 for chrom in sorted(peaks.get_chr_names()):
108 ps = peaks.get_data_from_chrom( chrom )
109 self.regions[chrom] = []
110 for i in range( len( ps ) ):
111 p = ps[ i ]
112 self.regions[chrom].append( ( p['start'], p['end'] ) )
113 self.total += 1
114 self.sort()
116 cpdef add_loc ( self, bytes chrom, int start, int end ):
117 if self.regions.has_key(chrom):
118 self.regions[chrom].append( (start,end) )
119 else:
120 self.regions[chrom] = [ (start,end), ]
121 self.total += 1
122 self.__flag_sorted = False
123 return
125 cpdef sort (self):
126 cdef:
127 bytes chrom
129 if self.__flag_sorted:
130 return
131 for chrom in sorted(self.regions.keys()):
132 self.regions[chrom].sort()
133 self.__flag_sorted = True
135 cpdef set get_chr_names (self):
136 return set( sorted(self.regions.keys()) )
138 cpdef expand ( self, int flanking ):
139 cdef:
140 bytes chrom
141 list ps
142 int i
144 self.sort()
145 for chrom in sorted(self.regions.keys()):
146 ps = self.regions[ chrom ]
147 for i in range( len( ps ) ):
148 ps[i] = ( max(0, ps[i][0] - flanking), ps[i][1] + flanking )
149 ps.sort()
150 self.regions[chrom] = ps
152 cpdef merge_overlap ( self ):
154 merge overlapping regions
156 cdef:
157 bytes chrom
158 int s_new_region, e_new_region, i, j
159 dict regions, new_regions
160 list chrs
161 list regions_chr
162 tuple prev_region
164 self.sort()
165 regions = self.regions
166 new_regions = {}
168 chrs = list(regions.keys())
169 chrs.sort()
170 self.total = 0
171 for i in range(len(chrs)):
172 chrom = chrs[i]
173 new_regions[chrom]=[]
174 n_append = new_regions[chrom].append
175 prev_region = ()
176 regions_chr = regions[chrom]
177 for i in range( len(regions_chr) ):
178 if not prev_region:
179 prev_region = regions_chr[i]
180 continue
181 else:
182 if regions_chr[i][0] <= prev_region[1]:
183 s_new_region = prev_region[0]
184 e_new_region = regions_chr[i][1]
185 prev_region = (s_new_region,e_new_region)
186 else:
187 n_append(prev_region)
188 prev_region = regions_chr[i]
189 if prev_region:
190 n_append(prev_region)
191 self.total += len( new_regions[chrom] )
192 self.regions = new_regions
193 self.sort()
194 return True
196 cpdef write_to_bed (self, fhd ):
197 cdef:
198 int i
199 bytes chrom
200 tuple region
202 chrs = list(self.regions.keys())
203 chrs.sort()
204 for i in range( len(chrs) ):
205 chrom = chrs[i]
206 for region in self.regions[chrom]:
207 fhd.write( "%s\t%d\t%d\n" % (chrom.decode(),region[0],region[1] ) )
209 def __str__ ( self ):
210 cdef:
211 int i
212 bytes chrom
213 tuple region
214 str ret
216 ret = ""
217 chrs = list(self.regions.keys())
218 chrs.sort()
219 for i in range( len(chrs) ):
220 chrom = chrs[i]
221 for region in self.regions[chrom]:
222 ret += "%s\t%d\t%d\n" % (chrom.decode(),region[0],region[1] )
223 return ret
225 # cpdef object randomly_pick ( self, int n, int seed = 12345 ):
226 # """Shuffle the regions and get n regions out of it. Return a
227 # new Regions object.
228 # """
230 # cdef:
231 # list all_pc
232 # list chrs
233 # bytes chrom
234 # object ret_peakio, p
235 # assert n > 0
236 # chrs = sorted(list(self.peaks.keys()))
237 # all_pc = []
238 # for chrom in chrs:
239 # all_pc.extend(self.peaks[chrom])
240 # random.seed( seed )
241 # all_pc = random.shuffle( all_pc )[:n]
242 # ret_peakio = PeakIO()
243 # for p in all_pc:
244 # ret_peakio.add_PeakContent ( p["chrom"], p )
245 # return ret_peakio
248 cpdef void exclude (self, object regionio2):
249 """ Remove overlapping regions in regionio2, another Region
250 object.
253 cdef:
254 dict regions1, regions2
255 list chrs1, chrs2
256 bytes k
257 dict ret_regionss
258 bool overlap_found
259 tuple r1, r2
260 long n_rl1, n_rl2
262 self.sort()
263 regions1 = self.regions
264 self.total = 0
265 assert isinstance(regionio2,Regions)
266 regionio2.sort()
267 regions2 = regionio2.regions
269 ret_regions = dict()
270 chrs1 = list(regions1.keys())
271 chrs2 = list(regions2.keys())
272 for k in chrs1:
273 #print(f"chromosome {k}")
274 if not chrs2.count(k):
275 # no such chromosome in peaks1, then don't touch the peaks in this chromosome
276 ret_regions[ k ] = regions1[ k ]
277 self.total += len( ret_regions[ k ] )
278 continue
279 ret_regions[ k ] = []
280 n_rl1 = len( regions1[k] )
281 n_rl2 = len( regions2[k] )
282 rl1_k = iter( regions1[k] ).__next__
283 rl2_k = iter( regions2[k] ).__next__
284 overlap_found = False
285 r1 = rl1_k()
286 n_rl1 -= 1
287 r2 = rl2_k()
288 n_rl2 -= 1
289 while ( True ):
290 # we do this until there is no r1 or r2 left.
291 if r2[0] < r1[1] and r1[0] < r2[1]:
292 # since we found an overlap, r1 will be skipped/excluded
293 # and move to the next r1
294 overlap_found = True
295 n_rl1 -= 1
296 if n_rl1 >= 0:
297 r1 = rl1_k()
298 overlap_found = False
299 continue
300 else:
301 break
302 if r1[1] < r2[1]:
303 # in this case, we need to move to the next r1,
304 # we will check if overlap_found is true, if not, we put r1 in a new dict
305 if not overlap_found:
306 ret_regions[ k ].append( r1 )
307 n_rl1 -= 1
308 if n_rl1 >= 0:
309 r1 = rl1_k()
310 overlap_found = False
311 else:
312 # no more r1 left
313 break
314 else:
315 # in this case, we need to move the next r2
316 if n_rl2:
317 r2 = rl2_k()
318 n_rl2 -= 1
319 else:
320 # no more r2 left
321 break
322 if n_rl1 >= 0:
323 ret_regions[ k ].extend( regions1[ k ][-n_rl1-1:] )
324 self.total += len( ret_regions[ k ] )
326 self.regions = ret_regions
327 self.__flag_sorted = False
328 self.sort()
329 return