simplify output msgs of callvar
[MACS.git] / MACS3 / IO / PeakIO.pyx
blob644b7fb56ac730583c325e3f78efa1ee5a8ec460
1 # cython: language_level=3
2 # cython: profile=True
3 # Time-stamp: <2020-11-24 17:11:29 Tao Liu>
5 """Module for PeakIO IO classes.
7 This code is free software; you can redistribute it and/or modify it
8 under the terms of the BSD License (see the file LICENSE included with
9 the distribution).
10 """
12 # ------------------------------------
13 # python modules
14 # ------------------------------------
15 from itertools import groupby
16 from operator import itemgetter
17 import re
18 import sys
20 # ------------------------------------
21 # MACS3 modules
22 # ------------------------------------
24 from MACS3.Utilities.Constants import *
26 # ------------------------------------
27 # Other modules
28 # ------------------------------------
30 from cpython cimport bool
32 # ------------------------------------
33 # constants
34 # ------------------------------------
35 __version__ = "PeakIO $Revision$"
36 __author__ = "Tao Liu <taoliu@jimmy.harvard.edu>"
37 __doc__ = "PeakIO class"
39 # ------------------------------------
40 # Misc functions
41 # ------------------------------------
42 cdef str subpeak_letters( int i):
43 if i < 26:
44 return chr(97+i)
45 else:
46 return subpeak_letters(i // 26) + chr(97 + (i % 26))
48 # ------------------------------------
49 # Classes
50 # ------------------------------------
52 cdef class PeakContent:
53 cdef:
54 int start
55 int end
56 int length
57 int summit
58 float score
59 float pileup
60 float pscore
61 float fc
62 float qscore
63 bytes name
65 def __init__ ( self, int start, int end, int summit,
66 float peak_score, float pileup,
67 float pscore, float fold_change, float qscore,
68 bytes name= b"NA" ):
69 self.start = start
70 self.end = end
71 self.length = end - start
72 self.summit = summit
73 self.score = peak_score
74 self.pileup = pileup
75 self.pscore = pscore
76 self.fc = fold_change
77 self.qscore = qscore
78 self.name = name
80 def __getitem__ ( self, a ):
81 if a == "start":
82 return self.start
83 elif a == "end":
84 return self.end
85 elif a == "length":
86 return self.length
87 elif a == "summit":
88 return self.summit
89 elif a == "score":
90 return self.score
91 elif a == "pileup":
92 return self.pileup
93 elif a == "pscore":
94 return self.pscore
95 elif a == "fc":
96 return self.fc
97 elif a == "qscore":
98 return self.qscore
99 elif a == "name":
100 return self.name
102 def __setitem__ ( self, a, v ):
103 if a == "start":
104 self.start = v
105 elif a == "end":
106 self.end = v
107 elif a == "length":
108 self.length = v
109 elif a == "summit":
110 self.summit = v
111 elif a == "score":
112 self.score = v
113 elif a == "pileup":
114 self.pileup = v
115 elif a == "pscore":
116 self.pscore = v
117 elif a == "fc":
118 self.fc = v
119 elif a == "qscore":
120 self.qscore = v
121 elif a == "name":
122 self.name = v
124 def __str__ (self):
125 return "start:%d;end:%d;score:%f" % ( self.start, self.end, self.score )
127 cdef class PeakIO:
128 """IO for peak information.
131 cdef:
132 dict peaks
134 def __init__ (self):
135 self.peaks = {}
137 cpdef add (self, bytes chromosome, int start, int end, int summit = 0,
138 float peak_score = 0, float pileup = 0,
139 float pscore = 0, float fold_change = 0, float qscore = 0,
140 bytes name = b"NA"):
141 """items:
142 start:start
143 end:end,
144 length:end-start,
145 summit:summit,
146 score:peak_score,
147 pileup:pileup,
148 pscore:pscore,
149 fc:fold_change,
150 qscore:qscore
152 if not self.peaks.has_key(chromosome):
153 self.peaks[chromosome]=[]
154 self.peaks[chromosome].append(PeakContent( start, end, summit, peak_score, pileup, pscore, fold_change, qscore, name))
156 cpdef add_PeakContent ( self, bytes chromosome, object peakcontent ):
157 if not self.peaks.has_key(chromosome):
158 self.peaks[chromosome]=[]
159 self.peaks[chromosome].append(peakcontent)
161 def get_data_from_chrom (self, bytes chrom):
162 if not self.peaks.has_key( chrom ):
163 self.peaks[chrom]= []
164 return self.peaks[chrom]
166 cpdef set get_chr_names (self):
167 return set(sorted(self.peaks.keys()))
169 def sort ( self ):
170 # sort by position
171 chrs = sorted(list(self.peaks.keys()))
172 for chrom in chrs:
173 self.peaks[chrom].sort(key=lambda x:x['start'])
174 return
177 def filter_pscore (self, double pscore_cut ):
178 cdef bytes chrom
180 peaks = self.peaks
181 new_peaks = {}
182 chrs = sorted(list(peaks.keys()))
184 for chrom in chrs:
185 new_peaks[chrom]=[p for p in peaks[chrom] if p['pscore'] >= pscore_cut]
186 self.peaks = new_peaks
188 def filter_qscore (self, double qscore_cut ):
189 cdef bytes chrom
191 peaks = self.peaks
192 new_peaks = {}
193 chrs = sorted(list(peaks.keys()))
195 for chrom in chrs:
196 new_peaks[chrom]=[p for p in peaks[chrom] if p['qscore'] >= qscore_cut]
197 self.peaks = new_peaks
199 def filter_fc (self, fc_low, fc_up=None ):
200 """Filter peaks in a given fc range.
202 If fc_low and fc_up is assigned, the peaks with fc in [fc_low,fc_up)
205 peaks = self.peaks
206 new_peaks = {}
207 chrs = list(peaks.keys())
208 chrs.sort()
209 if fc_up:
210 for chrom in chrs:
211 new_peaks[chrom]=[p for p in peaks[chrom] if p['fc'] >= fc_low and p['fc']<fc_up]
212 else:
213 for chrom in chrs:
214 new_peaks[chrom]=[p for p in peaks[chrom] if p['fc'] >= fc_low]
215 self.peaks = new_peaks
217 def total (self):
218 peaks = self.peaks
219 chrs = list(peaks.keys())
220 chrs.sort()
221 x = 0
222 for chrom in chrs:
223 x += len(peaks[chrom])
224 return x
226 # these methods are very fast, specifying types is unnecessary
227 def write_to_xls (self, fhd, bytes name_prefix=b"%s_peak_", bytes name=b"MACS"):
228 return self._to_xls(name_prefix=name_prefix, name=name,
229 print_func=fhd.write)
231 cdef _to_xls (self, bytes name_prefix=b"%s_peak_", bytes name=b"MACS", print_func=sys.stdout.write):
233 if self.peaks:
234 print_func("\t".join(("chr","start", "end", "length", "abs_summit", "pileup", "-log10(pvalue)", "fold_enrichment", "-log10(qvalue)", "name"))+"\n")
235 else:
236 return
238 try: peakprefix = name_prefix % name
239 except: peakprefix = name_prefix
241 peaks = self.peaks
242 chrs = list(peaks.keys())
243 chrs.sort()
244 n_peak = 0
245 for chrom in chrs:
246 for end, group in groupby(peaks[chrom], key=itemgetter("end")):
247 n_peak += 1
248 these_peaks = list(group)
249 if len(these_peaks) > 1:
250 for i, peak in enumerate(these_peaks):
251 peakname = "%s%d%s" % (peakprefix.decode(), n_peak, subpeak_letters(i))
252 #[start,end,end-start,summit,peak_height,number_tags,pvalue,fold_change,qvalue]
253 print_func("%s\t%d\t%d\t%d" % (chrom.decode(),peak['start']+1,peak['end'],peak['length']))
254 print_func("\t%d" % (peak['summit']+1)) # summit position
255 print_func("\t%.6g" % (round(peak['pileup'],2))) # pileup height at summit
256 print_func("\t%.6g" % (peak['pscore'])) # -log10pvalue at summit
257 print_func("\t%.6g" % (peak['fc'])) # fold change at summit
258 print_func("\t%.6g" % (peak['qscore'])) # -log10qvalue at summit
259 print_func("\t%s" % peakname)
260 print_func("\n")
261 else:
262 peak = these_peaks[0]
263 peakname = "%s%d" % (peakprefix.decode(), n_peak)
264 #[start,end,end-start,summit,peak_height,number_tags,pvalue,fold_change,qvalue]
265 print_func("%s\t%d\t%d\t%d" % (chrom.decode(),peak['start']+1,peak['end'],peak['length']))
266 print_func("\t%d" % (peak['summit']+1)) # summit position
267 print_func("\t%.6g" % (round(peak['pileup'],2))) # pileup height at summit
268 print_func("\t%.6g" % (peak['pscore'])) # -log10pvalue at summit
269 print_func("\t%.6g" % (peak['fc'])) # fold change at summit
270 print_func("\t%.6g" % (peak['qscore'])) # -log10qvalue at summit
271 print_func("\t%s" % peakname)
272 print_func("\n")
273 return
275 cdef _to_bed(self, bytes name_prefix=b"%s_peak_", bytes name=b"MACS",
276 bytes description=b"%s", str score_column="score",
277 trackline=False, print_func=sys.stdout.write):
279 generalization of tobed and write_to_bed
281 #print(description)
282 chrs = list(self.peaks.keys())
283 chrs.sort()
284 n_peak = 0
285 try: peakprefix = name_prefix % name
286 except: peakprefix = name_prefix
287 try: desc = description % name
288 except: desc = description
289 if trackline:
290 try: print_func('track name="%s (peaks)" description="%s" visibility=1\n' % ( name.replace(b"\"", b"\\\"").decode(),\
291 desc.replace(b"\"", b"\\\"").decode() ) )
292 except: print_func('track name=MACS description=Unknown\n')
293 for chrom in chrs:
294 for end, group in groupby(self.peaks[chrom], key=itemgetter("end")):
295 n_peak += 1
296 peaks = list(group)
297 if len(peaks) > 1:
298 for i, peak in enumerate(peaks):
299 print_func("%s\t%d\t%d\t%s%d%s\t%.6g\n" % (chrom.decode(),peak['start'],peak['end'],peakprefix.decode(),n_peak,subpeak_letters(i),peak[score_column]))
300 else:
301 peak = peaks[0]
302 print_func("%s\t%d\t%d\t%s%d\t%.6g\n" % (chrom.decode(),peak['start'],peak['end'],peakprefix.decode(),n_peak,peak[score_column]))
304 cdef _to_summits_bed(self, bytes name_prefix=b"%s_peak_", bytes name=b"MACS",
305 bytes description = b"%s", str score_column="score",
306 bool trackline=False, print_func=sys.stdout.write):
308 generalization of to_summits_bed and write_to_summit_bed
310 chrs = list(self.peaks.keys())
311 chrs.sort()
312 n_peak = 0
313 try: peakprefix = name_prefix % name
314 except: peakprefix = name_prefix
315 try: desc = description % name
316 except: desc = description
317 if trackline:
318 try: print_func('track name="%s (summits)" description="%s" visibility=1\n' % ( name.replace(b"\"", b"\\\"").decode(),\
319 desc.replace(b"\"", b"\\\"").decode() ) )
320 except: print_func('track name=MACS description=Unknown')
321 for chrom in chrs:
322 for end, group in groupby(self.peaks[chrom], key=itemgetter("end")):
323 n_peak += 1
324 peaks = list(group)
325 if len(peaks) > 1:
326 for i, peak in enumerate(peaks):
327 summit_p = peak['summit']
328 print_func("%s\t%d\t%d\t%s%d%s\t%.6g\n" % (chrom.decode(),summit_p,summit_p+1,peakprefix.decode(),n_peak,subpeak_letters(i),peak[score_column]))
329 else:
330 peak = peaks[0]
331 summit_p = peak['summit']
332 print_func("%s\t%d\t%d\t%s%d\t%.6g\n" % (chrom.decode(),summit_p,summit_p+1,peakprefix.decode(),n_peak,peak[score_column]))
334 def tobed (self):
335 """Print out peaks in BED5 format.
337 Five columns are chromosome, peak start, peak end, peak name, and peak height.
339 start:start
340 end:end,
341 length:end-start,
342 summit:summit,
343 score:peak_score,
344 pileup:pileup,
345 pscore:pvalue,
346 fc:fold_change,
347 qscore:qvalue
349 return self._to_bed(name_prefix=b"peak_", score_column="score", name=b"", description=b"")
351 def to_summits_bed (self):
352 """Print out peak summits in BED5 format.
354 Five columns are chromosome, summit start, summit end, peak name, and peak height.
357 return self._to_summits_bed(name_prefix=b"peak_", score_column="score", name=b"", description=b"")
359 # these methods are very fast, specifying types is unnecessary
360 def write_to_bed (self, fhd, bytes name_prefix=b"peak_", bytes name=b"MACS",
361 bytes description = b"%s", str score_column="score", trackline=True):
362 """Write peaks in BED5 format in a file handler. Score (5th
363 column) is decided by score_column setting. Check the
364 following list. Name column ( 4th column) is made by putting
365 name_prefix together with an ascending number.
367 Five columns are chromosome, peak start, peak end, peak name,
368 and peak score.
370 items in peak hash object:
372 start:start
373 end:end,
374 length:end-start,
375 summit:summit,
376 score:peak_score,
377 pileup:pileup,
378 pscore:pvalue,
379 fc:fold_change,
380 qscore:qvalue
382 #print(description)
383 return self._to_bed(name_prefix=name_prefix, name=name,
384 description=description, score_column=score_column,
385 print_func=fhd.write, trackline=trackline)
387 def write_to_summit_bed (self, fhd, bytes name_prefix = b"peak_", bytes name = b"MACS",
388 bytes description = b"%s", str score_column ="score", trackline=True):
389 """Write peak summits in BED5 format in a file handler. Score
390 (5th column) is decided by score_column setting. Check the
391 following list. Name column ( 4th column) is made by putting
392 name_prefix together with an ascending number.
394 Five columns are chromosome, summit start, summit end, peak name, and peak score.
396 items in peak object:
398 start:start
399 end:end,
400 length:end-start,
401 summit:summit,
402 score:peak_score,
403 pileup:pileup,
404 pscore:pvalue,
405 fc:fold_change,
406 qscore:qvalue
408 return self._to_summits_bed(name_prefix=name_prefix, name=name,
409 description=description, score_column=score_column,
410 print_func=fhd.write, trackline=trackline)
412 def write_to_narrowPeak (self, fhd, bytes name_prefix = b"peak_", bytes name = b"peak", str score_column="score", trackline=True):
413 """Print out peaks in narrowPeak format.
415 This format is designed for ENCODE project, and basically a
416 BED6+4 format.
418 +-----------+------+----------------------------------------+
419 |field |type |description |
420 +-----------+------+----------------------------------------+
421 |chrom |string|Name of the chromosome |
422 +-----------+------+----------------------------------------+
423 |chromStart |int |The starting position of the feature in |
424 | | |the chromosome. The first base in a |
425 | | |chromosome is numbered 0. |
426 +-----------+------+----------------------------------------+
427 |chromEnd |int |The ending position of the feature in |
428 | | |the chromosome or scaffold. The chromEnd|
429 | | |base is not included in the display of |
430 | | |the feature. For example, the first 100|
431 | | |bases of a chromosome are defined as |
432 | | |chromStart=0, chromEnd=100, and span the|
433 | | |bases numbered 0-99. |
434 +-----------+------+----------------------------------------+
435 |name |string|Name given to a region (preferably |
436 | | |unique). Use '.' if no name is assigned.|
437 +-----------+------+----------------------------------------+
438 |score |int |Indicates how dark the peak will be |
439 |(-logpvalue| |displayed in the browser (1-1000). If |
440 |in MACS3 * | |'0', the DCC will assign this based on |
441 |10) | |signal value. Ideally average |
442 | | |signalValue per base spread between |
443 | | |100-1000. |
444 +-----------+------+----------------------------------------+
445 |strand |char |+/- to denote strand or orientation |
446 |(always .) | |(whenever applicable). Use '.' if no |
447 | | |orientation is assigned. |
448 +-----------+------+----------------------------------------+
449 |signalValue|float |Measurement of overall (usually, |
450 |(fc) | |average) enrichment for the region. |
451 +-----------+------+----------------------------------------+
452 |pValue |float |Measurement of statistical signficance |
453 | | |(-log10). Use -1 if no pValue is |
454 | | |assigned. |
455 +-----------+------+----------------------------------------+
456 |qValue |float |Measurement of statistical significance |
457 | | |using false discovery rate. Use -1 if no|
458 | | |qValue is assigned. |
459 +-----------+------+----------------------------------------+
460 |peak |int |Point-source called for this peak; |
461 | | |0-based offset from chromStart. Use -1 |
462 | | |if no point-source called. |
463 +-----------+------+----------------------------------------+
466 cdef int n_peak
467 cdef bytes chrom
468 cdef long s
469 cdef str peakname
471 chrs = list(self.peaks.keys())
472 chrs.sort()
473 n_peak = 0
474 write = fhd.write
475 try: peakprefix = name_prefix % name
476 except: peakprefix = name_prefix
477 if trackline:
478 write("track type=narrowPeak name=\"%s\" description=\"%s\" nextItemButton=on\n" % (name.decode(), name.decode()))
479 for chrom in chrs:
480 for end, group in groupby(self.peaks[chrom], key=itemgetter("end")):
481 n_peak += 1
482 these_peaks = list(group)
483 if len(these_peaks) > 1: # from call-summits
484 for i, peak in enumerate(these_peaks):
485 peakname = "%s%d%s" % (peakprefix.decode(), n_peak, subpeak_letters(i))
486 if peak['summit'] == -1:
487 s = -1
488 else:
489 s = peak['summit'] - peak['start']
490 fhd.write( "%s\t%d\t%d\t%s\t%d\t.\t%.6g\t%.6g\t%.6g\t%d\n"
492 (chrom.decode(),peak['start'],peak['end'],peakname,int(10*peak[score_column]),
493 peak['fc'],peak['pscore'],peak['qscore'],s) )
494 else:
495 peak = these_peaks[0]
496 peakname = "%s%d" % (peakprefix.decode(), n_peak)
497 if peak['summit'] == -1:
498 s = -1
499 else:
500 s = peak['summit'] - peak['start']
501 fhd.write( "%s\t%d\t%d\t%s\t%d\t.\t%.6g\t%.6g\t%.6g\t%d\n"
503 (chrom.decode(),peak['start'],peak['end'],peakname,int(10*peak[score_column]),
504 peak['fc'],peak['pscore'],peak['qscore'],s) )
505 return
507 def write_to_xls (self, ofhd, bytes name_prefix = b"%s_peak_", bytes name = b"MACS"):
508 """Save the peak results in a tab-delimited plain text file
509 with suffix .xls.
512 wait... why I have two write_to_xls in this class?
515 write = ofhd.write
516 write("\t".join(("chr","start", "end", "length", "abs_summit", "pileup", "-log10(pvalue)", "fold_enrichment", "-log10(qvalue)", "name"))+"\n")
518 try: peakprefix = name_prefix % name
519 except: peakprefix = name_prefix
521 peaks = self.peaks
522 chrs = list(peaks.keys())
523 chrs.sort()
524 n_peak = 0
525 for chrom in chrs:
526 for end, group in groupby(peaks[chrom], key=itemgetter("end")):
527 n_peak += 1
528 these_peaks = list(group)
529 if len(these_peaks) > 1:
530 for i, peak in enumerate(these_peaks):
531 peakname = "%s%d%s" % (peakprefix.decode(), n_peak, subpeak_letters(i))
532 #[start,end,end-start,summit,peak_height,number_tags,pvalue,fold_change,qvalue]
533 write("%s\t%d\t%d\t%d" % (chrom.decode(),peak['start']+1,peak['end'],peak['length']))
534 write("\t%d" % (peak['summit']+1)) # summit position
535 write("\t%.6g" % (round(peak['pileup'],2))) # pileup height at summit
536 write("\t%.6g" % (peak['pscore'])) # -log10pvalue at summit
537 write("\t%.6g" % (peak['fc'])) # fold change at summit
538 write("\t%.6g" % (peak['qscore'])) # -log10qvalue at summit
539 write("\t%s" % peakname)
540 write("\n")
541 else:
542 peak = these_peaks[0]
543 peakname = "%s%d" % (peakprefix.decode(), n_peak)
544 #[start,end,end-start,summit,peak_height,number_tags,pvalue,fold_change,qvalue]
545 write("%s\t%d\t%d\t%d" % (chrom.decode(),peak['start']+1,peak['end'],peak['length']))
546 write("\t%d" % (peak['summit']+1)) # summit position
547 write("\t%.6g" % (round(peak['pileup'],2))) # pileup height at summit
548 write("\t%.6g" % (peak['pscore'])) # -log10pvalue at summit
549 write("\t%.6g" % (peak['fc'])) # fold change at summit
550 write("\t%.6g" % (peak['qscore'])) # -log10qvalue at summit
551 write("\t%s" % peakname)
552 write("\n")
553 return
556 def overlap_with_other_peaks (self, peaks2, double cover=0):
557 """Peaks2 is a PeakIO object or dictionary with can be
558 initialized as a PeakIO. check __init__ for PeakIO for detail.
560 return how many peaks are intersected by peaks2 by percentage
561 coverage on peaks2(if 50%, cover = 0.5).
563 cdef int total_num
564 cdef list chrs1, chrs2, a
565 cdef bytes k
567 peaks1 = self.peaks
568 if isinstance(peaks2,PeakIO):
569 peaks2 = peaks2.peaks
570 total_num = 0
571 chrs1 = list(peaks1.keys())
572 chrs2 = list(peaks2.keys())
573 for k in chrs1:
574 if not chrs2.count(k):
575 continue
576 rl1_k = iter(peaks1[k])
577 rl2_k = iter(peaks2[k])
578 tmp_n = False
579 try:
580 r1 = rl1_k.next()
581 r2 = rl2_k.next()
582 while (True):
583 if r2[0] < r1[1] and r1[0] < r2[1]:
584 a = sorted([r1[0],r1[1],r2[0],r2[1]])
585 if float(a[2]-a[1]+1)/r2[2] > cover:
586 if not tmp_n:
587 total_num+=1
588 tmp_n = True
589 if r1[1] < r2[1]:
590 r1 = rl1_k.next()
591 tmp_n = False
592 else:
593 r2 = rl2_k.next()
594 except StopIteration:
595 continue
596 return total_num
598 def read_from_xls (self, ofhd):
599 """Save the peak results in a tab-delimited plain text file
600 with suffix .xls.
603 cdef:
604 bytes line = b''
605 bytes chrom = b''
606 int n_peak = 0
607 int start, end, length, summit
608 float pileup, pscore, fc, qscore
609 list fields
610 while True:
611 if not (line.startswith('#') or line.strip() == ''): break
612 line = ofhd.readline()
614 # sanity check
615 columns = line.rstrip().split('\t')
616 for a,b in zip(columns, ("chr","start", "end", "length", "abs_summit",
617 "pileup", "-log10(pvalue)", "fold_enrichment",
618 "-log10(qvalue)", "name")):
619 if not a==b: raise NotImplementedError('column %s not recognized', a)
621 add = self.add
622 split = str.split
623 rstrip = str.rstrip
624 for i, line in enumerate(ofhd.readlines()):
625 fields = split(line, '\t')
626 peak = {}
627 chrom = fields[0].encode()
628 start = int(fields[1]) - 1
629 end = int(fields[2])
630 length = int(fields[3])
631 if end - start != length:
632 raise UserWarning('Malformed peak at line %d:\n%s' % (i, line))
633 summit = int(fields[4]) - 1
634 pileup = float(fields[5])
635 pscore = float(fields[6])
636 fc = float(fields[7])
637 qscore = float(fields[8])
638 peakname = rstrip(fields[9])
639 add(chrom, start, end, summit, qscore, pileup, pscore, fc, qscore,
640 peakname)
642 cpdef parse_peakname(peakname):
643 """returns peaknumber, subpeak
645 cdef:
646 bytes peak_id, peaknumber, subpeak
647 peak_id = peakname.split(b'_')[-1]
648 x = re.split('(\D.*)', peak_id)
649 peaknumber = int(x[0])
650 try:
651 subpeak = x[1]
652 except IndexError:
653 subpeak = b''
654 return (peaknumber, subpeak)
656 cdef class Region:
657 """For plain region of chrom, start and end
659 cdef:
660 dict regions
661 bool __flag_sorted
663 def __init__ (self):
664 self.regions= {}
665 self.__flag_sorted = False
667 def add_loc ( self, bytes chrom, int start, int end ):
668 if self.regions.has_key(chrom):
669 self.regions[chrom].append( (start,end) )
670 else:
671 self.regions[chrom] = [(start,end), ]
672 self.__flag_sorted = False
673 return
675 def sort (self):
676 cdef bytes chrom
678 for chrom in list(self.regions.keys()):
679 self.regions[chrom].sort()
680 self.__flag_sorted = True
682 def merge_overlap ( self ):
683 cdef bytes chrom
684 cdef int s_new_region, e_new_region, i, j
686 if not self.__flag_sorted:
687 self.sort()
688 regions = self.regions
689 new_regions = {}
690 chrs = list(regions.keys())
691 chrs.sort()
692 for i in range(len(chrs)):
693 chrom = chrs[i]
694 #for chrom in chrs:
695 new_regions[chrom]=[]
696 n_append = new_regions[chrom].append
697 prev_region = None
698 regions_chr = regions[chrom]
699 for i in range(len(regions_chr)):
700 if not prev_region:
701 prev_region = regions_chr[i]
702 continue
703 else:
704 if regions_chr[i][0] <= prev_region[1]:
705 s_new_region = prev_region[0]
706 e_new_region = regions_chr[i][1]
707 prev_region = (s_new_region,e_new_region)
708 else:
709 n_append(prev_region)
710 prev_region = regions_chr[i]
711 if prev_region:
712 n_append(prev_region)
713 self.regions = new_regions
714 self.sort()
715 return True
717 def write_to_bed (self, fhd ):
718 cdef int i
719 cdef bytes chrom
721 chrs = list(self.regions.keys())
722 chrs.sort()
723 for i in range( len(chrs) ):
724 chrom = chrs[i]
725 for region in self.regions[chrom]:
726 fhd.write( "%s\t%d\t%d\n" % (chrom.decode(),region[0],region[1] ) )
729 cdef class BroadPeakContent:
730 cdef:
731 long start
732 long end
733 long length
734 float score
735 bytes thickStart
736 bytes thickEnd
737 long blockNum
738 bytes blockSizes
739 bytes blockStarts
740 float pileup
741 float pscore
742 float fc
743 float qscore
744 bytes name
746 def __init__ ( self, long start, long end, float score,
747 bytes thickStart, bytes thickEnd,
748 long blockNum, bytes blockSizes,
749 bytes blockStarts, float pileup,
750 float pscore, float fold_change,
751 float qscore, bytes name = b"NA" ):
752 self.start = start
753 self.end = end
754 self.score = score
755 self.thickStart = thickStart
756 self.thickEnd = thickEnd
757 self.blockNum = blockNum
758 self.blockSizes = blockSizes
759 self.blockStarts = blockStarts
761 self.length = end - start
762 self.pileup = pileup
763 self.pscore = pscore
764 self.fc = fold_change
765 self.qscore = qscore
766 self.name = name
768 def __getitem__ ( self, a ):
769 if a == "start":
770 return self.start
771 elif a == "end":
772 return self.end
773 elif a == "length":
774 return self.length
775 elif a == "score":
776 return self.score
777 elif a == "thickStart":
778 return self.thickStart
779 elif a == "thickEnd":
780 return self.thickEnd
781 elif a == "blockNum":
782 return self.blockNum
783 elif a == "blockSizes":
784 return self.blockSizes
785 elif a == "blockStarts":
786 return self.blockStarts
787 elif a == "pileup":
788 return self.pileup
789 elif a == "pscore":
790 return self.pscore
791 elif a == "fc":
792 return self.fc
793 elif a == "qscore":
794 return self.qscore
795 elif a == "name":
796 return self.name
798 def __str__ (self):
799 return "start:%d;end:%d;score:%f" % ( self.start, self.end, self.score )
802 cdef class BroadPeakIO:
803 """IO for broad peak information.
806 cdef:
807 dict peaks
809 def __init__ (self):
810 self.peaks = {}
812 def add (self, char * chromosome, long start, long end, long score = 0,
813 bytes thickStart=b".", bytes thickEnd=b".",
814 long blockNum=0, bytes blockSizes=b".",
815 bytes blockStarts=b".", float pileup = 0,
816 float pscore = 0, float fold_change = 0,
817 float qscore = 0, bytes name = b"NA" ):
818 """items
819 chromosome : chromosome name,
820 start : broad region start,
821 end : broad region end,
822 score : average score in all blocks,
823 thickStart : start of highly enriched region, # could be b'.'
824 thickEnd : end of highly enriched region, # could be b'.'
825 blockNum : number of blocks, # could be 0
826 blockSizes : sizes of blocks, # could be b'.'
827 blockStarts: starts of blocks # could be b'.'
828 pileup : median pileup in region # could be 0
829 pscore : median pvalue score in region # could be 0
830 fold_change: median fold change in region # could be 0
831 qscore : median pvalue score in region # could be 0
832 name : peak name # could be b'NA'
834 if not self.peaks.has_key(chromosome):
835 self.peaks[chromosome] = []
836 self.peaks[chromosome].append( BroadPeakContent( start, end, score, thickStart, thickEnd,
837 blockNum, blockSizes, blockStarts,
838 pileup, pscore, fold_change, qscore, name ) )
840 def filter_pscore (self, double pscore_cut ):
841 cdef:
842 bytes chrom
843 dict peaks
844 dict new_peaks
845 list chrs
846 BroadPeakContent p
848 peaks = self.peaks
849 new_peaks = {}
850 chrs = sorted(list(peaks.keys()))
852 for chrom in chrs:
853 new_peaks[chrom]=[p for p in peaks[chrom] if p['pscore'] >= pscore_cut]
854 self.peaks = new_peaks
856 def filter_qscore (self, double qscore_cut ):
857 cdef:
858 bytes chrom
859 dict peaks
860 dict new_peaks
861 list chrs
862 BroadPeakContent p
864 peaks = self.peaks
865 new_peaks = {}
866 chrs = sorted(list(peaks.keys()))
868 for chrom in chrs:
869 new_peaks[chrom]=[p for p in peaks[chrom] if p['qscore'] >= qscore_cut]
870 self.peaks = new_peaks
872 def filter_fc (self, fc_low, fc_up=None ):
873 """Filter peaks in a given fc range.
875 If fc_low and fc_up is assigned, the peaks with fc in [fc_low,fc_up)
878 cdef:
879 bytes chrom
880 dict peaks
881 dict new_peaks
882 list chrs
883 BroadPeakContent p
885 peaks = self.peaks
886 new_peaks = {}
887 chrs = list(peaks.keys())
888 chrs.sort()
889 if fc_up:
890 for chrom in chrs:
891 new_peaks[chrom]=[p for p in peaks[chrom] if p['fc'] >= fc_low and p['fc']<fc_up]
892 else:
893 for chrom in chrs:
894 new_peaks[chrom]=[p for p in peaks[chrom] if p['fc'] >= fc_low]
895 self.peaks = new_peaks
897 def total (self):
898 cdef:
899 bytes chrom
900 dict peaks
901 list chrs
902 long x
904 peaks = self.peaks
905 chrs = list(peaks.keys())
906 chrs.sort()
907 x = 0
908 for chrom in chrs:
909 x += len(peaks[chrom])
910 return x
912 def write_to_gappedPeak (self, fhd, bytes name_prefix=b"peak_", bytes name=b'peak', bytes description=b"%s", str score_column="score", trackline=True):
913 """Print out peaks in gappedBed format. Only those with stronger enrichment regions are saved.
915 This format is basically a BED12+3 format.
917 +--------------+------+----------------------------------------+
918 |field |type |description |
919 +--------------+------+----------------------------------------+
920 |chrom |string|Name of the chromosome |
921 +--------------+------+----------------------------------------+
922 |chromStart |int |The starting position of the feature in |
923 | | |the chromosome. The first base in a |
924 | | |chromosome is numbered 0. |
925 +--------------+------+----------------------------------------+
926 |chromEnd |int |The ending position of the feature in |
927 | | |the chromosome or scaffold. The chromEnd|
928 | | |base is not included in the display of |
929 | | |the feature. For example, the first 100|
930 | | |bases of a chromosome are defined as |
931 | | |chromStart=0, chromEnd=100, and span the|
932 | | |bases numbered 0-99. |
933 +--------------+------+----------------------------------------+
934 |name |string|Name given to a region (preferably |
935 | | |unique). Use '.' if no name is assigned.|
936 +--------------+------+----------------------------------------+
937 |score |int |Indicates how dark the peak will be |
938 |(always use | |displayed in the browser (1-1000). If |
939 |1000 for | |'0', the DCC will assign this based on |
940 |the | |signal value. Ideally average |
941 |thickest | |signalValue per base spread between |
942 |color) | |100-1000. |
943 +--------------+------+----------------------------------------+
944 |strand |char |+/- to denote strand or orientation |
945 |(always .) | |(whenever applicable). Use '.' if no |
946 | | |orientation is assigned. |
947 +--------------+------+----------------------------------------+
948 |thickStart |int | The starting position at which the |
949 | | |feature is drawn thickly. Mark the start|
950 | | |of highly enriched regions. |
951 | | | |
952 +--------------+------+----------------------------------------+
953 |thickEnd |int | The ending position at which the |
954 | | |feature is drawn thickly. Mark the end |
955 | | |of highly enriched regions. |
956 +--------------+------+----------------------------------------+
957 |itemRGB |string| Not used. Set it as 0. |
958 +--------------+------+----------------------------------------+
959 |blockCounts |int | The number of blocks (exons) in the BED|
960 | | |line. |
961 +--------------+------+----------------------------------------+
962 |blockSizes |string| A comma-separated list of the block |
963 | | |sizes. |
964 +--------------+------+----------------------------------------+
965 |blockStarts |string| A comma-separated list of block starts.|
966 +--------------+------+----------------------------------------+
967 |signalValue |float |Measurement of overall (usually, |
968 |(fc) | |average) enrichment for the region. |
969 +--------------+------+----------------------------------------+
970 |pValue |float |Measurement of statistical signficance |
971 | | |(-log10). Use -1 if no pValue is |
972 | | |assigned. |
973 +--------------+------+----------------------------------------+
974 |qValue |float |Measurement of statistical significance |
975 | | |using false discovery rate. Use -1 if no|
976 | | |qValue is assigned. |
977 +--------------+------+----------------------------------------+
980 chrs = list(self.peaks.keys())
981 chrs.sort()
982 n_peak = 0
983 try: peakprefix = name_prefix % name
984 except: peakprefix = name_prefix
985 try: desc = description % name
986 except: desc = description
987 if trackline:
988 fhd.write("track name=\"%s\" description=\"%s\" type=gappedPeak nextItemButton=on\n" % (name.decode(), desc.decode()) )
989 for chrom in chrs:
990 for peak in self.peaks[chrom]:
991 n_peak += 1
992 if peak["thickStart"] != b".":
993 fhd.write( "%s\t%d\t%d\t%s%d\t%d\t.\t%s\t%s\t0\t%d\t%s\t%s\t%.6g\t%.6g\t%.6g\n"
995 (chrom.decode(),peak["start"],peak["end"],peakprefix.decode(),n_peak,int(10*peak[score_column]),
996 peak["thickStart"].decode(),peak["thickEnd"].decode(),
997 peak["blockNum"],peak["blockSizes"].decode(),peak["blockStarts"].decode(), peak['fc'], peak['pscore'], peak['qscore'] ) )
999 def write_to_Bed12 (self, fhd, bytes name_prefix=b"peak_", bytes name=b'peak', bytes description=b"%s", str score_column="score", trackline=True):
1000 """Print out peaks in Bed12 format.
1002 +--------------+------+----------------------------------------+
1003 |field |type |description |
1004 +--------------+------+----------------------------------------+
1005 |chrom |string|Name of the chromosome |
1006 +--------------+------+----------------------------------------+
1007 |chromStart |int |The starting position of the feature in |
1008 | | |the chromosome. The first base in a |
1009 | | |chromosome is numbered 0. |
1010 +--------------+------+----------------------------------------+
1011 |chromEnd |int |The ending position of the feature in |
1012 | | |the chromosome or scaffold. The chromEnd|
1013 | | |base is not included in the display of |
1014 | | |the feature. For example, the first 100|
1015 | | |bases of a chromosome are defined as |
1016 | | |chromStart=0, chromEnd=100, and span the|
1017 | | |bases numbered 0-99. |
1018 +--------------+------+----------------------------------------+
1019 |name |string|Name given to a region (preferably |
1020 | | |unique). Use '.' if no name is assigned.|
1021 +--------------+------+----------------------------------------+
1022 |score |int |Indicates how dark the peak will be |
1023 |(always use | |displayed in the browser (1-1000). If |
1024 |1000 for | |'0', the DCC will assign this based on |
1025 |the | |signal value. Ideally average |
1026 |thickest | |signalValue per base spread between |
1027 |color) | |100-1000. |
1028 +--------------+------+----------------------------------------+
1029 |strand |char |+/- to denote strand or orientation |
1030 |(always .) | |(whenever applicable). Use '.' if no |
1031 | | |orientation is assigned. |
1032 +--------------+------+----------------------------------------+
1033 |thickStart |int | The starting position at which the |
1034 | | |feature is drawn thickly. Mark the start|
1035 | | |of highly enriched regions. |
1036 | | | |
1037 +--------------+------+----------------------------------------+
1038 |thickEnd |int | The ending position at which the |
1039 | | |feature is drawn thickly. Mark the end |
1040 | | |of highly enriched regions. |
1041 +--------------+------+----------------------------------------+
1042 |itemRGB |string| Not used. Set it as 0. |
1043 +--------------+------+----------------------------------------+
1044 |blockCounts |int | The number of blocks (exons) in the BED|
1045 | | |line. |
1046 +--------------+------+----------------------------------------+
1047 |blockSizes |string| A comma-separated list of the block |
1048 | | |sizes. |
1049 +--------------+------+----------------------------------------+
1050 |blockStarts |string| A comma-separated list of block starts.|
1051 +--------------+------+----------------------------------------+
1054 chrs = list(self.peaks.keys())
1055 chrs.sort()
1056 n_peak = 0
1057 try: peakprefix = name_prefix % name
1058 except: peakprefix = name_prefix
1059 try: desc = description % name
1060 except: desc = description
1061 if trackline:
1062 fhd.write("track name=\"%s\" description=\"%s\" type=bed nextItemButton=on\n" % (name.decode(), desc.decode()) )
1063 for chrom in chrs:
1064 for peak in self.peaks[chrom]:
1065 n_peak += 1
1066 if peak["thickStart"] == b".":
1067 # this will violate gappedPeak format, since it's a complement like broadPeak line.
1068 fhd.write( "%s\t%d\t%d\t%s%d\t%d\t.\n"
1070 (chrom.decode(),peak["start"],peak["end"],peakprefix.decode(),n_peak,int(10*peak[score_column]) ) )
1071 else:
1072 fhd.write( "%s\t%d\t%d\t%s%d\t%d\t.\t%s\t%s\t0\t%d\t%s\t%s\n"
1074 (chrom.decode(), peak["start"], peak["end"], peakprefix.decode(), n_peak, int(10*peak[score_column]),
1075 peak["thickStart"].decode(), peak["thickEnd"].decode(),
1076 peak["blockNum"], peak["blockSizes"].decode(), peak["blockStarts"].decode() ))
1079 def write_to_broadPeak (self, fhd, bytes name_prefix=b"peak_", bytes name=b'peak', bytes description=b"%s", str score_column="score", trackline=True):
1080 """Print out peaks in broadPeak format.
1082 This format is designed for ENCODE project, and basically a
1083 BED6+3 format.
1085 +-----------+------+----------------------------------------+
1086 |field |type |description |
1087 +-----------+------+----------------------------------------+
1088 |chrom |string|Name of the chromosome |
1089 +-----------+------+----------------------------------------+
1090 |chromStart |int |The starting position of the feature in |
1091 | | |the chromosome. The first base in a |
1092 | | |chromosome is numbered 0. |
1093 +-----------+------+----------------------------------------+
1094 |chromEnd |int |The ending position of the feature in |
1095 | | |the chromosome or scaffold. The chromEnd|
1096 | | |base is not included in the display of |
1097 | | |the feature. For example, the first 100|
1098 | | |bases of a chromosome are defined as |
1099 | | |chromStart=0, chromEnd=100, and span the|
1100 | | |bases numbered 0-99. |
1101 +-----------+------+----------------------------------------+
1102 |name |string|Name given to a region (preferably |
1103 | | |unique). Use '.' if no name is assigned.|
1104 +-----------+------+----------------------------------------+
1105 |score |int |Indicates how dark the peak will be |
1106 |(-logqvalue| |displayed in the browser (1-1000). If |
1107 |in MACS3 * | |'0', the DCC will assign this based on |
1108 |10) | |signal value. Ideally average |
1109 | | |signalValue per base spread between |
1110 | | |100-1000. |
1111 +-----------+------+----------------------------------------+
1112 |strand |char |+/- to denote strand or orientation |
1113 |(always .) | |(whenever applicable). Use '.' if no |
1114 | | |orientation is assigned. |
1115 +-----------+------+----------------------------------------+
1116 |signalValue|float |Measurement of overall (usually, |
1117 |(fc) | |average) enrichment for the region. |
1118 +-----------+------+----------------------------------------+
1119 |pValue |float |Measurement of statistical signficance |
1120 | | |(-log10). Use -1 if no pValue is |
1121 | | |assigned. |
1122 +-----------+------+----------------------------------------+
1123 |qValue |float |Measurement of statistical significance |
1124 | | |using false discovery rate. Use -1 if no|
1125 | | |qValue is assigned. |
1126 +-----------+------+----------------------------------------+
1129 cdef int n_peak
1130 cdef bytes chrom
1131 cdef long s
1132 cdef str peakname
1134 chrs = list(self.peaks.keys())
1135 chrs.sort()
1136 n_peak = 0
1137 write = fhd.write
1138 try: peakprefix = name_prefix % name
1139 except: peakprefix = name_prefix
1140 if trackline:
1141 write("track type=broadPeak name=\"%s\" description=\"%s\" nextItemButton=on\n" % (name.decode(), name.decode()))
1142 for chrom in chrs:
1143 for end, group in groupby(self.peaks[chrom], key=itemgetter("end")):
1144 n_peak += 1
1145 these_peaks = list(group)
1146 peak = these_peaks[0]
1147 peakname = "%s%d" % (peakprefix.decode(), n_peak)
1148 fhd.write( "%s\t%d\t%d\t%s\t%d\t.\t%.6g\t%.6g\t%.6g\n" %
1149 (chrom.decode(),peak['start'],peak['end'],peakname,int(10*peak[score_column]),
1150 peak['fc'],peak['pscore'],peak['qscore'] ) )
1151 return
1154 def write_to_xls (self, ofhd, bytes name_prefix=b"%s_peak_", bytes name=b"MACS"):
1155 """Save the peak results in a tab-delimited plain text file
1156 with suffix .xls.
1159 wait... why I have two write_to_xls in this class?
1162 write = ofhd.write
1163 write("\t".join(("chr","start", "end", "length", "pileup", "-log10(pvalue)", "fold_enrichment", "-log10(qvalue)", "name"))+"\n")
1165 try: peakprefix = name_prefix % name
1166 except: peakprefix = name_prefix
1168 peaks = self.peaks
1169 chrs = list(peaks.keys())
1170 chrs.sort()
1171 n_peak = 0
1172 for chrom in chrs:
1173 for end, group in groupby(peaks[chrom], key=itemgetter("end")):
1174 n_peak += 1
1175 these_peaks = list(group)
1176 peak = these_peaks[0]
1177 peakname = "%s%d" % (peakprefix.decode(), n_peak)
1178 write("%s\t%d\t%d\t%d" % (chrom.decode(),peak['start']+1,peak['end'],peak['length']))
1179 write("\t%.6g" % (round(peak['pileup'],2))) # pileup height at summit
1180 write("\t%.6g" % (peak['pscore'])) # -log10pvalue at summit
1181 write("\t%.6g" % (peak['fc'])) # fold change at summit
1182 write("\t%.6g" % (peak['qscore'])) # -log10qvalue at summit
1183 write("\t%s" % peakname)
1184 write("\n")
1185 return