1 # cython: language_level=3
3 # Time-stamp: <2020-11-24 17:11:29 Tao Liu>
5 """Module for PeakIO IO classes.
7 This code is free software; you can redistribute it and/or modify it
8 under the terms of the BSD License (see the file LICENSE included with
12 # ------------------------------------
14 # ------------------------------------
15 from itertools
import groupby
16 from operator
import itemgetter
20 # ------------------------------------
22 # ------------------------------------
24 from MACS3
.Utilities
.Constants
import *
26 # ------------------------------------
28 # ------------------------------------
30 from cpython cimport bool
32 # ------------------------------------
34 # ------------------------------------
35 __version__
= "PeakIO $Revision$"
36 __author__
= "Tao Liu <taoliu@jimmy.harvard.edu>"
37 __doc__
= "PeakIO class"
39 # ------------------------------------
41 # ------------------------------------
42 cdef str subpeak_letters
( int i
):
46 return subpeak_letters
(i
// 26) + chr(97 + (i
% 26))
48 # ------------------------------------
50 # ------------------------------------
52 cdef class PeakContent
:
65 def __init__
( self, int start
, int end
, int summit
,
66 float peak_score
, float pileup
,
67 float pscore
, float fold_change
, float qscore
,
71 self.length
= end
- start
73 self.score
= peak_score
80 def __getitem__
( self, a
):
102 def __setitem__
( self, a
, v
):
125 return "start:%d;end:%d;score:%f" % ( self.start
, self.end
, self.score
)
128 """IO for peak information.
137 cpdef add
(self, bytes chromosome
, int start
, int end
, int summit
= 0,
138 float peak_score
= 0, float pileup
= 0,
139 float pscore
= 0, float fold_change
= 0, float qscore
= 0,
152 if not self.peaks
.has_key
(chromosome
):
153 self.peaks
[chromosome
]=[]
154 self.peaks
[chromosome
].append
(PeakContent
( start
, end
, summit
, peak_score
, pileup
, pscore
, fold_change
, qscore
, name
))
156 cpdef add_PeakContent
( self, bytes chromosome
, object peakcontent
):
157 if not self.peaks
.has_key
(chromosome
):
158 self.peaks
[chromosome
]=[]
159 self.peaks
[chromosome
].append
(peakcontent
)
161 def get_data_from_chrom
(self, bytes chrom
):
162 if not self.peaks
.has_key
( chrom
):
163 self.peaks
[chrom
]= []
164 return self.peaks
[chrom
]
166 cpdef set get_chr_names
(self):
167 return set
(sorted
(self.peaks
.keys
()))
171 chrs
= sorted
(list(self.peaks
.keys
()))
173 self.peaks
[chrom
].sort
(key
=lambda x
:x
['start'])
177 def filter_pscore
(self, double pscore_cut
):
182 chrs
= sorted
(list(peaks
.keys
()))
185 new_peaks
[chrom
]=[p
for p
in peaks
[chrom
] if p
['pscore'] >= pscore_cut
]
186 self.peaks
= new_peaks
188 def filter_qscore
(self, double qscore_cut
):
193 chrs
= sorted
(list(peaks
.keys
()))
196 new_peaks
[chrom
]=[p
for p
in peaks
[chrom
] if p
['qscore'] >= qscore_cut
]
197 self.peaks
= new_peaks
199 def filter_fc
(self, fc_low
, fc_up
=None
):
200 """Filter peaks in a given fc range.
202 If fc_low and fc_up is assigned, the peaks with fc in [fc_low,fc_up)
207 chrs
= list(peaks
.keys
())
211 new_peaks
[chrom
]=[p
for p
in peaks
[chrom
] if p
['fc'] >= fc_low
and p
['fc']<fc_up
]
214 new_peaks
[chrom
]=[p
for p
in peaks
[chrom
] if p
['fc'] >= fc_low
]
215 self.peaks
= new_peaks
219 chrs
= list(peaks
.keys
())
223 x
+= len(peaks
[chrom
])
226 # these methods are very fast, specifying types is unnecessary
227 def write_to_xls
(self, fhd
, bytes name_prefix
=b
"%s_peak_", bytes name
=b
"MACS"):
228 return self._to_xls
(name_prefix
=name_prefix
, name
=name
,
229 print_func
=fhd
.write
)
231 cdef _to_xls
(self, bytes name_prefix
=b
"%s_peak_", bytes name
=b
"MACS", print_func
=sys
.stdout
.write
):
234 print_func
("\t".join
(("chr","start", "end", "length", "abs_summit", "pileup", "-log10(pvalue)", "fold_enrichment", "-log10(qvalue)", "name"))+"\n")
238 try: peakprefix
= name_prefix
% name
239 except: peakprefix
= name_prefix
242 chrs
= list(peaks
.keys
())
246 for end
, group
in groupby
(peaks
[chrom
], key
=itemgetter
("end")):
248 these_peaks
= list(group
)
249 if len(these_peaks
) > 1:
250 for i
, peak
in enumerate
(these_peaks
):
251 peakname
= "%s%d%s" % (peakprefix
.decode
(), n_peak
, subpeak_letters
(i
))
252 #[start,end,end-start,summit,peak_height,number_tags,pvalue,fold_change,qvalue]
253 print_func
("%s\t%d\t%d\t%d" % (chrom
.decode
(),peak
['start']+1,peak
['end'],peak
['length']))
254 print_func
("\t%d" % (peak
['summit']+1)) # summit position
255 print_func
("\t%.6g" % (round(peak
['pileup'],2))) # pileup height at summit
256 print_func
("\t%.6g" % (peak
['pscore'])) # -log10pvalue at summit
257 print_func
("\t%.6g" % (peak
['fc'])) # fold change at summit
258 print_func
("\t%.6g" % (peak
['qscore'])) # -log10qvalue at summit
259 print_func
("\t%s" % peakname
)
262 peak
= these_peaks
[0]
263 peakname
= "%s%d" % (peakprefix
.decode
(), n_peak
)
264 #[start,end,end-start,summit,peak_height,number_tags,pvalue,fold_change,qvalue]
265 print_func
("%s\t%d\t%d\t%d" % (chrom
.decode
(),peak
['start']+1,peak
['end'],peak
['length']))
266 print_func
("\t%d" % (peak
['summit']+1)) # summit position
267 print_func
("\t%.6g" % (round(peak
['pileup'],2))) # pileup height at summit
268 print_func
("\t%.6g" % (peak
['pscore'])) # -log10pvalue at summit
269 print_func
("\t%.6g" % (peak
['fc'])) # fold change at summit
270 print_func
("\t%.6g" % (peak
['qscore'])) # -log10qvalue at summit
271 print_func
("\t%s" % peakname
)
275 cdef _to_bed
(self, bytes name_prefix
=b
"%s_peak_", bytes name
=b
"MACS",
276 bytes description
=b
"%s", str score_column
="score",
277 trackline
=False
, print_func
=sys
.stdout
.write
):
279 generalization of tobed and write_to_bed
282 chrs
= list(self.peaks
.keys
())
285 try: peakprefix
= name_prefix
% name
286 except: peakprefix
= name_prefix
287 try: desc
= description
% name
288 except: desc
= description
290 try: print_func
('track name="%s (peaks)" description="%s" visibility=1\n' % ( name
.replace
(b
"\"", b
"\\\"").decode
(),\
291 desc
.replace
(b
"\"", b
"\\\"").decode
() ) )
292 except: print_func
('track name=MACS description=Unknown\n')
294 for end
, group
in groupby
(self.peaks
[chrom
], key
=itemgetter
("end")):
298 for i
, peak
in enumerate
(peaks
):
299 print_func
("%s\t%d\t%d\t%s%d%s\t%.6g\n" % (chrom
.decode
(),peak
['start'],peak
['end'],peakprefix
.decode
(),n_peak
,subpeak_letters
(i
),peak
[score_column
]))
302 print_func
("%s\t%d\t%d\t%s%d\t%.6g\n" % (chrom
.decode
(),peak
['start'],peak
['end'],peakprefix
.decode
(),n_peak
,peak
[score_column
]))
304 cdef _to_summits_bed
(self, bytes name_prefix
=b
"%s_peak_", bytes name
=b
"MACS",
305 bytes description
= b
"%s", str score_column
="score",
306 bool trackline
=False
, print_func
=sys
.stdout
.write
):
308 generalization of to_summits_bed and write_to_summit_bed
310 chrs
= list(self.peaks
.keys
())
313 try: peakprefix
= name_prefix
% name
314 except: peakprefix
= name_prefix
315 try: desc
= description
% name
316 except: desc
= description
318 try: print_func
('track name="%s (summits)" description="%s" visibility=1\n' % ( name
.replace
(b
"\"", b
"\\\"").decode
(),\
319 desc
.replace
(b
"\"", b
"\\\"").decode
() ) )
320 except: print_func
('track name=MACS description=Unknown')
322 for end
, group
in groupby
(self.peaks
[chrom
], key
=itemgetter
("end")):
326 for i
, peak
in enumerate
(peaks
):
327 summit_p
= peak
['summit']
328 print_func
("%s\t%d\t%d\t%s%d%s\t%.6g\n" % (chrom
.decode
(),summit_p
,summit_p
+1,peakprefix
.decode
(),n_peak
,subpeak_letters
(i
),peak
[score_column
]))
331 summit_p
= peak
['summit']
332 print_func
("%s\t%d\t%d\t%s%d\t%.6g\n" % (chrom
.decode
(),summit_p
,summit_p
+1,peakprefix
.decode
(),n_peak
,peak
[score_column
]))
335 """Print out peaks in BED5 format.
337 Five columns are chromosome, peak start, peak end, peak name, and peak height.
349 return self._to_bed
(name_prefix
=b
"peak_", score_column
="score", name
=b
"", description
=b
"")
351 def to_summits_bed
(self):
352 """Print out peak summits in BED5 format.
354 Five columns are chromosome, summit start, summit end, peak name, and peak height.
357 return self._to_summits_bed
(name_prefix
=b
"peak_", score_column
="score", name
=b
"", description
=b
"")
359 # these methods are very fast, specifying types is unnecessary
360 def write_to_bed
(self, fhd
, bytes name_prefix
=b
"peak_", bytes name
=b
"MACS",
361 bytes description
= b
"%s", str score_column
="score", trackline
=True
):
362 """Write peaks in BED5 format in a file handler. Score (5th
363 column) is decided by score_column setting. Check the
364 following list. Name column ( 4th column) is made by putting
365 name_prefix together with an ascending number.
367 Five columns are chromosome, peak start, peak end, peak name,
370 items in peak hash object:
383 return self._to_bed
(name_prefix
=name_prefix
, name
=name
,
384 description
=description
, score_column
=score_column
,
385 print_func
=fhd
.write
, trackline
=trackline
)
387 def write_to_summit_bed
(self, fhd
, bytes name_prefix
= b
"peak_", bytes name
= b
"MACS",
388 bytes description
= b
"%s", str score_column
="score", trackline
=True
):
389 """Write peak summits in BED5 format in a file handler. Score
390 (5th column) is decided by score_column setting. Check the
391 following list. Name column ( 4th column) is made by putting
392 name_prefix together with an ascending number.
394 Five columns are chromosome, summit start, summit end, peak name, and peak score.
396 items in peak object:
408 return self._to_summits_bed
(name_prefix
=name_prefix
, name
=name
,
409 description
=description
, score_column
=score_column
,
410 print_func
=fhd
.write
, trackline
=trackline
)
412 def write_to_narrowPeak
(self, fhd
, bytes name_prefix
= b
"peak_", bytes name
= b
"peak", str score_column
="score", trackline
=True
):
413 """Print out peaks in narrowPeak format.
415 This format is designed for ENCODE project, and basically a
418 +-----------+------+----------------------------------------+
419 |field |type |description |
420 +-----------+------+----------------------------------------+
421 |chrom |string|Name of the chromosome |
422 +-----------+------+----------------------------------------+
423 |chromStart |int |The starting position of the feature in |
424 | | |the chromosome. The first base in a |
425 | | |chromosome is numbered 0. |
426 +-----------+------+----------------------------------------+
427 |chromEnd |int |The ending position of the feature in |
428 | | |the chromosome or scaffold. The chromEnd|
429 | | |base is not included in the display of |
430 | | |the feature. For example, the first 100|
431 | | |bases of a chromosome are defined as |
432 | | |chromStart=0, chromEnd=100, and span the|
433 | | |bases numbered 0-99. |
434 +-----------+------+----------------------------------------+
435 |name |string|Name given to a region (preferably |
436 | | |unique). Use '.' if no name is assigned.|
437 +-----------+------+----------------------------------------+
438 |score |int |Indicates how dark the peak will be |
439 |(-logpvalue| |displayed in the browser (1-1000). If |
440 |in MACS3 * | |'0', the DCC will assign this based on |
441 |10) | |signal value. Ideally average |
442 | | |signalValue per base spread between |
444 +-----------+------+----------------------------------------+
445 |strand |char |+/- to denote strand or orientation |
446 |(always .) | |(whenever applicable). Use '.' if no |
447 | | |orientation is assigned. |
448 +-----------+------+----------------------------------------+
449 |signalValue|float |Measurement of overall (usually, |
450 |(fc) | |average) enrichment for the region. |
451 +-----------+------+----------------------------------------+
452 |pValue |float |Measurement of statistical signficance |
453 | | |(-log10). Use -1 if no pValue is |
455 +-----------+------+----------------------------------------+
456 |qValue |float |Measurement of statistical significance |
457 | | |using false discovery rate. Use -1 if no|
458 | | |qValue is assigned. |
459 +-----------+------+----------------------------------------+
460 |peak |int |Point-source called for this peak; |
461 | | |0-based offset from chromStart. Use -1 |
462 | | |if no point-source called. |
463 +-----------+------+----------------------------------------+
471 chrs
= list(self.peaks
.keys
())
475 try: peakprefix
= name_prefix
% name
476 except: peakprefix
= name_prefix
478 write
("track type=narrowPeak name=\"%s\" description=\"%s\" nextItemButton=on\n" % (name
.decode
(), name
.decode
()))
480 for end
, group
in groupby
(self.peaks
[chrom
], key
=itemgetter
("end")):
482 these_peaks
= list(group
)
483 if len(these_peaks
) > 1: # from call-summits
484 for i
, peak
in enumerate
(these_peaks
):
485 peakname
= "%s%d%s" % (peakprefix
.decode
(), n_peak
, subpeak_letters
(i
))
486 if peak
['summit'] == -1:
489 s
= peak
['summit'] - peak
['start']
490 fhd
.write
( "%s\t%d\t%d\t%s\t%d\t.\t%.6g\t%.6g\t%.6g\t%d\n"
492 (chrom
.decode
(),peak
['start'],peak
['end'],peakname
,int(10*peak
[score_column
]),
493 peak
['fc'],peak
['pscore'],peak
['qscore'],s
) )
495 peak
= these_peaks
[0]
496 peakname
= "%s%d" % (peakprefix
.decode
(), n_peak
)
497 if peak
['summit'] == -1:
500 s
= peak
['summit'] - peak
['start']
501 fhd
.write
( "%s\t%d\t%d\t%s\t%d\t.\t%.6g\t%.6g\t%.6g\t%d\n"
503 (chrom
.decode
(),peak
['start'],peak
['end'],peakname
,int(10*peak
[score_column
]),
504 peak
['fc'],peak
['pscore'],peak
['qscore'],s
) )
507 def write_to_xls
(self, ofhd
, bytes name_prefix
= b
"%s_peak_", bytes name
= b
"MACS"):
508 """Save the peak results in a tab-delimited plain text file
512 wait... why I have two write_to_xls in this class?
516 write
("\t".join
(("chr","start", "end", "length", "abs_summit", "pileup", "-log10(pvalue)", "fold_enrichment", "-log10(qvalue)", "name"))+"\n")
518 try: peakprefix
= name_prefix
% name
519 except: peakprefix
= name_prefix
522 chrs
= list(peaks
.keys
())
526 for end
, group
in groupby
(peaks
[chrom
], key
=itemgetter
("end")):
528 these_peaks
= list(group
)
529 if len(these_peaks
) > 1:
530 for i
, peak
in enumerate
(these_peaks
):
531 peakname
= "%s%d%s" % (peakprefix
.decode
(), n_peak
, subpeak_letters
(i
))
532 #[start,end,end-start,summit,peak_height,number_tags,pvalue,fold_change,qvalue]
533 write
("%s\t%d\t%d\t%d" % (chrom
.decode
(),peak
['start']+1,peak
['end'],peak
['length']))
534 write
("\t%d" % (peak
['summit']+1)) # summit position
535 write
("\t%.6g" % (round(peak
['pileup'],2))) # pileup height at summit
536 write
("\t%.6g" % (peak
['pscore'])) # -log10pvalue at summit
537 write
("\t%.6g" % (peak
['fc'])) # fold change at summit
538 write
("\t%.6g" % (peak
['qscore'])) # -log10qvalue at summit
539 write
("\t%s" % peakname
)
542 peak
= these_peaks
[0]
543 peakname
= "%s%d" % (peakprefix
.decode
(), n_peak
)
544 #[start,end,end-start,summit,peak_height,number_tags,pvalue,fold_change,qvalue]
545 write
("%s\t%d\t%d\t%d" % (chrom
.decode
(),peak
['start']+1,peak
['end'],peak
['length']))
546 write
("\t%d" % (peak
['summit']+1)) # summit position
547 write
("\t%.6g" % (round(peak
['pileup'],2))) # pileup height at summit
548 write
("\t%.6g" % (peak
['pscore'])) # -log10pvalue at summit
549 write
("\t%.6g" % (peak
['fc'])) # fold change at summit
550 write
("\t%.6g" % (peak
['qscore'])) # -log10qvalue at summit
551 write
("\t%s" % peakname
)
556 def overlap_with_other_peaks
(self, peaks2
, double cover
=0):
557 """Peaks2 is a PeakIO object or dictionary with can be
558 initialized as a PeakIO. check __init__ for PeakIO for detail.
560 return how many peaks are intersected by peaks2 by percentage
561 coverage on peaks2(if 50%, cover = 0.5).
564 cdef list chrs1
, chrs2
, a
568 if isinstance(peaks2
,PeakIO
):
569 peaks2
= peaks2
.peaks
571 chrs1
= list(peaks1
.keys
())
572 chrs2
= list(peaks2
.keys
())
574 if not chrs2
.count
(k
):
576 rl1_k
= iter
(peaks1
[k
])
577 rl2_k
= iter
(peaks2
[k
])
583 if r2
[0] < r1
[1] and r1
[0] < r2
[1]:
584 a
= sorted
([r1
[0],r1
[1],r2
[0],r2
[1]])
585 if float(a
[2]-a
[1]+1)/r2
[2] > cover
:
594 except StopIteration
:
598 def read_from_xls
(self, ofhd
):
599 """Save the peak results in a tab-delimited plain text file
607 int start
, end
, length
, summit
608 float pileup
, pscore
, fc
, qscore
611 if not (line
.startswith
('#') or line
.strip
() == ''): break
612 line
= ofhd
.readline
()
615 columns
= line
.rstrip
().split
('\t')
616 for a
,b
in zip
(columns
, ("chr","start", "end", "length", "abs_summit",
617 "pileup", "-log10(pvalue)", "fold_enrichment",
618 "-log10(qvalue)", "name")):
619 if not a
==b
: raise NotImplementedError('column %s not recognized', a
)
624 for i
, line
in enumerate
(ofhd
.readlines
()):
625 fields
= split
(line
, '\t')
627 chrom
= fields
[0].encode
()
628 start
= int(fields
[1]) - 1
630 length
= int(fields
[3])
631 if end
- start
!= length
:
632 raise UserWarning
('Malformed peak at line %d:\n%s' % (i
, line
))
633 summit
= int(fields
[4]) - 1
634 pileup
= float(fields
[5])
635 pscore
= float(fields
[6])
636 fc
= float(fields
[7])
637 qscore
= float(fields
[8])
638 peakname
= rstrip
(fields
[9])
639 add
(chrom
, start
, end
, summit
, qscore
, pileup
, pscore
, fc
, qscore
,
642 cpdef parse_peakname
(peakname
):
643 """returns peaknumber, subpeak
646 bytes peak_id
, peaknumber
, subpeak
647 peak_id
= peakname
.split
(b
'_')[-1]
648 x
= re
.split
('(\D.*)', peak_id
)
649 peaknumber
= int(x
[0])
654 return (peaknumber
, subpeak
)
657 """For plain region of chrom, start and end
665 self.__flag_sorted
= False
667 def add_loc
( self, bytes chrom
, int start
, int end
):
668 if self.regions
.has_key
(chrom
):
669 self.regions
[chrom
].append
( (start
,end
) )
671 self.regions
[chrom
] = [(start
,end
), ]
672 self.__flag_sorted
= False
678 for chrom
in list(self.regions
.keys
()):
679 self.regions
[chrom
].sort
()
680 self.__flag_sorted
= True
682 def merge_overlap
( self ):
684 cdef int s_new_region
, e_new_region
, i
, j
686 if not self.__flag_sorted
:
688 regions
= self.regions
690 chrs
= list(regions
.keys
())
692 for i
in range(len(chrs
)):
695 new_regions
[chrom
]=[]
696 n_append
= new_regions
[chrom
].append
698 regions_chr
= regions
[chrom
]
699 for i
in range(len(regions_chr
)):
701 prev_region
= regions_chr
[i
]
704 if regions_chr
[i
][0] <= prev_region
[1]:
705 s_new_region
= prev_region
[0]
706 e_new_region
= regions_chr
[i
][1]
707 prev_region
= (s_new_region
,e_new_region
)
709 n_append
(prev_region
)
710 prev_region
= regions_chr
[i
]
712 n_append
(prev_region
)
713 self.regions
= new_regions
717 def write_to_bed
(self, fhd
):
721 chrs
= list(self.regions
.keys
())
723 for i
in range( len(chrs
) ):
725 for region
in self.regions
[chrom
]:
726 fhd
.write
( "%s\t%d\t%d\n" % (chrom
.decode
(),region
[0],region
[1] ) )
729 cdef class BroadPeakContent
:
746 def __init__
( self, long start
, long end
, float score
,
747 bytes thickStart
, bytes thickEnd
,
748 long blockNum
, bytes blockSizes
,
749 bytes blockStarts
, float pileup
,
750 float pscore
, float fold_change
,
751 float qscore
, bytes name
= b
"NA" ):
755 self.thickStart
= thickStart
756 self.thickEnd
= thickEnd
757 self.blockNum
= blockNum
758 self.blockSizes
= blockSizes
759 self.blockStarts
= blockStarts
761 self.length
= end
- start
764 self.fc
= fold_change
768 def __getitem__
( self, a
):
777 elif a
== "thickStart":
778 return self.thickStart
779 elif a
== "thickEnd":
781 elif a
== "blockNum":
783 elif a
== "blockSizes":
784 return self.blockSizes
785 elif a
== "blockStarts":
786 return self.blockStarts
799 return "start:%d;end:%d;score:%f" % ( self.start
, self.end
, self.score
)
802 cdef class BroadPeakIO
:
803 """IO for broad peak information.
812 def add
(self, char * chromosome
, long start
, long end
, long score
= 0,
813 bytes thickStart
=b
".", bytes thickEnd
=b
".",
814 long blockNum
=0, bytes blockSizes
=b
".",
815 bytes blockStarts
=b
".", float pileup
= 0,
816 float pscore
= 0, float fold_change
= 0,
817 float qscore
= 0, bytes name
= b
"NA" ):
819 chromosome : chromosome name,
820 start : broad region start,
821 end : broad region end,
822 score : average score in all blocks,
823 thickStart : start of highly enriched region, # could be b'.'
824 thickEnd : end of highly enriched region, # could be b'.'
825 blockNum : number of blocks, # could be 0
826 blockSizes : sizes of blocks, # could be b'.'
827 blockStarts: starts of blocks # could be b'.'
828 pileup : median pileup in region # could be 0
829 pscore : median pvalue score in region # could be 0
830 fold_change: median fold change in region # could be 0
831 qscore : median pvalue score in region # could be 0
832 name : peak name # could be b'NA'
834 if not self.peaks
.has_key
(chromosome
):
835 self.peaks
[chromosome
] = []
836 self.peaks
[chromosome
].append
( BroadPeakContent
( start
, end
, score
, thickStart
, thickEnd
,
837 blockNum
, blockSizes
, blockStarts
,
838 pileup
, pscore
, fold_change
, qscore
, name
) )
840 def filter_pscore
(self, double pscore_cut
):
850 chrs
= sorted
(list(peaks
.keys
()))
853 new_peaks
[chrom
]=[p
for p
in peaks
[chrom
] if p
['pscore'] >= pscore_cut
]
854 self.peaks
= new_peaks
856 def filter_qscore
(self, double qscore_cut
):
866 chrs
= sorted
(list(peaks
.keys
()))
869 new_peaks
[chrom
]=[p
for p
in peaks
[chrom
] if p
['qscore'] >= qscore_cut
]
870 self.peaks
= new_peaks
872 def filter_fc
(self, fc_low
, fc_up
=None
):
873 """Filter peaks in a given fc range.
875 If fc_low and fc_up is assigned, the peaks with fc in [fc_low,fc_up)
887 chrs
= list(peaks
.keys
())
891 new_peaks
[chrom
]=[p
for p
in peaks
[chrom
] if p
['fc'] >= fc_low
and p
['fc']<fc_up
]
894 new_peaks
[chrom
]=[p
for p
in peaks
[chrom
] if p
['fc'] >= fc_low
]
895 self.peaks
= new_peaks
905 chrs
= list(peaks
.keys
())
909 x
+= len(peaks
[chrom
])
912 def write_to_gappedPeak
(self, fhd
, bytes name_prefix
=b
"peak_", bytes name
=b
'peak', bytes description
=b
"%s", str score_column
="score", trackline
=True
):
913 """Print out peaks in gappedBed format. Only those with stronger enrichment regions are saved.
915 This format is basically a BED12+3 format.
917 +--------------+------+----------------------------------------+
918 |field |type |description |
919 +--------------+------+----------------------------------------+
920 |chrom |string|Name of the chromosome |
921 +--------------+------+----------------------------------------+
922 |chromStart |int |The starting position of the feature in |
923 | | |the chromosome. The first base in a |
924 | | |chromosome is numbered 0. |
925 +--------------+------+----------------------------------------+
926 |chromEnd |int |The ending position of the feature in |
927 | | |the chromosome or scaffold. The chromEnd|
928 | | |base is not included in the display of |
929 | | |the feature. For example, the first 100|
930 | | |bases of a chromosome are defined as |
931 | | |chromStart=0, chromEnd=100, and span the|
932 | | |bases numbered 0-99. |
933 +--------------+------+----------------------------------------+
934 |name |string|Name given to a region (preferably |
935 | | |unique). Use '.' if no name is assigned.|
936 +--------------+------+----------------------------------------+
937 |score |int |Indicates how dark the peak will be |
938 |(always use | |displayed in the browser (1-1000). If |
939 |1000 for | |'0', the DCC will assign this based on |
940 |the | |signal value. Ideally average |
941 |thickest | |signalValue per base spread between |
942 |color) | |100-1000. |
943 +--------------+------+----------------------------------------+
944 |strand |char |+/- to denote strand or orientation |
945 |(always .) | |(whenever applicable). Use '.' if no |
946 | | |orientation is assigned. |
947 +--------------+------+----------------------------------------+
948 |thickStart |int | The starting position at which the |
949 | | |feature is drawn thickly. Mark the start|
950 | | |of highly enriched regions. |
952 +--------------+------+----------------------------------------+
953 |thickEnd |int | The ending position at which the |
954 | | |feature is drawn thickly. Mark the end |
955 | | |of highly enriched regions. |
956 +--------------+------+----------------------------------------+
957 |itemRGB |string| Not used. Set it as 0. |
958 +--------------+------+----------------------------------------+
959 |blockCounts |int | The number of blocks (exons) in the BED|
961 +--------------+------+----------------------------------------+
962 |blockSizes |string| A comma-separated list of the block |
964 +--------------+------+----------------------------------------+
965 |blockStarts |string| A comma-separated list of block starts.|
966 +--------------+------+----------------------------------------+
967 |signalValue |float |Measurement of overall (usually, |
968 |(fc) | |average) enrichment for the region. |
969 +--------------+------+----------------------------------------+
970 |pValue |float |Measurement of statistical signficance |
971 | | |(-log10). Use -1 if no pValue is |
973 +--------------+------+----------------------------------------+
974 |qValue |float |Measurement of statistical significance |
975 | | |using false discovery rate. Use -1 if no|
976 | | |qValue is assigned. |
977 +--------------+------+----------------------------------------+
980 chrs
= list(self.peaks
.keys
())
983 try: peakprefix
= name_prefix
% name
984 except: peakprefix
= name_prefix
985 try: desc
= description
% name
986 except: desc
= description
988 fhd
.write
("track name=\"%s\" description=\"%s\" type=gappedPeak nextItemButton=on\n" % (name
.decode
(), desc
.decode
()) )
990 for peak
in self.peaks
[chrom
]:
992 if peak
["thickStart"] != b
".":
993 fhd
.write
( "%s\t%d\t%d\t%s%d\t%d\t.\t%s\t%s\t0\t%d\t%s\t%s\t%.6g\t%.6g\t%.6g\n"
995 (chrom
.decode
(),peak
["start"],peak
["end"],peakprefix
.decode
(),n_peak
,int(10*peak
[score_column
]),
996 peak
["thickStart"].decode
(),peak
["thickEnd"].decode
(),
997 peak
["blockNum"],peak
["blockSizes"].decode
(),peak
["blockStarts"].decode
(), peak
['fc'], peak
['pscore'], peak
['qscore'] ) )
999 def write_to_Bed12
(self, fhd
, bytes name_prefix
=b
"peak_", bytes name
=b
'peak', bytes description
=b
"%s", str score_column
="score", trackline
=True
):
1000 """Print out peaks in Bed12 format.
1002 +--------------+------+----------------------------------------+
1003 |field |type |description |
1004 +--------------+------+----------------------------------------+
1005 |chrom |string|Name of the chromosome |
1006 +--------------+------+----------------------------------------+
1007 |chromStart |int |The starting position of the feature in |
1008 | | |the chromosome. The first base in a |
1009 | | |chromosome is numbered 0. |
1010 +--------------+------+----------------------------------------+
1011 |chromEnd |int |The ending position of the feature in |
1012 | | |the chromosome or scaffold. The chromEnd|
1013 | | |base is not included in the display of |
1014 | | |the feature. For example, the first 100|
1015 | | |bases of a chromosome are defined as |
1016 | | |chromStart=0, chromEnd=100, and span the|
1017 | | |bases numbered 0-99. |
1018 +--------------+------+----------------------------------------+
1019 |name |string|Name given to a region (preferably |
1020 | | |unique). Use '.' if no name is assigned.|
1021 +--------------+------+----------------------------------------+
1022 |score |int |Indicates how dark the peak will be |
1023 |(always use | |displayed in the browser (1-1000). If |
1024 |1000 for | |'0', the DCC will assign this based on |
1025 |the | |signal value. Ideally average |
1026 |thickest | |signalValue per base spread between |
1027 |color) | |100-1000. |
1028 +--------------+------+----------------------------------------+
1029 |strand |char |+/- to denote strand or orientation |
1030 |(always .) | |(whenever applicable). Use '.' if no |
1031 | | |orientation is assigned. |
1032 +--------------+------+----------------------------------------+
1033 |thickStart |int | The starting position at which the |
1034 | | |feature is drawn thickly. Mark the start|
1035 | | |of highly enriched regions. |
1037 +--------------+------+----------------------------------------+
1038 |thickEnd |int | The ending position at which the |
1039 | | |feature is drawn thickly. Mark the end |
1040 | | |of highly enriched regions. |
1041 +--------------+------+----------------------------------------+
1042 |itemRGB |string| Not used. Set it as 0. |
1043 +--------------+------+----------------------------------------+
1044 |blockCounts |int | The number of blocks (exons) in the BED|
1046 +--------------+------+----------------------------------------+
1047 |blockSizes |string| A comma-separated list of the block |
1049 +--------------+------+----------------------------------------+
1050 |blockStarts |string| A comma-separated list of block starts.|
1051 +--------------+------+----------------------------------------+
1054 chrs
= list(self.peaks
.keys
())
1057 try: peakprefix
= name_prefix
% name
1058 except: peakprefix
= name_prefix
1059 try: desc
= description
% name
1060 except: desc
= description
1062 fhd
.write
("track name=\"%s\" description=\"%s\" type=bed nextItemButton=on\n" % (name
.decode
(), desc
.decode
()) )
1064 for peak
in self.peaks
[chrom
]:
1066 if peak
["thickStart"] == b
".":
1067 # this will violate gappedPeak format, since it's a complement like broadPeak line.
1068 fhd
.write
( "%s\t%d\t%d\t%s%d\t%d\t.\n"
1070 (chrom
.decode
(),peak
["start"],peak
["end"],peakprefix
.decode
(),n_peak
,int(10*peak
[score_column
]) ) )
1072 fhd
.write
( "%s\t%d\t%d\t%s%d\t%d\t.\t%s\t%s\t0\t%d\t%s\t%s\n"
1074 (chrom
.decode
(), peak
["start"], peak
["end"], peakprefix
.decode
(), n_peak
, int(10*peak
[score_column
]),
1075 peak
["thickStart"].decode
(), peak
["thickEnd"].decode
(),
1076 peak
["blockNum"], peak
["blockSizes"].decode
(), peak
["blockStarts"].decode
() ))
1079 def write_to_broadPeak
(self, fhd
, bytes name_prefix
=b
"peak_", bytes name
=b
'peak', bytes description
=b
"%s", str score_column
="score", trackline
=True
):
1080 """Print out peaks in broadPeak format.
1082 This format is designed for ENCODE project, and basically a
1085 +-----------+------+----------------------------------------+
1086 |field |type |description |
1087 +-----------+------+----------------------------------------+
1088 |chrom |string|Name of the chromosome |
1089 +-----------+------+----------------------------------------+
1090 |chromStart |int |The starting position of the feature in |
1091 | | |the chromosome. The first base in a |
1092 | | |chromosome is numbered 0. |
1093 +-----------+------+----------------------------------------+
1094 |chromEnd |int |The ending position of the feature in |
1095 | | |the chromosome or scaffold. The chromEnd|
1096 | | |base is not included in the display of |
1097 | | |the feature. For example, the first 100|
1098 | | |bases of a chromosome are defined as |
1099 | | |chromStart=0, chromEnd=100, and span the|
1100 | | |bases numbered 0-99. |
1101 +-----------+------+----------------------------------------+
1102 |name |string|Name given to a region (preferably |
1103 | | |unique). Use '.' if no name is assigned.|
1104 +-----------+------+----------------------------------------+
1105 |score |int |Indicates how dark the peak will be |
1106 |(-logqvalue| |displayed in the browser (1-1000). If |
1107 |in MACS3 * | |'0', the DCC will assign this based on |
1108 |10) | |signal value. Ideally average |
1109 | | |signalValue per base spread between |
1111 +-----------+------+----------------------------------------+
1112 |strand |char |+/- to denote strand or orientation |
1113 |(always .) | |(whenever applicable). Use '.' if no |
1114 | | |orientation is assigned. |
1115 +-----------+------+----------------------------------------+
1116 |signalValue|float |Measurement of overall (usually, |
1117 |(fc) | |average) enrichment for the region. |
1118 +-----------+------+----------------------------------------+
1119 |pValue |float |Measurement of statistical signficance |
1120 | | |(-log10). Use -1 if no pValue is |
1122 +-----------+------+----------------------------------------+
1123 |qValue |float |Measurement of statistical significance |
1124 | | |using false discovery rate. Use -1 if no|
1125 | | |qValue is assigned. |
1126 +-----------+------+----------------------------------------+
1134 chrs
= list(self.peaks
.keys
())
1138 try: peakprefix
= name_prefix
% name
1139 except: peakprefix
= name_prefix
1141 write
("track type=broadPeak name=\"%s\" description=\"%s\" nextItemButton=on\n" % (name
.decode
(), name
.decode
()))
1143 for end
, group
in groupby
(self.peaks
[chrom
], key
=itemgetter
("end")):
1145 these_peaks
= list(group
)
1146 peak
= these_peaks
[0]
1147 peakname
= "%s%d" % (peakprefix
.decode
(), n_peak
)
1148 fhd
.write
( "%s\t%d\t%d\t%s\t%d\t.\t%.6g\t%.6g\t%.6g\n" %
1149 (chrom
.decode
(),peak
['start'],peak
['end'],peakname
,int(10*peak
[score_column
]),
1150 peak
['fc'],peak
['pscore'],peak
['qscore'] ) )
1154 def write_to_xls
(self, ofhd
, bytes name_prefix
=b
"%s_peak_", bytes name
=b
"MACS"):
1155 """Save the peak results in a tab-delimited plain text file
1159 wait... why I have two write_to_xls in this class?
1163 write
("\t".join
(("chr","start", "end", "length", "pileup", "-log10(pvalue)", "fold_enrichment", "-log10(qvalue)", "name"))+"\n")
1165 try: peakprefix
= name_prefix
% name
1166 except: peakprefix
= name_prefix
1169 chrs
= list(peaks
.keys
())
1173 for end
, group
in groupby
(peaks
[chrom
], key
=itemgetter
("end")):
1175 these_peaks
= list(group
)
1176 peak
= these_peaks
[0]
1177 peakname
= "%s%d" % (peakprefix
.decode
(), n_peak
)
1178 write
("%s\t%d\t%d\t%d" % (chrom
.decode
(),peak
['start']+1,peak
['end'],peak
['length']))
1179 write
("\t%.6g" % (round(peak
['pileup'],2))) # pileup height at summit
1180 write
("\t%.6g" % (peak
['pscore'])) # -log10pvalue at summit
1181 write
("\t%.6g" % (peak
['fc'])) # fold change at summit
1182 write
("\t%.6g" % (peak
['qscore'])) # -log10qvalue at summit
1183 write
("\t%s" % peakname
)