MACS3/IO/PeakIO.pyx

   1 # cython: language_level=3
   2 # cython: profile=True
   3 # Time-stamp: <2020-11-24 17:11:29 Tao Liu>
   4
   5 """Module for PeakIO IO classes.
   6
   7 This code is free software; you can redistribute it and/or modify it
   8 under the terms of the BSD License (see the file LICENSE included with
   9 the distribution).
  10 """
  11
  12 # ------------------------------------
  13 # python modules
  14 # ------------------------------------
  15 from itertools import groupby
  16 from operator import itemgetter
  17 import re
  18 import sys
  19
  20 # ------------------------------------
  21 # MACS3 modules
  22 # ------------------------------------
  23
  24 from MACS3.Utilities.Constants import *
  25
  26 # ------------------------------------
  27 # Other modules
  28 # ------------------------------------
  29
  30 from cpython cimport bool
  31
  32 # ------------------------------------
  33 # constants
  34 # ------------------------------------
  35 __version__ = "PeakIO $Revision$"
  36 __author__ = "Tao Liu <taoliu@jimmy.harvard.edu>"
  37 __doc__ = "PeakIO class"
  38
  39 # ------------------------------------
  40 # Misc functions
  41 # ------------------------------------
  42 cdef str subpeak_letters( int i):
  43     if i < 26:
  44         return chr(97+i)
  45     else:
  46         return subpeak_letters(i // 26) + chr(97 + (i % 26))
  47
  48 # ------------------------------------
  49 # Classes
  50 # ------------------------------------
  51
  52 cdef class PeakContent:
  53     cdef:
  54         int start
  55         int end
  56         int length
  57         int summit
  58         float score
  59         float pileup
  60         float pscore
  61         float fc
  62         float qscore
  63         bytes name
  64
  65     def __init__ ( self, int start, int end, int summit,
  66                    float peak_score, float pileup,
  67                    float pscore, float fold_change, float qscore,
  68                    bytes name= b"NA" ):
  69         self.start = start
  70         self.end = end
  71         self.length = end - start
  72         self.summit = summit
  73         self.score = peak_score
  74         self.pileup = pileup
  75         self.pscore = pscore
  76         self.fc = fold_change
  77         self.qscore = qscore
  78         self.name = name
  79
  80     def __getitem__ ( self, a ):
  81         if a == "start":
  82             return self.start
  83         elif a == "end":
  84             return self.end
  85         elif a == "length":
  86             return self.length
  87         elif a == "summit":
  88             return self.summit
  89         elif a == "score":
  90             return self.score
  91         elif a == "pileup":
  92             return self.pileup
  93         elif a == "pscore":
  94             return self.pscore
  95         elif a == "fc":
  96             return self.fc
  97         elif a == "qscore":
  98             return self.qscore
  99         elif a == "name":
 100             return self.name
 101
 102     def __setitem__ ( self, a, v ):
 103         if a == "start":
 104             self.start = v
 105         elif a == "end":
 106             self.end = v
 107         elif a == "length":
 108             self.length = v
 109         elif a == "summit":
 110             self.summit = v
 111         elif a == "score":
 112             self.score = v
 113         elif a == "pileup":
 114             self.pileup = v
 115         elif a == "pscore":
 116             self.pscore = v
 117         elif a == "fc":
 118             self.fc = v
 119         elif a == "qscore":
 120             self.qscore = v
 121         elif a == "name":
 122             self.name = v
 123
 124     def __str__ (self):
 125         return "start:%d;end:%d;score:%f" % ( self.start, self.end, self.score )
 126
 127 cdef class PeakIO:
 128     """IO for peak information.
 129
 130     """
 131     cdef:
 132         dict peaks
 133
 134     def __init__ (self):
 135         self.peaks = {}
 136
 137     cpdef add (self, bytes chromosome, int start, int end, int summit = 0,
 138                float peak_score = 0, float pileup = 0,
 139                float pscore = 0, float fold_change = 0, float qscore = 0,
 140                bytes name = b"NA"):
 141         """items:
 142         start:start
 143         end:end,
 144         length:end-start,
 145         summit:summit,
 146         score:peak_score,
 147         pileup:pileup,
 148         pscore:pscore,
 149         fc:fold_change,
 150         qscore:qscore
 151         """
 152         if not self.peaks.has_key(chromosome):
 153             self.peaks[chromosome]=[]
 154         self.peaks[chromosome].append(PeakContent( start, end, summit, peak_score, pileup, pscore, fold_change, qscore, name))
 155
 156     cpdef add_PeakContent ( self, bytes chromosome, object peakcontent ):
 157         if not self.peaks.has_key(chromosome):
 158             self.peaks[chromosome]=[]
 159         self.peaks[chromosome].append(peakcontent)
 160
 161     def get_data_from_chrom (self, bytes chrom):
 162         if not self.peaks.has_key( chrom ):
 163             self.peaks[chrom]= []
 164         return self.peaks[chrom]
 165
 166     cpdef set get_chr_names (self):
 167         return set(sorted(self.peaks.keys()))
 168
 169     def sort ( self ):
 170         # sort by position
 171         chrs = sorted(list(self.peaks.keys()))
 172         for chrom in chrs:
 173             self.peaks[chrom].sort(key=lambda x:x['start'])
 174         return
 175
 176
 177     def filter_pscore (self, double pscore_cut ):
 178         cdef bytes chrom
 179
 180         peaks = self.peaks
 181         new_peaks = {}
 182         chrs = sorted(list(peaks.keys()))
 183
 184         for chrom in chrs:
 185             new_peaks[chrom]=[p for p in peaks[chrom] if p['pscore'] >= pscore_cut]
 186         self.peaks = new_peaks
 187
 188     def filter_qscore (self, double qscore_cut ):
 189         cdef bytes chrom
 190
 191         peaks = self.peaks
 192         new_peaks = {}
 193         chrs = sorted(list(peaks.keys()))
 194
 195         for chrom in chrs:
 196             new_peaks[chrom]=[p for p in peaks[chrom] if p['qscore'] >= qscore_cut]
 197         self.peaks = new_peaks
 198
 199     def filter_fc (self, fc_low, fc_up=None ):
 200         """Filter peaks in a given fc range.
 201
 202         If fc_low and fc_up is assigned, the peaks with fc in [fc_low,fc_up)
 203
 204         """
 205         peaks = self.peaks
 206         new_peaks = {}
 207         chrs = list(peaks.keys())
 208         chrs.sort()
 209         if fc_up:
 210             for chrom in chrs:
 211                 new_peaks[chrom]=[p for p in peaks[chrom] if p['fc'] >= fc_low and p['fc']<fc_up]
 212         else:
 213             for chrom in chrs:
 214                 new_peaks[chrom]=[p for p in peaks[chrom] if p['fc'] >= fc_low]
 215         self.peaks = new_peaks
 216
 217     def total (self):
 218         peaks = self.peaks
 219         chrs = list(peaks.keys())
 220         chrs.sort()
 221         x = 0
 222         for chrom in chrs:
 223             x += len(peaks[chrom])
 224         return x
 225
 226     # these methods are very fast, specifying types is unnecessary
 227     def write_to_xls (self, fhd, bytes name_prefix=b"%s_peak_", bytes name=b"MACS"):
 228         return self._to_xls(name_prefix=name_prefix, name=name,
 229                             print_func=fhd.write)
 230
 231     cdef _to_xls (self, bytes name_prefix=b"%s_peak_", bytes name=b"MACS", print_func=sys.stdout.write):
 232
 233         if self.peaks:
 234             print_func("\t".join(("chr","start", "end",  "length",  "abs_summit", "pileup", "-log10(pvalue)", "fold_enrichment", "-log10(qvalue)", "name"))+"\n")
 235         else:
 236             return
 237
 238         try: peakprefix = name_prefix % name
 239         except: peakprefix = name_prefix
 240
 241         peaks = self.peaks
 242         chrs = list(peaks.keys())
 243         chrs.sort()
 244         n_peak = 0
 245         for chrom in chrs:
 246             for end, group in groupby(peaks[chrom], key=itemgetter("end")):
 247                 n_peak += 1
 248                 these_peaks = list(group)
 249                 if len(these_peaks) > 1:
 250                     for i, peak in enumerate(these_peaks):
 251                         peakname = "%s%d%s" % (peakprefix.decode(), n_peak, subpeak_letters(i))
 252                         #[start,end,end-start,summit,peak_height,number_tags,pvalue,fold_change,qvalue]
 253                         print_func("%s\t%d\t%d\t%d" % (chrom.decode(),peak['start']+1,peak['end'],peak['length']))
 254                         print_func("\t%d" % (peak['summit']+1)) # summit position
 255                         print_func("\t%.6g" % (round(peak['pileup'],2))) # pileup height at summit
 256                         print_func("\t%.6g" % (peak['pscore'])) # -log10pvalue at summit
 257                         print_func("\t%.6g" % (peak['fc'])) # fold change at summit
 258                         print_func("\t%.6g" % (peak['qscore'])) # -log10qvalue at summit
 259                         print_func("\t%s" % peakname)
 260                         print_func("\n")
 261                 else:
 262                     peak = these_peaks[0]
 263                     peakname = "%s%d" % (peakprefix.decode(), n_peak)
 264                     #[start,end,end-start,summit,peak_height,number_tags,pvalue,fold_change,qvalue]
 265                     print_func("%s\t%d\t%d\t%d" % (chrom.decode(),peak['start']+1,peak['end'],peak['length']))
 266                     print_func("\t%d" % (peak['summit']+1)) # summit position
 267                     print_func("\t%.6g" % (round(peak['pileup'],2))) # pileup height at summit
 268                     print_func("\t%.6g" % (peak['pscore'])) # -log10pvalue at summit
 269                     print_func("\t%.6g" % (peak['fc'])) # fold change at summit
 270                     print_func("\t%.6g" % (peak['qscore'])) # -log10qvalue at summit
 271                     print_func("\t%s" % peakname)
 272                     print_func("\n")
 273         return
 274
 275     cdef _to_bed(self, bytes name_prefix=b"%s_peak_", bytes name=b"MACS",
 276                 bytes description=b"%s", str score_column="score",
 277                 trackline=False, print_func=sys.stdout.write):
 278         """
 279         generalization of tobed and write_to_bed
 280         """
 281         #print(description)
 282         chrs = list(self.peaks.keys())
 283         chrs.sort()
 284         n_peak = 0
 285         try: peakprefix = name_prefix % name
 286         except: peakprefix = name_prefix
 287         try: desc = description % name
 288         except: desc = description
 289         if trackline:
 290             try: print_func('track name="%s (peaks)" description="%s" visibility=1\n' % ( name.replace(b"\"", b"\\\"").decode(),\
 291                                                                                           desc.replace(b"\"", b"\\\"").decode() ) )
 292             except: print_func('track name=MACS description=Unknown\n')
 293         for chrom in chrs:
 294             for end, group in groupby(self.peaks[chrom], key=itemgetter("end")):
 295                 n_peak += 1
 296                 peaks = list(group)
 297                 if len(peaks) > 1:
 298                     for i, peak in enumerate(peaks):
 299                         print_func("%s\t%d\t%d\t%s%d%s\t%.6g\n" % (chrom.decode(),peak['start'],peak['end'],peakprefix.decode(),n_peak,subpeak_letters(i),peak[score_column]))
 300                 else:
 301                     peak = peaks[0]
 302                     print_func("%s\t%d\t%d\t%s%d\t%.6g\n" % (chrom.decode(),peak['start'],peak['end'],peakprefix.decode(),n_peak,peak[score_column]))
 303
 304     cdef _to_summits_bed(self, bytes name_prefix=b"%s_peak_", bytes name=b"MACS",
 305                         bytes description = b"%s", str score_column="score",
 306                         bool trackline=False, print_func=sys.stdout.write):
 307         """
 308         generalization of to_summits_bed and write_to_summit_bed
 309         """
 310         chrs = list(self.peaks.keys())
 311         chrs.sort()
 312         n_peak = 0
 313         try: peakprefix = name_prefix % name
 314         except: peakprefix = name_prefix
 315         try: desc = description % name
 316         except: desc = description
 317         if trackline:
 318             try: print_func('track name="%s (summits)" description="%s" visibility=1\n' % ( name.replace(b"\"", b"\\\"").decode(),\
 319                                                                                             desc.replace(b"\"", b"\\\"").decode() ) )
 320             except: print_func('track name=MACS description=Unknown')
 321         for chrom in chrs:
 322             for end, group in groupby(self.peaks[chrom], key=itemgetter("end")):
 323                 n_peak += 1
 324                 peaks = list(group)
 325                 if len(peaks) > 1:
 326                     for i, peak in enumerate(peaks):
 327                         summit_p = peak['summit']
 328                         print_func("%s\t%d\t%d\t%s%d%s\t%.6g\n" % (chrom.decode(),summit_p,summit_p+1,peakprefix.decode(),n_peak,subpeak_letters(i),peak[score_column]))
 329                 else:
 330                     peak = peaks[0]
 331                     summit_p = peak['summit']
 332                     print_func("%s\t%d\t%d\t%s%d\t%.6g\n" % (chrom.decode(),summit_p,summit_p+1,peakprefix.decode(),n_peak,peak[score_column]))
 333
 334     def tobed (self):
 335         """Print out peaks in BED5 format.
 336
 337         Five columns are chromosome, peak start, peak end, peak name, and peak height.
 338
 339         start:start
 340         end:end,
 341         length:end-start,
 342         summit:summit,
 343         score:peak_score,
 344         pileup:pileup,
 345         pscore:pvalue,
 346         fc:fold_change,
 347         qscore:qvalue
 348         """
 349         return self._to_bed(name_prefix=b"peak_", score_column="score", name=b"", description=b"")
 350
 351     def to_summits_bed (self):
 352         """Print out peak summits in BED5 format.
 353
 354         Five columns are chromosome, summit start, summit end, peak name, and peak height.
 355
 356         """
 357         return self._to_summits_bed(name_prefix=b"peak_", score_column="score", name=b"", description=b"")
 358
 359     # these methods are very fast, specifying types is unnecessary
 360     def write_to_bed (self, fhd, bytes name_prefix=b"peak_", bytes name=b"MACS",
 361                         bytes description = b"%s", str score_column="score", trackline=True):
 362         """Write peaks in BED5 format in a file handler. Score (5th
 363         column) is decided by score_column setting. Check the
 364         following list. Name column ( 4th column) is made by putting
 365         name_prefix together with an ascending number.
 366
 367         Five columns are chromosome, peak start, peak end, peak name,
 368         and peak score.
 369
 370         items in peak hash object:
 371
 372         start:start
 373         end:end,
 374         length:end-start,
 375         summit:summit,
 376         score:peak_score,
 377         pileup:pileup,
 378         pscore:pvalue,
 379         fc:fold_change,
 380         qscore:qvalue
 381         """
 382         #print(description)
 383         return self._to_bed(name_prefix=name_prefix, name=name,
 384                             description=description, score_column=score_column,
 385                             print_func=fhd.write, trackline=trackline)
 386
 387     def write_to_summit_bed (self, fhd, bytes name_prefix = b"peak_", bytes name = b"MACS",
 388                              bytes description = b"%s", str score_column ="score", trackline=True):
 389         """Write peak summits in BED5 format in a file handler. Score
 390         (5th column) is decided by score_column setting. Check the
 391         following list. Name column ( 4th column) is made by putting
 392         name_prefix together with an ascending number.
 393
 394         Five columns are chromosome, summit start, summit end, peak name, and peak score.
 395
 396         items in peak object:
 397
 398         start:start
 399         end:end,
 400         length:end-start,
 401         summit:summit,
 402         score:peak_score,
 403         pileup:pileup,
 404         pscore:pvalue,
 405         fc:fold_change,
 406         qscore:qvalue
 407         """
 408         return self._to_summits_bed(name_prefix=name_prefix, name=name,
 409                                     description=description, score_column=score_column,
 410                                     print_func=fhd.write, trackline=trackline)
 411
 412     def write_to_narrowPeak (self, fhd, bytes name_prefix = b"peak_", bytes name = b"peak", str score_column="score", trackline=True):
 413         """Print out peaks in narrowPeak format.
 414
 415         This format is designed for ENCODE project, and basically a
 416         BED6+4 format.
 417
 418         +-----------+------+----------------------------------------+
 419         |field      |type  |description                             |
 420         +-----------+------+----------------------------------------+
 421         |chrom      |string|Name of the chromosome                  |
 422         +-----------+------+----------------------------------------+
 423         |chromStart |int   |The starting position of the feature in |
 424         |           |      |the chromosome. The first base in a     |
 425         |           |      |chromosome is numbered 0.               |
 426         +-----------+------+----------------------------------------+
 427         |chromEnd   |int   |The ending position of the feature in   |
 428         |           |      |the chromosome or scaffold. The chromEnd|
 429         |           |      |base is not included in the display of  |
 430         |           |      |the feature.  For example, the first 100|
 431         |           |      |bases of a chromosome are defined as    |
 432         |           |      |chromStart=0, chromEnd=100, and span the|
 433         |           |      |bases numbered 0-99.                    |
 434         +-----------+------+----------------------------------------+
 435         |name       |string|Name given to a region (preferably      |
 436         |           |      |unique). Use '.' if no name is assigned.|
 437         +-----------+------+----------------------------------------+
 438         |score      |int   |Indicates how dark the peak will be     |
 439         |(-logpvalue|      |displayed in the browser (1-1000). If   |
 440         |in MACS3 * |      |'0', the DCC will assign this based on  |
 441         |10)        |      |signal value. Ideally average           |
 442         |           |      |signalValue per base spread between     |
 443         |           |      |100-1000.                               |
 444         +-----------+------+----------------------------------------+
 445         |strand     |char  |+/- to denote strand or orientation     |
 446         |(always .) |      |(whenever applicable). Use '.' if no    |
 447         |           |      |orientation is assigned.                |
 448         +-----------+------+----------------------------------------+
 449         |signalValue|float |Measurement of overall (usually,        |
 450         |(fc)       |      |average) enrichment for the region.     |
 451         +-----------+------+----------------------------------------+
 452         |pValue     |float |Measurement of statistical signficance  |
 453         |           |      |(-log10). Use -1 if no pValue is        |
 454         |           |      |assigned.                               |
 455         +-----------+------+----------------------------------------+
 456         |qValue     |float |Measurement of statistical significance |
 457         |           |      |using false discovery rate. Use -1 if no|
 458         |           |      |qValue is assigned.                     |
 459         +-----------+------+----------------------------------------+
 460         |peak       |int   |Point-source called for this peak;      |
 461         |           |      |0-based offset from chromStart. Use -1  |
 462         |           |      |if no point-source called.              |
 463         +-----------+------+----------------------------------------+
 464
 465         """
 466         cdef int n_peak
 467         cdef bytes chrom
 468         cdef long s
 469         cdef str peakname
 470
 471         chrs = list(self.peaks.keys())
 472         chrs.sort()
 473         n_peak = 0
 474         write = fhd.write
 475         try: peakprefix = name_prefix % name
 476         except: peakprefix = name_prefix
 477         if trackline:
 478             write("track type=narrowPeak name=\"%s\" description=\"%s\" nextItemButton=on\n" % (name.decode(), name.decode()))
 479         for chrom in chrs:
 480             for end, group in groupby(self.peaks[chrom], key=itemgetter("end")):
 481                 n_peak += 1
 482                 these_peaks = list(group)
 483                 if len(these_peaks) > 1: # from call-summits
 484                     for i, peak in enumerate(these_peaks):
 485                         peakname = "%s%d%s" % (peakprefix.decode(), n_peak, subpeak_letters(i))
 486                         if peak['summit'] == -1:
 487                             s = -1
 488                         else:
 489                             s = peak['summit'] - peak['start']
 490                         fhd.write( "%s\t%d\t%d\t%s\t%d\t.\t%.6g\t%.6g\t%.6g\t%d\n"
 491                                    %
 492                                    (chrom.decode(),peak['start'],peak['end'],peakname,int(10*peak[score_column]),
 493                                     peak['fc'],peak['pscore'],peak['qscore'],s) )
 494                 else:
 495                     peak = these_peaks[0]
 496                     peakname = "%s%d" % (peakprefix.decode(), n_peak)
 497                     if peak['summit'] == -1:
 498                         s = -1
 499                     else:
 500                         s = peak['summit'] - peak['start']
 501                     fhd.write( "%s\t%d\t%d\t%s\t%d\t.\t%.6g\t%.6g\t%.6g\t%d\n"
 502                                %
 503                                (chrom.decode(),peak['start'],peak['end'],peakname,int(10*peak[score_column]),
 504                                 peak['fc'],peak['pscore'],peak['qscore'],s) )
 505         return
 506
 507     def write_to_xls (self, ofhd, bytes name_prefix = b"%s_peak_", bytes name = b"MACS"):
 508         """Save the peak results in a tab-delimited plain text file
 509         with suffix .xls.
 510
 511
 512         wait... why I have two write_to_xls in this class?
 513
 514         """
 515         write = ofhd.write
 516         write("\t".join(("chr","start", "end",  "length",  "abs_summit", "pileup", "-log10(pvalue)", "fold_enrichment", "-log10(qvalue)", "name"))+"\n")
 517
 518         try: peakprefix = name_prefix % name
 519         except: peakprefix = name_prefix
 520
 521         peaks = self.peaks
 522         chrs = list(peaks.keys())
 523         chrs.sort()
 524         n_peak = 0
 525         for chrom in chrs:
 526             for end, group in groupby(peaks[chrom], key=itemgetter("end")):
 527                 n_peak += 1
 528                 these_peaks = list(group)
 529                 if len(these_peaks) > 1:
 530                     for i, peak in enumerate(these_peaks):
 531                         peakname = "%s%d%s" % (peakprefix.decode(), n_peak, subpeak_letters(i))
 532                         #[start,end,end-start,summit,peak_height,number_tags,pvalue,fold_change,qvalue]
 533                         write("%s\t%d\t%d\t%d" % (chrom.decode(),peak['start']+1,peak['end'],peak['length']))
 534                         write("\t%d" % (peak['summit']+1)) # summit position
 535                         write("\t%.6g" % (round(peak['pileup'],2))) # pileup height at summit
 536                         write("\t%.6g" % (peak['pscore'])) # -log10pvalue at summit
 537                         write("\t%.6g" % (peak['fc'])) # fold change at summit
 538                         write("\t%.6g" % (peak['qscore'])) # -log10qvalue at summit
 539                         write("\t%s" % peakname)
 540                         write("\n")
 541                 else:
 542                     peak = these_peaks[0]
 543                     peakname = "%s%d" % (peakprefix.decode(), n_peak)
 544                     #[start,end,end-start,summit,peak_height,number_tags,pvalue,fold_change,qvalue]
 545                     write("%s\t%d\t%d\t%d" % (chrom.decode(),peak['start']+1,peak['end'],peak['length']))
 546                     write("\t%d" % (peak['summit']+1)) # summit position
 547                     write("\t%.6g" % (round(peak['pileup'],2))) # pileup height at summit
 548                     write("\t%.6g" % (peak['pscore'])) # -log10pvalue at summit
 549                     write("\t%.6g" % (peak['fc'])) # fold change at summit
 550                     write("\t%.6g" % (peak['qscore'])) # -log10qvalue at summit
 551                     write("\t%s" % peakname)
 552                     write("\n")
 553         return
 554
 555
 556     def overlap_with_other_peaks (self, peaks2, double cover=0):
 557         """Peaks2 is a PeakIO object or dictionary with can be
 558         initialized as a PeakIO. check __init__ for PeakIO for detail.
 559
 560         return how many peaks are intersected by peaks2 by percentage
 561         coverage on peaks2(if 50%, cover = 0.5).
 562         """
 563         cdef int total_num
 564         cdef list chrs1, chrs2, a
 565         cdef bytes k
 566
 567         peaks1 = self.peaks
 568         if isinstance(peaks2,PeakIO):
 569             peaks2 = peaks2.peaks
 570         total_num = 0
 571         chrs1 = list(peaks1.keys())
 572         chrs2 = list(peaks2.keys())
 573         for k in chrs1:
 574             if not chrs2.count(k):
 575                 continue
 576             rl1_k = iter(peaks1[k])
 577             rl2_k = iter(peaks2[k])
 578             tmp_n = False
 579             try:
 580                 r1 = rl1_k.next()
 581                 r2 = rl2_k.next()
 582                 while (True):
 583                     if r2[0] < r1[1] and r1[0] < r2[1]:
 584                         a = sorted([r1[0],r1[1],r2[0],r2[1]])
 585                         if float(a[2]-a[1]+1)/r2[2] > cover:
 586                             if not tmp_n:
 587                                 total_num+=1
 588                                 tmp_n = True
 589                     if r1[1] < r2[1]:
 590                         r1 = rl1_k.next()
 591                         tmp_n = False
 592                     else:
 593                         r2 = rl2_k.next()
 594             except StopIteration:
 595                 continue
 596         return total_num
 597
 598     def read_from_xls (self, ofhd):
 599         """Save the peak results in a tab-delimited plain text file
 600         with suffix .xls.
 601
 602         """
 603         cdef:
 604             bytes line = b''
 605             bytes chrom = b''
 606             int n_peak = 0
 607             int start, end, length, summit
 608             float pileup, pscore, fc, qscore
 609             list fields
 610         while True:
 611             if not (line.startswith('#') or line.strip() == ''): break
 612             line = ofhd.readline()
 613
 614         # sanity check
 615         columns = line.rstrip().split('\t')
 616         for a,b in zip(columns, ("chr","start", "end",  "length", "abs_summit",
 617                                  "pileup", "-log10(pvalue)", "fold_enrichment",
 618                                  "-log10(qvalue)", "name")):
 619             if not a==b: raise NotImplementedError('column %s not recognized', a)
 620
 621         add = self.add
 622         split = str.split
 623         rstrip = str.rstrip
 624         for i, line in enumerate(ofhd.readlines()):
 625             fields = split(line, '\t')
 626             peak = {}
 627             chrom = fields[0].encode()
 628             start = int(fields[1]) - 1
 629             end = int(fields[2])
 630             length = int(fields[3])
 631             if end - start != length:
 632                 raise UserWarning('Malformed peak at line %d:\n%s' % (i, line))
 633             summit = int(fields[4]) - 1
 634             pileup = float(fields[5])
 635             pscore = float(fields[6])
 636             fc = float(fields[7])
 637             qscore = float(fields[8])
 638             peakname = rstrip(fields[9])
 639             add(chrom, start, end, summit, qscore, pileup, pscore, fc, qscore,
 640                 peakname)
 641
 642 cpdef parse_peakname(peakname):
 643     """returns peaknumber, subpeak
 644     """
 645     cdef:
 646         bytes peak_id, peaknumber, subpeak
 647     peak_id = peakname.split(b'_')[-1]
 648     x = re.split('(\D.*)', peak_id)
 649     peaknumber = int(x[0])
 650     try:
 651         subpeak = x[1]
 652     except IndexError:
 653         subpeak = b''
 654     return (peaknumber, subpeak)
 655
 656 cdef class Region:
 657     """For plain region of chrom, start and end
 658     """
 659     cdef:
 660         dict regions
 661         bool __flag_sorted
 662
 663     def __init__ (self):
 664         self.regions= {}
 665         self.__flag_sorted = False
 666
 667     def add_loc ( self, bytes chrom, int start, int end ):
 668         if self.regions.has_key(chrom):
 669             self.regions[chrom].append( (start,end) )
 670         else:
 671             self.regions[chrom] = [(start,end), ]
 672         self.__flag_sorted = False
 673         return
 674
 675     def sort (self):
 676         cdef bytes chrom
 677
 678         for chrom in list(self.regions.keys()):
 679             self.regions[chrom].sort()
 680         self.__flag_sorted = True
 681
 682     def merge_overlap ( self ):
 683         cdef bytes chrom
 684         cdef int s_new_region, e_new_region, i, j
 685
 686         if not self.__flag_sorted:
 687             self.sort()
 688         regions = self.regions
 689         new_regions = {}
 690         chrs = list(regions.keys())
 691         chrs.sort()
 692         for i in range(len(chrs)):
 693             chrom = chrs[i]
 694         #for chrom in chrs:
 695             new_regions[chrom]=[]
 696             n_append = new_regions[chrom].append
 697             prev_region = None
 698             regions_chr = regions[chrom]
 699             for i in range(len(regions_chr)):
 700                 if not prev_region:
 701                     prev_region = regions_chr[i]
 702                     continue
 703                 else:
 704                     if regions_chr[i][0] <= prev_region[1]:
 705                         s_new_region = prev_region[0]
 706                         e_new_region = regions_chr[i][1]
 707                         prev_region = (s_new_region,e_new_region)
 708                     else:
 709                         n_append(prev_region)
 710                         prev_region = regions_chr[i]
 711             if prev_region:
 712                 n_append(prev_region)
 713         self.regions = new_regions
 714         self.sort()
 715         return True
 716
 717     def write_to_bed (self, fhd ):
 718         cdef int i
 719         cdef bytes chrom
 720
 721         chrs = list(self.regions.keys())
 722         chrs.sort()
 723         for i in range( len(chrs) ):
 724             chrom = chrs[i]
 725             for region in self.regions[chrom]:
 726                 fhd.write( "%s\t%d\t%d\n" % (chrom.decode(),region[0],region[1] ) )
 727
 728
 729 cdef class BroadPeakContent:
 730     cdef:
 731         long start
 732         long end
 733         long length
 734         float score
 735         bytes thickStart
 736         bytes thickEnd
 737         long blockNum
 738         bytes  blockSizes
 739         bytes  blockStarts
 740         float pileup
 741         float pscore
 742         float fc
 743         float qscore
 744         bytes name
 745
 746     def __init__ ( self, long start, long end, float score,
 747                    bytes thickStart, bytes thickEnd,
 748                    long blockNum, bytes blockSizes,
 749                    bytes blockStarts, float pileup,
 750                    float pscore, float fold_change,
 751                    float qscore, bytes name = b"NA" ):
 752         self.start = start
 753         self.end = end
 754         self.score = score
 755         self.thickStart = thickStart
 756         self.thickEnd = thickEnd
 757         self.blockNum = blockNum
 758         self.blockSizes = blockSizes
 759         self.blockStarts = blockStarts
 760
 761         self.length = end - start
 762         self.pileup = pileup
 763         self.pscore = pscore
 764         self.fc = fold_change
 765         self.qscore = qscore
 766         self.name = name
 767
 768     def __getitem__ ( self, a ):
 769         if a == "start":
 770             return self.start
 771         elif a == "end":
 772             return self.end
 773         elif a == "length":
 774             return self.length
 775         elif a == "score":
 776             return self.score
 777         elif a == "thickStart":
 778             return self.thickStart
 779         elif a == "thickEnd":
 780             return self.thickEnd
 781         elif a == "blockNum":
 782             return self.blockNum
 783         elif a == "blockSizes":
 784             return self.blockSizes
 785         elif a == "blockStarts":
 786             return self.blockStarts
 787         elif a == "pileup":
 788             return self.pileup
 789         elif a == "pscore":
 790             return self.pscore
 791         elif a == "fc":
 792             return self.fc
 793         elif a == "qscore":
 794             return self.qscore
 795         elif a == "name":
 796             return self.name
 797
 798     def __str__ (self):
 799         return "start:%d;end:%d;score:%f" % ( self.start, self.end, self.score )
 800
 801
 802 cdef class BroadPeakIO:
 803     """IO for broad peak information.
 804
 805     """
 806     cdef:
 807         dict peaks
 808
 809     def __init__ (self):
 810         self.peaks = {}
 811
 812     def add (self, char * chromosome, long start, long end, long score = 0,
 813              bytes thickStart=b".", bytes thickEnd=b".",
 814              long blockNum=0, bytes blockSizes=b".",
 815              bytes blockStarts=b".", float pileup = 0,
 816              float pscore = 0, float fold_change = 0,
 817              float qscore = 0, bytes name = b"NA" ):
 818         """items
 819         chromosome : chromosome name,
 820         start      : broad region start,
 821         end        : broad region end,
 822         score      : average score in all blocks,
 823         thickStart : start of highly enriched region, # could be b'.'
 824         thickEnd   : end of highly enriched region,   # could be b'.'
 825         blockNum   : number of blocks,                # could be 0
 826         blockSizes : sizes of blocks,                 # could be b'.'
 827         blockStarts: starts of blocks                 # could be b'.'
 828         pileup     : median pileup in region          # could be 0
 829         pscore     : median pvalue score in region    # could be 0
 830         fold_change: median fold change in region     # could be 0
 831         qscore     : median pvalue score in region    # could be 0
 832         name       : peak name                        # could be b'NA'
 833         """
 834         if not self.peaks.has_key(chromosome):
 835             self.peaks[chromosome] = []
 836         self.peaks[chromosome].append( BroadPeakContent( start, end, score, thickStart, thickEnd,
 837                                                          blockNum, blockSizes, blockStarts,
 838                                                          pileup, pscore, fold_change, qscore, name ) )
 839
 840     def filter_pscore (self, double pscore_cut ):
 841         cdef:
 842             bytes chrom
 843             dict peaks
 844             dict new_peaks
 845             list chrs
 846             BroadPeakContent p
 847
 848         peaks = self.peaks
 849         new_peaks = {}
 850         chrs = sorted(list(peaks.keys()))
 851
 852         for chrom in chrs:
 853             new_peaks[chrom]=[p for p in peaks[chrom] if p['pscore'] >= pscore_cut]
 854         self.peaks = new_peaks
 855
 856     def filter_qscore (self, double qscore_cut ):
 857         cdef:
 858             bytes chrom
 859             dict peaks
 860             dict new_peaks
 861             list chrs
 862             BroadPeakContent p
 863
 864         peaks = self.peaks
 865         new_peaks = {}
 866         chrs = sorted(list(peaks.keys()))
 867
 868         for chrom in chrs:
 869             new_peaks[chrom]=[p for p in peaks[chrom] if p['qscore'] >= qscore_cut]
 870         self.peaks = new_peaks
 871
 872     def filter_fc (self, fc_low, fc_up=None ):
 873         """Filter peaks in a given fc range.
 874
 875         If fc_low and fc_up is assigned, the peaks with fc in [fc_low,fc_up)
 876
 877         """
 878         cdef:
 879             bytes chrom
 880             dict peaks
 881             dict new_peaks
 882             list chrs
 883             BroadPeakContent p
 884
 885         peaks = self.peaks
 886         new_peaks = {}
 887         chrs = list(peaks.keys())
 888         chrs.sort()
 889         if fc_up:
 890             for chrom in chrs:
 891                 new_peaks[chrom]=[p for p in peaks[chrom] if p['fc'] >= fc_low and p['fc']<fc_up]
 892         else:
 893             for chrom in chrs:
 894                 new_peaks[chrom]=[p for p in peaks[chrom] if p['fc'] >= fc_low]
 895         self.peaks = new_peaks
 896
 897     def total (self):
 898         cdef:
 899             bytes chrom
 900             dict peaks
 901             list chrs
 902             long x
 903
 904         peaks = self.peaks
 905         chrs = list(peaks.keys())
 906         chrs.sort()
 907         x = 0
 908         for chrom in chrs:
 909             x += len(peaks[chrom])
 910         return x
 911
 912     def write_to_gappedPeak (self, fhd, bytes name_prefix=b"peak_", bytes name=b'peak', bytes description=b"%s", str score_column="score", trackline=True):
 913         """Print out peaks in gappedBed format. Only those with stronger enrichment regions are saved.
 914
 915         This format is basically a BED12+3 format.
 916
 917         +--------------+------+----------------------------------------+
 918         |field         |type  |description                             |
 919         +--------------+------+----------------------------------------+
 920         |chrom         |string|Name of the chromosome                  |
 921         +--------------+------+----------------------------------------+
 922         |chromStart    |int   |The starting position of the feature in |
 923         |              |      |the chromosome. The first base in a     |
 924         |              |      |chromosome is numbered 0.               |
 925         +--------------+------+----------------------------------------+
 926         |chromEnd      |int   |The ending position of the feature in   |
 927         |              |      |the chromosome or scaffold. The chromEnd|
 928         |              |      |base is not included in the display of  |
 929         |              |      |the feature.  For example, the first 100|
 930         |              |      |bases of a chromosome are defined as    |
 931         |              |      |chromStart=0, chromEnd=100, and span the|
 932         |              |      |bases numbered 0-99.                    |
 933         +--------------+------+----------------------------------------+
 934         |name          |string|Name given to a region (preferably      |
 935         |              |      |unique). Use '.' if no name is assigned.|
 936         +--------------+------+----------------------------------------+
 937         |score         |int   |Indicates how dark the peak will be     |
 938         |(always use   |      |displayed in the browser (1-1000). If   |
 939         |1000 for      |      |'0', the DCC will assign this based on  |
 940         |the           |      |signal value. Ideally average           |
 941         |thickest      |      |signalValue per base spread between     |
 942         |color)        |      |100-1000.                               |
 943         +--------------+------+----------------------------------------+
 944         |strand        |char  |+/- to denote strand or orientation     |
 945         |(always .)    |      |(whenever applicable). Use '.' if no    |
 946         |              |      |orientation is assigned.                |
 947         +--------------+------+----------------------------------------+
 948         |thickStart    |int   | The starting position at which the     |
 949         |              |      |feature is drawn thickly. Mark the start|
 950         |              |      |of highly enriched regions.             |
 951         |              |      |                                        |
 952         +--------------+------+----------------------------------------+
 953         |thickEnd      |int   | The ending position at which the       |
 954         |              |      |feature is drawn thickly. Mark the end  |
 955         |              |      |of highly enriched regions.             |
 956         +--------------+------+----------------------------------------+
 957         |itemRGB       |string| Not used. Set it as 0.                 |
 958         +--------------+------+----------------------------------------+
 959         |blockCounts   |int   | The number of blocks (exons) in the BED|
 960         |              |      |line.                                   |
 961         +--------------+------+----------------------------------------+
 962         |blockSizes    |string| A comma-separated list of the block    |
 963         |              |      |sizes.                                  |
 964         +--------------+------+----------------------------------------+
 965         |blockStarts   |string| A comma-separated list of block starts.|
 966         +--------------+------+----------------------------------------+
 967         |signalValue   |float |Measurement of overall (usually,        |
 968         |(fc)          |      |average) enrichment for the region.     |
 969         +--------------+------+----------------------------------------+
 970         |pValue        |float |Measurement of statistical signficance  |
 971         |              |      |(-log10). Use -1 if no pValue is        |
 972         |              |      |assigned.                               |
 973         +--------------+------+----------------------------------------+
 974         |qValue        |float |Measurement of statistical significance |
 975         |              |      |using false discovery rate. Use -1 if no|
 976         |              |      |qValue is assigned.                     |
 977         +--------------+------+----------------------------------------+
 978
 979         """
 980         chrs = list(self.peaks.keys())
 981         chrs.sort()
 982         n_peak = 0
 983         try: peakprefix = name_prefix % name
 984         except: peakprefix = name_prefix
 985         try: desc = description % name
 986         except: desc = description
 987         if trackline:
 988             fhd.write("track name=\"%s\" description=\"%s\" type=gappedPeak nextItemButton=on\n" % (name.decode(), desc.decode()) )
 989         for chrom in chrs:
 990             for peak in self.peaks[chrom]:
 991                 n_peak += 1
 992                 if peak["thickStart"] != b".":
 993                     fhd.write( "%s\t%d\t%d\t%s%d\t%d\t.\t%s\t%s\t0\t%d\t%s\t%s\t%.6g\t%.6g\t%.6g\n"
 994                                %
 995                                (chrom.decode(),peak["start"],peak["end"],peakprefix.decode(),n_peak,int(10*peak[score_column]),
 996                                 peak["thickStart"].decode(),peak["thickEnd"].decode(),
 997                                 peak["blockNum"],peak["blockSizes"].decode(),peak["blockStarts"].decode(), peak['fc'], peak['pscore'], peak['qscore'] ) )
 998
 999     def write_to_Bed12 (self, fhd, bytes name_prefix=b"peak_", bytes name=b'peak', bytes description=b"%s", str score_column="score", trackline=True):
1000         """Print out peaks in Bed12 format.
1001
1002         +--------------+------+----------------------------------------+
1003         |field         |type  |description                             |
1004         +--------------+------+----------------------------------------+
1005         |chrom         |string|Name of the chromosome                  |
1006         +--------------+------+----------------------------------------+
1007         |chromStart    |int   |The starting position of the feature in |
1008         |              |      |the chromosome. The first base in a     |
1009         |              |      |chromosome is numbered 0.               |
1010         +--------------+------+----------------------------------------+
1011         |chromEnd      |int   |The ending position of the feature in   |
1012         |              |      |the chromosome or scaffold. The chromEnd|
1013         |              |      |base is not included in the display of  |
1014         |              |      |the feature.  For example, the first 100|
1015         |              |      |bases of a chromosome are defined as    |
1016         |              |      |chromStart=0, chromEnd=100, and span the|
1017         |              |      |bases numbered 0-99.                    |
1018         +--------------+------+----------------------------------------+
1019         |name          |string|Name given to a region (preferably      |
1020         |              |      |unique). Use '.' if no name is assigned.|
1021         +--------------+------+----------------------------------------+
1022         |score         |int   |Indicates how dark the peak will be     |
1023         |(always use   |      |displayed in the browser (1-1000). If   |
1024         |1000 for      |      |'0', the DCC will assign this based on  |
1025         |the           |      |signal value. Ideally average           |
1026         |thickest      |      |signalValue per base spread between     |
1027         |color)        |      |100-1000.                               |
1028         +--------------+------+----------------------------------------+
1029         |strand        |char  |+/- to denote strand or orientation     |
1030         |(always .)    |      |(whenever applicable). Use '.' if no    |
1031         |              |      |orientation is assigned.                |
1032         +--------------+------+----------------------------------------+
1033         |thickStart    |int   | The starting position at which the     |
1034         |              |      |feature is drawn thickly. Mark the start|
1035         |              |      |of highly enriched regions.             |
1036         |              |      |                                        |
1037         +--------------+------+----------------------------------------+
1038         |thickEnd      |int   | The ending position at which the       |
1039         |              |      |feature is drawn thickly. Mark the end  |
1040         |              |      |of highly enriched regions.             |
1041         +--------------+------+----------------------------------------+
1042         |itemRGB       |string| Not used. Set it as 0.                 |
1043         +--------------+------+----------------------------------------+
1044         |blockCounts   |int   | The number of blocks (exons) in the BED|
1045         |              |      |line.                                   |
1046         +--------------+------+----------------------------------------+
1047         |blockSizes    |string| A comma-separated list of the block    |
1048         |              |      |sizes.                                  |
1049         +--------------+------+----------------------------------------+
1050         |blockStarts   |string| A comma-separated list of block starts.|
1051         +--------------+------+----------------------------------------+
1052
1053         """
1054         chrs = list(self.peaks.keys())
1055         chrs.sort()
1056         n_peak = 0
1057         try: peakprefix = name_prefix % name
1058         except: peakprefix = name_prefix
1059         try: desc = description % name
1060         except: desc = description
1061         if trackline:
1062             fhd.write("track name=\"%s\" description=\"%s\" type=bed nextItemButton=on\n" % (name.decode(), desc.decode()) )
1063         for chrom in chrs:
1064             for peak in self.peaks[chrom]:
1065                 n_peak += 1
1066                 if peak["thickStart"] == b".":
1067                     # this will violate gappedPeak format, since it's a complement like broadPeak line.
1068                     fhd.write( "%s\t%d\t%d\t%s%d\t%d\t.\n"
1069                                %
1070                                (chrom.decode(),peak["start"],peak["end"],peakprefix.decode(),n_peak,int(10*peak[score_column]) ) )
1071                 else:
1072                     fhd.write( "%s\t%d\t%d\t%s%d\t%d\t.\t%s\t%s\t0\t%d\t%s\t%s\n"
1073                                %
1074                                (chrom.decode(), peak["start"], peak["end"], peakprefix.decode(), n_peak, int(10*peak[score_column]),
1075                                 peak["thickStart"].decode(), peak["thickEnd"].decode(),
1076                                 peak["blockNum"], peak["blockSizes"].decode(), peak["blockStarts"].decode() ))
1077
1078
1079     def write_to_broadPeak (self, fhd, bytes name_prefix=b"peak_", bytes name=b'peak', bytes description=b"%s", str score_column="score", trackline=True):
1080         """Print out peaks in broadPeak format.
1081
1082         This format is designed for ENCODE project, and basically a
1083         BED6+3 format.
1084
1085         +-----------+------+----------------------------------------+
1086         |field      |type  |description                             |
1087         +-----------+------+----------------------------------------+
1088         |chrom      |string|Name of the chromosome                  |
1089         +-----------+------+----------------------------------------+
1090         |chromStart |int   |The starting position of the feature in |
1091         |           |      |the chromosome. The first base in a     |
1092         |           |      |chromosome is numbered 0.               |
1093         +-----------+------+----------------------------------------+
1094         |chromEnd   |int   |The ending position of the feature in   |
1095         |           |      |the chromosome or scaffold. The chromEnd|
1096         |           |      |base is not included in the display of  |
1097         |           |      |the feature.  For example, the first 100|
1098         |           |      |bases of a chromosome are defined as    |
1099         |           |      |chromStart=0, chromEnd=100, and span the|
1100         |           |      |bases numbered 0-99.                    |
1101         +-----------+------+----------------------------------------+
1102         |name       |string|Name given to a region (preferably      |
1103         |           |      |unique). Use '.' if no name is assigned.|
1104         +-----------+------+----------------------------------------+
1105         |score      |int   |Indicates how dark the peak will be     |
1106         |(-logqvalue|      |displayed in the browser (1-1000). If   |
1107         |in MACS3 * |      |'0', the DCC will assign this based on  |
1108         |10)        |      |signal value. Ideally average           |
1109         |           |      |signalValue per base spread between     |
1110         |           |      |100-1000.                               |
1111         +-----------+------+----------------------------------------+
1112         |strand     |char  |+/- to denote strand or orientation     |
1113         |(always .) |      |(whenever applicable). Use '.' if no    |
1114         |           |      |orientation is assigned.                |
1115         +-----------+------+----------------------------------------+
1116         |signalValue|float |Measurement of overall (usually,        |
1117         |(fc)       |      |average) enrichment for the region.     |
1118         +-----------+------+----------------------------------------+
1119         |pValue     |float |Measurement of statistical signficance  |
1120         |           |      |(-log10). Use -1 if no pValue is        |
1121         |           |      |assigned.                               |
1122         +-----------+------+----------------------------------------+
1123         |qValue     |float |Measurement of statistical significance |
1124         |           |      |using false discovery rate. Use -1 if no|
1125         |           |      |qValue is assigned.                     |
1126         +-----------+------+----------------------------------------+
1127
1128         """
1129         cdef int n_peak
1130         cdef bytes chrom
1131         cdef long s
1132         cdef str peakname
1133
1134         chrs = list(self.peaks.keys())
1135         chrs.sort()
1136         n_peak = 0
1137         write = fhd.write
1138         try: peakprefix = name_prefix % name
1139         except: peakprefix = name_prefix
1140         if trackline:
1141             write("track type=broadPeak name=\"%s\" description=\"%s\" nextItemButton=on\n" % (name.decode(), name.decode()))
1142         for chrom in chrs:
1143             for end, group in groupby(self.peaks[chrom], key=itemgetter("end")):
1144                 n_peak += 1
1145                 these_peaks = list(group)
1146                 peak = these_peaks[0]
1147                 peakname = "%s%d" % (peakprefix.decode(), n_peak)
1148                 fhd.write( "%s\t%d\t%d\t%s\t%d\t.\t%.6g\t%.6g\t%.6g\n" %
1149                            (chrom.decode(),peak['start'],peak['end'],peakname,int(10*peak[score_column]),
1150                             peak['fc'],peak['pscore'],peak['qscore'] ) )
1151         return
1152
1153
1154     def write_to_xls (self, ofhd, bytes name_prefix=b"%s_peak_", bytes name=b"MACS"):
1155         """Save the peak results in a tab-delimited plain text file
1156         with suffix .xls.
1157
1158
1159         wait... why I have two write_to_xls in this class?
1160
1161         """
1162         write = ofhd.write
1163         write("\t".join(("chr","start", "end",  "length",  "pileup", "-log10(pvalue)", "fold_enrichment", "-log10(qvalue)", "name"))+"\n")
1164
1165         try: peakprefix = name_prefix % name
1166         except: peakprefix = name_prefix
1167
1168         peaks = self.peaks
1169         chrs = list(peaks.keys())
1170         chrs.sort()
1171         n_peak = 0
1172         for chrom in chrs:
1173             for end, group in groupby(peaks[chrom], key=itemgetter("end")):
1174                 n_peak += 1
1175                 these_peaks = list(group)
1176                 peak = these_peaks[0]
1177                 peakname = "%s%d" % (peakprefix.decode(), n_peak)
1178                 write("%s\t%d\t%d\t%d" % (chrom.decode(),peak['start']+1,peak['end'],peak['length']))
1179                 write("\t%.6g" % (round(peak['pileup'],2))) # pileup height at summit
1180                 write("\t%.6g" % (peak['pscore'])) # -log10pvalue at summit
1181                 write("\t%.6g" % (peak['fc'])) # fold change at summit
1182                 write("\t%.6g" % (peak['qscore'])) # -log10qvalue at summit
1183                 write("\t%s" % peakname)
1184                 write("\n")
1185         return