MACS3/Commands/hmmratac_cmd.py

   1 # Time-stamp: <2024-10-08 10:53:48 Tao Liu>
   2
   3
   4 """Description: Main HMMR command
   5
   6 This code is free software; you can redistribute it and/or modify it
   7 under the terms of the BSD License (see the file LICENSE included with
   8 the distribution).
   9 """
  10
  11 # ------------------------------------
  12 # python modules
  13 # ------------------------------------
  14
  15 import os
  16 import sys
  17 import gc
  18 import numpy as np
  19 import tempfile
  20
  21 # ------------------------------------
  22 # own python modules
  23 # ------------------------------------
  24 # from MACS3.Utilities.Constants import *
  25 from MACS3.Utilities.OptValidator import opt_validate_hmmratac
  26 from MACS3.IO.PeakIO import PeakIO
  27 from MACS3.IO.Parser import BAMPEParser, BEDPEParser  # BAMaccessor
  28 from MACS3.Signal.HMMR_EM import HMMR_EM
  29 from MACS3.Signal.HMMR_Signal_Processing import (generate_weight_mapping,
  30                                                  generate_digested_signals,
  31                                                  extract_signals_from_regions)
  32 from MACS3.Signal.HMMR_HMM import (hmm_training,
  33                                    hmm_predict,
  34                                    hmm_model_init,
  35                                    hmm_model_save)
  36 from MACS3.Signal.Region import Regions
  37 from MACS3.IO.BedGraphIO import bedGraphIO
  38
  39 # ------------------------------------
  40 # constants
  41 # ------------------------------------
  42
  43 # ------------------------------------
  44 # Misc functions
  45 # ------------------------------------
  46
  47 # ------------------------------------
  48 # Main function
  49 # ------------------------------------
  50
  51
  52 def run(args):
  53     """The HMMRATAC function/pipeline for MACS.
  54
  55     """
  56     options = opt_validate_hmmratac(args)
  57
  58     #############################################
  59     # 0. Names of output files
  60     #############################################
  61     short_bdgfile = os.path.join(options.outdir,
  62                                  options.name+"_digested_short.bdg")
  63     mono_bdgfile = os.path.join(options.outdir,
  64                                 options.name+"_digested_mono.bdg")
  65     di_bdgfile = os.path.join(options.outdir,
  66                               options.name+"_digested_di.bdg")
  67     tri_bdgfile = os.path.join(options.outdir,
  68                                options.name+"_digested_tri.bdg")
  69     training_region_bedfile = os.path.join(options.outdir,
  70                                            options.name+"_training_regions.bed")
  71     training_datafile = os.path.join(options.outdir,
  72                                      options.name+"_training_data.txt")
  73     training_datalengthfile = os.path.join(options.outdir,
  74                                            options.name+"_training_lengths.txt")
  75
  76     hmm_modelfile = os.path.join(options.outdir,
  77                                  options.name+"_model.json")
  78
  79     open_state_bdgfile = os.path.join(options.outdir,
  80                                       options.name+"_open.bdg")
  81     nuc_state_bdgfile = os.path.join(options.outdir,
  82                                      options.name+"_nuc.bdg")
  83     bg_state_bdgfile = os.path.join(options.outdir,
  84                                     options.name+"_bg.bdg")
  85
  86     states_file = os.path.join(options.outdir,
  87                                options.name+"_states.bed")
  88
  89     accessible_file = os.path.join(options.outdir,
  90                                    options.name+"_accessible_regions.narrowPeak")
  91
  92     cutoffanalysis_file = os.path.join(options.outdir,
  93                                        options.name+"_cutoff_analysis.tsv")
  94
  95     #############################################
  96     # 1. Read the input files
  97     #############################################
  98     options.info("\n" + options.argtxt)
  99
 100     if options.format == "BAMPE":
 101         options.info("#1 Read fragments from BAMPE file...")
 102         parser = BAMPEParser
 103     elif options.format == "BEDPE":
 104         options.info("#1 Read fragments from BEDPE file...")
 105         parser = BEDPEParser
 106     else:
 107         raise Exception("wrong format")
 108
 109     alignment = parser(options.input_file[0], buffer_size=options.buffer_size)
 110     petrack = alignment.build_petrack()
 111     if len(options.input_file) > 1:
 112         # multiple input
 113         for inputfile in options.input_file[1:]:
 114             alignment = parser(inputfile, buffer_size=options.buffer_size)
 115             petrack = alignment.append_petrack(petrack)
 116     # remember to finalize the petrack
 117     petrack.finalize()
 118
 119     # filter duplicates if needed
 120     if options.misc_keep_duplicates:
 121         petrack.filter_dup(maxnum=1)
 122
 123     # read in blacklisted if option entered
 124     if options.blacklist:
 125         options.info("#  Read blacklist file...")
 126         peakio = open(options.blacklist)
 127         blacklist = PeakIO()
 128         i = 0
 129         for l_p in peakio:
 130             fs = l_p.rstrip().split()
 131             i += 1
 132             blacklist.add(fs[0].encode(),
 133                           int(fs[1]),
 134                           int(fs[2]),
 135                           name=b"%d" % i)
 136             blacklist.sort()
 137         blacklist_regions = Regions()
 138         blacklist_regions.init_from_PeakIO(blacklist)
 139
 140     #############################################
 141     # 2. EM
 142     #############################################
 143     if options.em_skip:
 144         # Skip EM and use the options.em_means and options.em_stddevs
 145         em_means = options.em_means
 146         em_stddevs = options.em_stddevs
 147         options.info("#2 EM is skipped. The following means and stddevs will be used:")
 148     else:
 149         # we will use EM to get the best means/stddevs for the mono-, di- and tri- modes of fragment sizes
 150         options.info("#2 Use EM algorithm to estimate means and stddevs of fragment lengths")
 151         options.info("#  for mono-, di-, and tri-nucleosomal signals...")
 152         em_trainer = HMMR_EM(petrack, options.em_means[1:4],
 153                              options.em_stddevs[1:4], seed=options.hmm_randomSeed)
 154         # the mean and stddev after EM training
 155         em_means = [options.em_means[0],]
 156         em_means.extend(em_trainer.fragMeans)
 157         em_stddevs = [options.em_stddevs[0],]
 158         em_stddevs.extend(em_trainer.fragStddevs)
 159         # we will round to 1 decimal digit
 160         for i in range(len(em_means)):
 161             em_means[i] = round(em_means[i], 1)
 162         for i in range(len(em_stddevs)):
 163             em_stddevs[i] = round(em_stddevs[i], 1)
 164         options.info("#  The means and stddevs after EM:")
 165
 166     options.info("#                    {0[0]:>10s} {0[1]:>10s} {0[2]:>10s} {0[3]:>10s}".format(["short", "mono", "di", "tri"]))
 167     options.info("#             means: {0[0]:>10.4g} {0[1]:>10.4g} {0[2]:>10.4g} {0[3]:>10.4g}".format(em_means))
 168     options.info("#           stddevs: {0[0]:>10.4g} {0[1]:>10.4g} {0[2]:>10.4g} {0[3]:>10.4g}".format(em_stddevs))
 169
 170     # to finalize the EM training, we will decompose ATAC-seq into four signal tracks
 171     options.info("#  Compute the weights for each fragment length for each of the four signal types")
 172     fl_dict = petrack.count_fraglengths()
 173     fl_list = list(fl_dict.keys())
 174     fl_list.sort()
 175
 176     # now we will prepare the weights for each fragment length for
 177     # each of the four distributions based on the EM results
 178     weight_mapping = generate_weight_mapping(fl_list, em_means, em_stddevs,
 179                                              min_frag_p=options.min_frag_p)
 180
 181     options.info("#  Generate short, mono-, di-, and tri-nucleosomal signals")
 182     digested_atac_signals = generate_digested_signals(petrack, weight_mapping)
 183
 184     # save three types of signals if needed
 185     if options.save_digested:
 186         bdgshort = bedGraphIO(short_bdgfile, data=digested_atac_signals[0])
 187         bdgshort.write_bedGraph("short", "short")
 188
 189         bdgmono = bedGraphIO(mono_bdgfile, data=digested_atac_signals[1])
 190         bdgmono.write_bedGraph("mono", "mono")
 191
 192         bdgdi = bedGraphIO(di_bdgfile, data=digested_atac_signals[2])
 193         bdgdi.write_bedGraph("di", "di")
 194
 195         bdgtri = bedGraphIO(tri_bdgfile, data=digested_atac_signals[3])
 196         bdgtri.write_bedGraph("tri", "tri")
 197
 198     minlen = int(petrack.average_template_length)
 199     # if options.pileup_short is on, we pile up only the short
 200     #  fragments to identify training regions and to prescan for
 201     #  candidate regions for decoding.
 202     if options.pileup_short:
 203         options.info("#  Pile up ONLY short fragments")
 204         fc_bdg = digested_atac_signals[0]
 205     else:
 206         options.info("#  Pile up all fragments")
 207         fc_bdg = petrack.pileup_bdg([1.0, ], baseline_value=0)
 208     (sum_v, n_v, max_v, min_v, mean_v, std_v) = fc_bdg.summary()
 209     options.info("#  Convert pileup to fold-change over average signal")
 210     fc_bdg.apply_func(lambda x: x/mean_v)
 211
 212     # if cutoff_analysis only, generate and save the report and quit
 213     if options.cutoff_analysis_only:
 214         # we will run cutoff analysis only and quit
 215         options.info(f"#3 Generate cutoff analysis report from {petrack.total} fragments")
 216         options.info(f"#   Please review the cutoff analysis result in {cutoffanalysis_file}")
 217
 218         # Let MACS3 do the cutoff analysis to help decide the lower
 219         # and upper cutoffs
 220         with open(cutoffanalysis_file, "w") as ofhd_cutoff:
 221             ofhd_cutoff.write(fc_bdg.cutoff_analysis(min_length=minlen,
 222                                                      max_gap=options.hmm_training_flanking,
 223                                                      max_score=options.cutoff_analysis_max,
 224                                                      steps=options.cutoff_analysis_steps))
 225         # raise Exception("Cutoff analysis only.")
 226         sys.exit(1)
 227
 228     #############################################
 229     # 3. Define training set by peak calling
 230     #############################################
 231
 232     if options.hmm_file:
 233         # skip this step if hmm_file is given
 234         options.info("#3 Skip this step of looking for training set since a Hidden Markov Model file has been provided!")
 235     elif options.hmm_training_regions:
 236         # if a training region file is provided - need to read in the bedfile and skip the peak calling step
 237         options.info(f"#3 Read training regions from BED file: {options.hmm_training_regions}")
 238         # from refinepeak_cmd.py:
 239         peakio = open(options.hmm_training_regions, "rb")
 240         peaks = PeakIO()
 241         for l_p in peakio:
 242             fs = l_p.rstrip().split()
 243             peaks.add(chromosome=fs[0], start=int(fs[1]), end=int(fs[2]))  # change based on what expected input file should contain
 244         peakio.close()
 245         training_regions = Regions()
 246         training_regions.init_from_PeakIO(peaks)
 247         options.info("#  Training regions have been read from bedfile")
 248     else:
 249         # Find regions with fold change within determined range to use as training sites.
 250         # Find regions with zscore values above certain cutoff to exclude from viterbi.
 251         #
 252         options.info(f"#3 Look for training set from {petrack.total} fragments")
 253         options.info(f"#  Call peak above within fold-change range of {options.hmm_lower} and {options.hmm_upper}.")
 254         options.info(f"#   The minimum length of the region is set as the average template/fragment length in the dataset: {minlen}")
 255         options.info(f"#   The maximum gap to merge nearby significant regions is set as the flanking size to extend training regions: {options.hmm_training_flanking}")
 256         peaks = fc_bdg.call_peaks(cutoff=options.hmm_lower, min_length=minlen, max_gap=options.hmm_training_flanking, call_summits=False)
 257         options.info(f"#  Total training regions called after applying the lower cutoff {options.hmm_lower}: {peaks.total}")
 258         peaks.filter_score(options.hmm_lower, options.hmm_upper)
 259         options.info(f"#  Total training regions after filtering with upper cutoff {options.hmm_upper}: {peaks.total}")
 260
 261         options.info( "#  **IMPORTANT**")
 262         options.info(f"#  Please review the cutoff analysis result in {cutoffanalysis_file} to verify")
 263         options.info( "#   if the choices of lower, upper and prescanning cutoff are appropriate.")
 264         options.info( "#   Please read the message in the section 'Choices of cutoff values' by running")
 265         options.info( "#   `macs3 hmmratac -h` for detail.")
 266         options.info( "#  ****")
 267
 268         # Let MACS3 do the cutoff analysis to help decide the lower and upper cutoffs
 269         with open(cutoffanalysis_file, "w") as ofhd_cutoff:
 270             ofhd_cutoff.write(fc_bdg.cutoff_analysis(min_length=minlen, max_gap=options.hmm_training_flanking, max_score=options.cutoff_analysis_max))
 271
 272         # we will check if anything left after filtering
 273         if peaks.total > options.hmm_maxTrain:
 274             peaks = peaks.randomly_pick(options.hmm_maxTrain, seed=options.hmm_randomSeed)
 275             options.info(f"#  We randomly pick {options.hmm_maxTrain} regions for training")
 276         elif peaks.total == 0:
 277             options.error("# No training regions found. Please adjust the lower or upper cutoff.")
 278             raise Exception("Not enough training regions!")
 279
 280         # Now we convert PeakIO to Regions and filter blacklisted regions
 281         training_regions = Regions()
 282         training_regions.init_from_PeakIO(peaks)
 283         # We will expand the regions to both directions and merge overlap
 284         options.info(f"#  We expand the training regions with {options.hmm_training_flanking} basepairs and merge overlap")
 285         training_regions.expand(options.hmm_training_flanking)
 286         training_regions.merge_overlap()
 287
 288         # remove peaks overlapping with blacklisted regions
 289         if options.blacklist:
 290             training_regions.exclude(blacklist_regions)
 291             options.info(f"#  after removing those overlapping with provided blacklisted regions, we have {training_regions.total} left")
 292         if options.save_train:
 293             fhd = open(training_region_bedfile, "w")
 294             training_regions.write_to_bed(fhd)
 295             fhd.close()
 296             options.info(f"#  Training regions have been saved to `{options.name}_training_regions.bed` ")
 297
 298     #############################################
 299     # 4. Train HMM
 300     #############################################
 301     # if model file is provided, we skip this step
 302     # include options.hmm_type and make it backwards compatible, if no hmm_type default is gaussian
 303     if options.hmm_file:
 304         options.info("#4 Load Hidden Markov Model from given model file")
 305         hmm_model, i_open_region, i_background_region, i_nucleosomal_region, options.hmm_binsize, options.hmm_type = hmm_model_init(options.hmm_file)
 306     else:
 307         options.info("#4 Train Hidden Markov Model with Multivariate Gaussian Emission")
 308
 309         # extract signals within peak using the given binsize
 310         options.info(f"#  Extract signals in training regions with bin size of {options.hmm_binsize}")
 311         [training_bins, training_data, training_data_lengths] = extract_signals_from_regions(digested_atac_signals, training_regions, binsize=options.hmm_binsize, hmm_type=options.hmm_type)
 312
 313         if options.save_train:
 314             f = open(training_datafile, "w")
 315             for i in range(len(training_data)):
 316                 v = training_data[i]
 317                 p = training_bins[i]
 318                 f.write(f"{p[0]}\t{p[1]}\t{v[0]}\t{v[1]}\t{v[2]}\t{v[3]}\n")
 319             f.close()
 320
 321             f = open(training_datalengthfile, "w")
 322             for v in training_data_lengths:
 323                 f.write(f"{v}\n")
 324             f.close()
 325
 326         options.info("#  Use Baum-Welch algorithm to train the HMM")
 327
 328         hmm_model = hmm_training(training_data,
 329                                  training_data_lengths,
 330                                  random_seed=options.hmm_randomSeed,
 331                                  hmm_type=options.hmm_type)
 332
 333         options.info(f"#   HMM converged: {hmm_model.monitor_.converged}")
 334
 335         # label hidden states
 336         if options.hmm_type == "gaussian":
 337             means_sum = np.sum(hmm_model.means_, axis=1)
 338         if options.hmm_type == "poisson":
 339             means_sum = np.sum(hmm_model.lambdas_, axis=1)
 340
 341         # first, the state with the highest overall emission is the open state
 342         i_open_region = np.where(means_sum == max(means_sum))[0][0]
 343
 344         # second, the state with lowest overall emission is the bg state
 345         i_background_region = np.where(means_sum == min(means_sum))[0][0]
 346
 347         # last one is the nuc state (note it may not be accurate though
 348         i_nucleosomal_region = list(set([0, 1, 2]) - set([i_open_region, i_background_region]))[0]
 349
 350         # write hmm into model file
 351         options.info(f"#  Write HMM parameters into JSON: {hmm_modelfile}")
 352         hmm_model_save(hmm_modelfile, hmm_model, options.hmm_binsize, i_open_region, i_nucleosomal_region, i_background_region, options.hmm_type)
 353
 354         # if --modelonly option provided, exit script after hmm model is saved
 355         if options.hmm_modelonly:
 356             options.info("#  Complete - HMM model was saved, program exited (--modelonly option was provided) ")
 357             sys.exit()
 358
 359     # Now tell users the parameters of the HMM
 360     assignments = ["", "", ""]
 361     assignments[i_open_region] = "open"
 362     assignments[i_nucleosomal_region] = "nuc"
 363     assignments[i_background_region] = "bg"
 364
 365     options.info(f"#  The Hidden Markov Model for signals of binsize of {options.hmm_binsize} basepairs:")
 366     options.info(f"#   open state index: state{i_open_region}")
 367     options.info(f"#   nucleosomal state index: state{i_nucleosomal_region}")
 368     options.info(f"#   background state index: state{i_background_region}")
 369     options.info( "#   Starting probabilities of states:")
 370     options.info( "#                    {0[0]:>10s} {0[1]:>10s} {0[2]:>10s}".format(assignments))
 371     options.info( "#                    {0[0]:>10.4g} {0[1]:>10.4g} {0[2]:>10.4g}".format(hmm_model.startprob_))
 372     options.info( "#   HMM Transition probabilities:")
 373     options.info( "#                    {0[0]:>10s} {0[1]:>10s} {0[2]:>10s}".format(assignments))
 374     options.info( "#       {0:>10s}-> {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g}".format(assignments[0], hmm_model.transmat_[0]))
 375     options.info( "#       {0:>10s}-> {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g}".format(assignments[1], hmm_model.transmat_[1]))
 376     options.info( "#       {0:>10s}-> {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g}".format(assignments[2], hmm_model.transmat_[2]))
 377
 378     if options.hmm_type == 'gaussian':
 379         options.info("#   HMM Emissions (means): ")
 380         options.info( "#                    {0[0]:>10s} {0[1]:>10s} {0[2]:>10s} {0[3]:>10s}".format(["short", "mono", "di", "tri"]))
 381         options.info( "#       {0:>10s}:  {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[0], hmm_model.means_[0]))
 382         options.info( "#       {0:>10s}:  {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[1], hmm_model.means_[1]))
 383         options.info( "#       {0:>10s}:  {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[2], hmm_model.means_[2]))
 384     if options.hmm_type == 'poisson':
 385         options.info( "#   HMM Emissions (lambdas): ")
 386         options.info( "#                    {0[0]:>10s} {0[1]:>10s} {0[2]:>10s} {0[3]:>10s}".format(["short", "mono", "di", "tri"]))
 387         options.info( "#       {0:>10s}:  {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[0], hmm_model.lambdas_[0]))
 388         options.info( "#       {0:>10s}:  {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[1], hmm_model.lambdas_[1]))
 389         options.info( "#       {0:>10s}:  {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[2], hmm_model.lambdas_[2]))
 390
 391
 392 #############################################
 393 # 5. Predict
 394 #############################################
 395     # Our prediction strategy will be different with HMMRATAC, we will first ask MACS call peaks with loose cutoff, then for each peak we will run HMM prediction to figure out labels. And for the rest of genomic regions, just mark them as 'background'.
 396     options.info("#5 Decode with Viterbi to predict states")
 397     # the following /4 is totally arbitrary, we may need to fix it
 398     candidate_peaks = fc_bdg.call_peaks(cutoff=options.prescan_cutoff, min_length=minlen, max_gap=options.hmm_training_flanking, call_summits=False)
 399     options.info(f"#5  Total candidate peaks : {candidate_peaks.total}")
 400
 401     # Now we convert PeakIO to Regions and filter blacklisted regions
 402     candidate_regions = Regions()
 403     candidate_regions.init_from_PeakIO(candidate_peaks)
 404     # We will expand the regions to both directions and merge overlap
 405     options.info(f"#  We expand the candidate regions with {options.hmm_training_flanking} and merge overlap")
 406     candidate_regions.expand(options.hmm_training_flanking)
 407     candidate_regions.merge_overlap()
 408     options.info(f"#   after expanding and merging, we have {candidate_regions.total} candidate regions")
 409
 410     # remove peaks overlapping with blacklisted regions
 411     if options.blacklist:
 412         candidate_regions.exclude(blacklist_regions)
 413         options.info(f"#   after removing those overlapping with provided blacklisted regions, we have {candidate_regions.total} left")
 414
 415     # extract signals
 416     options.info("#  Extract signals in candidate regions and decode with HMM")
 417     # we will do the extraction and prediction in a step of 10000 regions by default
 418
 419     # Note: we can implement in a different way to extract then predict for each candidate region.
 420     # predicted_results = hmm_decode_each_region (digested_atac_signals, candidate_regions, hmm_model, binsize = options.hmm_binsize)
 421     # Note: we implement in a way that we will decode the candidate regions 10000 regions at a time so 1. we can make it running in parallel in the future; 2. we can reduce the memory usage.
 422     options.info("#  Use HMM to predict states")
 423     n = 0
 424
 425     # we create a temporary file to save the proba predicted from hmm
 426     predicted_proba_file = tempfile.TemporaryFile(mode="w+b")
 427
 428     while candidate_regions.total != 0:
 429         n += 1
 430         # we get DECODING_STEPS number of candidate regions first
 431         cr = candidate_regions.pop(options.decoding_steps)
 432         options.info("#    decoding %d..." % (n * options.decoding_steps))
 433
 434         # then extrac data from digested signals, create cr_bins, cr_data, and cr_data_lengths
 435         [cr_bins, cr_data, cr_data_lengths] = extract_signals_from_regions(digested_atac_signals, cr, binsize=options.hmm_binsize, hmm_type=options.hmm_type)
 436         # options.debug("#     extract_signals_from_regions complete")
 437
 438         prob_data = hmm_predict(cr_data, cr_data_lengths, hmm_model)
 439         assert len(prob_data) == len(cr_bins)
 440         for i in range(len(prob_data)):
 441             predicted_proba_file.write(b"%s,%d" % cr_bins[i])
 442             predicted_proba_file.write(b",%f,%f,%f\n" % tuple(prob_data[i]))
 443
 444         cr_data = []
 445         cr_data_lengths = []
 446         cr_bins = []
 447         prob_data = []
 448         gc.collect()
 449
 450     predicted_proba_file.seek(0)  # reset
 451     options.info("# predicted_proba files written...")
 452
 453 #############################################
 454 # 6. Output - add to OutputWriter
 455 #############################################
 456     options.info("# Write the output...")
 457     # Now taken the candidate_bins and predicted_proba, we can generate various
 458     # outputs
 459
 460     # One thing to remember about candidate_bins is that the position
 461     # in this array is the 'end' of the bin, the actual region is the
 462     # 'end'-'binsize' to the 'end'.
 463
 464     # First, the likelihoods for each of the three states in a bedGraph
 465     if options.save_likelihoods:
 466         options.info(f"# Write the likelihoods for each states into three bedGraph files {options.name}_open.bdg, {options.name}_nuc.bdg, and {options.name}_bg.bdg")
 467         open_state_bdg_fhd = open(open_state_bdgfile, "w")
 468         nuc_state_bdg_fhd = open(nuc_state_bdgfile, "w")
 469         bg_state_bdg_fhd = open(bg_state_bdgfile, "w")
 470         save_proba_to_bedGraph(predicted_proba_file, options.hmm_binsize, open_state_bdg_fhd, nuc_state_bdg_fhd, bg_state_bdg_fhd, i_open_region, i_nucleosomal_region, i_background_region)
 471         predicted_proba_file.seek(0)  # reset
 472         open_state_bdg_fhd.close()
 473         nuc_state_bdg_fhd.close()
 474         bg_state_bdg_fhd.close()
 475         options.info("# finished writing proba_to_bedgraph")
 476
 477     # # Generate states path:
 478     states_path = generate_states_path(predicted_proba_file, options.hmm_binsize, i_open_region, i_nucleosomal_region, i_background_region)
 479     options.info("# finished generating states path")
 480     predicted_proba_file.close()          # kill the temp file
 481     # Save states path if needed
 482     # PS: we need to implement extra feature to include those regions NOT in candidate_bins and assign them as 'background state'.
 483     if options.save_states:
 484         options.info(f"# Write states assignments in a BED file: {options.name}_states.bed")
 485         with open(states_file, "w") as f:
 486             save_states_bed(states_path, f)
 487
 488     options.info(f"# Write accessible regions in a narrowPeak file: {options.name}_accessible_regions.narrowPeak")
 489     with open(accessible_file, "w") as ofhd:
 490         save_accessible_regions(states_path, ofhd, options.openregion_minlen, fc_bdg)
 491
 492     options.info("# Finished")
 493
 494
 495 def save_proba_to_bedGraph(predicted_proba_file, binsize, open_state_bdg_file, nuc_state_bdg_file, bg_state_bdg_file, i_open, i_nuc, i_bg):
 496
 497     open_state_bdg_file = bedGraphIO(open_state_bdg_file)
 498     nuc_state_bdg_file = bedGraphIO(nuc_state_bdg_file)
 499     bg_state_bdg_file = bedGraphIO(bg_state_bdg_file)
 500
 501     open_state_bdg = open_state_bdg_file.data
 502     nuc_state_bdg = nuc_state_bdg_file.data
 503     bg_state_bdg = bg_state_bdg_file.data
 504
 505     prev_chrom_name = None
 506     prev_bin_end = None
 507
 508     for pp_line in predicted_proba_file:
 509         pp_data = pp_line.strip().split(b',')
 510
 511         chrname = pp_data[0]
 512         end_pos = int(pp_data[1])
 513         start_pos = end_pos - binsize
 514         pp_open = float(pp_data[i_open+2])
 515         pp_nuc = float(pp_data[i_nuc+2])
 516         pp_bg = float(pp_data[i_bg+2])
 517
 518         if chrname != prev_chrom_name:
 519             # we start a new chromosome
 520             if start_pos > 0:
 521                 # add the first unannotated region as background
 522                 open_state_bdg.add_loc(chrname, 0, start_pos, 0.0)
 523                 nuc_state_bdg.add_loc(chrname, 0, start_pos, 0.0)
 524                 bg_state_bdg.add_loc(chrname, 0, start_pos, 1.0)
 525             prev_chrom_name = chrname
 526         else:
 527             # now check if the prev_bin_end is start_pos, if not, add a gap of background
 528             if prev_bin_end < start_pos:
 529                 open_state_bdg.add_loc(chrname, prev_bin_end, start_pos, 0.0)
 530                 nuc_state_bdg.add_loc(chrname, prev_bin_end, start_pos, 0.0)
 531                 bg_state_bdg.add_loc(chrname, prev_bin_end, start_pos, 1.0)
 532
 533         open_state_bdg.add_loc(chrname, start_pos, end_pos, pp_open)
 534         nuc_state_bdg.add_loc(chrname, start_pos, end_pos, pp_nuc)
 535         bg_state_bdg.add_loc(chrname, start_pos, end_pos, pp_bg)
 536         prev_bin_end = end_pos
 537
 538     open_state_bdg_file.write_bedGraph("Open States", "Likelihoods of being Open States", trackline=False)
 539     nuc_state_bdg_file.write_bedGraph("Nucleosomal States", "Likelihoods of being Nucleosomal States", trackline=False)
 540     bg_state_bdg_file.write_bedGraph("Background States", "Likelihoods of being Background States", trackline=False)
 541     return
 542
 543
 544 def save_states_bed(states_path, states_bedfile):
 545     # we do not need to output background state.
 546     for l_len in range(len(states_path)):
 547         if states_path[l_len][3] != "bg":
 548             states_bedfile.write("%s\t" % states_path[l_len][0].decode())
 549             states_bedfile.write("%d\t%d\t%s\n" % states_path[l_len][1:])
 550     return
 551
 552
 553 def generate_states_path(predicted_proba_file, binsize, i_open, i_nuc, i_bg):
 554     # predicted_proba_file is a temporary file
 555     ret_states_path = []
 556     labels_list = ["open", "nuc", "bg"]
 557
 558     prev_chrom_name = None
 559     prev_bin_end = None
 560     prev_label = None
 561
 562     for pp_line in predicted_proba_file:
 563         pp_data = pp_line.strip().split(b',')
 564
 565         chrname = pp_data[0]
 566         end_pos = int(pp_data[1])
 567         start_pos = end_pos - binsize
 568         pp_open = float(pp_data[i_open+2])
 569         pp_nuc = float(pp_data[i_nuc+2])
 570         pp_bg = float(pp_data[i_bg+2])
 571
 572         # find the best state as label
 573         label = labels_list[max((pp_open, 0), (pp_nuc, 1), (pp_bg, 2), key=lambda x: x[0])[1]]
 574
 575         if chrname != prev_chrom_name:
 576             # we start a new chromosome
 577             if start_pos > 0:
 578                 # add the first unannotated region as background
 579                 ret_states_path.append((chrname, 0, start_pos, "bg"))
 580             ret_states_path.append((chrname, start_pos, end_pos, label))
 581             prev_label = label
 582             prev_chrom_name = chrname
 583         else:
 584             # now check if the prev_bin_end is start_pos, if not, add a gap of background
 585             if prev_bin_end < start_pos:
 586                 ret_states_path.append((chrname, prev_bin_end, start_pos, "bg"))
 587                 prev_label = "bg"
 588             # same label, just extend
 589             if label == prev_label:
 590                 ret_states_path[-1] = (ret_states_path[-1][0], ret_states_path[-1][1], end_pos, label)
 591             else:
 592                 ret_states_path.append((chrname, start_pos, end_pos, label))
 593                 prev_label = label
 594
 595         prev_bin_end = end_pos
 596     return ret_states_path
 597
 598
 599 def save_accessible_regions(states_path, accessible_region_file, openregion_minlen, bdgscore):
 600     # Function to add regions to the list
 601     def add_regions(i, regions):
 602         for j in range(i, i+3):
 603             if not regions or states_path[j][2] != regions[-1][2]:
 604                 regions.append((states_path[j][0], int(states_path[j][1]), int(states_path[j][2]), states_path[j][3]))
 605         return regions
 606
 607     # Select only accessible regions from _states.bed, look for nuc-open-nuc pattern
 608     # This by default is the only final output from HMMRATAC
 609     accessible_regions = []
 610     for i in range(len(states_path)-2):
 611         if (states_path[i][3] == 'nuc' and
 612             states_path[i+1][3] == 'open' and
 613             states_path[i+2][3] == 'nuc' and
 614             states_path[i][2] == states_path[i+1][1] and
 615             states_path[i+1][2] == states_path[i+2][1] and
 616             states_path[i+2][2] - states_path[i][1] > openregion_minlen):  # require nuc-open-nuc entire region start/endpos > openregion_minlen
 617             accessible_regions = add_regions(i, accessible_regions)
 618
 619     # remove 'nuc' regions:
 620     accessible_regions = [tup for tup in accessible_regions if tup[3] != 'nuc']
 621
 622     # Generate broadpeak object
 623     openpeak = PeakIO()
 624     for region in accessible_regions[:-1]:
 625         openpeak.add(chromosome=region[0], start=region[1], end=region[2])
 626
 627     # refine peak summit and score using bedGraphTrackI with scores
 628     openpeak = bdgscore.refine_peaks(openpeak)
 629     openpeak.write_to_narrowPeak(accessible_region_file)
 630     return