implement PETrackII for fragment files from scATAC experiments
[MACS.git] / MACS3 / Commands / hmmratac_cmd.py
blob57ab2f4ffda7a4710d5647c839dde043e9534669
1 # Time-stamp: <2024-10-08 10:53:48 Tao Liu>
4 """Description: Main HMMR command
6 This code is free software; you can redistribute it and/or modify it
7 under the terms of the BSD License (see the file LICENSE included with
8 the distribution).
9 """
11 # ------------------------------------
12 # python modules
13 # ------------------------------------
15 import os
16 import sys
17 import gc
18 import numpy as np
19 import tempfile
21 # ------------------------------------
22 # own python modules
23 # ------------------------------------
24 # from MACS3.Utilities.Constants import *
25 from MACS3.Utilities.OptValidator import opt_validate_hmmratac
26 from MACS3.IO.PeakIO import PeakIO
27 from MACS3.IO.Parser import BAMPEParser, BEDPEParser # BAMaccessor
28 from MACS3.Signal.HMMR_EM import HMMR_EM
29 from MACS3.Signal.HMMR_Signal_Processing import (generate_weight_mapping,
30 generate_digested_signals,
31 extract_signals_from_regions)
32 from MACS3.Signal.HMMR_HMM import (hmm_training,
33 hmm_predict,
34 hmm_model_init,
35 hmm_model_save)
36 from MACS3.Signal.Region import Regions
37 from MACS3.IO.BedGraphIO import bedGraphIO
39 # ------------------------------------
40 # constants
41 # ------------------------------------
43 # ------------------------------------
44 # Misc functions
45 # ------------------------------------
47 # ------------------------------------
48 # Main function
49 # ------------------------------------
52 def run(args):
53 """The HMMRATAC function/pipeline for MACS.
55 """
56 options = opt_validate_hmmratac(args)
58 #############################################
59 # 0. Names of output files
60 #############################################
61 short_bdgfile = os.path.join(options.outdir,
62 options.name+"_digested_short.bdg")
63 mono_bdgfile = os.path.join(options.outdir,
64 options.name+"_digested_mono.bdg")
65 di_bdgfile = os.path.join(options.outdir,
66 options.name+"_digested_di.bdg")
67 tri_bdgfile = os.path.join(options.outdir,
68 options.name+"_digested_tri.bdg")
69 training_region_bedfile = os.path.join(options.outdir,
70 options.name+"_training_regions.bed")
71 training_datafile = os.path.join(options.outdir,
72 options.name+"_training_data.txt")
73 training_datalengthfile = os.path.join(options.outdir,
74 options.name+"_training_lengths.txt")
76 hmm_modelfile = os.path.join(options.outdir,
77 options.name+"_model.json")
79 open_state_bdgfile = os.path.join(options.outdir,
80 options.name+"_open.bdg")
81 nuc_state_bdgfile = os.path.join(options.outdir,
82 options.name+"_nuc.bdg")
83 bg_state_bdgfile = os.path.join(options.outdir,
84 options.name+"_bg.bdg")
86 states_file = os.path.join(options.outdir,
87 options.name+"_states.bed")
89 accessible_file = os.path.join(options.outdir,
90 options.name+"_accessible_regions.narrowPeak")
92 cutoffanalysis_file = os.path.join(options.outdir,
93 options.name+"_cutoff_analysis.tsv")
95 #############################################
96 # 1. Read the input files
97 #############################################
98 options.info("\n" + options.argtxt)
100 if options.format == "BAMPE":
101 options.info("#1 Read fragments from BAMPE file...")
102 parser = BAMPEParser
103 elif options.format == "BEDPE":
104 options.info("#1 Read fragments from BEDPE file...")
105 parser = BEDPEParser
106 else:
107 raise Exception("wrong format")
109 alignment = parser(options.input_file[0], buffer_size=options.buffer_size)
110 petrack = alignment.build_petrack()
111 if len(options.input_file) > 1:
112 # multiple input
113 for inputfile in options.input_file[1:]:
114 alignment = parser(inputfile, buffer_size=options.buffer_size)
115 petrack = alignment.append_petrack(petrack)
116 # remember to finalize the petrack
117 petrack.finalize()
119 # filter duplicates if needed
120 if options.misc_keep_duplicates:
121 petrack.filter_dup(maxnum=1)
123 # read in blacklisted if option entered
124 if options.blacklist:
125 options.info("# Read blacklist file...")
126 peakio = open(options.blacklist)
127 blacklist = PeakIO()
128 i = 0
129 for l_p in peakio:
130 fs = l_p.rstrip().split()
131 i += 1
132 blacklist.add(fs[0].encode(),
133 int(fs[1]),
134 int(fs[2]),
135 name=b"%d" % i)
136 blacklist.sort()
137 blacklist_regions = Regions()
138 blacklist_regions.init_from_PeakIO(blacklist)
140 #############################################
141 # 2. EM
142 #############################################
143 if options.em_skip:
144 # Skip EM and use the options.em_means and options.em_stddevs
145 em_means = options.em_means
146 em_stddevs = options.em_stddevs
147 options.info("#2 EM is skipped. The following means and stddevs will be used:")
148 else:
149 # we will use EM to get the best means/stddevs for the mono-, di- and tri- modes of fragment sizes
150 options.info("#2 Use EM algorithm to estimate means and stddevs of fragment lengths")
151 options.info("# for mono-, di-, and tri-nucleosomal signals...")
152 em_trainer = HMMR_EM(petrack, options.em_means[1:4],
153 options.em_stddevs[1:4], seed=options.hmm_randomSeed)
154 # the mean and stddev after EM training
155 em_means = [options.em_means[0],]
156 em_means.extend(em_trainer.fragMeans)
157 em_stddevs = [options.em_stddevs[0],]
158 em_stddevs.extend(em_trainer.fragStddevs)
159 # we will round to 1 decimal digit
160 for i in range(len(em_means)):
161 em_means[i] = round(em_means[i], 1)
162 for i in range(len(em_stddevs)):
163 em_stddevs[i] = round(em_stddevs[i], 1)
164 options.info("# The means and stddevs after EM:")
166 options.info("# {0[0]:>10s} {0[1]:>10s} {0[2]:>10s} {0[3]:>10s}".format(["short", "mono", "di", "tri"]))
167 options.info("# means: {0[0]:>10.4g} {0[1]:>10.4g} {0[2]:>10.4g} {0[3]:>10.4g}".format(em_means))
168 options.info("# stddevs: {0[0]:>10.4g} {0[1]:>10.4g} {0[2]:>10.4g} {0[3]:>10.4g}".format(em_stddevs))
170 # to finalize the EM training, we will decompose ATAC-seq into four signal tracks
171 options.info("# Compute the weights for each fragment length for each of the four signal types")
172 fl_dict = petrack.count_fraglengths()
173 fl_list = list(fl_dict.keys())
174 fl_list.sort()
176 # now we will prepare the weights for each fragment length for
177 # each of the four distributions based on the EM results
178 weight_mapping = generate_weight_mapping(fl_list, em_means, em_stddevs,
179 min_frag_p=options.min_frag_p)
181 options.info("# Generate short, mono-, di-, and tri-nucleosomal signals")
182 digested_atac_signals = generate_digested_signals(petrack, weight_mapping)
184 # save three types of signals if needed
185 if options.save_digested:
186 bdgshort = bedGraphIO(short_bdgfile, data=digested_atac_signals[0])
187 bdgshort.write_bedGraph("short", "short")
189 bdgmono = bedGraphIO(mono_bdgfile, data=digested_atac_signals[1])
190 bdgmono.write_bedGraph("mono", "mono")
192 bdgdi = bedGraphIO(di_bdgfile, data=digested_atac_signals[2])
193 bdgdi.write_bedGraph("di", "di")
195 bdgtri = bedGraphIO(tri_bdgfile, data=digested_atac_signals[3])
196 bdgtri.write_bedGraph("tri", "tri")
198 minlen = int(petrack.average_template_length)
199 # if options.pileup_short is on, we pile up only the short
200 # fragments to identify training regions and to prescan for
201 # candidate regions for decoding.
202 if options.pileup_short:
203 options.info("# Pile up ONLY short fragments")
204 fc_bdg = digested_atac_signals[0]
205 else:
206 options.info("# Pile up all fragments")
207 fc_bdg = petrack.pileup_bdg([1.0, ], baseline_value=0)
208 (sum_v, n_v, max_v, min_v, mean_v, std_v) = fc_bdg.summary()
209 options.info("# Convert pileup to fold-change over average signal")
210 fc_bdg.apply_func(lambda x: x/mean_v)
212 # if cutoff_analysis only, generate and save the report and quit
213 if options.cutoff_analysis_only:
214 # we will run cutoff analysis only and quit
215 options.info(f"#3 Generate cutoff analysis report from {petrack.total} fragments")
216 options.info(f"# Please review the cutoff analysis result in {cutoffanalysis_file}")
218 # Let MACS3 do the cutoff analysis to help decide the lower
219 # and upper cutoffs
220 with open(cutoffanalysis_file, "w") as ofhd_cutoff:
221 ofhd_cutoff.write(fc_bdg.cutoff_analysis(min_length=minlen,
222 max_gap=options.hmm_training_flanking,
223 max_score=options.cutoff_analysis_max,
224 steps=options.cutoff_analysis_steps))
225 # raise Exception("Cutoff analysis only.")
226 sys.exit(1)
228 #############################################
229 # 3. Define training set by peak calling
230 #############################################
232 if options.hmm_file:
233 # skip this step if hmm_file is given
234 options.info("#3 Skip this step of looking for training set since a Hidden Markov Model file has been provided!")
235 elif options.hmm_training_regions:
236 # if a training region file is provided - need to read in the bedfile and skip the peak calling step
237 options.info(f"#3 Read training regions from BED file: {options.hmm_training_regions}")
238 # from refinepeak_cmd.py:
239 peakio = open(options.hmm_training_regions, "rb")
240 peaks = PeakIO()
241 for l_p in peakio:
242 fs = l_p.rstrip().split()
243 peaks.add(chromosome=fs[0], start=int(fs[1]), end=int(fs[2])) # change based on what expected input file should contain
244 peakio.close()
245 training_regions = Regions()
246 training_regions.init_from_PeakIO(peaks)
247 options.info("# Training regions have been read from bedfile")
248 else:
249 # Find regions with fold change within determined range to use as training sites.
250 # Find regions with zscore values above certain cutoff to exclude from viterbi.
252 options.info(f"#3 Look for training set from {petrack.total} fragments")
253 options.info(f"# Call peak above within fold-change range of {options.hmm_lower} and {options.hmm_upper}.")
254 options.info(f"# The minimum length of the region is set as the average template/fragment length in the dataset: {minlen}")
255 options.info(f"# The maximum gap to merge nearby significant regions is set as the flanking size to extend training regions: {options.hmm_training_flanking}")
256 peaks = fc_bdg.call_peaks(cutoff=options.hmm_lower, min_length=minlen, max_gap=options.hmm_training_flanking, call_summits=False)
257 options.info(f"# Total training regions called after applying the lower cutoff {options.hmm_lower}: {peaks.total}")
258 peaks.filter_score(options.hmm_lower, options.hmm_upper)
259 options.info(f"# Total training regions after filtering with upper cutoff {options.hmm_upper}: {peaks.total}")
261 options.info( "# **IMPORTANT**")
262 options.info(f"# Please review the cutoff analysis result in {cutoffanalysis_file} to verify")
263 options.info( "# if the choices of lower, upper and prescanning cutoff are appropriate.")
264 options.info( "# Please read the message in the section 'Choices of cutoff values' by running")
265 options.info( "# `macs3 hmmratac -h` for detail.")
266 options.info( "# ****")
268 # Let MACS3 do the cutoff analysis to help decide the lower and upper cutoffs
269 with open(cutoffanalysis_file, "w") as ofhd_cutoff:
270 ofhd_cutoff.write(fc_bdg.cutoff_analysis(min_length=minlen, max_gap=options.hmm_training_flanking, max_score=options.cutoff_analysis_max))
272 # we will check if anything left after filtering
273 if peaks.total > options.hmm_maxTrain:
274 peaks = peaks.randomly_pick(options.hmm_maxTrain, seed=options.hmm_randomSeed)
275 options.info(f"# We randomly pick {options.hmm_maxTrain} regions for training")
276 elif peaks.total == 0:
277 options.error("# No training regions found. Please adjust the lower or upper cutoff.")
278 raise Exception("Not enough training regions!")
280 # Now we convert PeakIO to Regions and filter blacklisted regions
281 training_regions = Regions()
282 training_regions.init_from_PeakIO(peaks)
283 # We will expand the regions to both directions and merge overlap
284 options.info(f"# We expand the training regions with {options.hmm_training_flanking} basepairs and merge overlap")
285 training_regions.expand(options.hmm_training_flanking)
286 training_regions.merge_overlap()
288 # remove peaks overlapping with blacklisted regions
289 if options.blacklist:
290 training_regions.exclude(blacklist_regions)
291 options.info(f"# after removing those overlapping with provided blacklisted regions, we have {training_regions.total} left")
292 if options.save_train:
293 fhd = open(training_region_bedfile, "w")
294 training_regions.write_to_bed(fhd)
295 fhd.close()
296 options.info(f"# Training regions have been saved to `{options.name}_training_regions.bed` ")
298 #############################################
299 # 4. Train HMM
300 #############################################
301 # if model file is provided, we skip this step
302 # include options.hmm_type and make it backwards compatible, if no hmm_type default is gaussian
303 if options.hmm_file:
304 options.info("#4 Load Hidden Markov Model from given model file")
305 hmm_model, i_open_region, i_background_region, i_nucleosomal_region, options.hmm_binsize, options.hmm_type = hmm_model_init(options.hmm_file)
306 else:
307 options.info("#4 Train Hidden Markov Model with Multivariate Gaussian Emission")
309 # extract signals within peak using the given binsize
310 options.info(f"# Extract signals in training regions with bin size of {options.hmm_binsize}")
311 [training_bins, training_data, training_data_lengths] = extract_signals_from_regions(digested_atac_signals, training_regions, binsize=options.hmm_binsize, hmm_type=options.hmm_type)
313 if options.save_train:
314 f = open(training_datafile, "w")
315 for i in range(len(training_data)):
316 v = training_data[i]
317 p = training_bins[i]
318 f.write(f"{p[0]}\t{p[1]}\t{v[0]}\t{v[1]}\t{v[2]}\t{v[3]}\n")
319 f.close()
321 f = open(training_datalengthfile, "w")
322 for v in training_data_lengths:
323 f.write(f"{v}\n")
324 f.close()
326 options.info("# Use Baum-Welch algorithm to train the HMM")
328 hmm_model = hmm_training(training_data,
329 training_data_lengths,
330 random_seed=options.hmm_randomSeed,
331 hmm_type=options.hmm_type)
333 options.info(f"# HMM converged: {hmm_model.monitor_.converged}")
335 # label hidden states
336 if options.hmm_type == "gaussian":
337 means_sum = np.sum(hmm_model.means_, axis=1)
338 if options.hmm_type == "poisson":
339 means_sum = np.sum(hmm_model.lambdas_, axis=1)
341 # first, the state with the highest overall emission is the open state
342 i_open_region = np.where(means_sum == max(means_sum))[0][0]
344 # second, the state with lowest overall emission is the bg state
345 i_background_region = np.where(means_sum == min(means_sum))[0][0]
347 # last one is the nuc state (note it may not be accurate though
348 i_nucleosomal_region = list(set([0, 1, 2]) - set([i_open_region, i_background_region]))[0]
350 # write hmm into model file
351 options.info(f"# Write HMM parameters into JSON: {hmm_modelfile}")
352 hmm_model_save(hmm_modelfile, hmm_model, options.hmm_binsize, i_open_region, i_nucleosomal_region, i_background_region, options.hmm_type)
354 # if --modelonly option provided, exit script after hmm model is saved
355 if options.hmm_modelonly:
356 options.info("# Complete - HMM model was saved, program exited (--modelonly option was provided) ")
357 sys.exit()
359 # Now tell users the parameters of the HMM
360 assignments = ["", "", ""]
361 assignments[i_open_region] = "open"
362 assignments[i_nucleosomal_region] = "nuc"
363 assignments[i_background_region] = "bg"
365 options.info(f"# The Hidden Markov Model for signals of binsize of {options.hmm_binsize} basepairs:")
366 options.info(f"# open state index: state{i_open_region}")
367 options.info(f"# nucleosomal state index: state{i_nucleosomal_region}")
368 options.info(f"# background state index: state{i_background_region}")
369 options.info( "# Starting probabilities of states:")
370 options.info( "# {0[0]:>10s} {0[1]:>10s} {0[2]:>10s}".format(assignments))
371 options.info( "# {0[0]:>10.4g} {0[1]:>10.4g} {0[2]:>10.4g}".format(hmm_model.startprob_))
372 options.info( "# HMM Transition probabilities:")
373 options.info( "# {0[0]:>10s} {0[1]:>10s} {0[2]:>10s}".format(assignments))
374 options.info( "# {0:>10s}-> {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g}".format(assignments[0], hmm_model.transmat_[0]))
375 options.info( "# {0:>10s}-> {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g}".format(assignments[1], hmm_model.transmat_[1]))
376 options.info( "# {0:>10s}-> {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g}".format(assignments[2], hmm_model.transmat_[2]))
378 if options.hmm_type == 'gaussian':
379 options.info("# HMM Emissions (means): ")
380 options.info( "# {0[0]:>10s} {0[1]:>10s} {0[2]:>10s} {0[3]:>10s}".format(["short", "mono", "di", "tri"]))
381 options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[0], hmm_model.means_[0]))
382 options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[1], hmm_model.means_[1]))
383 options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[2], hmm_model.means_[2]))
384 if options.hmm_type == 'poisson':
385 options.info( "# HMM Emissions (lambdas): ")
386 options.info( "# {0[0]:>10s} {0[1]:>10s} {0[2]:>10s} {0[3]:>10s}".format(["short", "mono", "di", "tri"]))
387 options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[0], hmm_model.lambdas_[0]))
388 options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[1], hmm_model.lambdas_[1]))
389 options.info( "# {0:>10s}: {1[0]:>10.4g} {1[1]:>10.4g} {1[2]:>10.4g} {1[3]:>10.4g}".format(assignments[2], hmm_model.lambdas_[2]))
392 #############################################
393 # 5. Predict
394 #############################################
395 # Our prediction strategy will be different with HMMRATAC, we will first ask MACS call peaks with loose cutoff, then for each peak we will run HMM prediction to figure out labels. And for the rest of genomic regions, just mark them as 'background'.
396 options.info("#5 Decode with Viterbi to predict states")
397 # the following /4 is totally arbitrary, we may need to fix it
398 candidate_peaks = fc_bdg.call_peaks(cutoff=options.prescan_cutoff, min_length=minlen, max_gap=options.hmm_training_flanking, call_summits=False)
399 options.info(f"#5 Total candidate peaks : {candidate_peaks.total}")
401 # Now we convert PeakIO to Regions and filter blacklisted regions
402 candidate_regions = Regions()
403 candidate_regions.init_from_PeakIO(candidate_peaks)
404 # We will expand the regions to both directions and merge overlap
405 options.info(f"# We expand the candidate regions with {options.hmm_training_flanking} and merge overlap")
406 candidate_regions.expand(options.hmm_training_flanking)
407 candidate_regions.merge_overlap()
408 options.info(f"# after expanding and merging, we have {candidate_regions.total} candidate regions")
410 # remove peaks overlapping with blacklisted regions
411 if options.blacklist:
412 candidate_regions.exclude(blacklist_regions)
413 options.info(f"# after removing those overlapping with provided blacklisted regions, we have {candidate_regions.total} left")
415 # extract signals
416 options.info("# Extract signals in candidate regions and decode with HMM")
417 # we will do the extraction and prediction in a step of 10000 regions by default
419 # Note: we can implement in a different way to extract then predict for each candidate region.
420 # predicted_results = hmm_decode_each_region (digested_atac_signals, candidate_regions, hmm_model, binsize = options.hmm_binsize)
421 # Note: we implement in a way that we will decode the candidate regions 10000 regions at a time so 1. we can make it running in parallel in the future; 2. we can reduce the memory usage.
422 options.info("# Use HMM to predict states")
423 n = 0
425 # we create a temporary file to save the proba predicted from hmm
426 predicted_proba_file = tempfile.TemporaryFile(mode="w+b")
428 while candidate_regions.total != 0:
429 n += 1
430 # we get DECODING_STEPS number of candidate regions first
431 cr = candidate_regions.pop(options.decoding_steps)
432 options.info("# decoding %d..." % (n * options.decoding_steps))
434 # then extrac data from digested signals, create cr_bins, cr_data, and cr_data_lengths
435 [cr_bins, cr_data, cr_data_lengths] = extract_signals_from_regions(digested_atac_signals, cr, binsize=options.hmm_binsize, hmm_type=options.hmm_type)
436 # options.debug("# extract_signals_from_regions complete")
438 prob_data = hmm_predict(cr_data, cr_data_lengths, hmm_model)
439 assert len(prob_data) == len(cr_bins)
440 for i in range(len(prob_data)):
441 predicted_proba_file.write(b"%s,%d" % cr_bins[i])
442 predicted_proba_file.write(b",%f,%f,%f\n" % tuple(prob_data[i]))
444 cr_data = []
445 cr_data_lengths = []
446 cr_bins = []
447 prob_data = []
448 gc.collect()
450 predicted_proba_file.seek(0) # reset
451 options.info("# predicted_proba files written...")
453 #############################################
454 # 6. Output - add to OutputWriter
455 #############################################
456 options.info("# Write the output...")
457 # Now taken the candidate_bins and predicted_proba, we can generate various
458 # outputs
460 # One thing to remember about candidate_bins is that the position
461 # in this array is the 'end' of the bin, the actual region is the
462 # 'end'-'binsize' to the 'end'.
464 # First, the likelihoods for each of the three states in a bedGraph
465 if options.save_likelihoods:
466 options.info(f"# Write the likelihoods for each states into three bedGraph files {options.name}_open.bdg, {options.name}_nuc.bdg, and {options.name}_bg.bdg")
467 open_state_bdg_fhd = open(open_state_bdgfile, "w")
468 nuc_state_bdg_fhd = open(nuc_state_bdgfile, "w")
469 bg_state_bdg_fhd = open(bg_state_bdgfile, "w")
470 save_proba_to_bedGraph(predicted_proba_file, options.hmm_binsize, open_state_bdg_fhd, nuc_state_bdg_fhd, bg_state_bdg_fhd, i_open_region, i_nucleosomal_region, i_background_region)
471 predicted_proba_file.seek(0) # reset
472 open_state_bdg_fhd.close()
473 nuc_state_bdg_fhd.close()
474 bg_state_bdg_fhd.close()
475 options.info("# finished writing proba_to_bedgraph")
477 # # Generate states path:
478 states_path = generate_states_path(predicted_proba_file, options.hmm_binsize, i_open_region, i_nucleosomal_region, i_background_region)
479 options.info("# finished generating states path")
480 predicted_proba_file.close() # kill the temp file
481 # Save states path if needed
482 # PS: we need to implement extra feature to include those regions NOT in candidate_bins and assign them as 'background state'.
483 if options.save_states:
484 options.info(f"# Write states assignments in a BED file: {options.name}_states.bed")
485 with open(states_file, "w") as f:
486 save_states_bed(states_path, f)
488 options.info(f"# Write accessible regions in a narrowPeak file: {options.name}_accessible_regions.narrowPeak")
489 with open(accessible_file, "w") as ofhd:
490 save_accessible_regions(states_path, ofhd, options.openregion_minlen, fc_bdg)
492 options.info("# Finished")
495 def save_proba_to_bedGraph(predicted_proba_file, binsize, open_state_bdg_file, nuc_state_bdg_file, bg_state_bdg_file, i_open, i_nuc, i_bg):
497 open_state_bdg_file = bedGraphIO(open_state_bdg_file)
498 nuc_state_bdg_file = bedGraphIO(nuc_state_bdg_file)
499 bg_state_bdg_file = bedGraphIO(bg_state_bdg_file)
501 open_state_bdg = open_state_bdg_file.data
502 nuc_state_bdg = nuc_state_bdg_file.data
503 bg_state_bdg = bg_state_bdg_file.data
505 prev_chrom_name = None
506 prev_bin_end = None
508 for pp_line in predicted_proba_file:
509 pp_data = pp_line.strip().split(b',')
511 chrname = pp_data[0]
512 end_pos = int(pp_data[1])
513 start_pos = end_pos - binsize
514 pp_open = float(pp_data[i_open+2])
515 pp_nuc = float(pp_data[i_nuc+2])
516 pp_bg = float(pp_data[i_bg+2])
518 if chrname != prev_chrom_name:
519 # we start a new chromosome
520 if start_pos > 0:
521 # add the first unannotated region as background
522 open_state_bdg.add_loc(chrname, 0, start_pos, 0.0)
523 nuc_state_bdg.add_loc(chrname, 0, start_pos, 0.0)
524 bg_state_bdg.add_loc(chrname, 0, start_pos, 1.0)
525 prev_chrom_name = chrname
526 else:
527 # now check if the prev_bin_end is start_pos, if not, add a gap of background
528 if prev_bin_end < start_pos:
529 open_state_bdg.add_loc(chrname, prev_bin_end, start_pos, 0.0)
530 nuc_state_bdg.add_loc(chrname, prev_bin_end, start_pos, 0.0)
531 bg_state_bdg.add_loc(chrname, prev_bin_end, start_pos, 1.0)
533 open_state_bdg.add_loc(chrname, start_pos, end_pos, pp_open)
534 nuc_state_bdg.add_loc(chrname, start_pos, end_pos, pp_nuc)
535 bg_state_bdg.add_loc(chrname, start_pos, end_pos, pp_bg)
536 prev_bin_end = end_pos
538 open_state_bdg_file.write_bedGraph("Open States", "Likelihoods of being Open States", trackline=False)
539 nuc_state_bdg_file.write_bedGraph("Nucleosomal States", "Likelihoods of being Nucleosomal States", trackline=False)
540 bg_state_bdg_file.write_bedGraph("Background States", "Likelihoods of being Background States", trackline=False)
541 return
544 def save_states_bed(states_path, states_bedfile):
545 # we do not need to output background state.
546 for l_len in range(len(states_path)):
547 if states_path[l_len][3] != "bg":
548 states_bedfile.write("%s\t" % states_path[l_len][0].decode())
549 states_bedfile.write("%d\t%d\t%s\n" % states_path[l_len][1:])
550 return
553 def generate_states_path(predicted_proba_file, binsize, i_open, i_nuc, i_bg):
554 # predicted_proba_file is a temporary file
555 ret_states_path = []
556 labels_list = ["open", "nuc", "bg"]
558 prev_chrom_name = None
559 prev_bin_end = None
560 prev_label = None
562 for pp_line in predicted_proba_file:
563 pp_data = pp_line.strip().split(b',')
565 chrname = pp_data[0]
566 end_pos = int(pp_data[1])
567 start_pos = end_pos - binsize
568 pp_open = float(pp_data[i_open+2])
569 pp_nuc = float(pp_data[i_nuc+2])
570 pp_bg = float(pp_data[i_bg+2])
572 # find the best state as label
573 label = labels_list[max((pp_open, 0), (pp_nuc, 1), (pp_bg, 2), key=lambda x: x[0])[1]]
575 if chrname != prev_chrom_name:
576 # we start a new chromosome
577 if start_pos > 0:
578 # add the first unannotated region as background
579 ret_states_path.append((chrname, 0, start_pos, "bg"))
580 ret_states_path.append((chrname, start_pos, end_pos, label))
581 prev_label = label
582 prev_chrom_name = chrname
583 else:
584 # now check if the prev_bin_end is start_pos, if not, add a gap of background
585 if prev_bin_end < start_pos:
586 ret_states_path.append((chrname, prev_bin_end, start_pos, "bg"))
587 prev_label = "bg"
588 # same label, just extend
589 if label == prev_label:
590 ret_states_path[-1] = (ret_states_path[-1][0], ret_states_path[-1][1], end_pos, label)
591 else:
592 ret_states_path.append((chrname, start_pos, end_pos, label))
593 prev_label = label
595 prev_bin_end = end_pos
596 return ret_states_path
599 def save_accessible_regions(states_path, accessible_region_file, openregion_minlen, bdgscore):
600 # Function to add regions to the list
601 def add_regions(i, regions):
602 for j in range(i, i+3):
603 if not regions or states_path[j][2] != regions[-1][2]:
604 regions.append((states_path[j][0], int(states_path[j][1]), int(states_path[j][2]), states_path[j][3]))
605 return regions
607 # Select only accessible regions from _states.bed, look for nuc-open-nuc pattern
608 # This by default is the only final output from HMMRATAC
609 accessible_regions = []
610 for i in range(len(states_path)-2):
611 if (states_path[i][3] == 'nuc' and
612 states_path[i+1][3] == 'open' and
613 states_path[i+2][3] == 'nuc' and
614 states_path[i][2] == states_path[i+1][1] and
615 states_path[i+1][2] == states_path[i+2][1] and
616 states_path[i+2][2] - states_path[i][1] > openregion_minlen): # require nuc-open-nuc entire region start/endpos > openregion_minlen
617 accessible_regions = add_regions(i, accessible_regions)
619 # remove 'nuc' regions:
620 accessible_regions = [tup for tup in accessible_regions if tup[3] != 'nuc']
622 # Generate broadpeak object
623 openpeak = PeakIO()
624 for region in accessible_regions[:-1]:
625 openpeak.add(chromosome=region[0], start=region[1], end=region[2])
627 # refine peak summit and score using bedGraphTrackI with scores
628 openpeak = bdgscore.refine_peaks(openpeak)
629 openpeak.write_to_narrowPeak(accessible_region_file)
630 return