Merge pull request #678 from kaizhang/master
[MACS.git] / MACS3 / Commands / callvar_cmd.py
blobcbd900fc89c68028a7ec525667c9a1ddf84fc5ea
1 # Time-stamp: <2024-10-11 10:28:07 Tao Liu>
3 """Description: Call variants directly
5 Copyright (c) 2017-2023 Tao Liu <vladimir.liu@gmail.com>
7 This code is free software; you can redistribute it and/or modify it
8 under the terms of the BSD License (see the file COPYING included
9 with the distribution).
11 @status: release candidate
12 @version: $Id$
13 @author: Tao Liu
14 @contact: vladimir.liu@gmail.com
15 """
17 # ------------------------------------
18 # python modules
19 # ------------------------------------
21 import datetime
22 import sys
23 from functools import partial
24 import multiprocessing as mp
26 # from time import time
27 from math import ceil
29 # ------------------------------------
30 # own python modules
31 # ------------------------------------
32 from MACS3.Utilities.Constants import MACS_VERSION
33 from MACS3.Utilities.OptValidator import opt_validate_callvar
34 from MACS3.IO.PeakIO import PeakIO
35 from MACS3.IO.BAM import BAMaccessor
36 from MACS3.Signal.RACollection import RACollection
37 from MACS3.Signal.PeakVariants import PeakVariants
40 VCFHEADER_0 = """##fileformat=VCFv4.1
41 ##fileDate=%s
42 ##source=MACS_V%s
43 ##Program_Args=%s
44 ##INFO=<ID=M,Number=.,Type=String,Description="MACS Model with minimum BIC value">
45 ##INFO=<ID=MT,Number=.,Type=String,Description="Mutation type: SNV/Insertion/Deletion">
46 ##INFO=<ID=DPT,Number=1,Type=Integer,Description="Depth Treatment: Read depth in ChIP-seq data">
47 ##INFO=<ID=DPC,Number=1,Type=Integer,Description="Depth Control: Read depth in control data">
48 ##INFO=<ID=DP1T,Number=.,Type=String,Description="Read depth of top1 allele in ChIP-seq data">
49 ##INFO=<ID=DP2T,Number=.,Type=String,Description="Read depth of top2 allele in ChIP-seq data">
50 ##INFO=<ID=DP1C,Number=.,Type=String,Description="Read depth of top1 allele in control data">
51 ##INFO=<ID=DP2C,Number=.,Type=String,Description="Read depth of top2 allele in control data">
52 ##INFO=<ID=lnLHOMOMAJOR,Number=1,Type=Float,Description="Log(e) scaled genotype likelihoods of homozygous with major allele model">
53 ##INFO=<ID=lnLHOMOMINOR,Number=1,Type=Float,Description="Log(e) scaled genotype likelihoods of homozygous with minor allele model">
54 ##INFO=<ID=lnLHETERNOAS,Number=1,Type=Float,Description="Log(e) scaled genotype likelihoods of heterozygous with no allele-specific model">
55 ##INFO=<ID=lnLHETERAS,Number=1,Type=Float,Description="Log(e) scaled genotype likelihoods of heterozygous with allele-specific model">
56 ##INFO=<ID=BICHOMOMAJOR,Number=1,Type=Float,Description="BIC value of homozygous with major allele model">
57 ##INFO=<ID=BICHOMOMINOR,Number=1,Type=Float,Description="BIC value of homozygous with minor allele model">
58 ##INFO=<ID=BICHETERNOAS,Number=1,Type=Float,Description="BIC value of heterozygous with no allele-specific model">
59 ##INFO=<ID=BICHETERAS,Number=1,Type=Float,Description="BIC value of heterozygous with allele-specific model">
60 ##INFO=<ID=GQHOMO,Number=1,Type=Integer,Description="Genotype quality of homozygous with major allele model">
61 ##INFO=<ID=GQHETERNOAS,Number=1,Type=Integer,Description="Genotype quality of heterozygous with no allele-specific model">
62 ##INFO=<ID=GQHETERAS,Number=1,Type=Integer,Description="Genotype quality of heterozygous with allele-specific model">
63 ##INFO=<ID=GQHETERASsig,Number=1,Type=Integer,Description="Genotype quality of allele-specific significance compared with no allele-specific model">
64 ##INFO=<ID=AR,Number=1,Type=Float,Description="Estimated allele ratio of heterozygous with allele-specific model">
65 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
66 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="read depth after filtering">
67 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype quality score">"""
69 VCFHEADER = """##fileformat=VCFv4.1
70 ##fileDate=%s
71 ##source=MACS_V%s
72 ##Program_Args=%s
73 ##INFO=<ID=M,Number=.,Type=String,Description="MACS Model with minimum BIC value">
74 ##INFO=<ID=MT,Number=.,Type=String,Description="Mutation type: SNV/Insertion/Deletion">
75 ##INFO=<ID=DPT,Number=1,Type=Integer,Description="Depth Treatment: Read depth in ChIP-seq data">
76 ##INFO=<ID=DPC,Number=1,Type=Integer,Description="Depth Control: Read depth in control data">
77 ##INFO=<ID=DP1T,Number=.,Type=String,Description="Read depth of top1 allele in ChIP-seq data">
78 ##INFO=<ID=DP2T,Number=.,Type=String,Description="Read depth of top2 allele in ChIP-seq data">
79 ##INFO=<ID=DP1C,Number=.,Type=String,Description="Read depth of top1 allele in control data">
80 ##INFO=<ID=DP2C,Number=.,Type=String,Description="Read depth of top2 allele in control data">
81 ##INFO=<ID=DBIC,Number=.,Type=Float,Description="Difference of BIC of selected model vs second best alternative model">
82 ##INFO=<ID=BICHOMOMAJOR,Number=1,Type=Integer,Description="BIC of homozygous with major allele model">
83 ##INFO=<ID=BICHOMOMINOR,Number=1,Type=Integer,Description="BIC of homozygous with minor allele model">
84 ##INFO=<ID=BICHETERNOAS,Number=1,Type=Integer,Description="BIC of heterozygous with no allele-specific model">
85 ##INFO=<ID=BICHETERAS,Number=1,Type=Integer,Description="BIC of heterozygous with allele-specific model">
86 ##INFO=<ID=AR,Number=1,Type=Float,Description="Estimated allele ratio of heterozygous with allele-specific model">
87 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
88 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth after filtering bad reads">
89 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality score">
90 ##FORMAT=<ID=PL,Number=3,Type=Integer,Description="Normalized, Phred-scaled genotype likelihoods for 00, 01, 11 genotype">"""
92 # ------------------------------------
93 # Main function
94 # ------------------------------------
97 def check_names(treat, control, error_stream):
98 """check common chromosome names"""
99 tchrnames = set(treat.get_chr_names())
100 cchrnames = set(control.get_chr_names())
101 commonnames = tchrnames.intersection(cchrnames)
102 if len(commonnames) == 0:
103 error_stream("No common chromosome names can be found from treatment and control! Check your input files! MACS will quit...")
104 error_stream("Chromosome names in treatment: %s" % ",".join(sorted(tchrnames)))
105 error_stream("Chromosome names in control: %s" % ",".join(sorted(cchrnames)))
106 sys.exit()
109 def run(args):
110 """The Main function/pipeline for MACS
113 options = opt_validate_callvar(args)
115 info = options.info
116 # warn = options.warn
117 # debug = options.debug
118 # error = options.error
120 peakbedfile = options.peakbed
121 tfile = options.tfile
122 cfile = options.cfile
123 top2allelesminr = options.top2allelesMinRatio
124 min_altallele_count = options.altalleleMinCount
125 max_allowed_ar = options.maxAR
126 NP = options.np
127 if NP <= 0:
128 NP = 1
129 min_homo_GQ = options.GQCutoffHomo
130 min_heter_GQ = options.GQCutoffHetero
131 minQ = options.Q
132 maxDuplicate = options.maxDuplicate
134 # parameter for assembly
135 fermiMinOverlap = options.fermiMinOverlap
136 fermi = options.fermi
138 peakio = open(peakbedfile)
139 peaks = PeakIO()
140 #i = 0
141 for t_peak in peakio:
142 fs = t_peak.rstrip().split()
143 # i += 1
144 peaks.add(fs[0].encode(), int(fs[1]), int(fs[2])) # , name=b"%d" % i)
145 peaks.sort()
147 # chrs = peaks.get_chr_names()
149 tbam = BAMaccessor(tfile)
150 if cfile:
151 cbam = BAMaccessor(cfile)
152 assert tbam.get_chromosomes()[0] in cbam.get_chromosomes() or cbam.get_chromosomes()[0] in tbam.get_chromosomes(), Exception("It seems Treatment and Control BAM use different naming for chromosomes! Check headers of both files.")
153 # assert tbam.get_chromosomes() == cbam.get_chromosomes(), Exception("Treatment and Control BAM files have different orders of sorted chromosomes! Please check BAM Headers and re-sort BAM files.")
154 else:
155 cbam = None
157 # ra_collections = []
159 # prepare and write header of output file (.vcf)
160 ovcf = open(options.ofile, "w")
161 tmpcmdstr = " --fermi " + fermi + " --fermi-overlap "+str(fermiMinOverlap)
162 ovcf.write(VCFHEADER % (datetime.date.today().strftime("%Y%m%d"), MACS_VERSION, " ".join(sys.argv[1:] + ["-Q", str(minQ), "-D", str(maxDuplicate), "--max-ar", str(max_allowed_ar), "--top2alleles-mratio", str(top2allelesminr), "--top2allele-count", str(min_altallele_count), "-g", str(min_heter_GQ), "-G", str(min_homo_GQ), tmpcmdstr])) + "\n")
163 for (chrom, chrlength) in tbam.get_rlengths().items():
164 ovcf.write("##contig=<ID=%s,length=%d,assembly=NA>\n" % (chrom.decode(), chrlength))
165 ovcf.write("\t".join(("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "SAMPLE")) + "\n")
167 # to get time
168 # t_total = 0
169 # t_prepare_ra = 0
170 # t_assemble = 0
171 # t_call_top2alleles = 0
172 # t_call_lnL = 0
173 # t_call_variants = 0
174 # t_call_GT = 0
175 #t_call_to_vcf = 0
176 # t_total_0 = time()
178 for chrom in sorted(tbam.get_chromosomes()):
179 peaks_chr = peaks.get_data_from_chrom(chrom)
180 for peak in peaks_chr:
181 # note, when we extract reads from BAM within a peak
182 # region, we assume BAM should be sorted and the BAM
183 # should be generated from "samtools view -L" process.
185 # print ( "---begin of peak---")
186 info(f"Peak: {chrom.decode()} {peak['start']} {peak['end']}")
188 # flag_todo_lassembly = False
190 # t0 = time()
191 try:
192 if cbam:
193 ra_collection = RACollection(chrom, peak,
194 tbam.get_reads_in_region(chrom, peak["start"], peak["end"], maxDuplicate=maxDuplicate),
195 cbam.get_reads_in_region(chrom, peak["start"], peak["end"], maxDuplicate=maxDuplicate))
196 else:
197 ra_collection = RACollection(chrom, peak,
198 tbam.get_reads_in_region(chrom, peak["start"], peak["end"], maxDuplicate=maxDuplicate))
199 except Exception:
200 info("No reads found in this peak. Skipped")
201 # while there is no reads in peak region, simply skip it.
202 continue
204 ra_collection.remove_outliers(percent=5)
205 # t_prepare_ra += time() - t0
207 # print ( "Reads in Peak:")
208 # print ( ra_collection.get_FASTQ().decode() )
210 s = ra_collection["peak_refseq"]
212 peak_variants = PeakVariants(chrom.decode(), peak["start"], peak["end"], s)
214 if fermi == "auto" or fermi == "off":
215 # first pass to call variant w/o assembly
216 # multiprocessing the following part
217 # t_call_variants_0 = time()
218 info(" Call variants w/o assembly")
220 # -- now make multi processes
221 # divide right-left into NP parts
222 window_size = ceil((ra_collection["right"] - ra_collection["left"]) / NP)
224 P = mp.Pool(NP)
225 # this partial function will only be used in multiprocessing
226 p_call_variants_at_range = partial(call_variants_at_range, s=s, collection=ra_collection, top2allelesminr=top2allelesminr, max_allowed_ar=max_allowed_ar, min_altallele_count=min_altallele_count, min_homo_GQ=min_homo_GQ, min_heter_GQ=min_heter_GQ, minQ=minQ)
228 ranges = []
229 for i in range(NP):
230 l_d = i * window_size + ra_collection["left"]
231 r_d = min((i + 1) * window_size + ra_collection["left"], ra_collection["right"])
232 ranges.append((l_d, r_d))
234 mapresults = P.map_async(p_call_variants_at_range, ranges)
235 P.close()
236 P.join()
237 results = mapresults.get(timeout=window_size*300)
238 for i in range(NP):
239 for result in results[i]:
240 peak_variants.add_variant(result[0], result[1])
242 # t_call_variants += time() - t_call_variants_0
244 # Next, check if we should do local assembly
245 if (fermi == "auto" and (peak_variants.has_indel() or peak_variants.has_refer_biased_01())) or fermi == "on":
246 #print( peak_variants.has_indel() )
247 #print( peak_variants.has_refer_biased_01() )
249 # invoke fermi to assemble local sequence and filter out those can not be mapped to unitigs.
250 info(" Try to call variants w/ fermi-lite assembly")
251 unitig_collection = ra_collection.build_unitig_collection(fermiMinOverlap)
252 if unitig_collection == -1:
253 info(" Too many mismatches found while assembling the sequence, we will skip this region entirely!")
254 continue
255 elif unitig_collection == 0:
256 info(" Failed to assemble unitigs, fall back to previous results")
257 if peak_variants.n_variants() > 0:
258 peak_variants.fix_indels()
259 ovcf.write(peak_variants.toVCF())
260 continue
261 # uncomment the following to print those assembled unitigs and their alignments to reference genome
262 # for u in unitig_collection["URAs_list"]:
263 # print(u["seq"].decode(), u["lpos"], u["rpos"], u["count"])
264 # print("a",u["unitig_aln"].decode())
265 # print("r",u["reference_aln"].decode())
266 else:
267 # if we do not assemble, write results now
268 if peak_variants.n_variants() > 0:
269 peak_variants.fix_indels()
270 ovcf.write(peak_variants.toVCF())
271 continue
273 # reach here only if we need l assembly and the assembly returns result
275 # If a peak has no indel but with refer_biased_01, we
276 # revisit all refer_biased_01 now. We do not use
277 # multiprocessing here for simplicity since there won't be
278 # too many in a peak region.
279 if (fermi == "auto" and (not peak_variants.has_indel()) and peak_variants.has_refer_biased_01()):
280 pos_tobe_revisit = peak_variants.get_refer_biased_01s()
281 for i in pos_tobe_revisit:
282 ref_nt = chr(s[i-ra_collection["left"]]).encode()
283 if ref_nt == b'N':
284 peak_variants.remove_variant(i)
285 continue
286 PRI = unitig_collection.get_PosReadsInfo_ref_pos(i, ref_nt, Q=minQ)
287 if PRI.raw_read_depth(opt="T") == 0: # skip if coverage is 0
288 peak_variants.remove_variant(i)
289 continue
290 PRI.update_top_alleles(top2allelesminr, min_altallele_count, max_allowed_ar)
291 PRI.call_GT(max_allowed_ar)
292 PRI.apply_GQ_cutoff(min_homo_GQ, min_heter_GQ)
293 if not PRI.filterflag():
294 peak_variants.replace_variant(i, PRI.toVariant())
295 else:
296 peak_variants.remove_variant(i)
297 if peak_variants.n_variants() > 0:
298 peak_variants.fix_indels()
299 ovcf.write(peak_variants.toVCF())
300 continue
302 # in this case, we call variants at every locations in the peak based on local assembly.
303 if (fermi == "on" or (fermi == "auto" and peak_variants.has_indel())):
304 peak_variants = PeakVariants(chrom.decode(), peak["start"], peak["end"], s) # reset
306 # --- make multi processes
307 # divide right-left into NP parts
308 window_size = ceil((ra_collection["right"] - ra_collection["left"]) / NP)
309 P = mp.Pool(NP)
310 # this partial function will only be used in multiprocessing
311 p_call_variants_at_range = partial(call_variants_at_range, s=s, collection=unitig_collection, top2allelesminr=top2allelesminr, max_allowed_ar=max_allowed_ar, min_altallele_count=min_altallele_count, min_homo_GQ=min_homo_GQ, min_heter_GQ=min_heter_GQ, minQ=minQ)
312 ranges = []
313 for i in range(NP):
314 l_d = i * window_size + ra_collection["left"]
315 r_d = min((i + 1) * window_size + ra_collection["left"], ra_collection["right"])
316 ranges.append((l_d, r_d))
318 mapresults = P.map_async(p_call_variants_at_range, ranges)
319 P.close()
320 P.join()
321 results = mapresults.get(timeout=window_size*300)
322 for i in range(NP):
323 for result in results[i]:
324 peak_variants.add_variant(result[0], result[1])
325 if peak_variants.n_variants() > 0:
326 peak_variants.fix_indels()
327 ovcf.write(peak_variants.toVCF())
329 # print ("time to retrieve read alignment information from BAM:",t_prepare_ra,"(",round( 100 * t_prepare_ra/t_total, 2),"% )")
330 return
333 def call_variants_at_range(lr, s, collection, top2allelesminr, max_allowed_ar, min_altallele_count, min_homo_GQ, min_heter_GQ, minQ):
334 result = []
335 for i in range(lr[0], lr[1]):
336 ref_nt = chr(s[i-collection["left"]]).encode()
337 if ref_nt == b'N':
338 continue
340 PRI = collection.get_PosReadsInfo_ref_pos(i, ref_nt, Q=minQ)
341 if PRI.raw_read_depth(opt="T") == 0: # skip if coverage is 0
342 continue
344 PRI.update_top_alleles(top2allelesminr, min_altallele_count, max_allowed_ar)
345 if not PRI.filterflag():
346 #PRI.update_top_alleles( top2allelesminr )
347 PRI.call_GT(max_allowed_ar)
348 PRI.apply_GQ_cutoff(min_homo_GQ, min_heter_GQ)
349 if not PRI.filterflag():
350 result.append((i, PRI.toVariant()))
351 return result