1 # Time-stamp: <2024-10-11 10:28:07 Tao Liu>
3 """Description: Call variants directly
5 Copyright (c) 2017-2023 Tao Liu <vladimir.liu@gmail.com>
7 This code is free software; you can redistribute it and/or modify it
8 under the terms of the BSD License (see the file COPYING included
9 with the distribution).
11 @status: release candidate
14 @contact: vladimir.liu@gmail.com
17 # ------------------------------------
19 # ------------------------------------
23 from functools
import partial
24 import multiprocessing
as mp
26 # from time import time
29 # ------------------------------------
31 # ------------------------------------
32 from MACS3
.Utilities
.Constants
import MACS_VERSION
33 from MACS3
.Utilities
.OptValidator
import opt_validate_callvar
34 from MACS3
.IO
.PeakIO
import PeakIO
35 from MACS3
.IO
.BAM
import BAMaccessor
36 from MACS3
.Signal
.RACollection
import RACollection
37 from MACS3
.Signal
.PeakVariants
import PeakVariants
40 VCFHEADER_0
= """##fileformat=VCFv4.1
44 ##INFO=<ID=M,Number=.,Type=String,Description="MACS Model with minimum BIC value">
45 ##INFO=<ID=MT,Number=.,Type=String,Description="Mutation type: SNV/Insertion/Deletion">
46 ##INFO=<ID=DPT,Number=1,Type=Integer,Description="Depth Treatment: Read depth in ChIP-seq data">
47 ##INFO=<ID=DPC,Number=1,Type=Integer,Description="Depth Control: Read depth in control data">
48 ##INFO=<ID=DP1T,Number=.,Type=String,Description="Read depth of top1 allele in ChIP-seq data">
49 ##INFO=<ID=DP2T,Number=.,Type=String,Description="Read depth of top2 allele in ChIP-seq data">
50 ##INFO=<ID=DP1C,Number=.,Type=String,Description="Read depth of top1 allele in control data">
51 ##INFO=<ID=DP2C,Number=.,Type=String,Description="Read depth of top2 allele in control data">
52 ##INFO=<ID=lnLHOMOMAJOR,Number=1,Type=Float,Description="Log(e) scaled genotype likelihoods of homozygous with major allele model">
53 ##INFO=<ID=lnLHOMOMINOR,Number=1,Type=Float,Description="Log(e) scaled genotype likelihoods of homozygous with minor allele model">
54 ##INFO=<ID=lnLHETERNOAS,Number=1,Type=Float,Description="Log(e) scaled genotype likelihoods of heterozygous with no allele-specific model">
55 ##INFO=<ID=lnLHETERAS,Number=1,Type=Float,Description="Log(e) scaled genotype likelihoods of heterozygous with allele-specific model">
56 ##INFO=<ID=BICHOMOMAJOR,Number=1,Type=Float,Description="BIC value of homozygous with major allele model">
57 ##INFO=<ID=BICHOMOMINOR,Number=1,Type=Float,Description="BIC value of homozygous with minor allele model">
58 ##INFO=<ID=BICHETERNOAS,Number=1,Type=Float,Description="BIC value of heterozygous with no allele-specific model">
59 ##INFO=<ID=BICHETERAS,Number=1,Type=Float,Description="BIC value of heterozygous with allele-specific model">
60 ##INFO=<ID=GQHOMO,Number=1,Type=Integer,Description="Genotype quality of homozygous with major allele model">
61 ##INFO=<ID=GQHETERNOAS,Number=1,Type=Integer,Description="Genotype quality of heterozygous with no allele-specific model">
62 ##INFO=<ID=GQHETERAS,Number=1,Type=Integer,Description="Genotype quality of heterozygous with allele-specific model">
63 ##INFO=<ID=GQHETERASsig,Number=1,Type=Integer,Description="Genotype quality of allele-specific significance compared with no allele-specific model">
64 ##INFO=<ID=AR,Number=1,Type=Float,Description="Estimated allele ratio of heterozygous with allele-specific model">
65 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
66 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="read depth after filtering">
67 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype quality score">"""
69 VCFHEADER
= """##fileformat=VCFv4.1
73 ##INFO=<ID=M,Number=.,Type=String,Description="MACS Model with minimum BIC value">
74 ##INFO=<ID=MT,Number=.,Type=String,Description="Mutation type: SNV/Insertion/Deletion">
75 ##INFO=<ID=DPT,Number=1,Type=Integer,Description="Depth Treatment: Read depth in ChIP-seq data">
76 ##INFO=<ID=DPC,Number=1,Type=Integer,Description="Depth Control: Read depth in control data">
77 ##INFO=<ID=DP1T,Number=.,Type=String,Description="Read depth of top1 allele in ChIP-seq data">
78 ##INFO=<ID=DP2T,Number=.,Type=String,Description="Read depth of top2 allele in ChIP-seq data">
79 ##INFO=<ID=DP1C,Number=.,Type=String,Description="Read depth of top1 allele in control data">
80 ##INFO=<ID=DP2C,Number=.,Type=String,Description="Read depth of top2 allele in control data">
81 ##INFO=<ID=DBIC,Number=.,Type=Float,Description="Difference of BIC of selected model vs second best alternative model">
82 ##INFO=<ID=BICHOMOMAJOR,Number=1,Type=Integer,Description="BIC of homozygous with major allele model">
83 ##INFO=<ID=BICHOMOMINOR,Number=1,Type=Integer,Description="BIC of homozygous with minor allele model">
84 ##INFO=<ID=BICHETERNOAS,Number=1,Type=Integer,Description="BIC of heterozygous with no allele-specific model">
85 ##INFO=<ID=BICHETERAS,Number=1,Type=Integer,Description="BIC of heterozygous with allele-specific model">
86 ##INFO=<ID=AR,Number=1,Type=Float,Description="Estimated allele ratio of heterozygous with allele-specific model">
87 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
88 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth after filtering bad reads">
89 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality score">
90 ##FORMAT=<ID=PL,Number=3,Type=Integer,Description="Normalized, Phred-scaled genotype likelihoods for 00, 01, 11 genotype">"""
92 # ------------------------------------
94 # ------------------------------------
97 def check_names(treat
, control
, error_stream
):
98 """check common chromosome names"""
99 tchrnames
= set(treat
.get_chr_names())
100 cchrnames
= set(control
.get_chr_names())
101 commonnames
= tchrnames
.intersection(cchrnames
)
102 if len(commonnames
) == 0:
103 error_stream("No common chromosome names can be found from treatment and control! Check your input files! MACS will quit...")
104 error_stream("Chromosome names in treatment: %s" % ",".join(sorted(tchrnames
)))
105 error_stream("Chromosome names in control: %s" % ",".join(sorted(cchrnames
)))
110 """The Main function/pipeline for MACS
113 options
= opt_validate_callvar(args
)
116 # warn = options.warn
117 # debug = options.debug
118 # error = options.error
120 peakbedfile
= options
.peakbed
121 tfile
= options
.tfile
122 cfile
= options
.cfile
123 top2allelesminr
= options
.top2allelesMinRatio
124 min_altallele_count
= options
.altalleleMinCount
125 max_allowed_ar
= options
.maxAR
129 min_homo_GQ
= options
.GQCutoffHomo
130 min_heter_GQ
= options
.GQCutoffHetero
132 maxDuplicate
= options
.maxDuplicate
134 # parameter for assembly
135 fermiMinOverlap
= options
.fermiMinOverlap
136 fermi
= options
.fermi
138 peakio
= open(peakbedfile
)
141 for t_peak
in peakio
:
142 fs
= t_peak
.rstrip().split()
144 peaks
.add(fs
[0].encode(), int(fs
[1]), int(fs
[2])) # , name=b"%d" % i)
147 # chrs = peaks.get_chr_names()
149 tbam
= BAMaccessor(tfile
)
151 cbam
= BAMaccessor(cfile
)
152 assert tbam
.get_chromosomes()[0] in cbam
.get_chromosomes() or cbam
.get_chromosomes()[0] in tbam
.get_chromosomes(), Exception("It seems Treatment and Control BAM use different naming for chromosomes! Check headers of both files.")
153 # assert tbam.get_chromosomes() == cbam.get_chromosomes(), Exception("Treatment and Control BAM files have different orders of sorted chromosomes! Please check BAM Headers and re-sort BAM files.")
157 # ra_collections = []
159 # prepare and write header of output file (.vcf)
160 ovcf
= open(options
.ofile
, "w")
161 tmpcmdstr
= " --fermi " + fermi
+ " --fermi-overlap "+str(fermiMinOverlap
)
162 ovcf
.write(VCFHEADER
% (datetime
.date
.today().strftime("%Y%m%d"), MACS_VERSION
, " ".join(sys
.argv
[1:] + ["-Q", str(minQ
), "-D", str(maxDuplicate
), "--max-ar", str(max_allowed_ar
), "--top2alleles-mratio", str(top2allelesminr
), "--top2allele-count", str(min_altallele_count
), "-g", str(min_heter_GQ
), "-G", str(min_homo_GQ
), tmpcmdstr
])) + "\n")
163 for (chrom
, chrlength
) in tbam
.get_rlengths().items():
164 ovcf
.write("##contig=<ID=%s,length=%d,assembly=NA>\n" % (chrom
.decode(), chrlength
))
165 ovcf
.write("\t".join(("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "SAMPLE")) + "\n")
171 # t_call_top2alleles = 0
173 # t_call_variants = 0
178 for chrom
in sorted(tbam
.get_chromosomes()):
179 peaks_chr
= peaks
.get_data_from_chrom(chrom
)
180 for peak
in peaks_chr
:
181 # note, when we extract reads from BAM within a peak
182 # region, we assume BAM should be sorted and the BAM
183 # should be generated from "samtools view -L" process.
185 # print ( "---begin of peak---")
186 info(f
"Peak: {chrom.decode()} {peak['start']} {peak['end']}")
188 # flag_todo_lassembly = False
193 ra_collection
= RACollection(chrom
, peak
,
194 tbam
.get_reads_in_region(chrom
, peak
["start"], peak
["end"], maxDuplicate
=maxDuplicate
),
195 cbam
.get_reads_in_region(chrom
, peak
["start"], peak
["end"], maxDuplicate
=maxDuplicate
))
197 ra_collection
= RACollection(chrom
, peak
,
198 tbam
.get_reads_in_region(chrom
, peak
["start"], peak
["end"], maxDuplicate
=maxDuplicate
))
200 info("No reads found in this peak. Skipped")
201 # while there is no reads in peak region, simply skip it.
204 ra_collection
.remove_outliers(percent
=5)
205 # t_prepare_ra += time() - t0
207 # print ( "Reads in Peak:")
208 # print ( ra_collection.get_FASTQ().decode() )
210 s
= ra_collection
["peak_refseq"]
212 peak_variants
= PeakVariants(chrom
.decode(), peak
["start"], peak
["end"], s
)
214 if fermi
== "auto" or fermi
== "off":
215 # first pass to call variant w/o assembly
216 # multiprocessing the following part
217 # t_call_variants_0 = time()
218 info(" Call variants w/o assembly")
220 # -- now make multi processes
221 # divide right-left into NP parts
222 window_size
= ceil((ra_collection
["right"] - ra_collection
["left"]) / NP
)
225 # this partial function will only be used in multiprocessing
226 p_call_variants_at_range
= partial(call_variants_at_range
, s
=s
, collection
=ra_collection
, top2allelesminr
=top2allelesminr
, max_allowed_ar
=max_allowed_ar
, min_altallele_count
=min_altallele_count
, min_homo_GQ
=min_homo_GQ
, min_heter_GQ
=min_heter_GQ
, minQ
=minQ
)
230 l_d
= i
* window_size
+ ra_collection
["left"]
231 r_d
= min((i
+ 1) * window_size
+ ra_collection
["left"], ra_collection
["right"])
232 ranges
.append((l_d
, r_d
))
234 mapresults
= P
.map_async(p_call_variants_at_range
, ranges
)
237 results
= mapresults
.get(timeout
=window_size
*300)
239 for result
in results
[i
]:
240 peak_variants
.add_variant(result
[0], result
[1])
242 # t_call_variants += time() - t_call_variants_0
244 # Next, check if we should do local assembly
245 if (fermi
== "auto" and (peak_variants
.has_indel() or peak_variants
.has_refer_biased_01())) or fermi
== "on":
246 #print( peak_variants.has_indel() )
247 #print( peak_variants.has_refer_biased_01() )
249 # invoke fermi to assemble local sequence and filter out those can not be mapped to unitigs.
250 info(" Try to call variants w/ fermi-lite assembly")
251 unitig_collection
= ra_collection
.build_unitig_collection(fermiMinOverlap
)
252 if unitig_collection
== -1:
253 info(" Too many mismatches found while assembling the sequence, we will skip this region entirely!")
255 elif unitig_collection
== 0:
256 info(" Failed to assemble unitigs, fall back to previous results")
257 if peak_variants
.n_variants() > 0:
258 peak_variants
.fix_indels()
259 ovcf
.write(peak_variants
.toVCF())
261 # uncomment the following to print those assembled unitigs and their alignments to reference genome
262 # for u in unitig_collection["URAs_list"]:
263 # print(u["seq"].decode(), u["lpos"], u["rpos"], u["count"])
264 # print("a",u["unitig_aln"].decode())
265 # print("r",u["reference_aln"].decode())
267 # if we do not assemble, write results now
268 if peak_variants
.n_variants() > 0:
269 peak_variants
.fix_indels()
270 ovcf
.write(peak_variants
.toVCF())
273 # reach here only if we need l assembly and the assembly returns result
275 # If a peak has no indel but with refer_biased_01, we
276 # revisit all refer_biased_01 now. We do not use
277 # multiprocessing here for simplicity since there won't be
278 # too many in a peak region.
279 if (fermi
== "auto" and (not peak_variants
.has_indel()) and peak_variants
.has_refer_biased_01()):
280 pos_tobe_revisit
= peak_variants
.get_refer_biased_01s()
281 for i
in pos_tobe_revisit
:
282 ref_nt
= chr(s
[i
-ra_collection
["left"]]).encode()
284 peak_variants
.remove_variant(i
)
286 PRI
= unitig_collection
.get_PosReadsInfo_ref_pos(i
, ref_nt
, Q
=minQ
)
287 if PRI
.raw_read_depth(opt
="T") == 0: # skip if coverage is 0
288 peak_variants
.remove_variant(i
)
290 PRI
.update_top_alleles(top2allelesminr
, min_altallele_count
, max_allowed_ar
)
291 PRI
.call_GT(max_allowed_ar
)
292 PRI
.apply_GQ_cutoff(min_homo_GQ
, min_heter_GQ
)
293 if not PRI
.filterflag():
294 peak_variants
.replace_variant(i
, PRI
.toVariant())
296 peak_variants
.remove_variant(i
)
297 if peak_variants
.n_variants() > 0:
298 peak_variants
.fix_indels()
299 ovcf
.write(peak_variants
.toVCF())
302 # in this case, we call variants at every locations in the peak based on local assembly.
303 if (fermi
== "on" or (fermi
== "auto" and peak_variants
.has_indel())):
304 peak_variants
= PeakVariants(chrom
.decode(), peak
["start"], peak
["end"], s
) # reset
306 # --- make multi processes
307 # divide right-left into NP parts
308 window_size
= ceil((ra_collection
["right"] - ra_collection
["left"]) / NP
)
310 # this partial function will only be used in multiprocessing
311 p_call_variants_at_range
= partial(call_variants_at_range
, s
=s
, collection
=unitig_collection
, top2allelesminr
=top2allelesminr
, max_allowed_ar
=max_allowed_ar
, min_altallele_count
=min_altallele_count
, min_homo_GQ
=min_homo_GQ
, min_heter_GQ
=min_heter_GQ
, minQ
=minQ
)
314 l_d
= i
* window_size
+ ra_collection
["left"]
315 r_d
= min((i
+ 1) * window_size
+ ra_collection
["left"], ra_collection
["right"])
316 ranges
.append((l_d
, r_d
))
318 mapresults
= P
.map_async(p_call_variants_at_range
, ranges
)
321 results
= mapresults
.get(timeout
=window_size
*300)
323 for result
in results
[i
]:
324 peak_variants
.add_variant(result
[0], result
[1])
325 if peak_variants
.n_variants() > 0:
326 peak_variants
.fix_indels()
327 ovcf
.write(peak_variants
.toVCF())
329 # print ("time to retrieve read alignment information from BAM:",t_prepare_ra,"(",round( 100 * t_prepare_ra/t_total, 2),"% )")
333 def call_variants_at_range(lr
, s
, collection
, top2allelesminr
, max_allowed_ar
, min_altallele_count
, min_homo_GQ
, min_heter_GQ
, minQ
):
335 for i
in range(lr
[0], lr
[1]):
336 ref_nt
= chr(s
[i
-collection
["left"]]).encode()
340 PRI
= collection
.get_PosReadsInfo_ref_pos(i
, ref_nt
, Q
=minQ
)
341 if PRI
.raw_read_depth(opt
="T") == 0: # skip if coverage is 0
344 PRI
.update_top_alleles(top2allelesminr
, min_altallele_count
, max_allowed_ar
)
345 if not PRI
.filterflag():
346 #PRI.update_top_alleles( top2allelesminr )
347 PRI
.call_GT(max_allowed_ar
)
348 PRI
.apply_GQ_cutoff(min_homo_GQ
, min_heter_GQ
)
349 if not PRI
.filterflag():
350 result
.append((i
, PRI
.toVariant()))