1 # Time-stamp: <2021-03-10 23:51:54 Tao Liu>
3 """Description: macs call
5 Copyright (c) 2017 Tao Liu <tliu4@buffalo.edu>
7 This code is free software; you can redistribute it and/or modify it
8 under the terms of the BSD License (see the file COPYING included
9 with the distribution).
11 @status: release candidate
14 @contact: tliu4@buffalo.edu
17 # ------------------------------------
19 # ------------------------------------
24 from functools
import partial
25 import multiprocessing
as mp
30 # ------------------------------------
32 # ------------------------------------
33 from MACS3
.Utilities
.Constants
import *
34 from MACS3
.Utilities
.OptValidator
import opt_validate_callvar
35 from MACS3
.IO
.PeakIO
import PeakIO
36 from MACS3
.IO
.BAM
import BAMaccessor
37 from MACS3
.Signal
.RACollection
import RACollection
38 from MACS3
.Signal
.PeakVariants
import PeakVariants
41 VCFHEADER_0
="""##fileformat=VCFv4.1
45 ##INFO=<ID=M,Number=.,Type=String,Description="MACS Model with minimum BIC value">
46 ##INFO=<ID=MT,Number=.,Type=String,Description="Mutation type: SNV/Insertion/Deletion">
47 ##INFO=<ID=DPT,Number=1,Type=Integer,Description="Depth Treatment: Read depth in ChIP-seq data">
48 ##INFO=<ID=DPC,Number=1,Type=Integer,Description="Depth Control: Read depth in control data">
49 ##INFO=<ID=DP1T,Number=.,Type=String,Description="Read depth of top1 allele in ChIP-seq data">
50 ##INFO=<ID=DP2T,Number=.,Type=String,Description="Read depth of top2 allele in ChIP-seq data">
51 ##INFO=<ID=DP1C,Number=.,Type=String,Description="Read depth of top1 allele in control data">
52 ##INFO=<ID=DP2C,Number=.,Type=String,Description="Read depth of top2 allele in control data">
53 ##INFO=<ID=lnLHOMOMAJOR,Number=1,Type=Float,Description="Log(e) scaled genotype likelihoods of homozygous with major allele model">
54 ##INFO=<ID=lnLHOMOMINOR,Number=1,Type=Float,Description="Log(e) scaled genotype likelihoods of homozygous with minor allele model">
55 ##INFO=<ID=lnLHETERNOAS,Number=1,Type=Float,Description="Log(e) scaled genotype likelihoods of heterozygous with no allele-specific model">
56 ##INFO=<ID=lnLHETERAS,Number=1,Type=Float,Description="Log(e) scaled genotype likelihoods of heterozygous with allele-specific model">
57 ##INFO=<ID=BICHOMOMAJOR,Number=1,Type=Float,Description="BIC value of homozygous with major allele model">
58 ##INFO=<ID=BICHOMOMINOR,Number=1,Type=Float,Description="BIC value of homozygous with minor allele model">
59 ##INFO=<ID=BICHETERNOAS,Number=1,Type=Float,Description="BIC value of heterozygous with no allele-specific model">
60 ##INFO=<ID=BICHETERAS,Number=1,Type=Float,Description="BIC value of heterozygous with allele-specific model">
61 ##INFO=<ID=GQHOMO,Number=1,Type=Integer,Description="Genotype quality of homozygous with major allele model">
62 ##INFO=<ID=GQHETERNOAS,Number=1,Type=Integer,Description="Genotype quality of heterozygous with no allele-specific model">
63 ##INFO=<ID=GQHETERAS,Number=1,Type=Integer,Description="Genotype quality of heterozygous with allele-specific model">
64 ##INFO=<ID=GQHETERASsig,Number=1,Type=Integer,Description="Genotype quality of allele-specific significance compared with no allele-specific model">
65 ##INFO=<ID=AR,Number=1,Type=Float,Description="Estimated allele ratio of heterozygous with allele-specific model">
66 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
67 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="read depth after filtering">
68 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype quality score">"""
70 VCFHEADER
="""##fileformat=VCFv4.1
74 ##INFO=<ID=M,Number=.,Type=String,Description="MACS Model with minimum BIC value">
75 ##INFO=<ID=MT,Number=.,Type=String,Description="Mutation type: SNV/Insertion/Deletion">
76 ##INFO=<ID=DPT,Number=1,Type=Integer,Description="Depth Treatment: Read depth in ChIP-seq data">
77 ##INFO=<ID=DPC,Number=1,Type=Integer,Description="Depth Control: Read depth in control data">
78 ##INFO=<ID=DP1T,Number=.,Type=String,Description="Read depth of top1 allele in ChIP-seq data">
79 ##INFO=<ID=DP2T,Number=.,Type=String,Description="Read depth of top2 allele in ChIP-seq data">
80 ##INFO=<ID=DP1C,Number=.,Type=String,Description="Read depth of top1 allele in control data">
81 ##INFO=<ID=DP2C,Number=.,Type=String,Description="Read depth of top2 allele in control data">
82 ##INFO=<ID=DBIC,Number=.,Type=Float,Description="Difference of BIC of selected model vs second best alternative model">
83 ##INFO=<ID=BICHOMOMAJOR,Number=1,Type=Integer,Description="BIC of homozygous with major allele model">
84 ##INFO=<ID=BICHOMOMINOR,Number=1,Type=Integer,Description="BIC of homozygous with minor allele model">
85 ##INFO=<ID=BICHETERNOAS,Number=1,Type=Integer,Description="BIC of heterozygous with no allele-specific model">
86 ##INFO=<ID=BICHETERAS,Number=1,Type=Integer,Description="BIC of heterozygous with allele-specific model">
87 ##INFO=<ID=AR,Number=1,Type=Float,Description="Estimated allele ratio of heterozygous with allele-specific model">
88 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
89 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth after filtering bad reads">
90 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality score">
91 ##FORMAT=<ID=PL,Number=3,Type=Integer,Description="Normalized, Phred-scaled genotype likelihoods for 00, 01, 11 genotype">"""
93 # ------------------------------------
95 # ------------------------------------
97 def check_names(treat
, control
, error_stream
):
98 """check common chromosome names"""
99 tchrnames
= set(treat
.get_chr_names())
100 cchrnames
= set(control
.get_chr_names())
101 commonnames
= tchrnames
.intersection(cchrnames
)
102 if len(commonnames
)==0:
103 error_stream("No common chromosome names can be found from treatment and control! Check your input files! MACS will quit...")
104 error_stream("Chromosome names in treatment: %s" % ",".join(sorted(tchrnames
)))
105 error_stream("Chromosome names in control: %s" % ",".join(sorted(cchrnames
)))
110 """The Main function/pipeline for MACS
113 options
= opt_validate_callvar( args
)
117 debug
= options
.debug
118 error
= options
.error
120 peakbedfile
= options
.peakbed
121 peaktfile
= options
.tfile
122 peakcfile
= options
.cfile
123 top2allelesminr
= options
.top2allelesMinRatio
124 min_altallele_count
= options
.altalleleMinCount
125 max_allowed_ar
= options
.maxAR
129 min_homo_GQ
= options
.GQCutoffHomo
130 min_heter_GQ
= options
.GQCutoffHetero
132 maxDuplicate
= options
.maxDuplicate
134 # parameter for assembly
135 fermiMinOverlap
= options
.fermiMinOverlap
136 fermi
= options
.fermi
138 peakio
= open( peakbedfile
)
142 fs
= l
.rstrip().split()
144 peaks
.add( fs
[0].encode(), int(fs
[1]), int(fs
[2]), name
=b
"%d" % i
)
147 chrs
= peaks
.get_chr_names()
149 tbam
= BAMaccessor( peaktfile
)
151 cbam
= BAMaccessor( peakcfile
)
152 assert tbam
.get_chromosomes()[0] in cbam
.get_chromosomes() or cbam
.get_chromosomes()[0] in tbam
.get_chromosomes(), Exception("It seems Treatment and Control BAM use different naming for chromosomes! Check headers of both files.")
153 #assert tbam.get_chromosomes() == cbam.get_chromosomes(), Exception("Treatment and Control BAM files have different orders of sorted chromosomes! Please check BAM Headers and re-sort BAM files.")
160 # prepare and write header of output file (.vcf)
161 ovcf
= open(options
.ofile
, "w")
162 tmpcmdstr
= " --fermi "+ fermi
+ " --fermi-overlap "+str(fermiMinOverlap
)
163 ovcf
.write ( VCFHEADER
% (datetime
.date
.today().strftime("%Y%m%d"), MACS_VERSION
, " ".join(sys
.argv
[1:] + ["-Q", str(minQ
), "-D", str(maxDuplicate
), "--max-ar", str(max_allowed_ar
), "--top2alleles-mratio", str(top2allelesminr
), "--top2allele-count", str(min_altallele_count
), "-g", str(min_heter_GQ
), "-G", str(min_homo_GQ
), tmpcmdstr
]) ) + "\n" )
164 for (chrom
, chrlength
) in tbam
.get_rlengths().items():
165 ovcf
.write( "##contig=<ID=%s,length=%d,assembly=NA>\n" % ( chrom
.decode(), chrlength
) )
166 ovcf
.write ( "\t".join( ("#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT","SAMPLE") ) + "\n" )
172 #t_call_top2alleles = 0
179 for chrom
in tbam
.get_chromosomes():
180 peaks_chr
= peaks
.get_data_from_chrom( chrom
)
181 for peak
in peaks_chr
:
182 # note, when we extract reads from BAM within a peak
183 # region, we assume BAM should be sorted and the BAM
184 # should be generated from "samtools view -L" process.
186 # print ( "---begin of peak---")
187 info ( f
"Peak: {chrom.decode()} {peak['start']} {peak['end']}" )
189 flag_todo_lassembly
= False
194 ra_collection
= RACollection( chrom
, peak
, tbam
.get_reads_in_region( chrom
, peak
["start"], peak
["end"], maxDuplicate
=maxDuplicate
), cbam
.get_reads_in_region( chrom
, peak
["start"], peak
["end"], maxDuplicate
=maxDuplicate
) )
196 ra_collection
= RACollection( chrom
, peak
, tbam
.get_reads_in_region( chrom
, peak
["start"], peak
["end"], maxDuplicate
=maxDuplicate
) )
198 info ("No reads found in peak: ", chrom
.decode(), peak
["start"], peak
["end"], ". Skipped!")
199 # while there is no reads in peak region, simply skip it.
202 ra_collection
.remove_outliers( percent
= 5 )
203 t_prepare_ra
+= time() - t0
205 # print ( "Reads in Peak:")
206 # print ( ra_collection.get_FASTQ().decode() )
208 s
= ra_collection
["peak_refseq"]
210 peak_variants
= PeakVariants( chrom
.decode(), peak
["start"], peak
["end"], s
)
213 if fermi
== "auto" or fermi
== "off":
214 # first pass to call variant w/o assembly
215 # multiprocessing the following part
216 t_call_variants_0
= time()
217 info ( " Call variants w/o assembly")
219 # -- now make multi processes
220 # divide right-left into NP parts
221 window_size
= ceil( ( ra_collection
["right"] - ra_collection
["left"] ) / NP
)
224 # this partial function will only be used in multiprocessing
225 p_call_variants_at_range
= partial(call_variants_at_range
, s
=s
, collection
=ra_collection
, top2allelesminr
=top2allelesminr
, max_allowed_ar
= max_allowed_ar
, min_altallele_count
= min_altallele_count
, min_homo_GQ
= min_homo_GQ
, min_heter_GQ
= min_heter_GQ
, minQ
=minQ
)
228 for i
in range( NP
):
229 l
= i
* window_size
+ ra_collection
["left"]
230 r
= min( (i
+ 1) * window_size
+ ra_collection
["left"], ra_collection
["right"] )
231 ranges
.append( (l
, r
) )
233 mapresults
= P
.map_async( p_call_variants_at_range
, ranges
)
236 results
= mapresults
.get(timeout
=window_size
*300)
237 for i
in range( NP
):
238 for result
in results
[ i
]:
239 peak_variants
.add_variant( result
[0], result
[1] )
241 t_call_variants
+= time() - t_call_variants_0
243 # Next, check if we should do local assembly
244 if ( fermi
== "auto" and ( peak_variants
.has_indel() or peak_variants
.has_refer_biased_01() ) ) or fermi
== "on":
245 #print( peak_variants.has_indel() )
246 #print( peak_variants.has_refer_biased_01() )
248 # invoke fermi to assemble local sequence and filter out those can not be mapped to unitigs.
249 info ( " Try to call variants w/ fermi-lite assembly")
250 unitig_collection
= ra_collection
.build_unitig_collection( fermiMinOverlap
)
251 if unitig_collection
== -1:
252 info(" Too many mismatches found while assembling the sequence, we will skip this region entirely!")
254 elif unitig_collection
== 0:
255 info ( " Failed to assemble unitigs, fall back to previous results" )
256 if peak_variants
.n_variants() > 0:
257 peak_variants
.fix_indels()
258 ovcf
.write( peak_variants
.toVCF() )
260 # uncomment the following to print those assembled unitigs and their alignments to reference genome
261 #for u in unitig_collection["URAs_list"]:
262 # print( u["seq"].decode(), u["lpos"], u["rpos"], u["count"] )
263 # print( "a",u["unitig_aln"].decode() )
264 # print( "r",u["reference_aln"].decode() )
266 # if we do not assemble, write results now
267 if peak_variants
.n_variants() > 0:
268 peak_variants
.fix_indels()
269 ovcf
.write( peak_variants
.toVCF() )
272 # reach here only if we need l assembly and the assembly returns result
274 # If a peak has no indel but with refer_biased_01, we
275 # revisit all refer_biased_01 now. We do not use
276 # multiprocessing here for simplicity since there won't be
277 # too many in a peak region.
278 if ( fermi
== "auto" and ( not peak_variants
.has_indel() ) and peak_variants
.has_refer_biased_01() ):
279 pos_tobe_revisit
= peak_variants
.get_refer_biased_01s()
280 for i
in pos_tobe_revisit
:
281 ref_nt
= chr(s
[ i
-ra_collection
["left"] ] ).encode()
283 peak_variants
.remove_variant( i
)
285 PRI
= unitig_collection
.get_PosReadsInfo_ref_pos ( i
, ref_nt
, Q
=minQ
)
286 if PRI
.raw_read_depth( opt
="T" ) == 0: # skip if coverage is 0
287 peak_variants
.remove_variant( i
)
289 PRI
.update_top_alleles( top2allelesminr
, min_altallele_count
, max_allowed_ar
)
290 PRI
.call_GT( max_allowed_ar
)
291 PRI
.apply_GQ_cutoff(min_homo_GQ
, min_heter_GQ
)
292 if not PRI
.filterflag():
293 peak_variants
.replace_variant( i
, PRI
.toVariant() )
295 peak_variants
.remove_variant( i
)
296 if peak_variants
.n_variants() > 0:
297 peak_variants
.fix_indels()
298 ovcf
.write( peak_variants
.toVCF() )
301 # in this case, we call variants at every locations in the peak based on local assembly.
302 if ( fermi
== "on" or ( fermi
== "auto" and peak_variants
.has_indel() ) ):
303 peak_variants
= PeakVariants( chrom
.decode(), peak
["start"], peak
["end"], s
) #reset
305 # --- make multi processes
306 # divide right-left into NP parts
307 window_size
= ceil( ( ra_collection
["right"] - ra_collection
["left"] ) / NP
)
309 # this partial function will only be used in multiprocessing
310 p_call_variants_at_range
= partial(call_variants_at_range
, s
=s
, collection
=unitig_collection
, top2allelesminr
=top2allelesminr
, max_allowed_ar
= max_allowed_ar
, min_altallele_count
= min_altallele_count
, min_homo_GQ
= min_homo_GQ
, min_heter_GQ
= min_heter_GQ
, minQ
=minQ
)
312 for i
in range( NP
):
313 l
= i
* window_size
+ ra_collection
["left"]
314 r
= min( (i
+ 1) * window_size
+ ra_collection
["left"], ra_collection
["right"] )
315 ranges
.append( (l
, r
) )
317 mapresults
= P
.map_async( p_call_variants_at_range
, ranges
)
320 results
= mapresults
.get(timeout
=window_size
*300)
321 for i
in range( NP
):
322 for result
in results
[ i
]:
323 peak_variants
.add_variant( result
[0], result
[1] )
325 if peak_variants
.n_variants() > 0:
326 peak_variants
.fix_indels()
327 ovcf
.write( peak_variants
.toVCF() )
329 #print ("time to retrieve read alignment information from BAM:",t_prepare_ra,"(",round( 100 * t_prepare_ra/t_total, 2),"% )")
332 def call_variants_at_range ( lr
, s
, collection
, top2allelesminr
, max_allowed_ar
, min_altallele_count
, min_homo_GQ
, min_heter_GQ
, minQ
):
333 #def call_variants_at_range ( lr, chrom, s, collection, top2allelesminr, max_allowed_ar, min_homo_GQ, min_heter_GQ ):
335 for i
in range( lr
[ 0 ], lr
[ 1 ] ):
336 ref_nt
= chr(s
[ i
-collection
["left"] ] ).encode()
340 PRI
= collection
.get_PosReadsInfo_ref_pos ( i
, ref_nt
, Q
=minQ
)
341 if PRI
.raw_read_depth( opt
="T" ) == 0: # skip if coverage is 0
344 PRI
.update_top_alleles( top2allelesminr
, min_altallele_count
, max_allowed_ar
)
345 if not PRI
.filterflag():
346 #PRI.update_top_alleles( top2allelesminr )
347 PRI
.call_GT( max_allowed_ar
)
348 PRI
.apply_GQ_cutoff(min_homo_GQ
, min_heter_GQ
)
349 if not PRI
.filterflag():
350 result
.append( ( i
, PRI
.toVariant() ) )