MACS3/Commands/callvar_cmd.py

   1 # Time-stamp: <2021-03-10 23:51:54 Tao Liu>
   2
   3 """Description: macs call
   4
   5 Copyright (c) 2017 Tao Liu <tliu4@buffalo.edu>
   6
   7 This code is free software; you can redistribute it and/or modify it
   8 under the terms of the BSD License (see the file COPYING included
   9 with the distribution).
  10
  11 @status: release candidate
  12 @version: $Id$
  13 @author:  Tao Liu
  14 @contact: tliu4@buffalo.edu
  15 """
  16
  17 # ------------------------------------
  18 # python modules
  19 # ------------------------------------
  20
  21 import logging
  22 import datetime
  23 import sys
  24 from functools import partial
  25 import multiprocessing as mp
  26
  27 from time import time
  28 from math import ceil
  29
  30 # ------------------------------------
  31 # own python modules
  32 # ------------------------------------
  33 from MACS3.Utilities.Constants import *
  34 from MACS3.Utilities.OptValidator import opt_validate_callvar
  35 from MACS3.IO.PeakIO import PeakIO
  36 from MACS3.IO.BAM import BAMaccessor
  37 from MACS3.Signal.RACollection import RACollection
  38 from MACS3.Signal.PeakVariants import PeakVariants
  39
  40
  41 VCFHEADER_0="""##fileformat=VCFv4.1
  42 ##fileDate=%s
  43 ##source=MACS_V%s
  44 ##Program_Args=%s
  45 ##INFO=<ID=M,Number=.,Type=String,Description="MACS Model with minimum BIC value">
  46 ##INFO=<ID=MT,Number=.,Type=String,Description="Mutation type: SNV/Insertion/Deletion">
  47 ##INFO=<ID=DPT,Number=1,Type=Integer,Description="Depth Treatment: Read depth in ChIP-seq data">
  48 ##INFO=<ID=DPC,Number=1,Type=Integer,Description="Depth Control: Read depth in control data">
  49 ##INFO=<ID=DP1T,Number=.,Type=String,Description="Read depth of top1 allele in ChIP-seq data">
  50 ##INFO=<ID=DP2T,Number=.,Type=String,Description="Read depth of top2 allele in ChIP-seq data">
  51 ##INFO=<ID=DP1C,Number=.,Type=String,Description="Read depth of top1 allele in control data">
  52 ##INFO=<ID=DP2C,Number=.,Type=String,Description="Read depth of top2 allele in control data">
  53 ##INFO=<ID=lnLHOMOMAJOR,Number=1,Type=Float,Description="Log(e) scaled genotype likelihoods of homozygous with major allele model">
  54 ##INFO=<ID=lnLHOMOMINOR,Number=1,Type=Float,Description="Log(e) scaled genotype likelihoods of homozygous with minor allele model">
  55 ##INFO=<ID=lnLHETERNOAS,Number=1,Type=Float,Description="Log(e) scaled genotype likelihoods of heterozygous with no allele-specific model">
  56 ##INFO=<ID=lnLHETERAS,Number=1,Type=Float,Description="Log(e) scaled genotype likelihoods of heterozygous with allele-specific model">
  57 ##INFO=<ID=BICHOMOMAJOR,Number=1,Type=Float,Description="BIC value of homozygous with major allele model">
  58 ##INFO=<ID=BICHOMOMINOR,Number=1,Type=Float,Description="BIC value of homozygous with minor allele model">
  59 ##INFO=<ID=BICHETERNOAS,Number=1,Type=Float,Description="BIC value of heterozygous with no allele-specific model">
  60 ##INFO=<ID=BICHETERAS,Number=1,Type=Float,Description="BIC value of heterozygous with allele-specific model">
  61 ##INFO=<ID=GQHOMO,Number=1,Type=Integer,Description="Genotype quality of homozygous with major allele model">
  62 ##INFO=<ID=GQHETERNOAS,Number=1,Type=Integer,Description="Genotype quality of heterozygous with no allele-specific model">
  63 ##INFO=<ID=GQHETERAS,Number=1,Type=Integer,Description="Genotype quality of heterozygous with allele-specific model">
  64 ##INFO=<ID=GQHETERASsig,Number=1,Type=Integer,Description="Genotype quality of allele-specific significance compared with no allele-specific model">
  65 ##INFO=<ID=AR,Number=1,Type=Float,Description="Estimated allele ratio of heterozygous with allele-specific model">
  66 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
  67 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="read depth after filtering">
  68 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype quality score">"""
  69
  70 VCFHEADER="""##fileformat=VCFv4.1
  71 ##fileDate=%s
  72 ##source=MACS_V%s
  73 ##Program_Args=%s
  74 ##INFO=<ID=M,Number=.,Type=String,Description="MACS Model with minimum BIC value">
  75 ##INFO=<ID=MT,Number=.,Type=String,Description="Mutation type: SNV/Insertion/Deletion">
  76 ##INFO=<ID=DPT,Number=1,Type=Integer,Description="Depth Treatment: Read depth in ChIP-seq data">
  77 ##INFO=<ID=DPC,Number=1,Type=Integer,Description="Depth Control: Read depth in control data">
  78 ##INFO=<ID=DP1T,Number=.,Type=String,Description="Read depth of top1 allele in ChIP-seq data">
  79 ##INFO=<ID=DP2T,Number=.,Type=String,Description="Read depth of top2 allele in ChIP-seq data">
  80 ##INFO=<ID=DP1C,Number=.,Type=String,Description="Read depth of top1 allele in control data">
  81 ##INFO=<ID=DP2C,Number=.,Type=String,Description="Read depth of top2 allele in control data">
  82 ##INFO=<ID=DBIC,Number=.,Type=Float,Description="Difference of BIC of selected model vs second best alternative model">
  83 ##INFO=<ID=BICHOMOMAJOR,Number=1,Type=Integer,Description="BIC of homozygous with major allele model">
  84 ##INFO=<ID=BICHOMOMINOR,Number=1,Type=Integer,Description="BIC of homozygous with minor allele model">
  85 ##INFO=<ID=BICHETERNOAS,Number=1,Type=Integer,Description="BIC of heterozygous with no allele-specific model">
  86 ##INFO=<ID=BICHETERAS,Number=1,Type=Integer,Description="BIC of heterozygous with allele-specific model">
  87 ##INFO=<ID=AR,Number=1,Type=Float,Description="Estimated allele ratio of heterozygous with allele-specific model">
  88 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
  89 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth after filtering bad reads">
  90 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality score">
  91 ##FORMAT=<ID=PL,Number=3,Type=Integer,Description="Normalized, Phred-scaled genotype likelihoods for 00, 01, 11 genotype">"""
  92
  93 # ------------------------------------
  94 # Main function
  95 # ------------------------------------
  96
  97 def check_names(treat, control, error_stream):
  98     """check common chromosome names"""
  99     tchrnames = set(treat.get_chr_names())
 100     cchrnames = set(control.get_chr_names())
 101     commonnames = tchrnames.intersection(cchrnames)
 102     if len(commonnames)==0:
 103         error_stream("No common chromosome names can be found from treatment and control! Check your input files! MACS will quit...")
 104         error_stream("Chromosome names in treatment: %s" % ",".join(sorted(tchrnames)))
 105         error_stream("Chromosome names in control: %s" % ",".join(sorted(cchrnames)))
 106         sys.exit()
 107
 108
 109 def run( args ):
 110     """The Main function/pipeline for MACS
 111
 112     """
 113     options = opt_validate_callvar( args )
 114
 115     info = options.info
 116     warn = options.warn
 117     debug = options.debug
 118     error = options.error
 119
 120     peakbedfile = options.peakbed
 121     peaktfile = options.tfile
 122     peakcfile = options.cfile
 123     top2allelesminr = options.top2allelesMinRatio
 124     min_altallele_count = options.altalleleMinCount
 125     max_allowed_ar = options.maxAR
 126     NP = options.np
 127     if NP<=0:
 128         NP = 1
 129     min_homo_GQ = options.GQCutoffHomo
 130     min_heter_GQ = options.GQCutoffHetero
 131     minQ = options.Q
 132     maxDuplicate = options.maxDuplicate
 133
 134     # parameter for assembly
 135     fermiMinOverlap = options.fermiMinOverlap
 136     fermi = options.fermi
 137
 138     peakio = open( peakbedfile )
 139     peaks = PeakIO()
 140     i = 0
 141     for l in peakio:
 142         fs = l.rstrip().split()
 143         i += 1
 144         peaks.add( fs[0].encode(), int(fs[1]), int(fs[2]), name=b"%d" % i )
 145     peaks.sort()
 146
 147     chrs = peaks.get_chr_names()
 148
 149     tbam = BAMaccessor( peaktfile )
 150     if peakcfile:
 151         cbam = BAMaccessor( peakcfile )
 152         assert tbam.get_chromosomes()[0] in cbam.get_chromosomes() or cbam.get_chromosomes()[0] in tbam.get_chromosomes(), Exception("It seems Treatment and Control BAM use different naming for chromosomes! Check headers of both files.")
 153         #assert tbam.get_chromosomes() == cbam.get_chromosomes(), Exception("Treatment and Control BAM files have different orders of sorted chromosomes! Please check BAM Headers and re-sort BAM files.")
 154     else:
 155         cbam = None
 156
 157
 158     ra_collections = []
 159
 160     # prepare and write header of output file (.vcf)
 161     ovcf = open(options.ofile, "w")
 162     tmpcmdstr = " --fermi "+ fermi+ " --fermi-overlap "+str(fermiMinOverlap)
 163     ovcf.write ( VCFHEADER % (datetime.date.today().strftime("%Y%m%d"), MACS_VERSION, " ".join(sys.argv[1:] + ["-Q", str(minQ), "-D", str(maxDuplicate), "--max-ar", str(max_allowed_ar), "--top2alleles-mratio", str(top2allelesminr), "--top2allele-count", str(min_altallele_count), "-g", str(min_heter_GQ), "-G", str(min_homo_GQ), tmpcmdstr]) ) + "\n" )
 164     for (chrom, chrlength) in tbam.get_rlengths().items():
 165         ovcf.write( "##contig=<ID=%s,length=%d,assembly=NA>\n" % ( chrom.decode(), chrlength ) )
 166     ovcf.write ( "\t".join( ("#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT","SAMPLE") ) + "\n" )
 167
 168     # to get time
 169     t_total = 0
 170     t_prepare_ra = 0
 171     t_assemble = 0
 172     #t_call_top2alleles = 0
 173     #t_call_lnL = 0
 174     t_call_variants = 0
 175     t_call_GT = 0
 176     #t_call_to_vcf = 0
 177     t_total_0 = time()
 178
 179     for chrom in tbam.get_chromosomes():
 180         peaks_chr = peaks.get_data_from_chrom( chrom )
 181         for peak in peaks_chr:
 182             # note, when we extract reads from BAM within a peak
 183             # region, we assume BAM should be sorted and the BAM
 184             # should be generated from "samtools view -L" process.
 185
 186             # print ( "---begin of peak---")
 187             info ( f"Peak: {chrom.decode()} {peak['start']} {peak['end']}" )
 188
 189             flag_todo_lassembly = False
 190
 191             t0 = time()
 192             try:
 193                 if cbam:
 194                     ra_collection = RACollection( chrom, peak, tbam.get_reads_in_region( chrom, peak["start"], peak["end"], maxDuplicate=maxDuplicate ), cbam.get_reads_in_region( chrom, peak["start"], peak["end"], maxDuplicate=maxDuplicate) )
 195                 else:
 196                     ra_collection = RACollection( chrom, peak, tbam.get_reads_in_region( chrom, peak["start"], peak["end"], maxDuplicate=maxDuplicate ) )
 197             except:
 198                 info ("No reads found in peak: ", chrom.decode(), peak["start"], peak["end"], ". Skipped!")
 199                 # while there is no reads in peak region, simply skip it.
 200                 continue
 201
 202             ra_collection.remove_outliers( percent = 5 )
 203             t_prepare_ra += time() - t0
 204
 205             # print ( "Reads in Peak:")
 206             # print ( ra_collection.get_FASTQ().decode() )
 207
 208             s = ra_collection["peak_refseq"]
 209
 210             peak_variants = PeakVariants( chrom.decode(), peak["start"], peak["end"], s )
 211
 212
 213             if fermi == "auto" or fermi == "off":
 214                 # first pass to call variant w/o assembly
 215                 # multiprocessing the following part
 216                 t_call_variants_0 = time()
 217                 info ( " Call variants w/o assembly")
 218
 219                 # -- now make multi processes
 220                 # divide right-left into NP parts
 221                 window_size = ceil( ( ra_collection["right"] - ra_collection["left"] ) / NP )
 222
 223                 P = mp.Pool( NP )
 224                 # this partial function will only be used in multiprocessing
 225                 p_call_variants_at_range =  partial(call_variants_at_range, s=s, collection=ra_collection, top2allelesminr=top2allelesminr, max_allowed_ar = max_allowed_ar, min_altallele_count = min_altallele_count, min_homo_GQ = min_homo_GQ, min_heter_GQ = min_heter_GQ, minQ=minQ)
 226
 227                 ranges = []
 228                 for i in range( NP ):
 229                     l = i * window_size + ra_collection["left"]
 230                     r = min( (i + 1) * window_size + ra_collection["left"], ra_collection["right"] )
 231                     ranges.append( (l, r) )
 232
 233                 mapresults = P.map_async( p_call_variants_at_range, ranges )
 234                 P.close()
 235                 P.join()
 236                 results = mapresults.get(timeout=window_size*300)
 237                 for i in range( NP ):
 238                     for result in results[ i ]:
 239                         peak_variants.add_variant( result[0], result[1] )
 240
 241                 t_call_variants += time() - t_call_variants_0
 242
 243             # Next, check if we should do local assembly
 244             if ( fermi == "auto" and ( peak_variants.has_indel() or peak_variants.has_refer_biased_01() ) ) or fermi == "on":
 245                 #print( peak_variants.has_indel() )
 246                 #print( peak_variants.has_refer_biased_01() )
 247
 248                 # invoke fermi to assemble local sequence and filter out those can not be mapped to unitigs.
 249                 info ( " Try to call variants w/ fermi-lite assembly")
 250                 unitig_collection = ra_collection.build_unitig_collection( fermiMinOverlap )
 251                 if unitig_collection == -1:
 252                     info(" Too many mismatches found while assembling the sequence, we will skip this region entirely!")
 253                     continue
 254                 elif unitig_collection == 0:
 255                     info ( "  Failed to assemble unitigs, fall back to previous results" )
 256                     if peak_variants.n_variants() > 0:
 257                         peak_variants.fix_indels()
 258                         ovcf.write( peak_variants.toVCF() )
 259                         continue
 260                 # uncomment the following to print those assembled unitigs and their alignments to reference genome
 261                 #for u in unitig_collection["URAs_list"]:
 262                 #    print( u["seq"].decode(), u["lpos"], u["rpos"], u["count"] )
 263                 #    print( "a",u["unitig_aln"].decode() )
 264                 #    print( "r",u["reference_aln"].decode() )
 265             else:
 266                 # if we do not assemble, write results now
 267                 if peak_variants.n_variants() > 0:
 268                     peak_variants.fix_indels()
 269                     ovcf.write( peak_variants.toVCF() )
 270                     continue
 271
 272             # reach here only if we need l assembly and the assembly returns result
 273
 274             # If a peak has no indel but with refer_biased_01, we
 275             # revisit all refer_biased_01 now. We do not use
 276             # multiprocessing here for simplicity since there won't be
 277             # too many in a peak region.
 278             if ( fermi == "auto" and ( not peak_variants.has_indel() ) and peak_variants.has_refer_biased_01()  ):
 279                 pos_tobe_revisit = peak_variants.get_refer_biased_01s()
 280                 for i in pos_tobe_revisit:
 281                     ref_nt = chr(s[ i-ra_collection["left"] ] ).encode()
 282                     if ref_nt == b'N':
 283                         peak_variants.remove_variant( i )
 284                         continue
 285                     PRI = unitig_collection.get_PosReadsInfo_ref_pos ( i, ref_nt, Q=minQ )
 286                     if PRI.raw_read_depth( opt="T" ) == 0: # skip if coverage is 0
 287                         peak_variants.remove_variant( i )
 288                         continue
 289                     PRI.update_top_alleles( top2allelesminr, min_altallele_count, max_allowed_ar )
 290                     PRI.call_GT( max_allowed_ar )
 291                     PRI.apply_GQ_cutoff(min_homo_GQ, min_heter_GQ)
 292                     if not PRI.filterflag():
 293                         peak_variants.replace_variant( i, PRI.toVariant() )
 294                     else:
 295                         peak_variants.remove_variant( i )
 296                 if peak_variants.n_variants() > 0:
 297                     peak_variants.fix_indels()
 298                     ovcf.write( peak_variants.toVCF() )
 299                     continue
 300
 301             # in this case, we call variants at every locations in the peak based on local assembly.
 302             if ( fermi == "on" or ( fermi == "auto" and peak_variants.has_indel() ) ):
 303                 peak_variants = PeakVariants( chrom.decode(), peak["start"], peak["end"], s ) #reset
 304
 305                 # --- make multi processes
 306                 # divide right-left into NP parts
 307                 window_size = ceil( ( ra_collection["right"] - ra_collection["left"] ) / NP )
 308                 P = mp.Pool( NP )
 309                 # this partial function will only be used in multiprocessing
 310                 p_call_variants_at_range =  partial(call_variants_at_range, s=s, collection=unitig_collection, top2allelesminr=top2allelesminr, max_allowed_ar = max_allowed_ar, min_altallele_count = min_altallele_count, min_homo_GQ = min_homo_GQ, min_heter_GQ = min_heter_GQ, minQ=minQ)
 311                 ranges = []
 312                 for i in range( NP ):
 313                     l = i * window_size + ra_collection["left"]
 314                     r = min( (i + 1) * window_size + ra_collection["left"], ra_collection["right"] )
 315                     ranges.append( (l, r) )
 316
 317                 mapresults = P.map_async( p_call_variants_at_range, ranges )
 318                 P.close()
 319                 P.join()
 320                 results = mapresults.get(timeout=window_size*300)
 321                 for i in range( NP ):
 322                     for result in results[ i ]:
 323                         peak_variants.add_variant( result[0], result[1] )
 324
 325             if peak_variants.n_variants() > 0:
 326                 peak_variants.fix_indels()
 327                 ovcf.write( peak_variants.toVCF() )
 328
 329     #print ("time to retrieve read alignment information from BAM:",t_prepare_ra,"(",round( 100 * t_prepare_ra/t_total, 2),"% )")
 330     return
 331
 332 def call_variants_at_range ( lr, s, collection, top2allelesminr, max_allowed_ar, min_altallele_count, min_homo_GQ, min_heter_GQ, minQ ):
 333 #def call_variants_at_range ( lr, chrom, s, collection, top2allelesminr, max_allowed_ar, min_homo_GQ, min_heter_GQ ):
 334     result = []
 335     for i in range( lr[ 0 ], lr[ 1 ] ):
 336         ref_nt = chr(s[ i-collection["left"] ] ).encode()
 337         if ref_nt == b'N':
 338             continue
 339
 340         PRI = collection.get_PosReadsInfo_ref_pos ( i, ref_nt, Q=minQ )
 341         if PRI.raw_read_depth( opt="T" ) == 0: # skip if coverage is 0
 342             continue
 343
 344         PRI.update_top_alleles( top2allelesminr, min_altallele_count, max_allowed_ar )
 345         if not PRI.filterflag():
 346             #PRI.update_top_alleles( top2allelesminr )
 347             PRI.call_GT( max_allowed_ar )
 348             PRI.apply_GQ_cutoff(min_homo_GQ, min_heter_GQ)
 349         if not PRI.filterflag():
 350             result.append( ( i, PRI.toVariant() ) )
 351     return result
 352
 353