MACS3/Commands/callvar_cmd.py

   1 # Time-stamp: <2024-10-11 10:28:07 Tao Liu>
   2
   3 """Description: Call variants directly
   4
   5 Copyright (c) 2017-2023 Tao Liu <vladimir.liu@gmail.com>
   6
   7 This code is free software; you can redistribute it and/or modify it
   8 under the terms of the BSD License (see the file COPYING included
   9 with the distribution).
  10
  11 @status: release candidate
  12 @version: $Id$
  13 @author:  Tao Liu
  14 @contact: vladimir.liu@gmail.com
  15 """
  16
  17 # ------------------------------------
  18 # python modules
  19 # ------------------------------------
  20
  21 import datetime
  22 import sys
  23 from functools import partial
  24 import multiprocessing as mp
  25
  26 # from time import time
  27 from math import ceil
  28
  29 # ------------------------------------
  30 # own python modules
  31 # ------------------------------------
  32 from MACS3.Utilities.Constants import MACS_VERSION
  33 from MACS3.Utilities.OptValidator import opt_validate_callvar
  34 from MACS3.IO.PeakIO import PeakIO
  35 from MACS3.IO.BAM import BAMaccessor
  36 from MACS3.Signal.RACollection import RACollection
  37 from MACS3.Signal.PeakVariants import PeakVariants
  38
  39
  40 VCFHEADER_0 = """##fileformat=VCFv4.1
  41 ##fileDate=%s
  42 ##source=MACS_V%s
  43 ##Program_Args=%s
  44 ##INFO=<ID=M,Number=.,Type=String,Description="MACS Model with minimum BIC value">
  45 ##INFO=<ID=MT,Number=.,Type=String,Description="Mutation type: SNV/Insertion/Deletion">
  46 ##INFO=<ID=DPT,Number=1,Type=Integer,Description="Depth Treatment: Read depth in ChIP-seq data">
  47 ##INFO=<ID=DPC,Number=1,Type=Integer,Description="Depth Control: Read depth in control data">
  48 ##INFO=<ID=DP1T,Number=.,Type=String,Description="Read depth of top1 allele in ChIP-seq data">
  49 ##INFO=<ID=DP2T,Number=.,Type=String,Description="Read depth of top2 allele in ChIP-seq data">
  50 ##INFO=<ID=DP1C,Number=.,Type=String,Description="Read depth of top1 allele in control data">
  51 ##INFO=<ID=DP2C,Number=.,Type=String,Description="Read depth of top2 allele in control data">
  52 ##INFO=<ID=lnLHOMOMAJOR,Number=1,Type=Float,Description="Log(e) scaled genotype likelihoods of homozygous with major allele model">
  53 ##INFO=<ID=lnLHOMOMINOR,Number=1,Type=Float,Description="Log(e) scaled genotype likelihoods of homozygous with minor allele model">
  54 ##INFO=<ID=lnLHETERNOAS,Number=1,Type=Float,Description="Log(e) scaled genotype likelihoods of heterozygous with no allele-specific model">
  55 ##INFO=<ID=lnLHETERAS,Number=1,Type=Float,Description="Log(e) scaled genotype likelihoods of heterozygous with allele-specific model">
  56 ##INFO=<ID=BICHOMOMAJOR,Number=1,Type=Float,Description="BIC value of homozygous with major allele model">
  57 ##INFO=<ID=BICHOMOMINOR,Number=1,Type=Float,Description="BIC value of homozygous with minor allele model">
  58 ##INFO=<ID=BICHETERNOAS,Number=1,Type=Float,Description="BIC value of heterozygous with no allele-specific model">
  59 ##INFO=<ID=BICHETERAS,Number=1,Type=Float,Description="BIC value of heterozygous with allele-specific model">
  60 ##INFO=<ID=GQHOMO,Number=1,Type=Integer,Description="Genotype quality of homozygous with major allele model">
  61 ##INFO=<ID=GQHETERNOAS,Number=1,Type=Integer,Description="Genotype quality of heterozygous with no allele-specific model">
  62 ##INFO=<ID=GQHETERAS,Number=1,Type=Integer,Description="Genotype quality of heterozygous with allele-specific model">
  63 ##INFO=<ID=GQHETERASsig,Number=1,Type=Integer,Description="Genotype quality of allele-specific significance compared with no allele-specific model">
  64 ##INFO=<ID=AR,Number=1,Type=Float,Description="Estimated allele ratio of heterozygous with allele-specific model">
  65 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
  66 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="read depth after filtering">
  67 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype quality score">"""
  68
  69 VCFHEADER = """##fileformat=VCFv4.1
  70 ##fileDate=%s
  71 ##source=MACS_V%s
  72 ##Program_Args=%s
  73 ##INFO=<ID=M,Number=.,Type=String,Description="MACS Model with minimum BIC value">
  74 ##INFO=<ID=MT,Number=.,Type=String,Description="Mutation type: SNV/Insertion/Deletion">
  75 ##INFO=<ID=DPT,Number=1,Type=Integer,Description="Depth Treatment: Read depth in ChIP-seq data">
  76 ##INFO=<ID=DPC,Number=1,Type=Integer,Description="Depth Control: Read depth in control data">
  77 ##INFO=<ID=DP1T,Number=.,Type=String,Description="Read depth of top1 allele in ChIP-seq data">
  78 ##INFO=<ID=DP2T,Number=.,Type=String,Description="Read depth of top2 allele in ChIP-seq data">
  79 ##INFO=<ID=DP1C,Number=.,Type=String,Description="Read depth of top1 allele in control data">
  80 ##INFO=<ID=DP2C,Number=.,Type=String,Description="Read depth of top2 allele in control data">
  81 ##INFO=<ID=DBIC,Number=.,Type=Float,Description="Difference of BIC of selected model vs second best alternative model">
  82 ##INFO=<ID=BICHOMOMAJOR,Number=1,Type=Integer,Description="BIC of homozygous with major allele model">
  83 ##INFO=<ID=BICHOMOMINOR,Number=1,Type=Integer,Description="BIC of homozygous with minor allele model">
  84 ##INFO=<ID=BICHETERNOAS,Number=1,Type=Integer,Description="BIC of heterozygous with no allele-specific model">
  85 ##INFO=<ID=BICHETERAS,Number=1,Type=Integer,Description="BIC of heterozygous with allele-specific model">
  86 ##INFO=<ID=AR,Number=1,Type=Float,Description="Estimated allele ratio of heterozygous with allele-specific model">
  87 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
  88 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth after filtering bad reads">
  89 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality score">
  90 ##FORMAT=<ID=PL,Number=3,Type=Integer,Description="Normalized, Phred-scaled genotype likelihoods for 00, 01, 11 genotype">"""
  91
  92 # ------------------------------------
  93 # Main function
  94 # ------------------------------------
  95
  96
  97 def check_names(treat, control, error_stream):
  98     """check common chromosome names"""
  99     tchrnames = set(treat.get_chr_names())
 100     cchrnames = set(control.get_chr_names())
 101     commonnames = tchrnames.intersection(cchrnames)
 102     if len(commonnames) == 0:
 103         error_stream("No common chromosome names can be found from treatment and control! Check your input files! MACS will quit...")
 104         error_stream("Chromosome names in treatment: %s" % ",".join(sorted(tchrnames)))
 105         error_stream("Chromosome names in control: %s" % ",".join(sorted(cchrnames)))
 106         sys.exit()
 107
 108
 109 def run(args):
 110     """The Main function/pipeline for MACS
 111
 112     """
 113     options = opt_validate_callvar(args)
 114
 115     info = options.info
 116     # warn = options.warn
 117     # debug = options.debug
 118     # error = options.error
 119
 120     peakbedfile = options.peakbed
 121     tfile = options.tfile
 122     cfile = options.cfile
 123     top2allelesminr = options.top2allelesMinRatio
 124     min_altallele_count = options.altalleleMinCount
 125     max_allowed_ar = options.maxAR
 126     NP = options.np
 127     if NP <= 0:
 128         NP = 1
 129     min_homo_GQ = options.GQCutoffHomo
 130     min_heter_GQ = options.GQCutoffHetero
 131     minQ = options.Q
 132     maxDuplicate = options.maxDuplicate
 133
 134     # parameter for assembly
 135     fermiMinOverlap = options.fermiMinOverlap
 136     fermi = options.fermi
 137
 138     peakio = open(peakbedfile)
 139     peaks = PeakIO()
 140     #i = 0
 141     for t_peak in peakio:
 142         fs = t_peak.rstrip().split()
 143         # i += 1
 144         peaks.add(fs[0].encode(), int(fs[1]), int(fs[2]))  # , name=b"%d" % i)
 145     peaks.sort()
 146
 147     # chrs = peaks.get_chr_names()
 148
 149     tbam = BAMaccessor(tfile)
 150     if cfile:
 151         cbam = BAMaccessor(cfile)
 152         assert tbam.get_chromosomes()[0] in cbam.get_chromosomes() or cbam.get_chromosomes()[0] in tbam.get_chromosomes(), Exception("It seems Treatment and Control BAM use different naming for chromosomes! Check headers of both files.")
 153         # assert tbam.get_chromosomes() == cbam.get_chromosomes(), Exception("Treatment and Control BAM files have different orders of sorted chromosomes! Please check BAM Headers and re-sort BAM files.")
 154     else:
 155         cbam = None
 156
 157     # ra_collections = []
 158
 159     # prepare and write header of output file (.vcf)
 160     ovcf = open(options.ofile, "w")
 161     tmpcmdstr = " --fermi " + fermi + " --fermi-overlap "+str(fermiMinOverlap)
 162     ovcf.write(VCFHEADER % (datetime.date.today().strftime("%Y%m%d"), MACS_VERSION, " ".join(sys.argv[1:] + ["-Q", str(minQ), "-D", str(maxDuplicate), "--max-ar", str(max_allowed_ar), "--top2alleles-mratio", str(top2allelesminr), "--top2allele-count", str(min_altallele_count), "-g", str(min_heter_GQ), "-G", str(min_homo_GQ), tmpcmdstr])) + "\n")
 163     for (chrom, chrlength) in tbam.get_rlengths().items():
 164         ovcf.write("##contig=<ID=%s,length=%d,assembly=NA>\n" % (chrom.decode(), chrlength))
 165     ovcf.write("\t".join(("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "SAMPLE")) + "\n")
 166
 167     # to get time
 168     # t_total = 0
 169     # t_prepare_ra = 0
 170     # t_assemble = 0
 171     # t_call_top2alleles = 0
 172     # t_call_lnL = 0
 173     # t_call_variants = 0
 174     # t_call_GT = 0
 175     #t_call_to_vcf = 0
 176     # t_total_0 = time()
 177
 178     for chrom in sorted(tbam.get_chromosomes()):
 179         peaks_chr = peaks.get_data_from_chrom(chrom)
 180         for peak in peaks_chr:
 181             # note, when we extract reads from BAM within a peak
 182             # region, we assume BAM should be sorted and the BAM
 183             # should be generated from "samtools view -L" process.
 184
 185             # print ( "---begin of peak---")
 186             info(f"Peak: {chrom.decode()} {peak['start']} {peak['end']}")
 187
 188             # flag_todo_lassembly = False
 189
 190             # t0 = time()
 191             try:
 192                 if cbam:
 193                     ra_collection = RACollection(chrom, peak,
 194                                                  tbam.get_reads_in_region(chrom, peak["start"], peak["end"], maxDuplicate=maxDuplicate),
 195                                                  cbam.get_reads_in_region(chrom, peak["start"], peak["end"], maxDuplicate=maxDuplicate))
 196                 else:
 197                     ra_collection = RACollection(chrom, peak,
 198                                                  tbam.get_reads_in_region(chrom, peak["start"], peak["end"], maxDuplicate=maxDuplicate))
 199             except Exception:
 200                 info("No reads found in this peak. Skipped")
 201                 # while there is no reads in peak region, simply skip it.
 202                 continue
 203
 204             ra_collection.remove_outliers(percent=5)
 205             # t_prepare_ra += time() - t0
 206
 207             # print ( "Reads in Peak:")
 208             # print ( ra_collection.get_FASTQ().decode() )
 209
 210             s = ra_collection["peak_refseq"]
 211
 212             peak_variants = PeakVariants(chrom.decode(), peak["start"], peak["end"], s)
 213
 214             if fermi == "auto" or fermi == "off":
 215                 # first pass to call variant w/o assembly
 216                 # multiprocessing the following part
 217                 # t_call_variants_0 = time()
 218                 info(" Call variants w/o assembly")
 219
 220                 # -- now make multi processes
 221                 # divide right-left into NP parts
 222                 window_size = ceil((ra_collection["right"] - ra_collection["left"]) / NP)
 223
 224                 P = mp.Pool(NP)
 225                 # this partial function will only be used in multiprocessing
 226                 p_call_variants_at_range = partial(call_variants_at_range, s=s, collection=ra_collection, top2allelesminr=top2allelesminr, max_allowed_ar=max_allowed_ar, min_altallele_count=min_altallele_count, min_homo_GQ=min_homo_GQ, min_heter_GQ=min_heter_GQ, minQ=minQ)
 227
 228                 ranges = []
 229                 for i in range(NP):
 230                     l_d = i * window_size + ra_collection["left"]
 231                     r_d = min((i + 1) * window_size + ra_collection["left"], ra_collection["right"])
 232                     ranges.append((l_d, r_d))
 233
 234                 mapresults = P.map_async(p_call_variants_at_range, ranges)
 235                 P.close()
 236                 P.join()
 237                 results = mapresults.get(timeout=window_size*300)
 238                 for i in range(NP):
 239                     for result in results[i]:
 240                         peak_variants.add_variant(result[0], result[1])
 241
 242                 # t_call_variants += time() - t_call_variants_0
 243
 244             # Next, check if we should do local assembly
 245             if (fermi == "auto" and (peak_variants.has_indel() or peak_variants.has_refer_biased_01())) or fermi == "on":
 246                 #print( peak_variants.has_indel() )
 247                 #print( peak_variants.has_refer_biased_01() )
 248
 249                 # invoke fermi to assemble local sequence and filter out those can not be mapped to unitigs.
 250                 info(" Try to call variants w/ fermi-lite assembly")
 251                 unitig_collection = ra_collection.build_unitig_collection(fermiMinOverlap)
 252                 if unitig_collection == -1:
 253                     info(" Too many mismatches found while assembling the sequence, we will skip this region entirely!")
 254                     continue
 255                 elif unitig_collection == 0:
 256                     info("  Failed to assemble unitigs, fall back to previous results")
 257                     if peak_variants.n_variants() > 0:
 258                         peak_variants.fix_indels()
 259                         ovcf.write(peak_variants.toVCF())
 260                         continue
 261                 # uncomment the following to print those assembled unitigs and their alignments to reference genome
 262                 # for u in unitig_collection["URAs_list"]:
 263                 #    print(u["seq"].decode(), u["lpos"], u["rpos"], u["count"])
 264                 #    print("a",u["unitig_aln"].decode())
 265                 #    print("r",u["reference_aln"].decode())
 266             else:
 267                 # if we do not assemble, write results now
 268                 if peak_variants.n_variants() > 0:
 269                     peak_variants.fix_indels()
 270                     ovcf.write(peak_variants.toVCF())
 271                     continue
 272
 273             # reach here only if we need l assembly and the assembly returns result
 274
 275             # If a peak has no indel but with refer_biased_01, we
 276             # revisit all refer_biased_01 now. We do not use
 277             # multiprocessing here for simplicity since there won't be
 278             # too many in a peak region.
 279             if (fermi == "auto" and (not peak_variants.has_indel()) and peak_variants.has_refer_biased_01()):
 280                 pos_tobe_revisit = peak_variants.get_refer_biased_01s()
 281                 for i in pos_tobe_revisit:
 282                     ref_nt = chr(s[i-ra_collection["left"]]).encode()
 283                     if ref_nt == b'N':
 284                         peak_variants.remove_variant(i)
 285                         continue
 286                     PRI = unitig_collection.get_PosReadsInfo_ref_pos(i, ref_nt, Q=minQ)
 287                     if PRI.raw_read_depth(opt="T") == 0:  # skip if coverage is 0
 288                         peak_variants.remove_variant(i)
 289                         continue
 290                     PRI.update_top_alleles(top2allelesminr, min_altallele_count, max_allowed_ar)
 291                     PRI.call_GT(max_allowed_ar)
 292                     PRI.apply_GQ_cutoff(min_homo_GQ, min_heter_GQ)
 293                     if not PRI.filterflag():
 294                         peak_variants.replace_variant(i, PRI.toVariant())
 295                     else:
 296                         peak_variants.remove_variant(i)
 297                 if peak_variants.n_variants() > 0:
 298                     peak_variants.fix_indels()
 299                     ovcf.write(peak_variants.toVCF())
 300                     continue
 301
 302             # in this case, we call variants at every locations in the peak based on local assembly.
 303             if (fermi == "on" or (fermi == "auto" and peak_variants.has_indel())):
 304                 peak_variants = PeakVariants(chrom.decode(), peak["start"], peak["end"], s)  # reset
 305
 306                 # --- make multi processes
 307                 # divide right-left into NP parts
 308                 window_size = ceil((ra_collection["right"] - ra_collection["left"]) / NP)
 309                 P = mp.Pool(NP)
 310                 # this partial function will only be used in multiprocessing
 311                 p_call_variants_at_range = partial(call_variants_at_range, s=s, collection=unitig_collection, top2allelesminr=top2allelesminr, max_allowed_ar=max_allowed_ar, min_altallele_count=min_altallele_count, min_homo_GQ=min_homo_GQ, min_heter_GQ=min_heter_GQ, minQ=minQ)
 312                 ranges = []
 313                 for i in range(NP):
 314                     l_d = i * window_size + ra_collection["left"]
 315                     r_d = min((i + 1) * window_size + ra_collection["left"], ra_collection["right"])
 316                     ranges.append((l_d, r_d))
 317
 318                 mapresults = P.map_async(p_call_variants_at_range, ranges)
 319                 P.close()
 320                 P.join()
 321                 results = mapresults.get(timeout=window_size*300)
 322                 for i in range(NP):
 323                     for result in results[i]:
 324                         peak_variants.add_variant(result[0], result[1])
 325             if peak_variants.n_variants() > 0:
 326                 peak_variants.fix_indels()
 327                 ovcf.write(peak_variants.toVCF())
 328
 329     # print ("time to retrieve read alignment information from BAM:",t_prepare_ra,"(",round( 100 * t_prepare_ra/t_total, 2),"% )")
 330     return
 331
 332
 333 def call_variants_at_range(lr, s, collection, top2allelesminr, max_allowed_ar, min_altallele_count, min_homo_GQ, min_heter_GQ, minQ):
 334     result = []
 335     for i in range(lr[0], lr[1]):
 336         ref_nt = chr(s[i-collection["left"]]).encode()
 337         if ref_nt == b'N':
 338             continue
 339
 340         PRI = collection.get_PosReadsInfo_ref_pos(i, ref_nt, Q=minQ)
 341         if PRI.raw_read_depth(opt="T") == 0:  # skip if coverage is 0
 342             continue
 343
 344         PRI.update_top_alleles(top2allelesminr, min_altallele_count, max_allowed_ar)
 345         if not PRI.filterflag():
 346             #PRI.update_top_alleles( top2allelesminr )
 347             PRI.call_GT(max_allowed_ar)
 348             PRI.apply_GQ_cutoff(min_homo_GQ, min_heter_GQ)
 349         if not PRI.filterflag():
 350             result.append((i, PRI.toVariant()))
 351     return result