python/salus/gtfGeneRange.py

   1 #!/usr/bin/env python3
   2
   3 import sys
   4 import io
   5 import os
   6 import tqdm
   7 import functools
   8 import gtfparse
   9 from collections import defaultdict
  10 import logging
  11 #logging.getLogger("gtfparse").setLevel(logging.WARNING)
  12 #gtfparse.logger.setLevel(logging.WARNING)
  13 for handler in logging.root.handlers:
  14     handler.setLevel(logging.WARNING)
  15
  16 SelectedAttribute = 'gene_name'
  17 SelectedAttribute = 'gene_id'
  18
  19 def eprint(*args, **kwargs):
  20     print(*args, file=sys.stderr, **kwargs)
  21
  22 def parse_with_polars_lazy(
  23         filepath_or_buffer,
  24         split_attributes=True,
  25         features=None,
  26         fix_quotes_columns=["attribute"]):
  27     # use a global string cache so that all strings get intern'd into
  28     # a single numbering system
  29     polars.toggle_string_cache(True)
  30     kwargs = dict(
  31         has_header=False,
  32         separator="\t",
  33         comment_char="#",
  34         null_values=".",
  35         dtypes={
  36             "seqname": polars.Categorical,
  37             "source": polars.Categorical,
  38             "start": polars.Int64,
  39             "end": polars.Int64,
  40             "score": polars.Float32,
  41             "feature": polars.Categorical,
  42             "strand": polars.Categorical,
  43             "frame": polars.UInt32,
  44         })
  45     try:
  46         if type(filepath_or_buffer) is StringIO:
  47             df = polars.read_csv(
  48                 filepath_or_buffer,
  49                 new_columns=REQUIRED_COLUMNS,
  50                 **kwargs).lazy()
  51         elif filepath_or_buffer.endswith(".gz") or filepath_or_buffer.endswith(".gzip"):
  52             with gzip.open(filepath_or_buffer) as f:
  53                 df = polars.read_csv(
  54                     f,
  55                     new_columns=REQUIRED_COLUMNS,
  56                     **kwargs).lazy()
  57         else:
  58             df = polars.scan_csv(
  59                 filepath_or_buffer,
  60                 with_column_names=lambda cols: REQUIRED_COLUMNS,
  61                 **kwargs).lazy()
  62     except polars.ShapeError:
  63         raise ParsingError("Wrong number of columns")
  64     df = df.with_columns([
  65         polars.col("frame").fill_null(0),
  66         polars.col("attribute").str.replace_all('"', "'")
  67     ])
  68     for fix_quotes_column in fix_quotes_columns:
  69         # Catch mistaken semicolons by replacing "xyz;" with "xyz"
  70         # Required to do this since the Ensembl GTF for Ensembl
  71         # release 78 has mistakes such as:
  72         #   gene_name = "PRAMEF6;" transcript_name = "PRAMEF6;-201"
  73         df = df.with_columns([
  74             polars.col(fix_quotes_column).str.replace(';\"', '\"').str.replace(";-", "-")
  75         ])
  76     if features is not None:
  77         features = sorted(set(features))
  78         df = df.filter(polars.col("feature").is_in(features))
  79     if split_attributes:
  80         df = df.with_columns([
  81             polars.col("attribute").str.split(";").alias("attribute_split")
  82         ])
  83     return df
  84 # https://github.com/openvax/gtfparse/pull/35
  85 gtfparse.parse_with_polars_lazy = parse_with_polars_lazy
  86
  87 def main():
  88     if len(sys.argv) < 2 :
  89         print('Usage:',sys.argv[0],'<gtf file> >geneRange.out',file=sys.stderr,flush=True);
  90         exit(0);
  91     gtfFile = sys.argv[1]   # 'GCF_000001405.40_GRCh38.p14_genomic.gtf', 'h200.p14_genomic.gtf'
  92     Genes = defaultdict(functools.partial(defaultdict, list))
  93     with tqdm.tqdm(total=os.path.getsize(gtfFile)) as pbar:
  94         i = 0
  95         with open(gtfFile,'r') as gtfileh:
  96             #for line in gtfileh:
  97             while line:= gtfileh.readline():
  98             #for i, line in enumerate(gtfileh):
  99                 #i += len(line)
 100                 i += 1
 101                 if not i % 1000:
 102                     pbar.update(gtfileh.tell() - pbar.n)
 103                     #pbar.update(i - pbar.n)
 104                     #eprint(i)
 105                 gtfline = gtfparse.parse_gtf_and_expand_attributes(io.StringIO(line), restrict_attribute_columns=[SelectedAttribute])
 106                 if SelectedAttribute in gtfline:
 107                     for k in ('start','end'):
 108                         Genes['\t'.join((gtfline[SelectedAttribute][0],gtfline['strand'][0],gtfline['seqname'][0]))][k].append(gtfline[k][0])
 109                 else:
 110                     print(str(gtfline))
 111                 pbar.update(len(line))
 112         for k in sorted(Genes.keys()):
 113             Genes[k]['MinStart'] = min(Genes[k]['start'])
 114             Genes[k]['MaxEnd'] = max(Genes[k]['end'])
 115             #print(Genes[k])
 116             print('\t'.join((k,str(Genes[k]['MinStart']),str(Genes[k]['MaxEnd']))))
 117             print('\t'.join(('#',str(Genes[k]['start']),str(Genes[k]['end']))))
 118             sys.stdout.flush()
 119         #print(Genes)
 120
 121 if __name__ == "__main__":
 122     main()  # time ./gtfGeneRange.py h200.p14_genomic.gtf