modified: openstmerge.py
[GalaxyCodeBases.git] / python / salus / gtfGeneRange.py
blob7cf2f72b2113025283918646bab3746d59c26f3a
1 #!/usr/bin/env python3
3 import sys
4 import io
5 import os
6 import tqdm
7 import functools
8 import gtfparse
9 from collections import defaultdict
10 import logging
11 #logging.getLogger("gtfparse").setLevel(logging.WARNING)
12 #gtfparse.logger.setLevel(logging.WARNING)
13 for handler in logging.root.handlers:
14 handler.setLevel(logging.WARNING)
16 SelectedAttribute = 'gene_name'
17 SelectedAttribute = 'gene_id'
19 def eprint(*args, **kwargs):
20 print(*args, file=sys.stderr, **kwargs)
22 def parse_with_polars_lazy(
23 filepath_or_buffer,
24 split_attributes=True,
25 features=None,
26 fix_quotes_columns=["attribute"]):
27 # use a global string cache so that all strings get intern'd into
28 # a single numbering system
29 polars.toggle_string_cache(True)
30 kwargs = dict(
31 has_header=False,
32 separator="\t",
33 comment_char="#",
34 null_values=".",
35 dtypes={
36 "seqname": polars.Categorical,
37 "source": polars.Categorical,
38 "start": polars.Int64,
39 "end": polars.Int64,
40 "score": polars.Float32,
41 "feature": polars.Categorical,
42 "strand": polars.Categorical,
43 "frame": polars.UInt32,
45 try:
46 if type(filepath_or_buffer) is StringIO:
47 df = polars.read_csv(
48 filepath_or_buffer,
49 new_columns=REQUIRED_COLUMNS,
50 **kwargs).lazy()
51 elif filepath_or_buffer.endswith(".gz") or filepath_or_buffer.endswith(".gzip"):
52 with gzip.open(filepath_or_buffer) as f:
53 df = polars.read_csv(
55 new_columns=REQUIRED_COLUMNS,
56 **kwargs).lazy()
57 else:
58 df = polars.scan_csv(
59 filepath_or_buffer,
60 with_column_names=lambda cols: REQUIRED_COLUMNS,
61 **kwargs).lazy()
62 except polars.ShapeError:
63 raise ParsingError("Wrong number of columns")
64 df = df.with_columns([
65 polars.col("frame").fill_null(0),
66 polars.col("attribute").str.replace_all('"', "'")
68 for fix_quotes_column in fix_quotes_columns:
69 # Catch mistaken semicolons by replacing "xyz;" with "xyz"
70 # Required to do this since the Ensembl GTF for Ensembl
71 # release 78 has mistakes such as:
72 # gene_name = "PRAMEF6;" transcript_name = "PRAMEF6;-201"
73 df = df.with_columns([
74 polars.col(fix_quotes_column).str.replace(';\"', '\"').str.replace(";-", "-")
76 if features is not None:
77 features = sorted(set(features))
78 df = df.filter(polars.col("feature").is_in(features))
79 if split_attributes:
80 df = df.with_columns([
81 polars.col("attribute").str.split(";").alias("attribute_split")
83 return df
84 # https://github.com/openvax/gtfparse/pull/35
85 gtfparse.parse_with_polars_lazy = parse_with_polars_lazy
87 def main():
88 if len(sys.argv) < 2 :
89 print('Usage:',sys.argv[0],'<gtf file> >geneRange.out',file=sys.stderr,flush=True);
90 exit(0);
91 gtfFile = sys.argv[1] # 'GCF_000001405.40_GRCh38.p14_genomic.gtf', 'h200.p14_genomic.gtf'
92 Genes = defaultdict(functools.partial(defaultdict, list))
93 with tqdm.tqdm(total=os.path.getsize(gtfFile)) as pbar:
94 i = 0
95 with open(gtfFile,'r') as gtfileh:
96 #for line in gtfileh:
97 while line:= gtfileh.readline():
98 #for i, line in enumerate(gtfileh):
99 #i += len(line)
100 i += 1
101 if not i % 1000:
102 pbar.update(gtfileh.tell() - pbar.n)
103 #pbar.update(i - pbar.n)
104 #eprint(i)
105 gtfline = gtfparse.parse_gtf_and_expand_attributes(io.StringIO(line), restrict_attribute_columns=[SelectedAttribute])
106 if SelectedAttribute in gtfline:
107 for k in ('start','end'):
108 Genes['\t'.join((gtfline[SelectedAttribute][0],gtfline['strand'][0],gtfline['seqname'][0]))][k].append(gtfline[k][0])
109 else:
110 print(str(gtfline))
111 pbar.update(len(line))
112 for k in sorted(Genes.keys()):
113 Genes[k]['MinStart'] = min(Genes[k]['start'])
114 Genes[k]['MaxEnd'] = max(Genes[k]['end'])
115 #print(Genes[k])
116 print('\t'.join((k,str(Genes[k]['MinStart']),str(Genes[k]['MaxEnd']))))
117 print('\t'.join(('#',str(Genes[k]['start']),str(Genes[k]['end']))))
118 sys.stdout.flush()
119 #print(Genes)
121 if __name__ == "__main__":
122 main() # time ./gtfGeneRange.py h200.p14_genomic.gtf