9 from collections
import defaultdict
11 #logging.getLogger("gtfparse").setLevel(logging.WARNING)
12 #gtfparse.logger.setLevel(logging.WARNING)
13 for handler
in logging
.root
.handlers
:
14 handler
.setLevel(logging
.WARNING
)
16 SelectedAttribute
= 'gene_name'
17 SelectedAttribute
= 'gene_id'
19 def eprint(*args
, **kwargs
):
20 print(*args
, file=sys
.stderr
, **kwargs
)
22 def parse_with_polars_lazy(
24 split_attributes
=True,
26 fix_quotes_columns
=["attribute"]):
27 # use a global string cache so that all strings get intern'd into
28 # a single numbering system
29 polars
.toggle_string_cache(True)
36 "seqname": polars
.Categorical
,
37 "source": polars
.Categorical
,
38 "start": polars
.Int64
,
40 "score": polars
.Float32
,
41 "feature": polars
.Categorical
,
42 "strand": polars
.Categorical
,
43 "frame": polars
.UInt32
,
46 if type(filepath_or_buffer
) is StringIO
:
49 new_columns
=REQUIRED_COLUMNS
,
51 elif filepath_or_buffer
.endswith(".gz") or filepath_or_buffer
.endswith(".gzip"):
52 with gzip
.open(filepath_or_buffer
) as f
:
55 new_columns
=REQUIRED_COLUMNS
,
60 with_column_names
=lambda cols
: REQUIRED_COLUMNS
,
62 except polars
.ShapeError
:
63 raise ParsingError("Wrong number of columns")
64 df
= df
.with_columns([
65 polars
.col("frame").fill_null(0),
66 polars
.col("attribute").str.replace_all('"', "'")
68 for fix_quotes_column
in fix_quotes_columns
:
69 # Catch mistaken semicolons by replacing "xyz;" with "xyz"
70 # Required to do this since the Ensembl GTF for Ensembl
71 # release 78 has mistakes such as:
72 # gene_name = "PRAMEF6;" transcript_name = "PRAMEF6;-201"
73 df
= df
.with_columns([
74 polars
.col(fix_quotes_column
).str.replace(';\"', '\"').str.replace(";-", "-")
76 if features
is not None:
77 features
= sorted(set(features
))
78 df
= df
.filter(polars
.col("feature").is_in(features
))
80 df
= df
.with_columns([
81 polars
.col("attribute").str.split(";").alias("attribute_split")
84 # https://github.com/openvax/gtfparse/pull/35
85 gtfparse
.parse_with_polars_lazy
= parse_with_polars_lazy
88 if len(sys
.argv
) < 2 :
89 print('Usage:',sys
.argv
[0],'<gtf file> >geneRange.out',file=sys
.stderr
,flush
=True);
91 gtfFile
= sys
.argv
[1] # 'GCF_000001405.40_GRCh38.p14_genomic.gtf', 'h200.p14_genomic.gtf'
92 Genes
= defaultdict(functools
.partial(defaultdict
, list))
93 with tqdm
.tqdm(total
=os
.path
.getsize(gtfFile
)) as pbar
:
95 with
open(gtfFile
,'r') as gtfileh
:
97 while line
:= gtfileh
.readline():
98 #for i, line in enumerate(gtfileh):
102 pbar
.update(gtfileh
.tell() - pbar
.n
)
103 #pbar.update(i - pbar.n)
105 gtfline
= gtfparse
.parse_gtf_and_expand_attributes(io
.StringIO(line
), restrict_attribute_columns
=[SelectedAttribute
])
106 if SelectedAttribute
in gtfline
:
107 for k
in ('start','end'):
108 Genes
['\t'.join((gtfline
[SelectedAttribute
][0],gtfline
['strand'][0],gtfline
['seqname'][0]))][k
].append(gtfline
[k
][0])
111 pbar
.update(len(line
))
112 for k
in sorted(Genes
.keys()):
113 Genes
[k
]['MinStart'] = min(Genes
[k
]['start'])
114 Genes
[k
]['MaxEnd'] = max(Genes
[k
]['end'])
116 print('\t'.join((k
,str(Genes
[k
]['MinStart']),str(Genes
[k
]['MaxEnd']))))
117 print('\t'.join(('#',str(Genes
[k
]['start']),str(Genes
[k
]['end']))))
121 if __name__
== "__main__":
122 main() # time ./gtfGeneRange.py h200.p14_genomic.gtf