modified: Makefile
[GalaxyCodeBases.git] / python / salus / imgalign / read10x.py
blob5d55f08466cce29f94905fe552d5856cfbb7fa80
1 #!/usr/bin/env python3
3 import concurrent.futures
4 import sys
5 import os
6 import io
7 import functools
8 import re
9 import argparse
10 import pathlib
11 import gzip
12 import graphblas as gb
13 import fast_matrix_market
14 import tqdm
15 import time
16 #import recordclass
18 import dahuffman
19 # rANS4x16 as used in CRAM should be the best option. See <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8896640/> and <https://github.com/samtools/htscodecs> or *<https://github.com/jkbonfield/rans_static>.
20 # Others like Arithmetic Coding <https://github.com/fab-jul/torchac>, <https://marknelson.us/posts/2014/10/19/data-compression-with-arithmetic-coding.html>. Or <https://github.com/ahmedfgad/ArithmeticEncodingPython>. And <https://michaeldipperstein.github.io/arithmetic.html#download>.
21 # Range coding <https://en.wikipedia.org/wiki/Range_coding>. *<https://github.com/richgel999/sserangecoding> <https://github.com/powturbo/Turbo-Range-Coder>.
23 spatialDB = None
24 mgBoolMtx = None
25 #spPosition = recordclass.make_dataclass("Point", [("x",int), ("y",int)], readonly=True)
26 baseCodec = dahuffman.HuffmanCodec.from_frequencies({'A':812342385,'T':764495360,'C':778170645,'G':622248280,'N':100})
27 coordinatesCodec = dahuffman.HuffmanCodec.from_frequencies({
28 ' ':49041072,'.':98082144,'0':50072308,'1':87291011,'2':85393505,'3':84616786,
29 '4':65118253,'5':58134804,'6':58360232,'7':58371094,'8':59921116,'9':59890253,
31 '''
32 >>> baseCodec.print_code_table()
33 Bits Code Value Symbol
34 2 00 0 'C'
35 4 0100 4 _EOF
36 4 0101 5 'N'
37 3 011 3 'G'
38 2 10 2 'T'
39 2 11 3 'A'
40 >>> coordinatesCodec.print_code_table()
41 Bits Code Value Symbol
42 3 000 0 '3'
43 3 001 1 '2'
44 3 010 2 '1'
45 3 011 3 '.'
46 5 10000 16 _EOF
47 5 10001 17 ' '
48 4 1001 9 '0'
49 4 1010 10 '5'
50 4 1011 11 '6'
51 4 1100 12 '7'
52 4 1101 13 '9'
53 4 1110 14 '8'
54 4 1111 15 '4'
55 '''
57 def eprint(*args, **kwargs) -> None:
58 print(*args, **kwargs, file=sys.stderr, flush=True)
60 def init_argparse() -> argparse.ArgumentParser:
61 parser = argparse.ArgumentParser(
62 description='merge scSeq data with spBarcode coordinates that gridded by given binning size ',
63 epilog='Contact: <huxs@salus-bio.com>')
64 #parser.add_argument('-b', '--bin', type=int, required = True, help='grid binning pixels')
65 parser.add_argument('-s', '--spatial', type=pathlib.Path, default='spatial.txt', metavar='txt', dest='scSpatial', help='For spatial.txt[.gz]')
66 parser.add_argument('-f', '--scseq-files', type=pathlib.Path, nargs=3, action='extend', metavar='<f>', dest='scSeqFiles', help='matrix.mtx[.gz] barcodes.tsv[.gz] (features|genes).tsv[.gz]')
67 #parser.add_argument("files", nargs="*")
68 parser.add_argument('-o', '--output-path', type=pathlib.Path, default='./gridded/', dest='outpath')
69 parser.add_argument('-z', '--gzip', action=argparse.BooleanOptionalAction, default=True, help='Output gzipped files, default on', dest='gzip')
70 parser.add_argument('-n', '--dryrun', '--dry-run', action='store_true', dest='dryrun')
71 parser.add_argument('-d', '--debug', action='store_true', dest='dodebug')
72 parser.add_argument(
73 "-v", "--version", action="version",
74 version=f"{parser.prog} version 1.0.0"
76 return parser
78 def fileOpener(filename):
79 f = open(filename,'rb')
80 fh = f
81 if (f.read(2) == b'\x1f\x8b'):
82 f.seek(0)
83 fh = gzip.GzipFile(fileobj=f, mode='rb')
84 else:
85 f.seek(0)
86 fht = io.TextIOWrapper(fh, encoding='utf-8', line_buffering=True)
87 return fht
89 maxBarcodeLen = 0
90 SpatialBarcodeRange_xXyY = [0,0,0,0]
91 gridRangeCnt = ()
92 def readSpatial(infile, db):
93 global maxBarcodeLen
94 global SpatialBarcodeRange_xXyY
95 global GenesCnt
96 global BarcodesCnt
97 pbar = tqdm.tqdm(desc='Spatial', total=BarcodesCnt, ncols=70, mininterval=0.5, maxinterval=10, unit='', unit_scale=True, dynamic_ncols=True)
98 with fileOpener(infile) as f:
99 for index,line in enumerate(f, start=1):
100 [ seq, Xpos, Ypos, *_ ] = line.split()
101 seqLen = len(seq)
102 if seqLen > maxBarcodeLen:
103 maxBarcodeLen = seqLen
104 theXpos = int(float(Xpos))
105 theYpos = int(float(Ypos))
106 if (SpatialBarcodeRange_xXyY[0]==0) or (SpatialBarcodeRange_xXyY[0] > theXpos):
107 SpatialBarcodeRange_xXyY[0] = theXpos
108 if (SpatialBarcodeRange_xXyY[1]==0) or (SpatialBarcodeRange_xXyY[1] < theXpos):
109 SpatialBarcodeRange_xXyY[1] = theXpos
110 if (SpatialBarcodeRange_xXyY[2]==0) or (SpatialBarcodeRange_xXyY[2] > theYpos):
111 SpatialBarcodeRange_xXyY[2] = theYpos
112 if (SpatialBarcodeRange_xXyY[3]==0) or (SpatialBarcodeRange_xXyY[3] < theYpos):
113 SpatialBarcodeRange_xXyY[3] = theYpos
114 binSeq = baseCodec.encode(seq)
115 #strSeq = baseCodec.decode(binSeq)
116 binCoordStr = coordinatesCodec.encode(f'{Xpos} {Ypos}')
117 #db[binSeq] = spPosition(theXpos,theYpos)
118 db[binSeq] = binCoordStr
119 if not index % 1000:
120 pbar.update(index - pbar.n)
121 pbar.update(index - pbar.n)
122 return index
124 GenesCnt = 0
125 BarcodesCnt = 0
126 mtxNNZ = 0
127 def checkmtx(mtxfile) -> None:
128 mheader = fast_matrix_market.read_header(mtxfile)
129 global GenesCnt, BarcodesCnt, mtxNNZ
130 GenesCnt = mheader.nrows
131 BarcodesCnt = mheader.ncols
132 mtxNNZ = mheader.nnz
134 def write2gzip(outfile):
135 fh = gzip.open(outfile, mode='wb', compresslevel=1)
136 return fh
138 def main() -> None:
139 parser = init_argparse()
140 if len(sys.argv) == 1:
141 parser.print_help()
142 exit(0);
143 args = parser.parse_args()
144 scSeqFiles = tuple( args.scSeqFiles )
145 eprint('[!]Spatial=[',args.scSpatial,'], 10Xmatrix:[',','.join(map(str,scSeqFiles)),']. OutPath:[',args.outpath,']',sep='');
146 #scFileNameTuple = ('matrix.mtx', 'barcodes.tsv', 'features.tsv', 'genes.tsv')
147 #spFileNameList = ['spatial.txt']; spFileNameList.extend(scFileNameTuple[0:3])
148 checkmtx(scSeqFiles[0].as_posix())
149 eprint('[!]Matrix Size: Gene count(nrows)=',GenesCnt,', Barcode count(ncols)=',BarcodesCnt,', Values(nnz)=',mtxNNZ,'.',sep='')
150 if args.dryrun: exit(0);
151 global spatialDB, SpatialBarcodeRange_xXyY, gridRangeCnt, mgBoolMtx
153 start = time.perf_counter()
154 eprint('[!]Reading spatial file ...')
155 spatialDB = {}
156 lineCnt = readSpatial(args.scSpatial.as_posix(), spatialDB)
158 if __name__ == "__main__":
159 gb.init("suitesparse", blocking=False)
160 main() # time ./read10x.py -s HY0726A1.spatial.txt.gz -f HY0726A1.matrix.mtx.gz HY0726A1.barcodes.tsv.gz HY0726A1.features.tsv.gz