3 import concurrent
.futures
12 import graphblas
as gb
13 import fast_matrix_market
19 # rANS4x16 as used in CRAM should be the best option. See <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8896640/> and <https://github.com/samtools/htscodecs> or *<https://github.com/jkbonfield/rans_static>.
20 # Others like Arithmetic Coding <https://github.com/fab-jul/torchac>, <https://marknelson.us/posts/2014/10/19/data-compression-with-arithmetic-coding.html>. Or <https://github.com/ahmedfgad/ArithmeticEncodingPython>. And <https://michaeldipperstein.github.io/arithmetic.html#download>.
21 # Range coding <https://en.wikipedia.org/wiki/Range_coding>. *<https://github.com/richgel999/sserangecoding> <https://github.com/powturbo/Turbo-Range-Coder>.
25 #spPosition = recordclass.make_dataclass("Point", [("x",int), ("y",int)], readonly=True)
26 baseCodec
= dahuffman
.HuffmanCodec
.from_frequencies({'A':812342385,'T':764495360,'C':778170645,'G':622248280,'N':100})
27 coordinatesCodec
= dahuffman
.HuffmanCodec
.from_frequencies({
28 ' ':49041072,'.':98082144,'0':50072308,'1':87291011,'2':85393505,'3':84616786,
29 '4':65118253,'5':58134804,'6':58360232,'7':58371094,'8':59921116,'9':59890253,
32 >>> baseCodec.print_code_table()
33 Bits Code Value Symbol
40 >>> coordinatesCodec.print_code_table()
41 Bits Code Value Symbol
57 def eprint(*args
, **kwargs
) -> None:
58 print(*args
, **kwargs
, file=sys
.stderr
, flush
=True)
60 def init_argparse() -> argparse
.ArgumentParser
:
61 parser
= argparse
.ArgumentParser(
62 description
='merge scSeq data with spBarcode coordinates that gridded by given binning size ',
63 epilog
='Contact: <huxs@salus-bio.com>')
64 #parser.add_argument('-b', '--bin', type=int, required = True, help='grid binning pixels')
65 parser
.add_argument('-s', '--spatial', type=pathlib
.Path
, default
='spatial.txt', metavar
='txt', dest
='scSpatial', help='For spatial.txt[.gz]')
66 parser
.add_argument('-f', '--scseq-files', type=pathlib
.Path
, nargs
=3, action
='extend', metavar
='<f>', dest
='scSeqFiles', help='matrix.mtx[.gz] barcodes.tsv[.gz] (features|genes).tsv[.gz]')
67 #parser.add_argument("files", nargs="*")
68 parser
.add_argument('-o', '--output-path', type=pathlib
.Path
, default
='./gridded/', dest
='outpath')
69 parser
.add_argument('-z', '--gzip', action
=argparse
.BooleanOptionalAction
, default
=True, help='Output gzipped files, default on', dest
='gzip')
70 parser
.add_argument('-n', '--dryrun', '--dry-run', action
='store_true', dest
='dryrun')
71 parser
.add_argument('-d', '--debug', action
='store_true', dest
='dodebug')
73 "-v", "--version", action
="version",
74 version
=f
"{parser.prog} version 1.0.0"
78 def fileOpener(filename
):
79 f
= open(filename
,'rb')
81 if (f
.read(2) == b
'\x1f\x8b'):
83 fh
= gzip
.GzipFile(fileobj
=f
, mode
='rb')
86 fht
= io
.TextIOWrapper(fh
, encoding
='utf-8', line_buffering
=True)
90 SpatialBarcodeRange_xXyY
= [0,0,0,0]
92 def readSpatial(infile
, db
):
94 global SpatialBarcodeRange_xXyY
97 pbar
= tqdm
.tqdm(desc
='Spatial', total
=BarcodesCnt
, ncols
=70, mininterval
=0.5, maxinterval
=10, unit
='', unit_scale
=True, dynamic_ncols
=True)
98 with
fileOpener(infile
) as f
:
99 for index
,line
in enumerate(f
, start
=1):
100 [ seq
, Xpos
, Ypos
, *_
] = line
.split()
102 if seqLen
> maxBarcodeLen
:
103 maxBarcodeLen
= seqLen
104 theXpos
= int(float(Xpos
))
105 theYpos
= int(float(Ypos
))
106 if (SpatialBarcodeRange_xXyY
[0]==0) or (SpatialBarcodeRange_xXyY
[0] > theXpos
):
107 SpatialBarcodeRange_xXyY
[0] = theXpos
108 if (SpatialBarcodeRange_xXyY
[1]==0) or (SpatialBarcodeRange_xXyY
[1] < theXpos
):
109 SpatialBarcodeRange_xXyY
[1] = theXpos
110 if (SpatialBarcodeRange_xXyY
[2]==0) or (SpatialBarcodeRange_xXyY
[2] > theYpos
):
111 SpatialBarcodeRange_xXyY
[2] = theYpos
112 if (SpatialBarcodeRange_xXyY
[3]==0) or (SpatialBarcodeRange_xXyY
[3] < theYpos
):
113 SpatialBarcodeRange_xXyY
[3] = theYpos
114 binSeq
= baseCodec
.encode(seq
)
115 #strSeq = baseCodec.decode(binSeq)
116 binCoordStr
= coordinatesCodec
.encode(f
'{Xpos} {Ypos}')
117 #db[binSeq] = spPosition(theXpos,theYpos)
118 db
[binSeq
] = binCoordStr
120 pbar
.update(index
- pbar
.n
)
121 pbar
.update(index
- pbar
.n
)
127 def checkmtx(mtxfile
) -> None:
128 mheader
= fast_matrix_market
.read_header(mtxfile
)
129 global GenesCnt
, BarcodesCnt
, mtxNNZ
130 GenesCnt
= mheader
.nrows
131 BarcodesCnt
= mheader
.ncols
134 def write2gzip(outfile
):
135 fh
= gzip
.open(outfile
, mode
='wb', compresslevel
=1)
139 parser
= init_argparse()
140 if len(sys
.argv
) == 1:
143 args
= parser
.parse_args()
144 scSeqFiles
= tuple( args
.scSeqFiles
)
145 eprint('[!]Spatial=[',args
.scSpatial
,'], 10Xmatrix:[',','.join(map(str,scSeqFiles
)),']. OutPath:[',args
.outpath
,']',sep
='');
146 #scFileNameTuple = ('matrix.mtx', 'barcodes.tsv', 'features.tsv', 'genes.tsv')
147 #spFileNameList = ['spatial.txt']; spFileNameList.extend(scFileNameTuple[0:3])
148 checkmtx(scSeqFiles
[0].as_posix())
149 eprint('[!]Matrix Size: Gene count(nrows)=',GenesCnt
,', Barcode count(ncols)=',BarcodesCnt
,', Values(nnz)=',mtxNNZ
,'.',sep
='')
150 if args
.dryrun
: exit(0);
151 global spatialDB
, SpatialBarcodeRange_xXyY
, gridRangeCnt
, mgBoolMtx
153 start
= time
.perf_counter()
154 eprint('[!]Reading spatial file ...')
156 lineCnt
= readSpatial(args
.scSpatial
.as_posix(), spatialDB
)
158 if __name__
== "__main__":
159 gb
.init("suitesparse", blocking
=False)
160 main() # time ./read10x.py -s HY0726A1.spatial.txt.gz -f HY0726A1.matrix.mtx.gz HY0726A1.barcodes.tsv.gz HY0726A1.features.tsv.gz