2 ###!/scratch/src/mambaforge/envs/py311/bin/python3
5 #sys.path.append('/scratch/src/mambaforge/envs/py311/lib/python3.11/site-packages')
16 pp
= pprint
.PrettyPrinter(indent
=4)
18 def eprint(*args
, **kwargs
) -> None:
19 print(*args
, **kwargs
, file=sys
.stderr
, flush
=True)
21 def init_argparse() -> argparse
.ArgumentParser
:
22 parser
= argparse
.ArgumentParser(
23 description
='intersection of FastQ Read 2 with spatial barcodes, and dump matching Read 1 with spatial coordinates',
24 epilog
='Contact: <huxs@salus-bio.com>')
25 #parser.add_argument('-s', '--spatial', type=pathlib.Path, default='spatial.txt', metavar='file', help='For spatial.txt[.gz]')
26 parser
.add_argument('-1', '--read1', type=pathlib
.Path
, default
='Unmapped.out.mate1', metavar
='file', help='For Unmapped.out.mate1[.gz]')
27 #parser.add_argument('-2', '--read2', type=pathlib.Path, default='Unmapped.out.mate2', metavar='file', help='For Unmapped.out.mate2[.gz]')
28 parser
.add_argument('-p', '--read2-paf', type=pathlib
.Path
, default
='Unmapped.mate2.paf', metavar
='file', help='For Unmapped.mate2.paf')
29 #parser.add_argument('-m', '--max-mismatch', dest='mismatch', type=int, default=1, help='max allowed mismatch, default=1')
30 parser
.add_argument('-o', '--output', type=pathlib
.Path
, default
='Unmapped.fq.gz', dest
='outfile')
31 parser
.add_argument('-d', '--dropped-output', type=pathlib
.Path
, dest
='droppedfile')
32 #parser.add_argument('-z', '--gzip', action=argparse.BooleanOptionalAction, default=True, help='Output gzipped files, default on', dest='gzip')
33 #parser.add_argument('-n', '--dryrun', '--dry-run', action='store_true', dest='dryrun')
35 "-v", "--version", action
="version",
36 version
=f
"{parser.prog} version 1.0.0"
40 def fileOpener(filename
):
41 f
= open(filename
,'rb')
43 if (f
.read(2) == b
'\x1f\x8b'):
45 fh
= gzip
.GzipFile(fileobj
=f
, mode
='rb')
48 #fht = io.TextIOWrapper(fh, encoding='utf-8', line_buffering=True)
52 parser
= init_argparse()
53 if len(sys
.argv
) == 1:
57 (01) perl -lane 'print ">",join("_",@F),"\\n$F[0]"' spatial.txt | minimap2 -k 15 -d spatial.miniref - 2>spatial.miniref.log
58 (2a) seqtk trimfq -L 30 Unmapped.out.mate2 | minimap2 -x sr spatial.miniref - -k15 -w10 -N1 -t8 -QL2c --eqx --cs --sr --end-bonus 200 --for-only -A4 -B0 -o Unmapped.mate2.paf 2>Unmapped.mate2.paf.log
59 (2b) seqtk trimfq -L 30 Unmapped.out.mate2 | minimap2 -x sr spatial.miniref - -k15 -w10 -N1 -t8 -QL2c --eqx --cs --sr --end-bonus 200 --for-only -A4 -B0 2>Unmapped.mate2.paf.log | gzip -1c >Unmapped.mate2.paf.gz
62 args
= parser
.parse_args()
64 eprint('[!]Read1:[',args
.read1
,'], Read2.PAF:[',args
.read2_paf
,']. OutFile:[',args
.outfile
,']',sep
='',end
='');
65 if args
.droppedfile
!= None:
66 eprint(', Dropped:[',args
.droppedfile
,'].',sep
='');
72 pbar
= tqdm
.tqdm(desc
='FastQ', ncols
=70, mininterval
=0.5, maxinterval
=10, unit
='', unit_scale
=True, dynamic_ncols
=True)
73 #with open(args.outfile, mode='wt') as fh:
74 IndelPatten
= re
.compile(r
"[ID]")
75 fqitem
= pyfastx
.Fastq(args
.read1
.as_posix(), build_index
=False)
77 name
,seq
,qual
= fqitem
.__next
__()
78 if args
.droppedfile
!= None:
79 dfh
= gzip
.open(args
.droppedfile
, mode
='wt', compresslevel
=1)
80 with gzip
.open(args
.outfile
, mode
='wt', compresslevel
=1) as fh
:
81 with
fileOpener(args
.read2_paf
) as fp2
:
82 with pafpy
.PafFile(fp2
) as paf
:
85 if record
.is_primary():
86 if record
.qstart
==0 and record
.qend
==30 and record
.tstart
==0 and record
.tend
==30:
87 (barcode
, xpos
, ypos
) = record
.tname
.split('_')
88 search
= IndelPatten
.search(record
.tags
['cg'].value
)
91 if args
.droppedfile
!= None:
92 print('@{}'.format(name
), xpos
, ypos
, record
.tags
['cg'], record
.tags
['cs'], 'zz:Z:withIndel', file=dfh
)
93 print(seq
,'+',qual
,sep
="\n", file=dfh
)
95 while(record
.qname
!= name
):
96 name
,seq
,qual
= next(fqitem
)
100 print('@{}'.format(name
), xpos
, ypos
, record
.tags
['cg'], record
.tags
['cs'], file=fh
)
101 print(seq
,'+',qual
,sep
="\n", file=fh
)
105 if args
.droppedfile
!= None:
106 tmpstr
= '_'.join( map(str,(record
.qstart
,record
.qend
,record
.tstart
,record
.tend
)) )
107 print('@{}'.format(name
), xpos
, ypos
, record
.tags
['cg'], record
.tags
['cs'], ':'.join(('zz:Z:Shifted',tmpstr
)), file=dfh
)
108 print(seq
,'+',qual
,sep
="\n", file=dfh
)
111 while(fqitem
.__next
__()):
114 except StopIteration as e
:
116 if args
.droppedfile
!= None:
119 eprint('[!]FastQ:[{}]-notFound:[{}] <=> Matched:[{}]=Accepted:[{}]+Skipped[{}].'.format(totalReads
,totalReads
-accepted
-skipped
, skipped
+accepted
, accepted
, skipped
))
121 if __name__
== "__main__":
122 main() # time ./spffq.py -1 n4457360.Unmapped.out.mate1.gz -p n175410.Unmapped.mate2.paf.gz -d Unmapped.dropped.gz
125 [1]+ Running perl -lane 'print ">",join("_",@F),"\n$F[0]"' spatial.txt | minimap2 -k 15 -d spatial.miniref - 2> spatial.miniref.log &
126 [2]+ Running seqtk trimfq -L 30 Unmapped.out.mate2 | minimap2 -x sr spatial.miniref - -k15 -w10 -N1 -t8 -QL2c --eqx --cs --sr --end-bonus 200 --for-only -A4 -B0 -o Unmapped2.paf 2> Unmapped2.err &
128 zgrep -v 'cg:Z:30=' n175410.Unmapped.mate2.paf.gz|grep 'cg:Z' |less -S
130 For 230605Evo_mouse s03C2, [!]FastQ:[18057073]-notFound:[14454975] <=> Matched:[3602098]=Accepted:[3600463]+Skipped[1635].