modified: Makefile
[GalaxyCodeBases.git] / python / salus / spffq.py
blob7dc3e55adb2b48be583e4ea1843cfb51411514db
1 #!/usr/bin/env python3
2 ###!/scratch/src/mambaforge/envs/py311/bin/python3
4 import sys
5 #sys.path.append('/scratch/src/mambaforge/envs/py311/lib/python3.11/site-packages')
6 import io
7 import argparse
8 import pathlib
9 import gzip
10 import re
11 import pyfastx
12 import pafpy
13 import tqdm
15 import pprint
16 pp = pprint.PrettyPrinter(indent=4)
18 def eprint(*args, **kwargs) -> None:
19 print(*args, **kwargs, file=sys.stderr, flush=True)
21 def init_argparse() -> argparse.ArgumentParser:
22 parser = argparse.ArgumentParser(
23 description='intersection of FastQ Read 2 with spatial barcodes, and dump matching Read 1 with spatial coordinates',
24 epilog='Contact: <huxs@salus-bio.com>')
25 #parser.add_argument('-s', '--spatial', type=pathlib.Path, default='spatial.txt', metavar='file', help='For spatial.txt[.gz]')
26 parser.add_argument('-1', '--read1', type=pathlib.Path, default='Unmapped.out.mate1', metavar='file', help='For Unmapped.out.mate1[.gz]')
27 #parser.add_argument('-2', '--read2', type=pathlib.Path, default='Unmapped.out.mate2', metavar='file', help='For Unmapped.out.mate2[.gz]')
28 parser.add_argument('-p', '--read2-paf', type=pathlib.Path, default='Unmapped.mate2.paf', metavar='file', help='For Unmapped.mate2.paf')
29 #parser.add_argument('-m', '--max-mismatch', dest='mismatch', type=int, default=1, help='max allowed mismatch, default=1')
30 parser.add_argument('-o', '--output', type=pathlib.Path, default='Unmapped.fq.gz', dest='outfile')
31 parser.add_argument('-d', '--dropped-output', type=pathlib.Path, dest='droppedfile')
32 #parser.add_argument('-z', '--gzip', action=argparse.BooleanOptionalAction, default=True, help='Output gzipped files, default on', dest='gzip')
33 #parser.add_argument('-n', '--dryrun', '--dry-run', action='store_true', dest='dryrun')
34 parser.add_argument(
35 "-v", "--version", action="version",
36 version=f"{parser.prog} version 1.0.0"
38 return parser
40 def fileOpener(filename):
41 f = open(filename,'rb')
42 fh = f
43 if (f.read(2) == b'\x1f\x8b'):
44 f.seek(0)
45 fh = gzip.GzipFile(fileobj=f, mode='rb')
46 else:
47 f.seek(0)
48 #fht = io.TextIOWrapper(fh, encoding='utf-8', line_buffering=True)
49 return fh
51 def main() -> None:
52 parser = init_argparse()
53 if len(sys.argv) == 1:
54 parser.print_help()
55 eprint('''
56 Requirements:
57 (01) perl -lane 'print ">",join("_",@F),"\\n$F[0]"' spatial.txt | minimap2 -k 15 -d spatial.miniref - 2>spatial.miniref.log
58 (2a) seqtk trimfq -L 30 Unmapped.out.mate2 | minimap2 -x sr spatial.miniref - -k15 -w10 -N1 -t8 -QL2c --eqx --cs --sr --end-bonus 200 --for-only -A4 -B0 -o Unmapped.mate2.paf 2>Unmapped.mate2.paf.log
59 (2b) seqtk trimfq -L 30 Unmapped.out.mate2 | minimap2 -x sr spatial.miniref - -k15 -w10 -N1 -t8 -QL2c --eqx --cs --sr --end-bonus 200 --for-only -A4 -B0 2>Unmapped.mate2.paf.log | gzip -1c >Unmapped.mate2.paf.gz
60 ''')
61 exit(0);
62 args = parser.parse_args()
63 #pp.pprint(args)
64 eprint('[!]Read1:[',args.read1,'], Read2.PAF:[',args.read2_paf,']. OutFile:[',args.outfile,']',sep='',end='');
65 if args.droppedfile != None:
66 eprint(', Dropped:[',args.droppedfile,'].',sep='');
67 else:
68 eprint()
69 skipped = 0
70 accepted = 0
71 totalReads = 1
72 pbar = tqdm.tqdm(desc='FastQ', ncols=70, mininterval=0.5, maxinterval=10, unit='', unit_scale=True, dynamic_ncols=True)
73 #with open(args.outfile, mode='wt') as fh:
74 IndelPatten = re.compile(r"[ID]")
75 fqitem = pyfastx.Fastq(args.read1.as_posix(), build_index=False)
76 iter(fqitem)
77 name,seq,qual = fqitem.__next__()
78 if args.droppedfile != None:
79 dfh = gzip.open(args.droppedfile, mode='wt', compresslevel=1)
80 with gzip.open(args.outfile, mode='wt', compresslevel=1) as fh:
81 with fileOpener(args.read2_paf) as fp2:
82 with pafpy.PafFile(fp2) as paf:
83 for record in paf:
84 #pbar.update(1)
85 if record.is_primary():
86 if record.qstart==0 and record.qend==30 and record.tstart==0 and record.tend==30:
87 (barcode, xpos, ypos) = record.tname.split('_')
88 search = IndelPatten.search(record.tags['cg'].value)
89 if search:
90 skipped +=1
91 if args.droppedfile != None:
92 print('@{}'.format(name), xpos, ypos, record.tags['cg'], record.tags['cs'], 'zz:Z:withIndel', file=dfh)
93 print(seq,'+',qual,sep="\n", file=dfh)
94 continue
95 while(record.qname != name):
96 name,seq,qual = next(fqitem)
97 totalReads +=1
98 pbar.update(1)
99 else:
100 print('@{}'.format(name), xpos, ypos, record.tags['cg'], record.tags['cs'], file=fh)
101 print(seq,'+',qual,sep="\n", file=fh)
102 accepted +=1
103 else:
104 skipped +=1
105 if args.droppedfile != None:
106 tmpstr = '_'.join( map(str,(record.qstart,record.qend,record.tstart,record.tend)) )
107 print('@{}'.format(name), xpos, ypos, record.tags['cg'], record.tags['cs'], ':'.join(('zz:Z:Shifted',tmpstr)), file=dfh)
108 print(seq,'+',qual,sep="\n", file=dfh)
109 #continue
110 try:
111 while(fqitem.__next__()):
112 totalReads +=1
113 pbar.update(1)
114 except StopIteration as e:
115 None
116 if args.droppedfile != None:
117 dfh.close()
118 pbar.close()
119 eprint('[!]FastQ:[{}]-notFound:[{}] <=> Matched:[{}]=Accepted:[{}]+Skipped[{}].'.format(totalReads,totalReads-accepted-skipped, skipped+accepted, accepted, skipped))
121 if __name__ == "__main__":
122 main() # time ./spffq.py -1 n4457360.Unmapped.out.mate1.gz -p n175410.Unmapped.mate2.paf.gz -d Unmapped.dropped.gz
125 [1]+ Running perl -lane 'print ">",join("_",@F),"\n$F[0]"' spatial.txt | minimap2 -k 15 -d spatial.miniref - 2> spatial.miniref.log &
126 [2]+ Running seqtk trimfq -L 30 Unmapped.out.mate2 | minimap2 -x sr spatial.miniref - -k15 -w10 -N1 -t8 -QL2c --eqx --cs --sr --end-bonus 200 --for-only -A4 -B0 -o Unmapped2.paf 2> Unmapped2.err &
128 zgrep -v 'cg:Z:30=' n175410.Unmapped.mate2.paf.gz|grep 'cg:Z' |less -S
130 For 230605Evo_mouse s03C2, [!]FastQ:[18057073]-notFound:[14454975] <=> Matched:[3602098]=Accepted:[3600463]+Skipped[1635].