4 from Bio
.Seq
import Seq
5 from Bio
.SeqRecord
import SeqRecord
6 from Bio
.SeqFeature
import SeqFeature
, FeatureLocation
9 import zstandard
as zstd
12 from collections
import defaultdict
13 seqDat
= defaultdict(list)
15 path
= 'PRJNA380127.gbk.zst'
16 with
open(path
, 'rb') as fh
:
17 dctx
= zstd
.ZstdDecompressor()
18 stream_reader
= dctx
.stream_reader(fh
)
19 text_stream
= io
.TextIOWrapper(stream_reader
, encoding
='utf-8')
20 for record
in SeqIO
.parse(text_stream
, "genbank"):
22 print(record
.description
)
23 datHumanSTR
= record
.annotations
['structured_comment']['HumanSTR']
24 print(datHumanSTR
['STR locus name'])
25 print(datHumanSTR
['Length-based allele'])
26 print(record
.annotations
['structured_comment']['HumanSTR']['Bracketed repeat'])
27 print(datHumanSTR
['RefSeq Accession'])
28 print(datHumanSTR
['Chrom. Location'])
31 seqDat
[datHumanSTR
['STR locus name']].append(['|'.join([record
.id, record
.description
.replace('Homo sapiens microsatellite ','').replace(' ','_')]),record
.seq
])
36 # https://www.ncbi.nlm.nih.gov/nuccore?term=380127%5BBioProject%5D
38 # get all sequence records for the specified genbank file
39 recs
= [rec
for rec
in SeqIO
.parse("PRJNA380127.gbk", "genbank")]
41 # print the number of sequence records that were extracted
44 # print annotations for each sequence record
46 print(rec
.annotations
)
48 # print the CDS sequence feature summary information for each feature in each
51 feats
= [feat
for feat
in rec
.features
if feat
.type == "CDS"]
56 for record in SeqIO.parse("t.gb","genbank"):
61 >>> print(record.annotations['structured_comment']['HumanSTR'])
62 OrderedDict([('STR locus name', 'TPOX'), ('Length-based allele', '6'), ('Bracketed repeat', '[AATG]6'), ('Sequencing technology', 'ForenSeq, MiSeq FGx; PowerSeq Auto, MiSeq'), ('Coverage', '>30X'), ('Length-based tech.', 'PowerPlex Fusion, ABI3500xl'), ('Assembly', 'GRCh38 (GCF_000001405)'), ('Chromosome', '2'), ('RefSeq Accession', 'NC_000002.12'), ('Chrom. Location', '1489532..1489698'), ('Repeat Location', '1489653..1489684'), ('Cytogenetic Location', '2p25.3')])
67 Description: Homo sapiens microsatellite TPOX 6 [AATG]6 sequence
68 Database cross-references: BioProject:PRJNA380554
72 /data_file_division=PRI
74 /accessions=['MF044246']
76 /keywords=['STRSeq, STR, TPOX']
77 /source=Homo sapiens (human)
78 /organism=Homo sapiens
79 /taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']
80 /references=[Reference(title='STRSeq: A catalog of sequence diversity at human identification Short Tandem Repeat loci', ...), Reference(title='Direct Submission', ...)]
81 /comment=Annotation ('bracketing') of the repeat region is consistent with
82 the guidance of the ISFG (International Society of Forensic
83 Genetics), PMID: 26844919. Lower case letters in the 'Bracketed
84 repeat' region below denote uncounted bases. The given
85 length-based allele value was determined using the designated
86 length-based technology. Variation in the length-based allele
87 between individuals or assays can result from indels in flanking
88 regions. The length of reported sequence is dependent on the assay
89 and the quality of the flanking sequence. This information is
90 provided as part of the STR Sequencing Project (STRseq), a
91 collaborative effort of the international forensic DNA community.
92 The purpose of this project is to facilitate the description of
93 sequence-based STR alleles. Additional resources can be found at
94 strseq.nist.gov. For questions or feedback, please contact
95 strseq@nist.gov. Allele frequency data can be accessed in the
96 strider.online database.
97 /structured_comment=OrderedDict([('HumanSTR', OrderedDict([('STR locus name', 'TPOX'), ('Length-based allele', '6'), ('Bracketed repeat', '[AATG]6'), ('Sequencing technology', 'ForenSeq, MiSeq FGx; PowerSeq Auto, MiSeq'), ('Coverage', '>30X'), ('Length-based tech.', 'PowerPlex Fusion, ABI3500xl'), ('Assembly', 'GRCh38 (GCF_000001405)'), ('Chromosome', '2'), ('RefSeq Accession', 'NC_000002.12'), ('Chrom. Location', '1489532..1489698'), ('Repeat Location', '1489653..1489684'), ('Cytogenetic Location', '2p25.3')]))])
98 Seq('TGGCCTGTGGGTCCCCCCATAGATCGTAAGCCCAGGAGGAAGGGCTGTGTTTCA...AAA', IUPACAmbiguousDNA())
100 LOCUS MF044246 159 bp DNA linear PRI 04-SEP-2018
101 DEFINITION Homo sapiens microsatellite TPOX 6 [AATG]6 sequence.
104 DBLINK BioProject: PRJNA380554
105 KEYWORDS STRSeq, STR, TPOX.
106 SOURCE Homo sapiens (human)
107 ORGANISM Homo sapiens
108 Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
109 Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
110 Catarrhini; Hominidae; Homo.
111 REFERENCE 1 (bases 1 to 159)
112 AUTHORS Gettings,K.B., Borsuk,L.A., Ballard,D., Bodner,M., Budowle,B.,
113 Devesse,L., King,J., Parson,W., Phillips,C. and Vallone,P.M.
114 TITLE STRSeq: A catalog of sequence diversity at human identification
115 Short Tandem Repeat loci
116 JOURNAL Forensic Sci Int Genet 31, 111-117 (2017)
118 REFERENCE 2 (bases 1 to 159)
120 TITLE Direct Submission
121 JOURNAL Submitted (04-MAY-2017) Applied Genetics Group, National Institute
122 of Standards and Technology, 100 Bureau Drive, MS-8314,
123 Gaithersburg, MD 20899, USA
124 COMMENT Annotation ('bracketing') of the repeat region is consistent with
125 the guidance of the ISFG (International Society of Forensic
126 Genetics), PMID: 26844919. Lower case letters in the 'Bracketed
127 repeat' region below denote uncounted bases. The given
128 length-based allele value was determined using the designated
129 length-based technology. Variation in the length-based allele
130 between individuals or assays can result from indels in flanking
131 regions. The length of reported sequence is dependent on the assay
132 and the quality of the flanking sequence. This information is
133 provided as part of the STR Sequencing Project (STRseq), a
134 collaborative effort of the international forensic DNA community.
135 The purpose of this project is to facilitate the description of
136 sequence-based STR alleles. Additional resources can be found at
137 strseq.nist.gov. For questions or feedback, please contact
138 strseq@nist.gov. Allele frequency data can be accessed in the
139 strider.online database.
142 STR locus name :: TPOX
143 Length-based allele :: 6
144 Bracketed repeat :: [AATG]6
145 Sequencing technology :: ForenSeq, MiSeq FGx; PowerSeq Auto, MiSeq
147 Length-based tech. :: PowerPlex Fusion, ABI3500xl
148 Assembly :: GRCh38 (GCF_000001405)
150 RefSeq Accession :: NC_000002.12
151 Chrom. Location :: 1489532..1489698
152 Repeat Location :: 1489653..1489684
153 Cytogenetic Location :: 2p25.3
155 FEATURES Location/Qualifiers
157 /organism="Homo sapiens"
158 /mol_type="genomic DNA"
159 /db_xref="taxon:9606"
161 /note="Promega PowerSeq Sequence"
162 misc_feature 120..150
163 /note="Illumina ForenSeq Sequence"
164 repeat_region 122..145
166 /satellite="microsatellite:TPOX"
168 1 tggcctgtgg gtccccccat agatcgtaag cccaggagga agggctgtgt ttcagggctg
169 61 tgatcactag cacccagaac cgtcgactgg cacagaacag gcacttaggg aaccctcact
170 121 gaatgaatga atgaatgaat gaatgtttgg gcaaataaa