2 from testlib
import testutil
, PygrTestProgram
4 import ConfigParser
, sys
, os
, string
5 from pygr
.mapping
import Collection
13 config
= ConfigParser
.ConfigParser({'testOutputBaseDir' : '.', 'smallSampleKey': ''})
14 config
.read([ os
.path
.join(os
.path
.expanduser('~'), '.pygrrc'), os
.path
.join(os
.path
.expanduser('~'), 'pygr.cfg'), '.pygrrc', 'pygr.cfg' ])
15 msaDir
= config
.get('megatests_hg18', 'msaDir')
16 seqDir
= config
.get('megatests_hg18', 'seqDir')
17 smallSampleKey
= config
.get('megatests_hg18', 'smallSampleKey')
18 testInputDB
= config
.get('megatests', 'testInputDB')
19 testInputDir
= config
.get('megatests', 'testInputDir')
20 testOutputBaseDir
= config
.get('megatests', 'testOutputBaseDir')
23 smallSamplePostfix
= '_' + smallSampleKey
25 smallSamplePostfix
= ''
27 ## msaDir CONTAINS PRE-BUILT NLMSA
28 ## seqDir CONTAINS GENOME ASSEMBLIES AND THEIR SEQDB FILES
29 ## TEST INPUT/OUPTUT FOR COMPARISON, THESE FILES SHOULD BE IN THIS DIRECTORY
30 ## exonAnnotFileName = 'Annotation_ConservedElement_Exons_hg18.txt'
31 ## intronAnnotFileName = 'Annotation_ConservedElement_Introns_hg18.txt'
32 ## stopAnnotFileName = 'Annotation_ConservedElement_Stop_hg18.txt'
33 ## testDir = os.path.join(testOutputBaseDir, 'TEST_' + ''.join(tmpList)) SHOULD BE DELETED IF YOU WANT TO RUN IN '.'
35 # DIRECTIONARY FOR DOC STRING OF SEQDB
37 'anoCar1':' Lizard Genome (January 2007)',
38 'bosTau3':'Cow Genome (August 2006)',
39 'canFam2':'Dog Genome (May 2005)',
40 'cavPor2':'Guinea Pig (October 2005)',
41 'danRer4':'Zebrafish Genome (March 2006)',
42 'dasNov1':'Armadillo Genome (May 2005)',
43 'echTel1':'Tenrec Genome (July 2005)',
44 'eriEur1':'European Hedgehog (Junuary 2006)',
45 'equCab1':'Horse Genome (January 2007)',
46 'felCat3':'Cat Genome (March 2006)',
47 'fr2':'Fugu Genome (October 2004)',
48 'galGal3':'Chicken Genome (May 2006)',
49 'gasAcu1':'Stickleback Genome (February 2006)',
50 'hg18':'Human Genome (May 2006)',
51 'loxAfr1':'Elephant Genome (May 2005)',
52 'mm8':'Mouse Genome (March 2006)',
53 'monDom4':'Opossum Genome (January 2006)',
54 'ornAna1':'Platypus Genome (March 2007)',
55 'oryCun1':'Rabbit Genome (May 2005)',
56 'oryLat1':'Medaka Genome (April 2006)',
57 'otoGar1':'Bushbaby Genome (December 2006)',
58 'panTro2':'Chimpanzee Genome (March 2006)',
59 'rheMac2':'Rhesus Genome (January 2006)',
60 'rn4':'Rat Genome (November 2004)',
61 'sorAra1':'Shrew (Junuary 2006)',
62 'tetNig1':'Tetraodon Genome (February 2004)',
63 'tupBel1':'Tree Shrew (December 2006)',
64 'xenTro2':'X. tropicalis Genome (August 2005)'
67 # GENOME ASSEMBLY LIST FOR DM2 MULTIZ15WAY
68 msaSpeciesList
= ['anoCar1', 'bosTau3', 'canFam2', 'cavPor2', 'danRer4', 'dasNov1', 'echTel1', \
69 'equCab1', 'eriEur1', 'felCat3', 'fr2', 'galGal3', 'gasAcu1', 'hg18', 'loxAfr1', \
70 'mm8', 'monDom4', 'ornAna1', 'oryCun1', 'oryLat1', 'otoGar1', 'panTro2', 'rheMac2', \
71 'rn4', 'sorAra1', 'tetNig1', 'tupBel1', 'xenTro2']
73 class PygrBuildNLMSAMegabase(unittest
.TestCase
):
74 'restrict megatest to an initially empty directory, need large space to perform'
75 def setUp(self
, testDir
= None):
77 tmpList
= [c
for c
in 'PygrBuildNLMSAMegabase']
78 random
.shuffle(tmpList
)
79 testDir
= os
.path
.join(testOutputBaseDir
, 'TEST_' + ''.join(tmpList
)) # FOR TEST, SHOULD BE DELETED
80 if testDir
is None: testDir
= 'TEST_' + ''.join(tmpList
) # NOT SPECIFIED, USE CURRENT DIRECTORY
83 testDir
= os
.path
.realpath(testDir
)
88 tmpFileName
= os
.path
.join(testDir
, 'DELETE_THIS_TEMP_FILE')
89 open(tmpFileName
, 'w').write('A'*1024*1024) # WRITE 1MB FILE FOR TESTING
92 pygr
.Data
.update(self
.path
)
93 from pygr
import seqdb
94 for orgstr
in msaSpeciesList
:
95 genome
= seqdb
.BlastDB(os
.path
.join(seqDir
, orgstr
))
96 genome
.__doc
__ = docStringDict
[orgstr
]
97 pygr
.Data
.addResource('TEST.Seq.Genome.' + orgstr
, genome
)
99 def copyFile(self
, filename
): # COPY A FILE INTO TEST DIRECTORY
100 newname
= os
.path
.join(self
.path
, os
.path
.basename(filename
))
101 open(newname
, 'w').write(open(filename
, 'r').read())
104 'delete the temporary directory and files, restore pygr.Data path'
105 for dirpath
, subdirs
, files
in os
.walk(self
.path
, topdown
= False): # SHOULD BE DELETED BOTTOM-UP FASHION
106 # THIS PART MAY NOT WORK IN NFS MOUNTED DIRECTORY DUE TO .nfsXXXXXXXXX CREATION
107 # IN NFS MOUNTED DIRECTORY, IT CANNOT BE DELETED UNTIL CLOSING PYGRDATA
108 for filename
in files
:
109 os
.remove(os
.path
.join(dirpath
, filename
))
111 # Restore original pygr.Data path to remedy lack of isolation
112 # between tests from the same run
113 pygr
.Data
.update(None)
116 class Build_Test(PygrBuildNLMSAMegabase
):
117 def test_seqdb(self
):
118 'Check pygr.Data contents'
119 l
= pygr
.Data
.dir('TEST')
120 preList
= ['TEST.Seq.Genome.' + orgstr
for orgstr
in msaSpeciesList
]
122 def test_collectionannot(self
):
123 'Test building an AnnotationDB from file'
124 from pygr
import seqdb
, cnestedlist
, sqlgraph
125 hg18
= pygr
.Data
.getResource('TEST.Seq.Genome.hg18')
126 # BUILD ANNOTATION DATABASE FOR REFSEQ EXONS
127 exon_slices
= Collection(filename
= os
.path
.join(self
.path
, 'refGene_exonAnnot_hg18.cdb'), \
128 intKeys
= True, mode
= 'cr', writeback
= False)
129 exon_db
= seqdb
.AnnotationDB(exon_slices
, hg18
,
130 sliceAttrDict
= dict(id = 0, exon_id
= 1, orientation
= 2,
131 gene_id
= 3, start
= 4, stop
= 5))
132 msa
= cnestedlist
.NLMSA(os
.path
.join(self
.path
, 'refGene_exonAnnot_hg18'), 'w', \
133 pairwiseMode
= True, bidirectional
= False)
134 for lines
in open(os
.path
.join(testInputDir
, 'refGene_exonAnnot%s_hg18.txt' % smallSamplePostfix
), 'r').xreadlines():
135 row
= [x
for x
in lines
.split('\t')] # CONVERT TO LIST SO MUTABLE
136 row
[1] = int(row
[1]) # CONVERT FROM STRING TO INTEGER
137 exon_slices
[row
[1]] = row
138 exon
= exon_db
[row
[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
139 msa
.addAnnotation(exon
) # SAVE IT TO GENOME MAPPING
140 exon_db
.clear_cache() # not really necessary; cache should autoGC
141 exon_slices
.close() # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
142 msa
.build() # FINALIZE GENOME ALIGNMENT INDEXES
143 exon_db
.__doc
__ = 'Exon Annotation Database for hg18'
144 pygr
.Data
.addResource('TEST.Annotation.hg18.exons', exon_db
)
145 msa
.__doc
__ = 'NLMSA Exon for hg18'
146 pygr
.Data
.addResource('TEST.Annotation.NLMSA.hg18.exons', msa
)
147 exon_schema
= pygr
.Data
.ManyToManyRelation(hg18
, exon_db
, bindAttrs
= ('exon1',))
148 exon_schema
.__doc
__ = 'Exon Schema for hg18'
149 pygr
.Data
.addSchema('TEST.Annotation.NLMSA.hg18.exons', exon_schema
)
150 # BUILD ANNOTATION DATABASE FOR REFSEQ SPLICES
151 splice_slices
= Collection(filename
= os
.path
.join(self
.path
, 'refGene_spliceAnnot_hg18.cdb'), \
152 intKeys
= True, mode
= 'cr', writeback
= False)
153 splice_db
= seqdb
.AnnotationDB(splice_slices
, hg18
,
154 sliceAttrDict
= dict(id = 0, splice_id
= 1, orientation
= 2,
155 gene_id
= 3, start
= 4, stop
= 5))
156 msa
= cnestedlist
.NLMSA(os
.path
.join(self
.path
, 'refGene_spliceAnnot_hg18'), 'w', \
157 pairwiseMode
= True, bidirectional
= False)
158 for lines
in open(os
.path
.join(testInputDir
, 'refGene_spliceAnnot%s_hg18.txt' % smallSamplePostfix
), 'r').xreadlines():
159 row
= [x
for x
in lines
.split('\t')] # CONVERT TO LIST SO MUTABLE
160 row
[1] = int(row
[1]) # CONVERT FROM STRING TO INTEGER
161 splice_slices
[row
[1]] = row
162 splice
= splice_db
[row
[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
163 msa
.addAnnotation(splice
) # SAVE IT TO GENOME MAPPING
164 splice_db
.clear_cache() # not really necessary; cache should autoGC
165 splice_slices
.close() # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
166 msa
.build() # FINALIZE GENOME ALIGNMENT INDEXES
167 splice_db
.__doc
__ = 'Splice Annotation Database for hg18'
168 pygr
.Data
.addResource('TEST.Annotation.hg18.splices', splice_db
)
169 msa
.__doc
__ = 'NLMSA Splice for hg18'
170 pygr
.Data
.addResource('TEST.Annotation.NLMSA.hg18.splices', msa
)
171 splice_schema
= pygr
.Data
.ManyToManyRelation(hg18
, splice_db
, bindAttrs
= ('splice1',))
172 splice_schema
.__doc
__ = 'Splice Schema for hg18'
173 pygr
.Data
.addSchema('TEST.Annotation.NLMSA.hg18.splices', splice_schema
)
174 # BUILD ANNOTATION DATABASE FOR REFSEQ EXONS
175 cds_slices
= Collection(filename
= os
.path
.join(self
.path
, 'refGene_cdsAnnot_hg18.cdb'), \
176 intKeys
= True, mode
= 'cr', writeback
= False)
177 cds_db
= seqdb
.AnnotationDB(cds_slices
, hg18
,
178 sliceAttrDict
= dict(id = 0, cds_id
= 1, orientation
= 2,
179 gene_id
= 3, start
= 4, stop
= 5))
180 msa
= cnestedlist
.NLMSA(os
.path
.join(self
.path
, 'refGene_cdsAnnot_hg18'), 'w', \
181 pairwiseMode
= True, bidirectional
= False)
182 for lines
in open(os
.path
.join(testInputDir
, 'refGene_cdsAnnot%s_hg18.txt' % smallSamplePostfix
), 'r').xreadlines():
183 row
= [x
for x
in lines
.split('\t')] # CONVERT TO LIST SO MUTABLE
184 row
[1] = int(row
[1]) # CONVERT FROM STRING TO INTEGER
185 cds_slices
[row
[1]] = row
186 cds
= cds_db
[row
[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
187 msa
.addAnnotation(cds
) # SAVE IT TO GENOME MAPPING
188 cds_db
.clear_cache() # not really necessary; cache should autoGC
189 cds_slices
.close() # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
190 msa
.build() # FINALIZE GENOME ALIGNMENT INDEXES
191 cds_db
.__doc
__ = 'CDS Annotation Database for hg18'
192 pygr
.Data
.addResource('TEST.Annotation.hg18.cdss', cds_db
)
193 msa
.__doc
__ = 'NLMSA CDS for hg18'
194 pygr
.Data
.addResource('TEST.Annotation.NLMSA.hg18.cdss', msa
)
195 cds_schema
= pygr
.Data
.ManyToManyRelation(hg18
, cds_db
, bindAttrs
= ('cds1',))
196 cds_schema
.__doc
__ = 'CDS Schema for hg18'
197 pygr
.Data
.addSchema('TEST.Annotation.NLMSA.hg18.cdss', cds_schema
)
198 # BUILD ANNOTATION DATABASE FOR MOST CONSERVED ELEMENTS FROM UCSC
199 ucsc_slices
= Collection(filename
= os
.path
.join(self
.path
, 'phastConsElements28way_hg18.cdb'), \
200 intKeys
= True, mode
= 'cr', writeback
= False)
201 ucsc_db
= seqdb
.AnnotationDB(ucsc_slices
, hg18
,
202 sliceAttrDict
= dict(id = 0, ucsc_id
= 1, orientation
= 2,
203 gene_id
= 3, start
= 4, stop
= 5))
204 msa
= cnestedlist
.NLMSA(os
.path
.join(self
.path
, 'phastConsElements28way_hg18'), 'w', \
205 pairwiseMode
= True, bidirectional
= False)
206 for lines
in open(os
.path
.join(testInputDir
, 'phastConsElements28way%s_hg18.txt' % smallSamplePostfix
), 'r').xreadlines():
207 row
= [x
for x
in lines
.split('\t')] # CONVERT TO LIST SO MUTABLE
208 row
[1] = int(row
[1]) # CONVERT FROM STRING TO INTEGER
209 ucsc_slices
[row
[1]] = row
210 ucsc
= ucsc_db
[row
[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
211 msa
.addAnnotation(ucsc
) # SAVE IT TO GENOME MAPPING
212 ucsc_db
.clear_cache() # not really necessary; cache should autoGC
213 ucsc_slices
.close() # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
214 msa
.build() # FINALIZE GENOME ALIGNMENT INDEXES
215 ucsc_db
.__doc
__ = 'Most Conserved Elements for hg18'
216 pygr
.Data
.addResource('TEST.Annotation.UCSC.hg18.mostconserved', ucsc_db
)
217 msa
.__doc
__ = 'NLMSA for Most Conserved Elements for hg18'
218 pygr
.Data
.addResource('TEST.Annotation.UCSC.NLMSA.hg18.mostconserved', msa
)
219 ucsc_schema
= pygr
.Data
.ManyToManyRelation(hg18
, ucsc_db
, bindAttrs
= ('element1',))
220 ucsc_schema
.__doc
__ = 'Schema for UCSC Most Conserved Elements for hg18'
221 pygr
.Data
.addSchema('TEST.Annotation.UCSC.NLMSA.hg18.mostconserved', ucsc_schema
)
222 # BUILD ANNOTATION DATABASE FOR SNP126 FROM UCSC
223 snp_slices
= Collection(filename
= os
.path
.join(self
.path
, 'snp126_hg18.cdb'), \
224 intKeys
= True, protocol
= 2, mode
= 'cr', writeback
= False)
225 snp_db
= seqdb
.AnnotationDB(snp_slices
, hg18
,
226 sliceAttrDict
= dict(id = 0, snp_id
= 1, orientation
= 2, gene_id
= 3, start
= 4,
227 stop
= 5, score
= 6, ref_NCBI
= 7, ref_UCSC
= 8, observed
= 9,
228 molType
= 10, myClass
= 11, myValid
= 12, avHet
= 13, avHetSE
= 14,
229 myFunc
= 15, locType
= 16, myWeight
= 17))
230 msa
= cnestedlist
.NLMSA(os
.path
.join(self
.path
, 'snp126_hg18'), 'w', \
231 pairwiseMode
= True, bidirectional
= False)
232 for lines
in open(os
.path
.join(testInputDir
, 'snp126%s_hg18.txt' % smallSamplePostfix
), 'r').xreadlines():
233 row
= [x
for x
in lines
.split('\t')] # CONVERT TO LIST SO MUTABLE
234 row
[1] = int(row
[1]) # CONVERT FROM STRING TO INTEGER
235 snp_slices
[row
[1]] = row
236 snp
= snp_db
[row
[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
237 msa
.addAnnotation(snp
) # SAVE IT TO GENOME MAPPING
238 snp_db
.clear_cache() # not really necessary; cache should autoGC
239 snp_slices
.close() # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
240 msa
.build() # FINALIZE GENOME ALIGNMENT INDEXES
241 snp_db
.__doc
__ = 'SNP126 for hg18'
242 pygr
.Data
.addResource('TEST.Annotation.UCSC.hg18.snp126', snp_db
)
243 msa
.__doc
__ = 'NLMSA for SNP126 for hg18'
244 pygr
.Data
.addResource('TEST.Annotation.UCSC.NLMSA.hg18.snp126', msa
)
245 snp_schema
= pygr
.Data
.ManyToManyRelation(hg18
, snp_db
, bindAttrs
= ('snp1',))
246 snp_schema
.__doc
__ = 'Schema for UCSC SNP126 for hg18'
247 pygr
.Data
.addSchema('TEST.Annotation.UCSC.NLMSA.hg18.snp126', snp_schema
)
249 pygr
.Data
.clear_cache()
251 # QUERY TO EXON AND SPLICES ANNOTATION DATABASE
252 hg18
= pygr
.Data
.getResource('TEST.Seq.Genome.hg18')
253 exonmsa
= pygr
.Data
.getResource('TEST.Annotation.NLMSA.hg18.exons')
254 splicemsa
= pygr
.Data
.getResource('TEST.Annotation.NLMSA.hg18.splices')
255 conservedmsa
= pygr
.Data
.getResource('TEST.Annotation.UCSC.NLMSA.hg18.mostconserved')
256 snpmsa
= pygr
.Data
.getResource('TEST.Annotation.UCSC.NLMSA.hg18.snp126')
257 cdsmsa
= pygr
.Data
.getResource('TEST.Annotation.NLMSA.hg18.cdss')
258 exons
= pygr
.Data
.getResource('TEST.Annotation.hg18.exons')
259 splices
= pygr
.Data
.getResource('TEST.Annotation.hg18.splices')
260 mostconserved
= pygr
.Data
.getResource('TEST.Annotation.UCSC.hg18.mostconserved')
261 snp126
= pygr
.Data
.getResource('TEST.Annotation.UCSC.hg18.snp126')
262 cdss
= pygr
.Data
.getResource('TEST.Annotation.hg18.cdss')
264 # OPEN hg18_MULTIZ28WAY NLMSA
265 msa
= cnestedlist
.NLMSA(os
.path
.join(msaDir
, 'hg18_multiz28way'), 'r', trypath
= [seqDir
])
267 exonAnnotFileName
= os
.path
.join(testInputDir
, 'Annotation_ConservedElement_Exons%s_hg18.txt' % smallSamplePostfix
)
268 intronAnnotFileName
= os
.path
.join(testInputDir
, 'Annotation_ConservedElement_Introns%s_hg18.txt' % smallSamplePostfix
)
269 stopAnnotFileName
= os
.path
.join(testInputDir
, 'Annotation_ConservedElement_Stop%s_hg18.txt' % smallSamplePostfix
)
270 newexonAnnotFileName
= os
.path
.join(self
.path
, 'new_Exons_hg18.txt')
271 newintronAnnotFileName
= os
.path
.join(self
.path
, 'new_Introns_hg18.txt')
272 newstopAnnotFileName
= os
.path
.join(self
.path
, 'new_stop_hg18.txt')
273 tmpexonAnnotFileName
= self
.copyFile(exonAnnotFileName
)
274 tmpintronAnnotFileName
= self
.copyFile(intronAnnotFileName
)
275 tmpstopAnnotFileName
= self
.copyFile(stopAnnotFileName
)
278 chrList
= [ smallSampleKey
]
280 chrList
= hg18
.seqLenDict
.keys()
283 outfile
= open(newexonAnnotFileName
, 'w')
284 for chrid
in chrList
:
286 # EXON ANNOTATION DATABASE
292 exlist1
= [(ix
.exon_id
, ix
) for ix
in ex1
.keys()]
294 for ixx
, exon
in exlist1
:
297 tmpexon
= exons
[exon
.exon_id
]
298 tmpslice
= tmpexon
.sequence
# FOR REAL EXON COORDINATE
299 wlist1
= 'EXON', chrid
, tmpexon
.exon_id
, tmpexon
.gene_id
, tmpslice
.start
, tmpslice
.stop
301 out1
= conservedmsa
[tmp
]
305 elementlist
= [(ix
.ucsc_id
, ix
) for ix
in out1
.keys()]
307 for iyy
, element
in elementlist
:
308 if element
.stop
- element
.start
< 100: continue
309 score
= int(string
.split(element
.gene_id
, '=')[1])
310 if score
< 100: continue
311 tmp2
= element
.sequence
312 tmpelement
= mostconserved
[element
.ucsc_id
]
313 tmpslice2
= tmpelement
.sequence
# FOR REAL ELEMENT COORDINATE
314 wlist2
= wlist1
+ (tmpelement
.ucsc_id
, tmpelement
.gene_id
, tmpslice2
.start
, tmpslice2
.stop
)
315 slicestart
, sliceend
= max(tmp
.start
, tmp2
.start
), min(tmp
.stop
, tmp2
.stop
)
316 if slicestart
< 0 or sliceend
< 0: sys
.exit('wrong query')
317 tmp1
= msa
.seqDict
['hg18.' + chrid
][slicestart
:sliceend
]
318 edges
= msa
[tmp1
].edges()
319 for src
, dest
, e
in edges
:
320 if src
.stop
- src
.start
< 100: continue
321 palign
, pident
= e
.pAligned(), e
.pIdentity()
322 if palign
< 0.8 or pident
< 0.8: continue
323 palign
, pident
= '%.2f' % palign
, '%.2f' % pident
324 wlist3
= wlist2
+ ((~msa
.seqDict
)[src
], str(src
), src
.start
, src
.stop
, \
325 (~msa
.seqDict
)[dest
], \
326 str(dest
), dest
.start
, dest
.stop
, palign
, pident
)
327 saveList
.append('\t'.join(map(str, wlist3
)) + '\n')
329 for saveline
in saveList
:
330 outfile
.write(saveline
)
332 md5old
= hashlib
.md5()
333 md5old
.update(open(tmpexonAnnotFileName
, 'r').read())
334 md5new
= hashlib
.md5()
335 md5new
.update(open(newexonAnnotFileName
, 'r').read())
336 assert md5old
.digest() == md5new
.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
338 outfile
= open(newintronAnnotFileName
, 'w')
339 for chrid
in chrList
:
341 # SPLICE ANNOTATION DATABASE
343 sp1
= splicemsa
[slice]
347 splist1
= [(ix
.splice_id
, ix
) for ix
in sp1
.keys()]
349 for ixx
, splice
in splist1
:
351 tmp
= splice
.sequence
352 tmpsplice
= splices
[splice
.splice_id
]
353 tmpslice
= tmpsplice
.sequence
# FOR REAL EXON COORDINATE
354 wlist1
= 'INTRON', chrid
, tmpsplice
.splice_id
, tmpsplice
.gene_id
, tmpslice
.start
, tmpslice
.stop
356 out1
= conservedmsa
[tmp
]
360 elementlist
= [(ix
.ucsc_id
, ix
) for ix
in out1
.keys()]
362 for iyy
, element
in elementlist
:
363 if element
.stop
- element
.start
< 100: continue
364 score
= int(string
.split(element
.gene_id
, '=')[1])
365 if score
< 100: continue
366 tmp2
= element
.sequence
367 tmpelement
= mostconserved
[element
.ucsc_id
]
368 tmpslice2
= tmpelement
.sequence
# FOR REAL ELEMENT COORDINATE
369 wlist2
= wlist1
+ (tmpelement
.ucsc_id
, tmpelement
.gene_id
, tmpslice2
.start
, tmpslice2
.stop
)
370 slicestart
, sliceend
= max(tmp
.start
, tmp2
.start
), min(tmp
.stop
, tmp2
.stop
)
371 if slicestart
< 0 or sliceend
< 0: sys
.exit('wrong query')
372 tmp1
= msa
.seqDict
['hg18.' + chrid
][slicestart
:sliceend
]
373 edges
= msa
[tmp1
].edges()
374 for src
, dest
, e
in edges
:
375 if src
.stop
- src
.start
< 100: continue
376 palign
, pident
= e
.pAligned(), e
.pIdentity()
377 if palign
< 0.8 or pident
< 0.8: continue
378 palign
, pident
= '%.2f' % palign
, '%.2f' % pident
379 wlist3
= wlist2
+ ((~msa
.seqDict
)[src
], str(src
), src
.start
, src
.stop
, \
380 (~msa
.seqDict
)[dest
], \
381 str(dest
), dest
.start
, dest
.stop
, palign
, pident
)
382 saveList
.append('\t'.join(map(str, wlist3
)) + '\n')
384 for saveline
in saveList
:
385 outfile
.write(saveline
)
386 # SNP IN SPLICE SITES
396 gtlist
= gtout
.keys()
397 aglist
= agout
.keys()
399 tmpsnp
= snp
.sequence
400 annsnp
= snp126
[snp
.snp_id
]
401 wlist2
= ('SNP5', chrid
, tmpsplice
.gene_id
, gt
.start
, gt
.stop
, str(gt
)) \
402 + (annsnp
.snp_id
, tmpsnp
.start
, tmpsnp
.stop
, \
403 str(tmpsnp
), annsnp
.gene_id
, annsnp
.ref_NCBI
, annsnp
.ref_UCSC
, \
404 annsnp
.observed
, annsnp
.molType
, \
405 annsnp
.myClass
, annsnp
.myValid
)
406 tmp1
= msa
.seqDict
['hg18.' + chrid
][abs(gt
.start
):abs(gt
.stop
)]
407 edges
= msa
[tmp1
].edges()
408 for src
, dest
, e
in edges
:
409 if src
.stop
- src
.start
!= 2 or dest
.stop
- dest
.start
!= 2: continue
410 palign
, pident
= e
.pAligned(), e
.pIdentity()
411 palign
, pident
= '%.2f' % palign
, '%.2f' % pident
412 wlist3
= wlist2
+ ((~msa
.seqDict
)[src
], str(src
), src
.start
, src
.stop
, \
413 (~msa
.seqDict
)[dest
], \
414 str(dest
), dest
.start
, dest
.stop
, palign
, pident
)
415 saveList
.append('\t'.join(map(str, wlist3
)) + '\n')
417 tmpsnp
= snp
.sequence
418 annsnp
= snp126
[snp
.snp_id
]
419 wlist2
= ('SNP3', chrid
, tmpsplice
.gene_id
, ag
.start
, ag
.stop
, str(ag
)) \
420 + (annsnp
.snp_id
, tmpsnp
.start
, tmpsnp
.stop
, \
421 str(tmpsnp
), annsnp
.gene_id
, annsnp
.ref_NCBI
, annsnp
.ref_UCSC
, \
422 annsnp
.observed
, annsnp
.molType
, \
423 annsnp
.myClass
, annsnp
.myValid
)
424 tmp1
= msa
.seqDict
['hg18.' + chrid
][abs(ag
.start
):abs(ag
.stop
)]
425 edges
= msa
[tmp1
].edges()
426 for src
, dest
, e
in edges
:
427 if src
.stop
- src
.start
!= 2 or dest
.stop
- dest
.start
!= 2: continue
428 palign
, pident
= e
.pAligned(), e
.pIdentity()
429 palign
, pident
= '%.2f' % palign
, '%.2f' % pident
430 wlist3
= wlist2
+ ((~msa
.seqDict
)[src
], str(src
), src
.start
, src
.stop
, \
431 (~msa
.seqDict
)[dest
], \
432 str(dest
), dest
.start
, dest
.stop
, palign
, pident
)
433 saveList
.append('\t'.join(map(str, wlist3
)) + '\n')
435 for saveline
in saveList
:
436 outfile
.write(saveline
)
438 md5old
= hashlib
.md5()
439 md5old
.update(open(tmpintronAnnotFileName
, 'r').read())
440 md5new
= hashlib
.md5()
441 md5new
.update(open(newintronAnnotFileName
, 'r').read())
442 assert md5old
.digest() == md5new
.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
444 outfile
= open(newstopAnnotFileName
, 'w')
445 for chrid
in chrList
:
447 # STOP ANNOTATION DATABASE
453 cdslist1
= [(ix
.cds_id
, ix
) for ix
in cds1
.keys()]
455 for ixx
, cds
in cdslist1
:
458 tmpcds
= cdss
[cds
.cds_id
]
459 tmpslice
= tmpcds
.sequence
# FOR REAL EXON COORDINATE
460 wlist1
= 'STOP', chrid
, tmpcds
.cds_id
, tmpcds
.gene_id
, tmpslice
.start
, tmpslice
.stop
461 if tmpslice
.start
< 0:
462 stopstart
, stopend
= -tmpslice
.stop
, -tmpslice
.start
463 stop
= -hg18
[chrid
][stopstart
:stopstart
+3]
465 stopstart
, stopend
= tmpslice
.start
, tmpslice
.stop
466 stop
= hg18
[chrid
][stopend
-3:stopend
]
467 if str(stop
).upper() not in ('TAA', 'TAG', 'TGA'): continue
473 snplist
= [(ix
.snp_id
, ix
) for ix
in snp1
.keys()]
475 for iyy
, snp
in snplist
:
476 tmpsnp
= snp
.sequence
477 annsnp
= snp126
[snp
.snp_id
]
478 wlist2
= wlist1
+ (str(stop
), stop
.start
, stop
.stop
) \
479 + (annsnp
.snp_id
, tmpsnp
.start
, tmpsnp
.stop
, \
480 str(tmpsnp
), annsnp
.gene_id
, annsnp
.ref_NCBI
, annsnp
.ref_UCSC
, \
481 annsnp
.observed
, annsnp
.molType
, \
482 annsnp
.myClass
, annsnp
.myValid
)
483 if tmpslice
.start
< 0:
484 tmp1
= -msa
.seqDict
['hg18.' + chrid
][stopstart
:stopstart
+3]
486 tmp1
= msa
.seqDict
['hg18.' + chrid
][stopend
-3:stopend
]
487 edges
= msa
[tmp1
].edges()
488 for src
, dest
, e
in edges
:
489 if src
.stop
- src
.start
!= 3 or dest
.stop
- dest
.start
!= 3: continue
490 palign
, pident
= e
.pAligned(), e
.pIdentity()
491 palign
, pident
= '%.2f' % palign
, '%.2f' % pident
492 if str(dest
).upper() not in ('TAA', 'TAG', 'TGA'): nonstr
= 'NONSENSE'
493 else: nonstr
= 'STOP'
494 wlist3
= wlist2
+ ((~msa
.seqDict
)[src
], str(src
), src
.start
, src
.stop
, \
495 (~msa
.seqDict
)[dest
], \
496 str(dest
), dest
.start
, dest
.stop
, palign
, pident
, nonstr
)
497 saveList
.append('\t'.join(map(str, wlist3
)) + '\n')
499 for saveline
in saveList
:
500 outfile
.write(saveline
)
502 md5old
= hashlib
.md5()
503 md5old
.update(open(tmpstopAnnotFileName
, 'r').read())
504 md5new
= hashlib
.md5()
505 md5new
.update(open(newstopAnnotFileName
, 'r').read())
506 assert md5old
.digest() == md5new
.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
508 def test_mysqlannot(self
):
509 'Test building an AnnotationDB from MySQL'
510 from pygr
import seqdb
, cnestedlist
, sqlgraph
511 hg18
= pygr
.Data
.getResource('TEST.Seq.Genome.hg18')
512 # BUILD ANNOTATION DATABASE FOR REFSEQ EXONS: MYSQL VERSION
513 exon_slices
= sqlgraph
.SQLTableClustered('%s.pygr_refGene_exonAnnot%s_hg18' % ( testInputDB
, smallSamplePostfix
),
514 clusterKey
= 'chromosome', maxCache
= 0)
515 exon_db
= seqdb
.AnnotationDB(exon_slices
, hg18
, sliceAttrDict
= dict(id = 'chromosome', \
516 gene_id
= 'name', exon_id
= 'exon_id'))
517 msa
= cnestedlist
.NLMSA(os
.path
.join(self
.path
, 'refGene_exonAnnot_SQL_hg18'), 'w', \
518 pairwiseMode
= True, bidirectional
= False)
520 msa
.addAnnotation(exon_db
[id])
521 exon_db
.clear_cache() # not really necessary; cache should autoGC
522 exon_slices
.clear_cache()
524 exon_db
.__doc
__ = 'SQL Exon Annotation Database for hg18'
525 pygr
.Data
.addResource('TEST.Annotation.SQL.hg18.exons', exon_db
)
526 msa
.__doc
__ = 'SQL NLMSA Exon for hg18'
527 pygr
.Data
.addResource('TEST.Annotation.NLMSA.SQL.hg18.exons', msa
)
528 exon_schema
= pygr
.Data
.ManyToManyRelation(hg18
, exon_db
, bindAttrs
= ('exon2',))
529 exon_schema
.__doc
__ = 'SQL Exon Schema for hg18'
530 pygr
.Data
.addSchema('TEST.Annotation.NLMSA.SQL.hg18.exons', exon_schema
)
531 # BUILD ANNOTATION DATABASE FOR REFSEQ SPLICES: MYSQL VERSION
532 splice_slices
= sqlgraph
.SQLTableClustered('%s.pygr_refGene_spliceAnnot%s_hg18' % ( testInputDB
, smallSamplePostfix
),
533 clusterKey
= 'chromosome', maxCache
= 0)
534 splice_db
= seqdb
.AnnotationDB(splice_slices
, hg18
, sliceAttrDict
= dict(id = 'chromosome', \
535 gene_id
= 'name', splice_id
= 'splice_id'))
536 msa
= cnestedlist
.NLMSA(os
.path
.join(self
.path
, 'refGene_spliceAnnot_SQL_hg18'), 'w', \
537 pairwiseMode
= True, bidirectional
= False)
539 msa
.addAnnotation(splice_db
[id])
540 splice_db
.clear_cache() # not really necessary; cache should autoGC
541 splice_slices
.clear_cache()
543 splice_db
.__doc
__ = 'SQL Splice Annotation Database for hg18'
544 pygr
.Data
.addResource('TEST.Annotation.SQL.hg18.splices', splice_db
)
545 msa
.__doc
__ = 'SQL NLMSA Splice for hg18'
546 pygr
.Data
.addResource('TEST.Annotation.NLMSA.SQL.hg18.splices', msa
)
547 splice_schema
= pygr
.Data
.ManyToManyRelation(hg18
, splice_db
, bindAttrs
= ('splice2',))
548 splice_schema
.__doc
__ = 'SQL Splice Schema for hg18'
549 pygr
.Data
.addSchema('TEST.Annotation.NLMSA.SQL.hg18.splices', splice_schema
)
550 # BUILD ANNOTATION DATABASE FOR REFSEQ EXONS: MYSQL VERSION
551 cds_slices
= sqlgraph
.SQLTableClustered('%s.pygr_refGene_cdsAnnot%s_hg18' % ( testInputDB
, smallSamplePostfix
),
552 clusterKey
= 'chromosome', maxCache
= 0)
553 cds_db
= seqdb
.AnnotationDB(cds_slices
, hg18
, sliceAttrDict
= dict(id = 'chromosome', \
554 gene_id
= 'name', cds_id
= 'cds_id'))
555 msa
= cnestedlist
.NLMSA(os
.path
.join(self
.path
, 'refGene_cdsAnnot_SQL_hg18'), 'w', \
556 pairwiseMode
= True, bidirectional
= False)
558 msa
.addAnnotation(cds_db
[id])
559 cds_db
.clear_cache() # not really necessary; cache should autoGC
560 cds_slices
.clear_cache()
562 cds_db
.__doc
__ = 'SQL CDS Annotation Database for hg18'
563 pygr
.Data
.addResource('TEST.Annotation.SQL.hg18.cdss', cds_db
)
564 msa
.__doc
__ = 'SQL NLMSA CDS for hg18'
565 pygr
.Data
.addResource('TEST.Annotation.NLMSA.SQL.hg18.cdss', msa
)
566 cds_schema
= pygr
.Data
.ManyToManyRelation(hg18
, cds_db
, bindAttrs
= ('cds2',))
567 cds_schema
.__doc
__ = 'SQL CDS Schema for hg18'
568 pygr
.Data
.addSchema('TEST.Annotation.NLMSA.SQL.hg18.cdss', cds_schema
)
569 # BUILD ANNOTATION DATABASE FOR MOST CONSERVED ELEMENTS FROM UCSC: MYSQL VERSION
570 ucsc_slices
= sqlgraph
.SQLTableClustered('%s.pygr_phastConsElements28way%s_hg18' % ( testInputDB
, smallSamplePostfix
),
571 clusterKey
= 'chromosome', maxCache
= 0)
572 ucsc_db
= seqdb
.AnnotationDB(ucsc_slices
, hg18
, sliceAttrDict
= dict(id = 'chromosome', \
573 gene_id
= 'name', ucsc_id
= 'ucsc_id'))
574 msa
= cnestedlist
.NLMSA(os
.path
.join(self
.path
, 'phastConsElements28way_SQL_hg18'), 'w', \
575 pairwiseMode
= True, bidirectional
= False)
577 msa
.addAnnotation(ucsc_db
[id])
578 ucsc_db
.clear_cache() # not really necessary; cache should autoGC
579 ucsc_slices
.clear_cache()
581 ucsc_db
.__doc
__ = 'SQL Most Conserved Elements for hg18'
582 pygr
.Data
.addResource('TEST.Annotation.UCSC.SQL.hg18.mostconserved', ucsc_db
)
583 msa
.__doc
__ = 'SQL NLMSA for Most Conserved Elements for hg18'
584 pygr
.Data
.addResource('TEST.Annotation.UCSC.NLMSA.SQL.hg18.mostconserved', msa
)
585 ucsc_schema
= pygr
.Data
.ManyToManyRelation(hg18
, ucsc_db
, bindAttrs
= ('element2',))
586 ucsc_schema
.__doc
__ = 'SQL Schema for UCSC Most Conserved Elements for hg18'
587 pygr
.Data
.addSchema('TEST.Annotation.UCSC.NLMSA.SQL.hg18.mostconserved', ucsc_schema
)
588 # BUILD ANNOTATION DATABASE FOR SNP126 FROM UCSC: MYSQL VERSION
589 snp_slices
= sqlgraph
.SQLTableClustered('%s.pygr_snp126%s_hg18' % ( testInputDB
, smallSamplePostfix
),
590 clusterKey
= 'clusterKey', maxCache
= 0)
591 snp_db
= seqdb
.AnnotationDB(snp_slices
, hg18
, sliceAttrDict
= dict(id = 'chromosome', gene_id
= 'name',
592 snp_id
= 'snp_id', score
= 'score', ref_NCBI
= 'ref_NCBI', ref_UCSC
= 'ref_UCSC',
593 observed
= 'observed', molType
= 'molType', myClass
= 'myClass', myValid
= 'myValid',
594 avHet
= 'avHet', avHetSE
= 'avHetSE', myFunc
= 'myFunc', locType
= 'locType',
595 myWeight
= 'myWeight'))
596 msa
= cnestedlist
.NLMSA(os
.path
.join(self
.path
, 'snp126_SQL_hg18'), 'w', \
597 pairwiseMode
= True, bidirectional
= False)
599 msa
.addAnnotation(snp_db
[id])
600 snp_db
.clear_cache() # not really necessary; cache should autoGC
601 snp_slices
.clear_cache()
603 snp_db
.__doc
__ = 'SQL SNP126 for hg18'
604 pygr
.Data
.addResource('TEST.Annotation.UCSC.SQL.hg18.snp126', snp_db
)
605 msa
.__doc
__ = 'SQL NLMSA for SNP126 for hg18'
606 pygr
.Data
.addResource('TEST.Annotation.UCSC.NLMSA.SQL.hg18.snp126', msa
)
607 snp_schema
= pygr
.Data
.ManyToManyRelation(hg18
, snp_db
, bindAttrs
= ('snp2',))
608 snp_schema
.__doc
__ = 'SQL Schema for UCSC SNP126 for hg18'
609 pygr
.Data
.addSchema('TEST.Annotation.UCSC.NLMSA.SQL.hg18.snp126', snp_schema
)
611 pygr
.Data
.clear_cache()
613 # QUERY TO EXON AND SPLICES ANNOTATION DATABASE
614 hg18
= pygr
.Data
.getResource('TEST.Seq.Genome.hg18')
615 exonmsa
= pygr
.Data
.getResource('TEST.Annotation.NLMSA.SQL.hg18.exons')
616 splicemsa
= pygr
.Data
.getResource('TEST.Annotation.NLMSA.SQL.hg18.splices')
617 conservedmsa
= pygr
.Data
.getResource('TEST.Annotation.UCSC.NLMSA.SQL.hg18.mostconserved')
618 snpmsa
= pygr
.Data
.getResource('TEST.Annotation.UCSC.NLMSA.SQL.hg18.snp126')
619 cdsmsa
= pygr
.Data
.getResource('TEST.Annotation.NLMSA.SQL.hg18.cdss')
620 exons
= pygr
.Data
.getResource('TEST.Annotation.SQL.hg18.exons')
621 splices
= pygr
.Data
.getResource('TEST.Annotation.SQL.hg18.splices')
622 mostconserved
= pygr
.Data
.getResource('TEST.Annotation.UCSC.SQL.hg18.mostconserved')
623 snp126
= pygr
.Data
.getResource('TEST.Annotation.UCSC.SQL.hg18.snp126')
624 cdss
= pygr
.Data
.getResource('TEST.Annotation.SQL.hg18.cdss')
626 # OPEN hg18_MULTIZ28WAY NLMSA
627 msa
= cnestedlist
.NLMSA(os
.path
.join(msaDir
, 'hg18_multiz28way'), 'r', trypath
= [seqDir
])
629 exonAnnotFileName
= os
.path
.join(testInputDir
, 'Annotation_ConservedElement_Exons%s_hg18.txt' % smallSamplePostfix
)
630 intronAnnotFileName
= os
.path
.join(testInputDir
, 'Annotation_ConservedElement_Introns%s_hg18.txt' % smallSamplePostfix
)
631 stopAnnotFileName
= os
.path
.join(testInputDir
, 'Annotation_ConservedElement_Stop%s_hg18.txt' % smallSamplePostfix
)
632 newexonAnnotFileName
= os
.path
.join(self
.path
, 'new_Exons_hg18.txt')
633 newintronAnnotFileName
= os
.path
.join(self
.path
, 'new_Introns_hg18.txt')
634 newstopAnnotFileName
= os
.path
.join(self
.path
, 'new_stop_hg18.txt')
635 tmpexonAnnotFileName
= self
.copyFile(exonAnnotFileName
)
636 tmpintronAnnotFileName
= self
.copyFile(intronAnnotFileName
)
637 tmpstopAnnotFileName
= self
.copyFile(stopAnnotFileName
)
640 chrList
= [ smallSampleKey
]
642 chrList
= hg18
.seqLenDict
.keys()
645 outfile
= open(newexonAnnotFileName
, 'w')
646 for chrid
in chrList
:
648 # EXON ANNOTATION DATABASE
654 exlist1
= [(ix
.exon_id
, ix
) for ix
in ex1
.keys()]
656 for ixx
, exon
in exlist1
:
659 tmpexon
= exons
[exon
.exon_id
]
660 tmpslice
= tmpexon
.sequence
# FOR REAL EXON COORDINATE
661 wlist1
= 'EXON', chrid
, tmpexon
.exon_id
, tmpexon
.gene_id
, tmpslice
.start
, tmpslice
.stop
663 out1
= conservedmsa
[tmp
]
667 elementlist
= [(ix
.ucsc_id
, ix
) for ix
in out1
.keys()]
669 for iyy
, element
in elementlist
:
670 if element
.stop
- element
.start
< 100: continue
671 score
= int(string
.split(element
.gene_id
, '=')[1])
672 if score
< 100: continue
673 tmp2
= element
.sequence
674 tmpelement
= mostconserved
[element
.ucsc_id
]
675 tmpslice2
= tmpelement
.sequence
# FOR REAL ELEMENT COORDINATE
676 wlist2
= wlist1
+ (tmpelement
.ucsc_id
, tmpelement
.gene_id
, tmpslice2
.start
, tmpslice2
.stop
)
677 slicestart
, sliceend
= max(tmp
.start
, tmp2
.start
), min(tmp
.stop
, tmp2
.stop
)
678 if slicestart
< 0 or sliceend
< 0: sys
.exit('wrong query')
679 tmp1
= msa
.seqDict
['hg18.' + chrid
][slicestart
:sliceend
]
680 edges
= msa
[tmp1
].edges()
681 for src
, dest
, e
in edges
:
682 if src
.stop
- src
.start
< 100: continue
683 palign
, pident
= e
.pAligned(), e
.pIdentity()
684 if palign
< 0.8 or pident
< 0.8: continue
685 palign
, pident
= '%.2f' % palign
, '%.2f' % pident
686 wlist3
= wlist2
+ ((~msa
.seqDict
)[src
], str(src
), src
.start
, src
.stop
, \
687 (~msa
.seqDict
)[dest
], \
688 str(dest
), dest
.start
, dest
.stop
, palign
, pident
)
689 saveList
.append('\t'.join(map(str, wlist3
)) + '\n')
691 for saveline
in saveList
:
692 outfile
.write(saveline
)
694 md5old
= hashlib
.md5()
695 md5old
.update(open(tmpexonAnnotFileName
, 'r').read())
696 md5new
= hashlib
.md5()
697 md5new
.update(open(newexonAnnotFileName
, 'r').read())
698 assert md5old
.digest() == md5new
.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
700 outfile
= open(newintronAnnotFileName
, 'w')
701 for chrid
in chrList
:
703 # SPLICE ANNOTATION DATABASE
705 sp1
= splicemsa
[slice]
709 splist1
= [(ix
.splice_id
, ix
) for ix
in sp1
.keys()]
711 for ixx
, splice
in splist1
:
713 tmp
= splice
.sequence
714 tmpsplice
= splices
[splice
.splice_id
]
715 tmpslice
= tmpsplice
.sequence
# FOR REAL EXON COORDINATE
716 wlist1
= 'INTRON', chrid
, tmpsplice
.splice_id
, tmpsplice
.gene_id
, tmpslice
.start
, tmpslice
.stop
718 out1
= conservedmsa
[tmp
]
722 elementlist
= [(ix
.ucsc_id
, ix
) for ix
in out1
.keys()]
724 for iyy
, element
in elementlist
:
725 if element
.stop
- element
.start
< 100: continue
726 score
= int(string
.split(element
.gene_id
, '=')[1])
727 if score
< 100: continue
728 tmp2
= element
.sequence
729 tmpelement
= mostconserved
[element
.ucsc_id
]
730 tmpslice2
= tmpelement
.sequence
# FOR REAL ELEMENT COORDINATE
731 wlist2
= wlist1
+ (tmpelement
.ucsc_id
, tmpelement
.gene_id
, tmpslice2
.start
, tmpslice2
.stop
)
732 slicestart
, sliceend
= max(tmp
.start
, tmp2
.start
), min(tmp
.stop
, tmp2
.stop
)
733 if slicestart
< 0 or sliceend
< 0: sys
.exit('wrong query')
734 tmp1
= msa
.seqDict
['hg18.' + chrid
][slicestart
:sliceend
]
735 edges
= msa
[tmp1
].edges()
736 for src
, dest
, e
in edges
:
737 if src
.stop
- src
.start
< 100: continue
738 palign
, pident
= e
.pAligned(), e
.pIdentity()
739 if palign
< 0.8 or pident
< 0.8: continue
740 palign
, pident
= '%.2f' % palign
, '%.2f' % pident
741 wlist3
= wlist2
+ ((~msa
.seqDict
)[src
], str(src
), src
.start
, src
.stop
, \
742 (~msa
.seqDict
)[dest
], \
743 str(dest
), dest
.start
, dest
.stop
, palign
, pident
)
744 saveList
.append('\t'.join(map(str, wlist3
)) + '\n')
746 for saveline
in saveList
:
747 outfile
.write(saveline
)
748 # SNP IN SPLICE SITES
758 gtlist
= gtout
.keys()
759 aglist
= agout
.keys()
761 tmpsnp
= snp
.sequence
762 annsnp
= snp126
[snp
.snp_id
]
763 wlist2
= ('SNP5', chrid
, tmpsplice
.gene_id
, gt
.start
, gt
.stop
, str(gt
)) \
764 + (annsnp
.snp_id
, tmpsnp
.start
, tmpsnp
.stop
, \
765 str(tmpsnp
), annsnp
.gene_id
, annsnp
.ref_NCBI
, annsnp
.ref_UCSC
, \
766 annsnp
.observed
, annsnp
.molType
, \
767 annsnp
.myClass
, annsnp
.myValid
)
768 tmp1
= msa
.seqDict
['hg18.' + chrid
][abs(gt
.start
):abs(gt
.stop
)]
769 edges
= msa
[tmp1
].edges()
770 for src
, dest
, e
in edges
:
771 if src
.stop
- src
.start
!= 2 or dest
.stop
- dest
.start
!= 2: continue
772 palign
, pident
= e
.pAligned(), e
.pIdentity()
773 palign
, pident
= '%.2f' % palign
, '%.2f' % pident
774 wlist3
= wlist2
+ ((~msa
.seqDict
)[src
], str(src
), src
.start
, src
.stop
, \
775 (~msa
.seqDict
)[dest
], \
776 str(dest
), dest
.start
, dest
.stop
, palign
, pident
)
777 saveList
.append('\t'.join(map(str, wlist3
)) + '\n')
779 tmpsnp
= snp
.sequence
780 annsnp
= snp126
[snp
.snp_id
]
781 wlist2
= ('SNP3', chrid
, tmpsplice
.gene_id
, ag
.start
, ag
.stop
, str(ag
)) \
782 + (annsnp
.snp_id
, tmpsnp
.start
, tmpsnp
.stop
, \
783 str(tmpsnp
), annsnp
.gene_id
, annsnp
.ref_NCBI
, annsnp
.ref_UCSC
, \
784 annsnp
.observed
, annsnp
.molType
, \
785 annsnp
.myClass
, annsnp
.myValid
)
786 tmp1
= msa
.seqDict
['hg18.' + chrid
][abs(ag
.start
):abs(ag
.stop
)]
787 edges
= msa
[tmp1
].edges()
788 for src
, dest
, e
in edges
:
789 if src
.stop
- src
.start
!= 2 or dest
.stop
- dest
.start
!= 2: continue
790 palign
, pident
= e
.pAligned(), e
.pIdentity()
791 palign
, pident
= '%.2f' % palign
, '%.2f' % pident
792 wlist3
= wlist2
+ ((~msa
.seqDict
)[src
], str(src
), src
.start
, src
.stop
, \
793 (~msa
.seqDict
)[dest
], \
794 str(dest
), dest
.start
, dest
.stop
, palign
, pident
)
795 saveList
.append('\t'.join(map(str, wlist3
)) + '\n')
797 for saveline
in saveList
:
798 outfile
.write(saveline
)
800 md5old
= hashlib
.md5()
801 md5old
.update(open(tmpintronAnnotFileName
, 'r').read())
802 md5new
= hashlib
.md5()
803 md5new
.update(open(newintronAnnotFileName
, 'r').read())
804 assert md5old
.digest() == md5new
.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
806 outfile
= open(newstopAnnotFileName
, 'w')
807 for chrid
in chrList
:
809 # STOP ANNOTATION DATABASE
815 cdslist1
= [(ix
.cds_id
, ix
) for ix
in cds1
.keys()]
817 for ixx
, cds
in cdslist1
:
820 tmpcds
= cdss
[cds
.cds_id
]
821 tmpslice
= tmpcds
.sequence
# FOR REAL EXON COORDINATE
822 wlist1
= 'STOP', chrid
, tmpcds
.cds_id
, tmpcds
.gene_id
, tmpslice
.start
, tmpslice
.stop
823 if tmpslice
.start
< 0:
824 stopstart
, stopend
= -tmpslice
.stop
, -tmpslice
.start
825 stop
= -hg18
[chrid
][stopstart
:stopstart
+3]
827 stopstart
, stopend
= tmpslice
.start
, tmpslice
.stop
828 stop
= hg18
[chrid
][stopend
-3:stopend
]
829 if str(stop
).upper() not in ('TAA', 'TAG', 'TGA'): continue
835 snplist
= [(ix
.snp_id
, ix
) for ix
in snp1
.keys()]
837 for iyy
, snp
in snplist
:
838 tmpsnp
= snp
.sequence
839 annsnp
= snp126
[snp
.snp_id
]
840 wlist2
= wlist1
+ (str(stop
), stop
.start
, stop
.stop
) \
841 + (annsnp
.snp_id
, tmpsnp
.start
, tmpsnp
.stop
, \
842 str(tmpsnp
), annsnp
.gene_id
, annsnp
.ref_NCBI
, annsnp
.ref_UCSC
, \
843 annsnp
.observed
, annsnp
.molType
, \
844 annsnp
.myClass
, annsnp
.myValid
)
845 if tmpslice
.start
< 0:
846 tmp1
= -msa
.seqDict
['hg18.' + chrid
][stopstart
:stopstart
+3]
848 tmp1
= msa
.seqDict
['hg18.' + chrid
][stopend
-3:stopend
]
849 edges
= msa
[tmp1
].edges()
850 for src
, dest
, e
in edges
:
851 if src
.stop
- src
.start
!= 3 or dest
.stop
- dest
.start
!= 3: continue
852 palign
, pident
= e
.pAligned(), e
.pIdentity()
853 palign
, pident
= '%.2f' % palign
, '%.2f' % pident
854 if str(dest
).upper() not in ('TAA', 'TAG', 'TGA'): nonstr
= 'NONSENSE'
855 else: nonstr
= 'STOP'
856 wlist3
= wlist2
+ ((~msa
.seqDict
)[src
], str(src
), src
.start
, src
.stop
, \
857 (~msa
.seqDict
)[dest
], \
858 str(dest
), dest
.start
, dest
.stop
, palign
, pident
, nonstr
)
859 saveList
.append('\t'.join(map(str, wlist3
)) + '\n')
861 for saveline
in saveList
:
862 outfile
.write(saveline
)
864 md5old
= hashlib
.md5()
865 md5old
.update(open(tmpstopAnnotFileName
, 'r').read())
866 md5new
= hashlib
.md5()
867 md5new
.update(open(newstopAnnotFileName
, 'r').read())
868 assert md5old
.digest() == md5new
.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
871 if __name__
== '__main__':
872 PygrTestProgram(verbosity
=2)