2 from testlib
import testutil
, PygrTestProgram
4 import ConfigParser
, sys
, os
, string
5 from pygr
.mapping
import Collection
13 config
= ConfigParser
.ConfigParser({'testOutputBaseDir' : '.', 'smallSampleKey': ''})
14 config
.read([ os
.path
.join(os
.path
.expanduser('~'), '.pygrrc'), os
.path
.join(os
.path
.expanduser('~'), 'pygr.cfg'), '.pygrrc', 'pygr.cfg' ])
15 msaDir
= config
.get('megatests_dm2', 'msaDir')
16 seqDir
= config
.get('megatests_dm2', 'seqDir')
17 smallSampleKey
= config
.get('megatests_dm2', 'smallSampleKey')
18 testInputDB
= config
.get('megatests', 'testInputDB')
19 testInputDir
= config
.get('megatests', 'testInputDir')
20 testOutputBaseDir
= config
.get('megatests', 'testOutputBaseDir')
23 smallSamplePostfix
= '_' + smallSampleKey
25 smallSamplePostfix
= ''
27 ## msaDir CONTAINS PRE-BUILT NLMSA
28 ## seqDir CONTAINS GENOME ASSEMBLIES AND THEIR SEQDB FILES
29 ## TEST INPUT/OUPTUT FOR COMPARISON, THESE FILES SHOULD BE IN THIS DIRECTORY
30 ## exonAnnotFileName = 'Annotation_ConservedElement_Exons_dm2.txt'
31 ## intronAnnotFileName = 'Annotation_ConservedElement_Introns_dm2.txt'
32 ## stopAnnotFileName = 'Annotation_ConservedElement_Stop_dm2.txt'
33 ## testDir = os.path.join(testOutputBaseDir, 'TEST_' + ''.join(tmpList)) SHOULD BE DELETED IF YOU WANT TO RUN IN '.'
35 # DIRECTIONARY FOR DOC STRING OF SEQDB
37 'anoGam1':'A. gambiae Genome (February 2003)',
38 'apiMel2':'A. mellifera Genome (January 2005)',
39 'dm2':'D. melanogaster Genome (April 2004)',
40 'dp4':'D. pseudoobscura Genome (February 2006)',
41 'droAna3':'D. ananassae Genome (February 2006)',
42 'droEre2':'D. erecta Genome (February 2006)',
43 'droGri2':'D. grimshawi Genome (February 2006)',
44 'droMoj3':'D. mojavensis Genome (February 2006)',
45 'droPer1':'D. persimilis Genome (October 2005)',
46 'droSec1':'D. sechellia Genome (October 2005)',
47 'droSim1':'D. simulans Genome (April 2005)',
48 'droVir3':'D. virilis Genome (February 2006)',
49 'droWil1':'D. willistoni Genome (February 2006)',
50 'droYak2':'D. yakuba Genome (November 2005)',
51 'triCas2':'T. castaneum Genome (September 2005)',
54 # GENOME ASSEMBLY LIST FOR DM2 MULTIZ15WAY
55 msaSpeciesList
= ['anoGam1', 'apiMel2', 'dm2', 'dp4', 'droAna3', 'droEre2', 'droGri2', 'droMoj3', \
56 'droPer1', 'droSec1', 'droSim1', 'droVir3', 'droWil1', 'droYak2', 'triCas2']
58 class PygrBuildNLMSAMegabase(unittest
.TestCase
):
59 'restrict megatest to an initially empty directory, need large space to perform'
60 def setUp(self
, testDir
= None):
62 tmpList
= [c
for c
in 'PygrBuildNLMSAMegabase']
63 random
.shuffle(tmpList
)
64 testDir
= os
.path
.join(testOutputBaseDir
, 'TEST_' + ''.join(tmpList
)) # FOR TEST, SHOULD BE DELETED
65 if testDir
is None: testDir
= 'TEST_' + ''.join(tmpList
) # NOT SPECIFIED, USE CURRENT DIRECTORY
68 testDir
= os
.path
.realpath(testDir
)
73 tmpFileName
= os
.path
.join(testDir
, 'DELETE_THIS_TEMP_FILE')
74 open(tmpFileName
, 'w').write('A'*1024*1024) # WRITE 1MB FILE FOR TESTING
77 pygr
.Data
.update(self
.path
)
78 from pygr
import seqdb
79 for orgstr
in msaSpeciesList
:
80 genome
= seqdb
.BlastDB(os
.path
.join(seqDir
, orgstr
))
81 genome
.__doc
__ = docStringDict
[orgstr
]
82 pygr
.Data
.addResource('TEST.Seq.Genome.' + orgstr
, genome
)
84 def copyFile(self
, filename
): # COPY A FILE INTO TEST DIRECTORY
85 newname
= os
.path
.join(self
.path
, os
.path
.basename(filename
))
86 open(newname
, 'w').write(open(filename
, 'r').read())
89 'delete the temporary directory and files, restore pygr.Data path'
90 for dirpath
, subdirs
, files
in os
.walk(self
.path
, topdown
= False): # SHOULD BE DELETED BOTTOM-UP FASHION
91 # THIS PART MAY NOT WORK IN NFS MOUNTED DIRECTORY DUE TO .nfsXXXXXXXXX CREATION
92 # IN NFS MOUNTED DIRECTORY, IT CANNOT BE DELETED UNTIL CLOSING PYGRDATA
93 for filename
in files
:
94 os
.remove(os
.path
.join(dirpath
, filename
))
96 # Restore original pygr.Data path to remedy lack of isolation
97 # between tests from the same run
98 pygr
.Data
.update(None)
101 class Build_Test(PygrBuildNLMSAMegabase
):
102 def test_seqdb(self
):
103 'Check pygr.Data contents'
104 l
= pygr
.Data
.dir('TEST')
105 preList
= ['TEST.Seq.Genome.' + orgstr
for orgstr
in msaSpeciesList
]
107 def test_collectionannot(self
):
108 'Test building an AnnotationDB from file'
109 from pygr
import seqdb
, cnestedlist
, sqlgraph
110 dm2
= pygr
.Data
.getResource('TEST.Seq.Genome.dm2')
111 # BUILD ANNOTATION DATABASE FOR REFSEQ EXONS
112 exon_slices
= Collection(filename
= os
.path
.join(self
.path
, 'refGene_exonAnnot_dm2.cdb'), \
113 intKeys
= True, mode
= 'cr', writeback
= False)
114 exon_db
= seqdb
.AnnotationDB(exon_slices
, dm2
,
115 sliceAttrDict
= dict(id = 0, exon_id
= 1, orientation
= 2,
116 gene_id
= 3, start
= 4, stop
= 5))
117 msa
= cnestedlist
.NLMSA(os
.path
.join(self
.path
, 'refGene_exonAnnot_dm2'), 'w', \
118 pairwiseMode
= True, bidirectional
= False)
119 for lines
in open(os
.path
.join(testInputDir
, 'refGene_exonAnnot%s_dm2.txt' % smallSamplePostfix
), 'r').xreadlines():
120 row
= [x
for x
in lines
.split('\t')] # CONVERT TO LIST SO MUTABLE
121 row
[1] = int(row
[1]) # CONVERT FROM STRING TO INTEGER
122 exon_slices
[row
[1]] = row
123 exon
= exon_db
[row
[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
124 msa
.addAnnotation(exon
) # SAVE IT TO GENOME MAPPING
125 exon_db
.clear_cache() # not really necessary; cache should autoGC
126 exon_slices
.close() # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
127 msa
.build() # FINALIZE GENOME ALIGNMENT INDEXES
128 exon_db
.__doc
__ = 'Exon Annotation Database for dm2'
129 pygr
.Data
.addResource('TEST.Annotation.dm2.exons', exon_db
)
130 msa
.__doc
__ = 'NLMSA Exon for dm2'
131 pygr
.Data
.addResource('TEST.Annotation.NLMSA.dm2.exons', msa
)
132 exon_schema
= pygr
.Data
.ManyToManyRelation(dm2
, exon_db
, bindAttrs
= ('exon1',))
133 exon_schema
.__doc
__ = 'Exon Schema for dm2'
134 pygr
.Data
.addSchema('TEST.Annotation.NLMSA.dm2.exons', exon_schema
)
135 # BUILD ANNOTATION DATABASE FOR REFSEQ SPLICES
136 splice_slices
= Collection(filename
= os
.path
.join(self
.path
, 'refGene_spliceAnnot_dm2.cdb'), \
137 intKeys
= True, mode
= 'cr', writeback
= False)
138 splice_db
= seqdb
.AnnotationDB(splice_slices
, dm2
,
139 sliceAttrDict
= dict(id = 0, splice_id
= 1, orientation
= 2,
140 gene_id
= 3, start
= 4, stop
= 5))
141 msa
= cnestedlist
.NLMSA(os
.path
.join(self
.path
, 'refGene_spliceAnnot_dm2'), 'w', \
142 pairwiseMode
= True, bidirectional
= False)
143 for lines
in open(os
.path
.join(testInputDir
, 'refGene_spliceAnnot%s_dm2.txt' % smallSamplePostfix
), 'r').xreadlines():
144 row
= [x
for x
in lines
.split('\t')] # CONVERT TO LIST SO MUTABLE
145 row
[1] = int(row
[1]) # CONVERT FROM STRING TO INTEGER
146 splice_slices
[row
[1]] = row
147 splice
= splice_db
[row
[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
148 msa
.addAnnotation(splice
) # SAVE IT TO GENOME MAPPING
149 splice_db
.clear_cache() # not really necessary; cache should autoGC
150 splice_slices
.close() # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
151 msa
.build() # FINALIZE GENOME ALIGNMENT INDEXES
152 splice_db
.__doc
__ = 'Splice Annotation Database for dm2'
153 pygr
.Data
.addResource('TEST.Annotation.dm2.splices', splice_db
)
154 msa
.__doc
__ = 'NLMSA Splice for dm2'
155 pygr
.Data
.addResource('TEST.Annotation.NLMSA.dm2.splices', msa
)
156 splice_schema
= pygr
.Data
.ManyToManyRelation(dm2
, splice_db
, bindAttrs
= ('splice1',))
157 splice_schema
.__doc
__ = 'Splice Schema for dm2'
158 pygr
.Data
.addSchema('TEST.Annotation.NLMSA.dm2.splices', splice_schema
)
159 # BUILD ANNOTATION DATABASE FOR MOST CONSERVED ELEMENTS FROM UCSC
160 ucsc_slices
= Collection(filename
= os
.path
.join(self
.path
, 'phastConsElements15way_dm2.cdb'), \
161 intKeys
= True, mode
= 'cr', writeback
= False)
162 ucsc_db
= seqdb
.AnnotationDB(ucsc_slices
, dm2
,
163 sliceAttrDict
= dict(id = 0, ucsc_id
= 1, orientation
= 2,
164 gene_id
= 3, start
= 4, stop
= 5))
165 msa
= cnestedlist
.NLMSA(os
.path
.join(self
.path
, 'phastConsElements15way_dm2'), 'w', \
166 pairwiseMode
= True, bidirectional
= False)
167 for lines
in open(os
.path
.join(testInputDir
, 'phastConsElements15way%s_dm2.txt' % smallSamplePostfix
), 'r').xreadlines():
168 row
= [x
for x
in lines
.split('\t')] # CONVERT TO LIST SO MUTABLE
169 row
[1] = int(row
[1]) # CONVERT FROM STRING TO INTEGER
170 ucsc_slices
[row
[1]] = row
171 ucsc
= ucsc_db
[row
[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON
172 msa
.addAnnotation(ucsc
) # SAVE IT TO GENOME MAPPING
173 ucsc_db
.clear_cache() # not really necessary; cache should autoGC
174 ucsc_slices
.close() # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS
175 msa
.build() # FINALIZE GENOME ALIGNMENT INDEXES
176 ucsc_db
.__doc
__ = 'Most Conserved Elements for dm2'
177 pygr
.Data
.addResource('TEST.Annotation.UCSC.dm2.mostconserved', ucsc_db
)
178 msa
.__doc
__ = 'NLMSA for Most Conserved Elements for dm2'
179 pygr
.Data
.addResource('TEST.Annotation.UCSC.NLMSA.dm2.mostconserved', msa
)
180 ucsc_schema
= pygr
.Data
.ManyToManyRelation(dm2
, ucsc_db
, bindAttrs
= ('element1',))
181 ucsc_schema
.__doc
__ = 'Schema for UCSC Most Conserved Elements for dm2'
182 pygr
.Data
.addSchema('TEST.Annotation.UCSC.NLMSA.dm2.mostconserved', ucsc_schema
)
184 pygr
.Data
.clear_cache() # force resources to reload when requested
186 # QUERY TO EXON AND SPLICES ANNOTATION DATABASE
187 dm2
= pygr
.Data
.getResource('TEST.Seq.Genome.dm2')
188 exonmsa
= pygr
.Data
.getResource('TEST.Annotation.NLMSA.dm2.exons')
189 splicemsa
= pygr
.Data
.getResource('TEST.Annotation.NLMSA.dm2.splices')
190 conservedmsa
= pygr
.Data
.getResource('TEST.Annotation.UCSC.NLMSA.dm2.mostconserved')
191 exons
= pygr
.Data
.getResource('TEST.Annotation.dm2.exons')
192 splices
= pygr
.Data
.getResource('TEST.Annotation.dm2.splices')
193 mostconserved
= pygr
.Data
.getResource('TEST.Annotation.UCSC.dm2.mostconserved')
195 # OPEN DM2_MULTIZ15WAY NLMSA
196 msa
= cnestedlist
.NLMSA(os
.path
.join(msaDir
, 'dm2_multiz15way'), 'r', trypath
= [seqDir
])
198 exonAnnotFileName
= os
.path
.join(testInputDir
, 'Annotation_ConservedElement_Exons%s_dm2.txt' % smallSamplePostfix
)
199 intronAnnotFileName
= os
.path
.join(testInputDir
, 'Annotation_ConservedElement_Introns%s_dm2.txt' % smallSamplePostfix
)
200 newexonAnnotFileName
= os
.path
.join(self
.path
, 'new_Exons_dm2.txt')
201 newintronAnnotFileName
= os
.path
.join(self
.path
, 'new_Introns_dm2.txt')
202 tmpexonAnnotFileName
= self
.copyFile(exonAnnotFileName
)
203 tmpintronAnnotFileName
= self
.copyFile(intronAnnotFileName
)
206 chrList
= [ smallSampleKey
]
208 chrList
= dm2
.seqLenDict
.keys()
211 outfile
= open(newexonAnnotFileName
, 'w')
212 for chrid
in chrList
:
219 exlist1
= [(ix
.exon_id
, ix
) for ix
in ex1
.keys()]
221 for ixx
, exon
in exlist1
:
224 tmpexon
= exons
[exon
.exon_id
]
225 tmpslice
= tmpexon
.sequence
# FOR REAL EXON COORDINATE
226 wlist1
= 'EXON', chrid
, tmpexon
.exon_id
, tmpexon
.gene_id
, tmpslice
.start
, tmpslice
.stop
228 out1
= conservedmsa
[tmp
]
232 elementlist
= [(ix
.ucsc_id
, ix
) for ix
in out1
.keys()]
234 for iyy
, element
in elementlist
:
235 if element
.stop
- element
.start
< 100: continue
236 score
= int(string
.split(element
.gene_id
, '=')[1])
237 if score
< 100: continue
238 tmp2
= element
.sequence
239 tmpelement
= mostconserved
[element
.ucsc_id
]
240 tmpslice2
= tmpelement
.sequence
# FOR REAL ELEMENT COORDINATE
241 wlist2
= wlist1
+ (tmpelement
.ucsc_id
, tmpelement
.gene_id
, tmpslice2
.start
, tmpslice2
.stop
)
242 slicestart
, sliceend
= max(tmp
.start
, tmp2
.start
), min(tmp
.stop
, tmp2
.stop
)
243 tmp1
= msa
.seqDict
['dm2.' + chrid
][slicestart
:sliceend
]
244 edges
= msa
[tmp1
].edges()
245 for src
, dest
, e
in edges
:
246 if src
.stop
- src
.start
< 100: continue
247 palign
, pident
= e
.pAligned(), e
.pIdentity()
248 if palign
< 0.8 or pident
< 0.8: continue
249 palign
, pident
= '%.2f' % palign
, '%.2f' % pident
250 wlist3
= wlist2
+ ((~msa
.seqDict
)[src
], str(src
), src
.start
, src
.stop
, \
251 (~msa
.seqDict
)[dest
], \
252 str(dest
), dest
.start
, dest
.stop
, palign
, pident
)
253 saveList
.append('\t'.join(map(str, wlist3
)) + '\n')
255 for saveline
in saveList
:
256 outfile
.write(saveline
)
258 md5old
= hashlib
.md5()
259 md5old
.update(open(tmpexonAnnotFileName
, 'r').read())
260 md5new
= hashlib
.md5()
261 md5new
.update(open(newexonAnnotFileName
, 'r').read())
262 assert md5old
.digest() == md5new
.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
264 outfile
= open(newintronAnnotFileName
, 'w')
265 for chrid
in chrList
:
268 sp1
= splicemsa
[slice]
272 splist1
= [(ix
.splice_id
, ix
) for ix
in sp1
.keys()]
274 for ixx
, splice
in splist1
:
276 tmp
= splice
.sequence
277 tmpsplice
= splices
[splice
.splice_id
]
278 tmpslice
= tmpsplice
.sequence
# FOR REAL EXON COORDINATE
279 wlist1
= 'INTRON', chrid
, tmpsplice
.splice_id
, tmpsplice
.gene_id
, tmpslice
.start
, tmpslice
.stop
281 out1
= conservedmsa
[tmp
]
285 elementlist
= [(ix
.ucsc_id
, ix
) for ix
in out1
.keys()]
287 for iyy
, element
in elementlist
:
288 if element
.stop
- element
.start
< 100: continue
289 score
= int(string
.split(element
.gene_id
, '=')[1])
290 if score
< 100: continue
291 tmp2
= element
.sequence
292 tmpelement
= mostconserved
[element
.ucsc_id
]
293 tmpslice2
= tmpelement
.sequence
# FOR REAL ELEMENT COORDINATE
294 wlist2
= wlist1
+ (tmpelement
.ucsc_id
, tmpelement
.gene_id
, tmpslice2
.start
, tmpslice2
.stop
)
295 slicestart
, sliceend
= max(tmp
.start
, tmp2
.start
), min(tmp
.stop
, tmp2
.stop
)
296 tmp1
= msa
.seqDict
['dm2.' + chrid
][slicestart
:sliceend
]
297 edges
= msa
[tmp1
].edges()
298 for src
, dest
, e
in edges
:
299 if src
.stop
- src
.start
< 100: continue
300 palign
, pident
= e
.pAligned(), e
.pIdentity()
301 if palign
< 0.8 or pident
< 0.8: continue
302 palign
, pident
= '%.2f' % palign
, '%.2f' % pident
303 wlist3
= wlist2
+ ((~msa
.seqDict
)[src
], str(src
), src
.start
, src
.stop
, \
304 (~msa
.seqDict
)[dest
], \
305 str(dest
), dest
.start
, dest
.stop
, palign
, pident
)
306 saveList
.append('\t'.join(map(str, wlist3
)) + '\n')
308 for saveline
in saveList
:
309 outfile
.write(saveline
)
311 md5old
= hashlib
.md5()
312 md5old
.update(open(tmpintronAnnotFileName
, 'r').read())
313 md5new
= hashlib
.md5()
314 md5new
.update(open(newintronAnnotFileName
, 'r').read())
315 assert md5old
.digest() == md5new
.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
317 def test_mysqlannot(self
):
318 'Test building an AnnotationDB from MySQL'
319 from pygr
import seqdb
, cnestedlist
, sqlgraph
320 dm2
= pygr
.Data
.getResource('TEST.Seq.Genome.dm2')
321 # BUILD ANNOTATION DATABASE FOR REFSEQ EXONS: MYSQL VERSION
322 exon_slices
= sqlgraph
.SQLTableClustered('%s.pygr_refGene_exonAnnot%s_dm2' % ( testInputDB
, smallSamplePostfix
),
323 clusterKey
= 'chromosome', maxCache
= 0)
324 exon_db
= seqdb
.AnnotationDB(exon_slices
, dm2
, sliceAttrDict
= dict(id = 'chromosome', \
325 gene_id
= 'name', exon_id
= 'exon_id'))
326 msa
= cnestedlist
.NLMSA(os
.path
.join(self
.path
, 'refGene_exonAnnot_SQL_dm2'), 'w', \
327 pairwiseMode
= True, bidirectional
= False)
329 msa
.addAnnotation(exon_db
[id])
330 exon_db
.clear_cache() # not really necessary; cache should autoGC
331 exon_slices
.clear_cache()
333 exon_db
.__doc
__ = 'SQL Exon Annotation Database for dm2'
334 pygr
.Data
.addResource('TEST.Annotation.SQL.dm2.exons', exon_db
)
335 msa
.__doc
__ = 'SQL NLMSA Exon for dm2'
336 pygr
.Data
.addResource('TEST.Annotation.NLMSA.SQL.dm2.exons', msa
)
337 exon_schema
= pygr
.Data
.ManyToManyRelation(dm2
, exon_db
, bindAttrs
= ('exon2',))
338 exon_schema
.__doc
__ = 'SQL Exon Schema for dm2'
339 pygr
.Data
.addSchema('TEST.Annotation.NLMSA.SQL.dm2.exons', exon_schema
)
340 # BUILD ANNOTATION DATABASE FOR REFSEQ SPLICES: MYSQL VERSION
341 splice_slices
= sqlgraph
.SQLTableClustered('%s.pygr_refGene_spliceAnnot%s_dm2' % ( testInputDB
, smallSamplePostfix
),
342 clusterKey
= 'chromosome', maxCache
= 0)
343 splice_db
= seqdb
.AnnotationDB(splice_slices
, dm2
, sliceAttrDict
= dict(id = 'chromosome', \
344 gene_id
= 'name', splice_id
= 'splice_id'))
345 msa
= cnestedlist
.NLMSA(os
.path
.join(self
.path
, 'refGene_spliceAnnot_SQL_dm2'), 'w', \
346 pairwiseMode
= True, bidirectional
= False)
348 msa
.addAnnotation(splice_db
[id])
349 splice_db
.clear_cache() # not really necessary; cache should autoGC
350 splice_slices
.clear_cache()
352 splice_db
.__doc
__ = 'SQL Splice Annotation Database for dm2'
353 pygr
.Data
.addResource('TEST.Annotation.SQL.dm2.splices', splice_db
)
354 msa
.__doc
__ = 'SQL NLMSA Splice for dm2'
355 pygr
.Data
.addResource('TEST.Annotation.NLMSA.SQL.dm2.splices', msa
)
356 splice_schema
= pygr
.Data
.ManyToManyRelation(dm2
, splice_db
, bindAttrs
= ('splice2',))
357 splice_schema
.__doc
__ = 'SQL Splice Schema for dm2'
358 pygr
.Data
.addSchema('TEST.Annotation.NLMSA.SQL.dm2.splices', splice_schema
)
359 # BUILD ANNOTATION DATABASE FOR MOST CONSERVED ELEMENTS FROM UCSC: MYSQL VERSION
360 ucsc_slices
= sqlgraph
.SQLTableClustered('%s.pygr_phastConsElements15way%s_dm2' % ( testInputDB
, smallSamplePostfix
),
361 clusterKey
= 'chromosome', maxCache
= 0)
362 ucsc_db
= seqdb
.AnnotationDB(ucsc_slices
, dm2
, sliceAttrDict
= dict(id = 'chromosome', \
363 gene_id
= 'name', ucsc_id
= 'ucsc_id'))
364 msa
= cnestedlist
.NLMSA(os
.path
.join(self
.path
, 'phastConsElements15way_SQL_dm2'), 'w', \
365 pairwiseMode
= True, bidirectional
= False)
367 msa
.addAnnotation(ucsc_db
[id])
368 ucsc_db
.clear_cache() # not really necessary; cache should autoGC
369 ucsc_slices
.clear_cache()
371 ucsc_db
.__doc
__ = 'SQL Most Conserved Elements for dm2'
372 pygr
.Data
.addResource('TEST.Annotation.UCSC.SQL.dm2.mostconserved', ucsc_db
)
373 msa
.__doc
__ = 'SQL NLMSA for Most Conserved Elements for dm2'
374 pygr
.Data
.addResource('TEST.Annotation.UCSC.NLMSA.SQL.dm2.mostconserved', msa
)
375 ucsc_schema
= pygr
.Data
.ManyToManyRelation(dm2
, ucsc_db
, bindAttrs
= ('element2',))
376 ucsc_schema
.__doc
__ = 'SQL Schema for UCSC Most Conserved Elements for dm2'
377 pygr
.Data
.addSchema('TEST.Annotation.UCSC.NLMSA.SQL.dm2.mostconserved', ucsc_schema
)
379 pygr
.Data
.clear_cache()
381 # QUERY TO EXON AND SPLICES ANNOTATION DATABASE
382 dm2
= pygr
.Data
.getResource('TEST.Seq.Genome.dm2')
383 exonmsa
= pygr
.Data
.getResource('TEST.Annotation.NLMSA.SQL.dm2.exons')
384 splicemsa
= pygr
.Data
.getResource('TEST.Annotation.NLMSA.SQL.dm2.splices')
385 conservedmsa
= pygr
.Data
.getResource('TEST.Annotation.UCSC.NLMSA.SQL.dm2.mostconserved')
386 exons
= pygr
.Data
.getResource('TEST.Annotation.SQL.dm2.exons')
387 splices
= pygr
.Data
.getResource('TEST.Annotation.SQL.dm2.splices')
388 mostconserved
= pygr
.Data
.getResource('TEST.Annotation.UCSC.SQL.dm2.mostconserved')
390 # OPEN DM2_MULTIZ15WAY NLMSA
391 msa
= cnestedlist
.NLMSA(os
.path
.join(msaDir
, 'dm2_multiz15way'), 'r', trypath
= [seqDir
])
393 exonAnnotFileName
= os
.path
.join(testInputDir
, 'Annotation_ConservedElement_Exons%s_dm2.txt' % smallSamplePostfix
)
394 intronAnnotFileName
= os
.path
.join(testInputDir
, 'Annotation_ConservedElement_Introns%s_dm2.txt' % smallSamplePostfix
)
395 newexonAnnotFileName
= os
.path
.join(self
.path
, 'new_Exons_dm2.txt')
396 newintronAnnotFileName
= os
.path
.join(self
.path
, 'new_Introns_dm2.txt')
397 tmpexonAnnotFileName
= self
.copyFile(exonAnnotFileName
)
398 tmpintronAnnotFileName
= self
.copyFile(intronAnnotFileName
)
401 chrList
= [ smallSampleKey
]
403 chrList
= dm2
.seqLenDict
.keys()
406 outfile
= open(newexonAnnotFileName
, 'w')
407 for chrid
in chrList
:
414 exlist1
= [(ix
.exon_id
, ix
) for ix
in ex1
.keys()]
416 for ixx
, exon
in exlist1
:
419 tmpexon
= exons
[exon
.exon_id
]
420 tmpslice
= tmpexon
.sequence
# FOR REAL EXON COORDINATE
421 wlist1
= 'EXON', chrid
, tmpexon
.exon_id
, tmpexon
.gene_id
, tmpslice
.start
, tmpslice
.stop
423 out1
= conservedmsa
[tmp
]
427 elementlist
= [(ix
.ucsc_id
, ix
) for ix
in out1
.keys()]
429 for iyy
, element
in elementlist
:
430 if element
.stop
- element
.start
< 100: continue
431 score
= int(string
.split(element
.gene_id
, '=')[1])
432 if score
< 100: continue
433 tmp2
= element
.sequence
434 tmpelement
= mostconserved
[element
.ucsc_id
]
435 tmpslice2
= tmpelement
.sequence
# FOR REAL ELEMENT COORDINATE
436 wlist2
= wlist1
+ (tmpelement
.ucsc_id
, tmpelement
.gene_id
, tmpslice2
.start
, tmpslice2
.stop
)
437 slicestart
, sliceend
= max(tmp
.start
, tmp2
.start
), min(tmp
.stop
, tmp2
.stop
)
438 tmp1
= msa
.seqDict
['dm2.' + chrid
][slicestart
:sliceend
]
439 edges
= msa
[tmp1
].edges()
440 for src
, dest
, e
in edges
:
441 if src
.stop
- src
.start
< 100: continue
442 palign
, pident
= e
.pAligned(), e
.pIdentity()
443 if palign
< 0.8 or pident
< 0.8: continue
444 palign
, pident
= '%.2f' % palign
, '%.2f' % pident
445 wlist3
= wlist2
+ ((~msa
.seqDict
)[src
], str(src
), src
.start
, src
.stop
, \
446 (~msa
.seqDict
)[dest
], \
447 str(dest
), dest
.start
, dest
.stop
, palign
, pident
)
448 saveList
.append('\t'.join(map(str, wlist3
)) + '\n')
450 for saveline
in saveList
:
451 outfile
.write(saveline
)
453 md5old
= hashlib
.md5()
454 md5old
.update(open(tmpexonAnnotFileName
, 'r').read())
455 md5new
= hashlib
.md5()
456 md5new
.update(open(newexonAnnotFileName
, 'r').read())
457 assert md5old
.digest() == md5new
.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
459 outfile
= open(newintronAnnotFileName
, 'w')
460 for chrid
in chrList
:
463 sp1
= splicemsa
[slice]
467 splist1
= [(ix
.splice_id
, ix
) for ix
in sp1
.keys()]
469 for ixx
, splice
in splist1
:
471 tmp
= splice
.sequence
472 tmpsplice
= splices
[splice
.splice_id
]
473 tmpslice
= tmpsplice
.sequence
# FOR REAL EXON COORDINATE
474 wlist1
= 'INTRON', chrid
, tmpsplice
.splice_id
, tmpsplice
.gene_id
, tmpslice
.start
, tmpslice
.stop
476 out1
= conservedmsa
[tmp
]
480 elementlist
= [(ix
.ucsc_id
, ix
) for ix
in out1
.keys()]
482 for iyy
, element
in elementlist
:
483 if element
.stop
- element
.start
< 100: continue
484 score
= int(string
.split(element
.gene_id
, '=')[1])
485 if score
< 100: continue
486 tmp2
= element
.sequence
487 tmpelement
= mostconserved
[element
.ucsc_id
]
488 tmpslice2
= tmpelement
.sequence
# FOR REAL ELEMENT COORDINATE
489 wlist2
= wlist1
+ (tmpelement
.ucsc_id
, tmpelement
.gene_id
, tmpslice2
.start
, tmpslice2
.stop
)
490 slicestart
, sliceend
= max(tmp
.start
, tmp2
.start
), min(tmp
.stop
, tmp2
.stop
)
491 tmp1
= msa
.seqDict
['dm2.' + chrid
][slicestart
:sliceend
]
492 edges
= msa
[tmp1
].edges()
493 for src
, dest
, e
in edges
:
494 if src
.stop
- src
.start
< 100: continue
495 palign
, pident
= e
.pAligned(), e
.pIdentity()
496 if palign
< 0.8 or pident
< 0.8: continue
497 palign
, pident
= '%.2f' % palign
, '%.2f' % pident
498 wlist3
= wlist2
+ ((~msa
.seqDict
)[src
], str(src
), src
.start
, src
.stop
, \
499 (~msa
.seqDict
)[dest
], \
500 str(dest
), dest
.start
, dest
.stop
, palign
, pident
)
501 saveList
.append('\t'.join(map(str, wlist3
)) + '\n')
503 for saveline
in saveList
:
504 outfile
.write(saveline
)
506 md5old
= hashlib
.md5()
507 md5old
.update(open(tmpintronAnnotFileName
, 'r').read())
508 md5new
= hashlib
.md5()
509 md5new
.update(open(newintronAnnotFileName
, 'r').read())
510 assert md5old
.digest() == md5new
.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
513 if __name__
== '__main__':
514 PygrTestProgram(verbosity
=2)