2 from testlib
import testutil
, PygrTestProgram
4 import ConfigParser
, sys
, os
, string
, glob
12 config
= ConfigParser
.ConfigParser({'testOutputBaseDir' : '.', 'smallSampleKey': ''})
13 config
.read([ os
.path
.join(os
.path
.expanduser('~'), '.pygrrc'), os
.path
.join(os
.path
.expanduser('~'), 'pygr.cfg'), '.pygrrc', 'pygr.cfg' ])
14 mafDir
= config
.get('megatests_hg18', 'mafDir')
15 seqDir
= config
.get('megatests_hg18', 'seqDir')
16 smallSampleKey
= config
.get('megatests_hg18', 'smallSampleKey')
17 testInputDir
= config
.get('megatests', 'testInputDir')
18 testOutputBaseDir
= config
.get('megatests', 'testOutputBaseDir')
21 smallSamplePostfix
= '_' + smallSampleKey
23 smallSamplePostfix
= ''
25 ## mafDir CONTAINS FOLLOWING DM2 MULTIZ15WAY MAF ALIGNMENTS
26 ## seqDir CONTAINS FOLLOWING 15 GENOME ASSEMBLIES AND THEIR SEQDB FILES
27 ## TEST INPUT/OUPTUT FOR COMPARISON, THESE FILES SHOULD BE IN THIS DIRECTORY
28 ## outfileName = 'splicesite_hg18.txt' # CHR4H TESTING
29 ## outputName = 'splicesite_hg18_multiz28way.txt' # CHR4H TESTING
30 ## testDir = os.path.join(testOutputBaseDir, 'TEST_' + ''.join(tmpList)) SHOULD BE DELETED IF YOU WANT TO RUN IN '.'
32 # DIRECTIONARY FOR DOC STRING OF SEQDB
34 'anoCar1':' Lizard Genome (January 2007)',
35 'bosTau3':'Cow Genome (August 2006)',
36 'canFam2':'Dog Genome (May 2005)',
37 'cavPor2':'Guinea Pig (October 2005)',
38 'danRer4':'Zebrafish Genome (March 2006)',
39 'dasNov1':'Armadillo Genome (May 2005)',
40 'echTel1':'Tenrec Genome (July 2005)',
41 'eriEur1':'European Hedgehog (Junuary 2006)',
42 'equCab1':'Horse Genome (January 2007)',
43 'felCat3':'Cat Genome (March 2006)',
44 'fr2':'Fugu Genome (October 2004)',
45 'galGal3':'Chicken Genome (May 2006)',
46 'gasAcu1':'Stickleback Genome (February 2006)',
47 'hg18':'Human Genome (May 2006)',
48 'loxAfr1':'Elephant Genome (May 2005)',
49 'mm8':'Mouse Genome (March 2006)',
50 'monDom4':'Opossum Genome (January 2006)',
51 'ornAna1':'Platypus Genome (March 2007)',
52 'oryCun1':'Rabbit Genome (May 2005)',
53 'oryLat1':'Medaka Genome (April 2006)',
54 'otoGar1':'Bushbaby Genome (December 2006)',
55 'panTro2':'Chimpanzee Genome (March 2006)',
56 'rheMac2':'Rhesus Genome (January 2006)',
57 'rn4':'Rat Genome (November 2004)',
58 'sorAra1':'Shrew (Junuary 2006)',
59 'tetNig1':'Tetraodon Genome (February 2004)',
60 'tupBel1':'Tree Shrew (December 2006)',
61 'xenTro2':'X. tropicalis Genome (August 2005)'
64 # GENOME ASSEMBLY LIST FOR DM2 MULTIZ15WAY
65 msaSpeciesList
= ['anoCar1', 'bosTau3', 'canFam2', 'cavPor2', 'danRer4', 'dasNov1', 'echTel1', \
66 'equCab1', 'eriEur1', 'felCat3', 'fr2', 'galGal3', 'gasAcu1', 'hg18', 'loxAfr1', \
67 'mm8', 'monDom4', 'ornAna1', 'oryCun1', 'oryLat1', 'otoGar1', 'panTro2', 'rheMac2', \
68 'rn4', 'sorAra1', 'tetNig1', 'tupBel1', 'xenTro2']
70 class PygrBuildNLMSAMegabase(unittest
.TestCase
):
71 'restrict megatest to an initially empty directory, need large space to perform'
72 def setUp(self
, testDir
= None):
74 tmpList
= [c
for c
in 'PygrBuildNLMSAMegabase']
75 random
.shuffle(tmpList
)
76 testDir
= os
.path
.join(testOutputBaseDir
, 'TEST_' + ''.join(tmpList
)) # FOR TEST, SHOULD BE DELETED
77 if testDir
is None: testDir
= 'TEST_' + ''.join(tmpList
) # NOT SPECIFIED, USE CURRENT DIRECTORY
80 testDir
= os
.path
.realpath(testDir
)
85 tmpFileName
= os
.path
.join(testDir
, 'DELETE_THIS_TEMP_FILE')
86 open(tmpFileName
, 'w').write('A'*1024*1024) # WRITE 1MB FILE FOR TESTING
89 pygr
.Data
.update(self
.path
)
90 from pygr
import seqdb
91 for orgstr
in msaSpeciesList
:
92 genome
= seqdb
.BlastDB(os
.path
.join(seqDir
, orgstr
))
93 genome
.__doc
__ = docStringDict
[orgstr
]
94 pygr
.Data
.addResource('TEST.Seq.Genome.' + orgstr
, genome
)
96 def copyFile(self
, filename
): # COPY A FILE INTO TEST DIRECTORY
97 newname
= os
.path
.join(self
.path
, os
.path
.basename(filename
))
98 open(newname
, 'w').write(open(filename
, 'r').read())
101 'delete the temporary directory and files, restore pygr.Data path'
102 for dirpath
, subdirs
, files
in os
.walk(self
.path
, topdown
= False): # SHOULD BE DELETED BOTTOM-UP FASHION
103 # THIS PART MAY NOT WORK IN NFS MOUNTED DIRECTORY DUE TO .nfsXXXXXXXXX CREATION
104 # IN NFS MOUNTED DIRECTORY, IT CANNOT BE DELETED UNTIL CLOSING PYGRDATA
105 for filename
in files
:
106 os
.remove(os
.path
.join(dirpath
, filename
))
108 # Restore original pygr.Data path to remedy lack of isolation
109 # between tests from the same run
110 pygr
.Data
.update(None)
113 class Build_Test(PygrBuildNLMSAMegabase
):
114 def test_seqdb(self
):
115 'Check pygr.Data contents'
116 l
= pygr
.Data
.dir('TEST')
117 preList
= ['TEST.Seq.Genome.' + orgstr
for orgstr
in msaSpeciesList
]
119 def test_build(self
):
120 'Test building an NLMSA and querying results'
121 from pygr
import seqdb
, cnestedlist
123 for orgstr
in msaSpeciesList
:
124 genomedict
[orgstr
] = pygr
.Data
.getResource('TEST.Seq.Genome.' + orgstr
)
125 uniondict
= seqdb
.PrefixUnionDict(genomedict
)
127 maflist
= ( os
.path
.join(mafDir
, smallSampleKey
+ '.maf'), )
129 maflist
= glob
.glob(os
.path
.join(mafDir
, '*.maf'))
131 msaname
= os
.path
.join(self
.path
, 'hg18_multiz28way')
132 msa1
= cnestedlist
.NLMSA(msaname
, 'w', uniondict
, maflist
, maxlen
= 536870912, maxint
= 22369620) # 500MB VERSION
133 msa1
.__doc__
= 'TEST NLMSA for hg18 multiz28way'
134 pygr
.Data
.addResource('TEST.MSA.UCSC.hg18_multiz28way', msa1
)
136 msa
= pygr
.Data
.getResource('TEST.MSA.UCSC.hg18_multiz28way')
137 outfileName
= os
.path
.join(testInputDir
, 'splicesite_hg18%s.txt' % smallSamplePostfix
)
138 outputName
= os
.path
.join(testInputDir
, 'splicesite_hg18%s_multiz28way.txt' % smallSamplePostfix
)
139 newOutputName
= os
.path
.join(self
.path
, 'splicesite_new1.txt')
140 tmpInputName
= self
.copyFile(outfileName
)
141 tmpOutputName
= self
.copyFile(outputName
)
142 outfile
= open(newOutputName
, 'w')
143 for lines
in open(tmpInputName
, 'r').xreadlines():
144 chrid
, intstart
, intend
, nobs
= string
.split(lines
.strip(), '\t')
145 intstart
, intend
, nobs
= int(intstart
), int(intend
), int(nobs
)
146 site1
= msa
.seqDict
['hg18' + '.' + chrid
][intstart
:intstart
+2]
147 site2
= msa
.seqDict
['hg18' + '.' + chrid
][intend
-2:intend
]
148 edges1
= msa
[site1
].edges()
149 edges2
= msa
[site2
].edges()
150 if len(edges1
) == 0: # EMPTY EDGES
151 wlist
= str(site1
), 'hg18', chrid
, intstart
, intstart
+2, '', '', '', '', ''
152 outfile
.write('\t'.join(map(str, wlist
)) + '\n')
153 if len(edges2
) == 0: # EMPTY EDGES
154 wlist
= str(site2
), 'hg18', chrid
, intend
-2, intend
, '', '', '', '', ''
155 outfile
.write('\t'.join(map(str, wlist
)) + '\n')
157 for src
, dest
, e
in edges1
:
158 if len(str(src
)) != 2 or len(str(dest
)) != 2: continue
159 dotindex
= (~msa
.seqDict
)[src
].index('.')
160 srcspecies
, src1
= (~msa
.seqDict
)[src
][:dotindex
], (~msa
.seqDict
)[src
][dotindex
+1:]
161 dotindex
= (~msa
.seqDict
)[dest
].index('.')
162 destspecies
, dest1
= (~msa
.seqDict
)[dest
][:dotindex
], (~msa
.seqDict
)[dest
][dotindex
+1:]
163 wlist
= str(src
), srcspecies
, src1
, src
.start
, src
.stop
, str(dest
), \
164 destspecies
, dest1
, dest
.start
, dest
.stop
165 saveList
.append('\t'.join(map(str, wlist
)) + '\n')
166 for src
, dest
, e
in edges2
:
167 if len(str(src
)) != 2 or len(str(dest
)) != 2: continue
168 dotindex
= (~msa
.seqDict
)[src
].index('.')
169 srcspecies
, src1
= (~msa
.seqDict
)[src
][:dotindex
], (~msa
.seqDict
)[src
][dotindex
+1:]
170 dotindex
= (~msa
.seqDict
)[dest
].index('.')
171 destspecies
, dest1
= (~msa
.seqDict
)[dest
][:dotindex
], (~msa
.seqDict
)[dest
][dotindex
+1:]
172 wlist
= str(src
), srcspecies
, src1
, src
.start
, src
.stop
, str(dest
), \
173 destspecies
, dest1
, dest
.start
, dest
.stop
174 saveList
.append('\t'.join(map(str, wlist
)) + '\n')
175 saveList
.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS
176 for saveline
in saveList
:
177 outfile
.write(saveline
)
179 md5old
= hashlib
.md5()
180 md5old
.update(open(newOutputName
, 'r').read())
181 md5new
= hashlib
.md5()
182 md5new
.update(open(tmpOutputName
, 'r').read())
183 assert md5old
.digest() == md5new
.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
186 msafilelist
= glob
.glob(msaname
+ '*')
188 cnestedlist
.dump_textfile(msaname
, os
.path
.join(self
.path
, 'hg18_multiz28way.txt'))
189 for filename
in msafilelist
: os
.remove(filename
)
190 runPath
= os
.path
.realpath(os
.curdir
)
192 cnestedlist
.textfile_to_binaries('hg18_multiz28way.txt')
195 msa1
= cnestedlist
.NLMSA(msaname
, 'r')
196 msa1
.__doc__
= 'TEST NLMSA for hg18 multiz28way'
197 pygr
.Data
.addResource('TEST.MSA.UCSC.hg18_multiz28way', msa1
)
199 msa
= pygr
.Data
.getResource('TEST.MSA.UCSC.hg18_multiz28way')
200 newOutputName
= os
.path
.join(self
.path
, 'splicesite_new2.txt')
201 tmpInputName
= self
.copyFile(outfileName
)
202 tmpOutputName
= self
.copyFile(outputName
)
203 outfile
= open(newOutputName
, 'w')
204 for lines
in open(tmpInputName
, 'r').xreadlines():
205 chrid
, intstart
, intend
, nobs
= string
.split(lines
.strip(), '\t')
206 intstart
, intend
, nobs
= int(intstart
), int(intend
), int(nobs
)
207 site1
= msa
.seqDict
['hg18' + '.' + chrid
][intstart
:intstart
+2]
208 site2
= msa
.seqDict
['hg18' + '.' + chrid
][intend
-2:intend
]
209 edges1
= msa
[site1
].edges()
210 edges2
= msa
[site2
].edges()
211 if len(edges1
) == 0: # EMPTY EDGES
212 wlist
= str(site1
), 'hg18', chrid
, intstart
, intstart
+2, '', '', '', '', ''
213 outfile
.write('\t'.join(map(str, wlist
)) + '\n')
214 if len(edges2
) == 0: # EMPTY EDGES
215 wlist
= str(site2
), 'hg18', chrid
, intend
-2, intend
, '', '', '', '', ''
216 outfile
.write('\t'.join(map(str, wlist
)) + '\n')
218 for src
, dest
, e
in edges1
:
219 if len(str(src
)) != 2 or len(str(dest
)) != 2: continue
220 dotindex
= (~msa
.seqDict
)[src
].index('.')
221 srcspecies
, src1
= (~msa
.seqDict
)[src
][:dotindex
], (~msa
.seqDict
)[src
][dotindex
+1:]
222 dotindex
= (~msa
.seqDict
)[dest
].index('.')
223 destspecies
, dest1
= (~msa
.seqDict
)[dest
][:dotindex
], (~msa
.seqDict
)[dest
][dotindex
+1:]
224 wlist
= str(src
), srcspecies
, src1
, src
.start
, src
.stop
, str(dest
), \
225 destspecies
, dest1
, dest
.start
, dest
.stop
226 saveList
.append('\t'.join(map(str, wlist
)) + '\n')
227 for src
, dest
, e
in edges2
:
228 if len(str(src
)) != 2 or len(str(dest
)) != 2: continue
229 dotindex
= (~msa
.seqDict
)[src
].index('.')
230 srcspecies
, src1
= (~msa
.seqDict
)[src
][:dotindex
], (~msa
.seqDict
)[src
][dotindex
+1:]
231 dotindex
= (~msa
.seqDict
)[dest
].index('.')
232 destspecies
, dest1
= (~msa
.seqDict
)[dest
][:dotindex
], (~msa
.seqDict
)[dest
][dotindex
+1:]
233 wlist
= str(src
), srcspecies
, src1
, src
.start
, src
.stop
, str(dest
), \
234 destspecies
, dest1
, dest
.start
, dest
.stop
235 saveList
.append('\t'.join(map(str, wlist
)) + '\n')
236 saveList
.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS
237 for saveline
in saveList
:
238 outfile
.write(saveline
)
240 md5old
= hashlib
.md5()
241 md5old
.update(open(newOutputName
, 'r').read())
242 md5new
= hashlib
.md5()
243 md5new
.update(open(tmpOutputName
, 'r').read())
244 assert md5old
.digest() == md5new
.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
247 if __name__
== '__main__':
248 PygrTestProgram(verbosity
=2)