3 '''Generate a set of sqt files from the specified greylag search result
4 file. The sqt files will be created in the current directory, with names
5 corresponding to the searched ms2 files.
8 from __future__
import with_statement
11 greylag, a collection of programs for MS/MS protein analysis
12 Copyright (C) 2006-2008 Stowers Institute for Medical Research
14 This program is free software: you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation, either version 3 of the License, or
17 (at your option) any later version.
19 This program is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
24 You should have received a copy of the GNU General Public License
25 along with this program. If not, see <http://www.gnu.org/licenses/>.
28 Stowers Institute for Medical Research
30 Kansas City, Missouri 64110
35 import cPickle
as pickle
39 from pprint
import pprint
45 # gc kills our performance, so disable it. gc only matters for cycles, which
46 # we (hope we) don't create. See the gc module docs.
47 import gc
; gc
.disable()
51 print >> sys
.stderr
, 'warning:', s
53 sys
.exit('error: ' + s
)
56 greylag
.chase_error
= error
59 def generate_leaves(tree
):
60 """Given a tree made of lists, yield the leaves.
62 >>> list(generate_leaves([(1,2,3)]))
64 >>> list(generate_leaves([[[(1,2,3)]], (4,5)]))
68 if not isinstance(tree
, list):
72 for l
in generate_leaves(b
):
76 # Example legacy headers:
77 # H Database /foo/bar/db.fasta
78 # H PrecursorMasses AVG
79 # H FragmentMasses MONO
80 # H StaticMod C=160.1388
82 # H DiffMod STY#=+80.0
84 def print_legacy_headers(f
, r
):
85 """Output legacy H lines that readers of traditional SQT files may
86 expect. Warns if multiple databases, or multiple or isotopic mass regimes
90 if len(r
['databases']) > 1:
91 warn("multiple databases present")
92 for db
in r
['databases']:
93 print >> f
, "H\tDatabase\t%s" % db
95 precursor_masses
, fragment_masses
= 'MONO', 'MONO'
96 mass_regimes
= r
['parameters']['mass_regimes']
97 if len(mass_regimes
) > 1:
98 warn("multiple mass regimes present--legacy SQT headers will be based"
100 parent_regime
, fragment_regime
= mass_regimes
[0]
101 if parent_regime
[1] or fragment_regime
[1]:
102 warn("isotopic mass regimes present--ignored for legacy SQT headers")
103 assert parent_regime
[0] in ('MONO', 'AVG')
104 assert fragment_regime
[0] in ('MONO', 'AVG')
105 print >> f
, "H\tPrecursorMasses\t%s" % parent_regime
[0]
106 print >> f
, "H\tFragmentMasses\t%s" % fragment_regime
[0]
108 # FIX: merge this with mass calculation code in
109 # greylag.initialize_spectrum_parameters?
110 # FIX: currently assuming MONO only (what would be better?)
111 static
= dict((res
, greylag
.formula_mass(greylag
.RESIDUE_FORMULA
[res
]))
112 for res
in greylag
.RESIDUES
)
113 for m
in r
['parameters']['pervasive_mods']:
114 if isinstance(m
[1], float):
115 static
[m
[3]] += m
[0] * m
[1]
117 static
[m
[3]] += m
[0] * greylag
.formula_mass(m
[1])
118 # all residues get a StaticMod because our figures are more exact than the
121 print >> f
, "H\tStaticMod\t%s=%s" % (res
, static
[res
])
123 diffs
= set(generate_leaves(r
['parameters']['potential_mods']))
127 warn("unmarked potential mod present--omitting")
129 if isinstance(m
[1], float):
132 delta
= m
[0] * greylag
.formula_mass(m
[1])
133 print >> f
, "H\tDiffMod\t%s%s=%+f" % (m
[3], m
[5], delta
)
136 def print_header(f
, r
):
137 """Output H lines."""
138 print >> f
, "H\tSQTGenerator\tgreylag"
139 print >> f
, "H\tSQTGeneratorVersion\t%s" % greylag
.VERSION
141 print_legacy_headers(f
, r
)
143 for pk
, pv
in sorted(r
['parameters'].items()):
144 print >> f
, "H\tParameter\t%s\t%s" % (pk
, pv
)
147 def print_regime_manifest(f
, regime_manifest
):
148 """Output R lines."""
149 for regime_no
, residue
, mass
in regime_manifest
:
150 print >> f
, "R\t%s\t%s\t%s" % (regime_no
, residue
, mass
)
153 def generate_marked_sequence(marks
, peptide_sequence
):
154 """Yield the characters in a marked version of peptide_sequence. If there
155 is both a terminal mark and a mark on a terminal residue, the latter is
158 >>> ''.join(generate_marked_sequence({}, 'ASDF'))
160 >>> ''.join(generate_marked_sequence({ 'N' : '^', 1 : '*' }, 'ASDF'))
162 >>> ''.join(generate_marked_sequence({ 'N' : '^', 0 : '*' }, 'ASDF'))
167 for n
, c
in enumerate(peptide_sequence
):
169 if n
== 0 and 'N' in marks
:
171 elif n
== len(peptide_sequence
)-1 and 'C' in marks
:
174 yield marks
.get(n
, '')
177 def print_spectrum(f
, modification_conjucts
, result
, enhanced
=False):
178 """Print the lines (S/M/L/A*) associated with the given spectrum."""
179 spectrum
, sp_best_matches
= result
181 scan_low
, scan_high
, charge
= spectrum
['name'].rsplit('.', 2)
182 print >> f
, '\t'.join(str(v
) for v
in
183 ["S", scan_low
, scan_high
, charge
, 0, 'honk',
185 round(math
.log(spectrum
['total_ion_current']), 4),
186 0, spectrum
['comparisons']])
188 best_scores
= sorted(list(set(m
['score'] for m
in sp_best_matches
)))
193 rank_map
= dict(zip(best_scores
, range(1,len(best_scores
)+1)))
195 best_score
= best_scores
[0]
196 for match
in sp_best_matches
:
197 score
= match
['score']
199 continue # null entry
200 rank
= rank_map
[score
]
201 score_delta
= (best_score
- score
) / best_score
203 mass_regime_index
= match
.get('mass_regime_index', 0)
204 conjunct_index
= match
.get('conjunct_index', 0)
206 am_lines
= [] # residue mods
208 for mt
in match
.get('mass_trace', []):
209 conjunct_item
= modification_conjucts
[conjunct_index
][2][mt
['conjunct_item_index']]
210 mark
= conjunct_item
[5]
212 assert mt
['position'] not in marks
, "not yet implemented"
213 marks
[mt
['position']] = mark
215 name
= conjunct_item
[4] or ''
216 delta
= conjunct_item
[6][mass_regime_index
][1]
217 am_lines
.append(["AM", mt
['position'], round(delta
, 8), name
])
219 at_lines
= [] # terminal mods
220 N_conjunct_item
= modification_conjucts
[conjunct_index
][0]
222 assert len(N_conjunct_item
) == 1
223 mark
= N_conjunct_item
[0][5]
227 name
= conjunct_item
[4] or ''
228 delta
= conjunct_item
[6][mass_regime_index
][1]
229 at_lines
.append(["AT", 'N', round(delta
, 8), name
])
230 C_conjunct_item
= modification_conjucts
[conjunct_index
][1]
232 assert len(C_conjunct_item
) == 1
233 mark
= C_conjunct_item
[0][5]
237 name
= conjunct_item
[4] or ''
238 delta
= conjunct_item
[6][mass_regime_index
][1]
239 at_lines
.append(["AT", 'C', round(delta
, 8), name
])
241 marked_sequence
= match
['peptide_sequence']
243 marked_sequence
= ''.join(generate_marked_sequence(marks
,
246 # FIX: also need M lines for fixed mods, terminal mods, isotope mods
248 # Note: scores, being log probability, are non-positive, but we
249 # flip the sign in the SQT output
250 print >> f
, '\t'.join(str(v
) for v
in
252 round(match
['predicted_parent_mass'], 5),
253 round(score_delta
, 4), round(-score
, 4), 0,
254 # FIX: 1 of 2 ions found--keep DTASelect happy
256 '.'.join((match
['N_peptide_flank'],
258 match
['C_peptide_flank'])),
262 if mass_regime_index
!= 0:
263 print >> f
, "AR\t%s" % mass_regime_index
264 if match
.get('pca_delta', 0) != 0:
265 print >> f
, "APCA\t%s" % match
['pca_delta']
266 for am_line
in sorted(am_lines
):
267 print >> f
, '\t'.join(str(v
) for v
in am_line
)
268 for at_line
in at_lines
:
269 print >> f
, '\t'.join(str(v
) for v
in at_line
)
271 for sequence_name
, peptide_begin
in match
['loci']:
272 print >> f
, 'L\t%s\t%s' % (sequence_name
, peptide_begin
)
275 def dump_result_file(result_file
):
276 """Pretty-print result file contents."""
280 r
= pickle
.load(result_file
)
284 error("result file truncated or invalid")
287 def generate_results(result_file
):
290 r
= pickle
.load(result_file
)
292 error("result file truncated or invalid")
298 def main(args
=sys
.argv
[1:]):
299 parser
= optparse
.OptionParser(usage
=
300 "usage: %prog [options] <result-file>",
301 description
=__doc__
, version
=greylag
.VERSION
)
302 pa
= parser
.add_option
303 pa("-d", "--output-directory", dest
="output_directory",
304 help="directory where output files are written [default is '.']",
306 pa("-e", "--enhanced-output", action
="store_true", dest
="enhanced_output",
307 help="include extra DTASelect-incompatible information in output")
308 pa("-v", "--verbose", action
="store_true", dest
="verbose",
310 pa("--dump", action
="store_true", dest
="dump",
311 help="just dump the result file to stdout (for debugging)")
312 pa("--copyright", action
="store_true", dest
="copyright",
313 help="print copyright and exit")
314 (options
, args
) = parser
.parse_args(args
=args
)
316 if options
.copyright
:
324 result_file
= open(args
[0], 'rb')
327 dump_result_file(result_file
)
330 header
= pickle
.load(result_file
)
332 spectrum_fns
= header
['spectrum files']
333 assert len(spectrum_fns
) == len(set(spectrum_fns
)) # check uniqueness
334 assert all(os
.path
.dirname(fn
) == '' for fn
in spectrum_fns
)
336 basenames
= [ os
.path
.splitext(spectrum_fn
)[0]
337 for spectrum_fn
in spectrum_fns
]
339 # spectrum file index -> sqt file
340 spectrum_filemap
= dict((n
, open(fn
+'.sqt', 'w'))
341 for n
, fn
in enumerate(basenames
))
343 for sqt_file
in spectrum_filemap
.values():
344 print_header(sqt_file
, header
)
345 print_regime_manifest(sqt_file
, header
['mass regime manifest'])
347 for result
in generate_results(result_file
):
349 print_spectrum(spectrum_filemap
[result
[0]['file_id']],
350 header
['modification conjuncts'], result
,
351 options
.enhanced_output
)
354 if __name__
== '__main__':