1 # common greylag functions and constants
3 ## greylag, a collection of programs for MS/MS protein analysis
4 ## Copyright (C) 2006-2008 Stowers Institute for Medical Research
6 ## This program is free software: you can redistribute it and/or modify
7 ## it under the terms of the GNU General Public License as published by
8 ## the Free Software Foundation, either version 3 of the License, or
9 ## (at your option) any later version.
11 ## This program is distributed in the hope that it will be useful,
12 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ## GNU General Public License for more details.
16 ## You should have received a copy of the GNU General Public License
17 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
19 ## Contact: Mike Coleman
20 ## Stowers Institute for Medical Research
21 ## 1000 East 50th Street
22 ## Kansas City, Missouri 64110
25 from __future__
import with_statement
28 import logging
; from logging
import debug
, info
, warning
39 # prefix added to locus id's for decoy loci
40 DEFAULT_DECOY_PREFIX
= "SHUFFLED_"
43 # handle to the singleton parameter object shared with the C++ module
44 CP
= cgreylag
.cvar
.parameters_the
47 def set_logging(options
):
48 log_level
= logging
.WARNING
50 log_level
= logging
.ERROR
52 log_level
= logging
.INFO
54 log_level
= logging
.DEBUG
57 logfile
= options
.logfile
58 logging
.basicConfig(level
=log_level
, datefmt
='%b %e %H:%M:%S',
59 format
=('%(asctime)s [%(process)d]'
60 ' %(levelname)s: %(message)s'),
64 # FIX: is this mono or avg? (possible error here is ~0.0007 amu)
65 PROTON_MASS
= 1.007276
66 ELECTRON_MASS
= 0.000549 # ?
68 # Reference values from NIST (http://physics.nist.gov/PhysRefData/)
69 MONOISOTOPIC_ATOMIC_MASS
= {
73 'O' : 15.994914622115,
78 # most prevalent only (1 in 1000)
79 ISOTOPIC_ATOMIC_MASS
= { # prevalence (in %)
80 'C13' : 13.003354837810, # 1.078
81 'N15' : 15.00010889849, # 0.3687
82 'O18' : 17.99916049, # 0.20514
83 'S33' : 32.9714585012, # 0.762
84 'S34' : 33.9678668311, # 4.2928
87 AVERAGE_ATOMIC_MASS
= {
96 # The xtandem average residue masses are about 0.002 amu higher than those
97 # calculated directly from the above average atomic masses. None of the
98 # chemists consulted knew of any reason why, aside from lack of precision in
99 # the average atomic mass estimates. This shouldn't matter very much, as
100 # fragmentation calculations should all be monoisotopic, and we can always
101 # widen the parent tolerance window a bit.
104 def formula_mass(formula
, atomic_mass
=MONOISOTOPIC_ATOMIC_MASS
):
105 """Return the mass of formula, using the given mass regime (monoisotopic
108 >>> formula_mass('H2O', { 'H':1, 'O':16 })
110 >>> # monoisotopic mass of glycine
111 >>> str(round(formula_mass('C2H3ON'), 4))
115 parts
= [ p
or '1' for p
in re
.split(r
'([A-Z][a-z]*)', formula
)[1:] ]
116 # parts for glycine = ['C', '2', 'H', '3', 'O', '1', 'N', '1']
117 return sum(atomic_mass
[parts
[i
]] * int(parts
[i
+1])
118 for i
in range(0, len(parts
), 2))
120 # FIX: selenocysteine (U), etc
145 RESIDUES
= sorted(RESIDUE_FORMULA
.keys())
146 RESIDUES_W_BRACKETS
= RESIDUES
+ ['[', ']']
149 # [0][1] -> 'H' -> fragment mass of H for regime 0
150 MASS_REGIME_ATOMIC_MASSES
= []
152 def dump_mass_regime_atomic_masses():
153 return MASS_REGIME_ATOMIC_MASSES
156 def mass_regime_atomic_masses(spec
):
157 """Given a regime spec like ('MONO', [('N15', 0.9)]), return a map of atom
160 name
, isotopes
= spec
161 assert name
in ['MONO', 'AVG'] and len(isotopes
) <= 1
163 r
= MONOISOTOPIC_ATOMIC_MASS
.copy()
165 r
= AVERAGE_ATOMIC_MASS
.copy()
167 iname
, prevalence
= isotopes
[0]
168 assert iname
== 'N15' and 0 <= prevalence
<= 1
169 # this is a simplification, but additional accuracy pointless?
171 r
['N'] = ISOTOPIC_ATOMIC_MASS
['N15']
173 r
['N'] += (ISOTOPIC_ATOMIC_MASS
['N15'] - r
['N']) * prevalence
177 def initialize_spectrum_parameters(options
, GLP
, mass_regimes
, fixed_mod_map
):
178 """Initialize parameters known to the spectrum module.
179 fixed_mod_map maps, for example, 'M' to (1, 'O', False, 'M', 'oxidation').
182 # This function can be called multiple times to reinitialize for a new
183 # search, though probably everything else (previous matches) ought to be
184 # "forgotten" at that point, too.
186 # This is the size of vectors that are indexed by residues (A-Z) or
187 # special characters ('[]').
188 RESIDUE_LIMIT
= max(ord(c
) for c
in 'Z[]') + 1
190 # These are currently monoisotopic. (deuterium pointless?)
191 CP
.proton_mass
= PROTON_MASS
192 CP
.hydrogen_mass
= formula_mass("H")
196 global MASS_REGIME_ATOMIC_MASSES
197 # clear previous state
198 MASS_REGIME_ATOMIC_MASSES
= []
199 CP
.parent_mass_regime
.clear()
200 CP
.fragment_mass_regime
.clear()
202 for rn
, regime_pair
in enumerate(mass_regimes
):
203 assert len(regime_pair
) == 2 # parent and fragment
204 info('mass regime: %s', regime_pair
)
205 MASS_REGIME_ATOMIC_MASSES
.append([])
206 for n
, regime
in enumerate(regime_pair
):
207 atmass
= mass_regime_atomic_masses(regime
)
208 MASS_REGIME_ATOMIC_MASSES
[-1].append(atmass
)
209 creg
= cgreylag
.mass_regime_parameters()
211 creg
.hydroxyl_mass
= formula_mass("OH", atmass
)
212 creg
.water_mass
= formula_mass("H2O", atmass
)
213 creg
.ammonia_mass
= formula_mass("NH3", atmass
)
215 creg
.fixed_residue_mass
.resize(RESIDUE_LIMIT
)
217 for r
in RESIDUES_W_BRACKETS
:
220 m
= (formula_mass(RESIDUE_FORMULA
[r
], atmass
)
221 + GLP
["mass_regime_debug_delta"])
223 regime_manifest
.append((rn
, r
, m
))
224 rmod
= fixed_mod_map
.get(r
)
226 if isinstance(rmod
[1], str):
228 m
+= rmod
[0] * formula_mass(rmod
[1])
230 m
+= rmod
[0] * formula_mass(rmod
[1], atmass
)
232 m
+= rmod
[0] * rmod
[1]
233 creg
.fixed_residue_mass
[ord(r
)] = m
234 # assuming these are monoisotopic (not regime)
235 creg
.fixed_residue_mass
[ord('[')] += formula_mass("H")
236 creg
.fixed_residue_mass
[ord(']')] += formula_mass("OH")
238 CP
.parent_mass_regime
.append(creg
)
240 creg
.fixed_residue_mass
[ord('[')] -= CP
.hydrogen_mass
241 creg
.fixed_residue_mass
[ord(']')] -= creg
.hydroxyl_mass
242 CP
.fragment_mass_regime
.append(creg
)
243 for r
in RESIDUES_W_BRACKETS
:
244 info('fixed mass %s: %s', r
,
245 [ ("%.6f" % CP
.parent_mass_regime
[rn
].fixed_residue_mass
[ord(r
)],
246 "%.6f" % CP
.fragment_mass_regime
[rn
].fixed_residue_mass
[ord(r
)])
247 for rn
in range(len(mass_regimes
)) ])
249 for rn
in range(len(mass_regimes
)):
250 # check for physically impossible/meaningless masses
251 if CP
.parent_mass_regime
[rn
].fixed_residue_mass
[ord(r
)] < 1.0:
252 raise ValueError('bogus parent mass specification for %s' % r
)
253 if CP
.fragment_mass_regime
[rn
].fixed_residue_mass
[ord(r
)] < 1.0:
254 raise ValueError('bogus fragment mass specification for %s' % r
)
256 CP
.parent_mass_tolerance_1
= GLP
["parent_mz_tolerance"]
257 CP
.parent_mass_tolerance_max
= (GLP
["parent_mz_tolerance"]
258 * GLP
["charge_limit"])
260 CP
.fragment_mass_tolerance
= GLP
["fragment_mass_tolerance"]
261 CP
.intensity_class_count
= GLP
["intensity_class_count"]
263 CP
.minimum_peptide_length
= GLP
["min_peptide_length"]
265 # CP.ln_factorial[n] == ln(n!)
266 CP
.ln_factorial
.resize(int(GLP
["max_parent_spectrum_mass"]
267 / GLP
["fragment_mass_tolerance"] + 100), 0.0)
268 for n
in range(2, len(CP
.ln_factorial
)):
269 CP
.ln_factorial
[n
] = CP
.ln_factorial
[n
-1] + math
.log(n
)
272 #CP.estimate_only = bool(options.estimate_only)
273 #CP.show_progress = bool(options.show_progress)
274 CP
.estimate_only
= False
275 CP
.show_progress
= False
277 return regime_manifest
280 def enumerate_conjunction(mod_tree
, limit
, conjuncts
=[]):
282 if 0 < len(conjuncts
) <= limit
:
285 first
, rest
= mod_tree
[0], mod_tree
[1:]
286 if isinstance(first
, list):
287 for x
in enumerate_disjunction(first
, limit
):
288 for y
in enumerate_conjunction(rest
, limit
, conjuncts
+ x
):
291 for y
in enumerate_conjunction(rest
, limit
, conjuncts
):
293 for y
in enumerate_conjunction(rest
, limit
, conjuncts
+ [first
]):
296 def enumerate_disjunction(mod_tree
, limit
=sys
.maxint
):
297 """Generates the conjuncts for mod_tree that are no longer than limit.
299 >>> list(enumerate_disjunction([['a'],['b'],['c']]))
300 [[], ['a'], ['b'], ['c']]
301 >>> list(enumerate_disjunction([[1,2,3]]))
302 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3]]
303 >>> list(enumerate_disjunction([[1,2,3],[4,5]]))
304 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3], [5], [4], [4, 5]]
305 >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=2))
306 [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [5], [4], [4, 5]]
307 >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=0))
311 assert isinstance(mod_tree
, list)
314 for s
in enumerate_conjunction(b
, limit
):
317 def get_mod_conjunct_triples(mod_tree
, limit
, mass_regimes
):
318 """Return a triple (N, C, rest), where N (C) is a tuple of at most one N
319 (C) conjunct, and rest is a tuple of conjuncts in a canonical order,
320 ordered by increasing number of conjuncts, and with duplicates within and
321 across removed. A mass table is also appended to each conjunct.
323 # FIX: is there a more elegant way or place for all this?
325 def rmass(regime_index
, par_frag
, sign
, delta
, is_mono
):
326 global MASS_REGIME_ATOMIC_MASSES
327 if isinstance(delta
, str):
330 return (sign
* formula_mass(delta
,
331 MASS_REGIME_ATOMIC_MASSES
[regime_index
][par_frag
]))
335 sign
, delta
, is_mono
= t
[:3]
336 return t
+ (tuple((rmass(r
, 0, sign
, delta
, is_mono
),
337 rmass(r
, 1, sign
, delta
, is_mono
))
338 for r
in range(len(mass_regimes
))),)
341 Ns
= tuple(frozenset(enmass(x
) for x
in c
if x
[3] == '['))
342 Cs
= tuple(frozenset(enmass(x
) for x
in c
if x
[3] == ']'))
343 assert len(Ns
) <= 1 and len(Cs
) <= 1
344 rest
= [ enmass(x
) for x
in c
if x
[3] not in '[]' ]
345 rest
= tuple(sorted(list(frozenset(rest
))))
346 return (Ns
, Cs
, rest
)
348 return sorted(list(frozenset(triple(conjunct
)
350 in enumerate_disjunction(mod_tree
, limit
))),
351 key
=lambda x
: (sum(len(y
) for y
in x
), x
))
354 def cleavage_motif_re(motif
):
355 """Return (regexp, pos), where regexp is a regular expression that will
356 match a cleavage motif, and pos is the position of the cleavage with
357 respect to the match (or None). (The RE actually matches one character,
358 the rest matching as lookahead, so that re.finditer will find all
359 overlapping matches.)
361 >>> cleavage_motif_re('[KR]|{P}')
363 >>> cleavage_motif_re('[X]|[X]')
369 motif_re
= re
.compile(r
'(\||(\[(X|[A-WYZ]+)\])|({([A-WYZ]+)}))')
370 parts
= [ p
[0] for p
in motif_re
.findall(motif
) ]
371 if ''.join(parts
) != motif
:
372 chase_error('invalid cleavage motif pattern')
376 if cleavage_pos
!= None:
377 chase_error("invalid cleavage motif pattern" " (multiple '|'s)")
383 re_parts
.append('[%s]' % part
[1:-1])
385 re_parts
.append('[^%s]' % part
[1:-1])
387 assert False, "unknown cleavage motif syntax"
389 if len(re_parts
) == 0:
391 elif len(re_parts
) == 1:
392 re_pattern
= re_parts
[0]
394 re_pattern
= re_parts
[0] + '(?=' + ''.join(re_parts
[1:]) + ')'
395 return (re_pattern
, cleavage_pos
)
398 def file_sha1(filename
):
399 """Return the (hex) SHA1 digest of the given file."""
403 return "no checksum--libs missing"
405 with
open(filename
) as f
:
410 def read_fasta_file(f
):
411 """Yield (locusname, defline, sequence) tuples as read from the given
412 FASTA file (uppercasing sequence)."""
414 locusname
, defline
= None, None
421 yield (locusname
, defline
, ''.join(seqs
))
423 locusname_rest
= defline
.split(None, 1)
424 locusname
= locusname_rest
[0] if locusname_rest
else ''
428 chase_error("bad format: line precedes initial defline"
429 " in '%s'", (f
.name
if hasattr(f
, 'name')
430 else 'unknown FASTA file'))
431 seqs
.append(line
.upper())
433 yield (locusname
, defline
, ''.join(seqs
))
436 def read_fasta_files(filenames
):
437 """Yield (locusname, defline, sequence, filename) tuples as read from
438 FASTA files (uppercasing sequence). An error is given if locusname is
439 empty or not unique across all sequences."""
443 for filename
in filenames
:
444 with
open(filename
) as f
:
445 for locusname
, defline
, sequence
in read_fasta_file(f
):
447 chase_error("empty locus name not allowed in '%s'",
449 if locusname
in loci_seen
:
450 chase_error("locus name '%s' is not unique in the search"
451 " database(s) in '%s'", locusname
, filename
)
452 loci_seen
.add(locusname
)
454 yield (locusname
, defline
, sequence
, filename
)