greylag.py

   1 # common greylag functions and constants
   2
   3 ##  greylag, a collection of programs for MS/MS protein analysis
   4 ##  Copyright (C) 2006-2008  Stowers Institute for Medical Research
   5 ##
   6 ##  This program is free software: you can redistribute it and/or modify
   7 ##  it under the terms of the GNU General Public License as published by
   8 ##  the Free Software Foundation, either version 3 of the License, or
   9 ##  (at your option) any later version.
  10 ##
  11 ##  This program is distributed in the hope that it will be useful,
  12 ##  but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 ##  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 ##  GNU General Public License for more details.
  15 ##
  16 ##  You should have received a copy of the GNU General Public License
  17 ##  along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 ##
  19 ##  Contact: Mike Coleman
  20 ##           Stowers Institute for Medical Research
  21 ##           1000 East 50th Street
  22 ##           Kansas City, Missouri  64110
  23 ##           USA
  24
  25 from __future__ import with_statement
  26
  27
  28 import logging; from logging import debug, info, warning
  29 import math
  30 import re
  31 import sys
  32
  33 import cgreylag
  34
  35
  36 VERSION = "0.1.2"
  37
  38
  39 # prefix added to locus id's for decoy loci
  40 DEFAULT_DECOY_PREFIX = "SHUFFLED_"
  41
  42
  43 # handle to the singleton parameter object shared with the C++ module
  44 CP = cgreylag.cvar.parameters_the
  45
  46
  47 def set_logging(options):
  48     log_level = logging.WARNING
  49     if options.quiet:
  50         log_level = logging.ERROR
  51     if options.verbose:
  52         log_level = logging.INFO
  53     if options.debug:
  54         log_level = logging.DEBUG
  55     logfile = None
  56     if options.logfile:
  57         logfile = options.logfile
  58     logging.basicConfig(level=log_level, datefmt='%b %e %H:%M:%S',
  59                         format=('%(asctime)s [%(process)d]'
  60                                 ' %(levelname)s: %(message)s'),
  61                         filename=logfile)
  62
  63
  64 # FIX: is this mono or avg?  (possible error here is ~0.0007 amu)
  65 PROTON_MASS =   1.007276
  66 ELECTRON_MASS = 0.000549                # ?
  67
  68 # Reference values from NIST (http://physics.nist.gov/PhysRefData/)
  69 MONOISOTOPIC_ATOMIC_MASS = {
  70     'H' :  1.00782503214,
  71     'C' : 12.00000000,
  72     'N' : 14.00307400529,
  73     'O' : 15.994914622115,
  74     'P' : 30.9737615120,
  75     'S' : 31.9720706912,
  76     }
  77
  78 # most prevalent only (1 in 1000)
  79 ISOTOPIC_ATOMIC_MASS = {                # prevalence (in %)
  80     'C13' : 13.003354837810,            # 1.078
  81     'N15' : 15.00010889849,             # 0.3687
  82     'O18' : 17.99916049,                # 0.20514
  83     'S33' : 32.9714585012,              # 0.762
  84     'S34' : 33.9678668311,              # 4.2928
  85     }
  86
  87 AVERAGE_ATOMIC_MASS = {
  88     'H' :  1.007947,
  89     'C' : 12.01078,
  90     'N' : 14.00672,
  91     'O' : 15.99943,
  92     'P' : 30.9737612,
  93     'S' : 32.0655,
  94     }
  95
  96 # The xtandem average residue masses are about 0.002 amu higher than those
  97 # calculated directly from the above average atomic masses.  None of the
  98 # chemists consulted knew of any reason why, aside from lack of precision in
  99 # the average atomic mass estimates.  This shouldn't matter very much, as
 100 # fragmentation calculations should all be monoisotopic, and we can always
 101 # widen the parent tolerance window a bit.
 102
 103
 104 def formula_mass(formula, atomic_mass=MONOISOTOPIC_ATOMIC_MASS):
 105     """Return the mass of formula, using the given mass regime (monoisotopic
 106     by default).
 107
 108     >>> formula_mass('H2O', { 'H':1, 'O':16 })
 109     18
 110     >>> # monoisotopic mass of glycine
 111     >>> str(round(formula_mass('C2H3ON'), 4))
 112     '57.0215'
 113
 114     """
 115     parts = [ p or '1' for p in re.split(r'([A-Z][a-z]*)', formula)[1:] ]
 116     # parts for glycine = ['C', '2', 'H', '3', 'O', '1', 'N', '1']
 117     return sum(atomic_mass[parts[i]] * int(parts[i+1])
 118                for i in range(0, len(parts), 2))
 119
 120 # FIX: selenocysteine (U), etc
 121 # residue -> formula
 122 RESIDUE_FORMULA = {
 123     'A' : "C3H5ON",
 124     'C' : "C3H5ONS",
 125     'D' : "C4H5O3N",
 126     'E' : "C5H7O3N",
 127     'F' : "C9H9ON",
 128     'G' : "C2H3ON",
 129     'H' : "C6H7ON3",
 130     'I' : "C6H11ON",
 131     'K' : "C6H12ON2",
 132     'L' : "C6H11ON",
 133     'M' : "C5H9ONS",
 134     'N' : "C4H6O2N2",
 135     'P' : "C5H7ON",
 136     'Q' : "C5H8O2N2",
 137     'R' : "C6H12ON4",
 138     'S' : "C3H5O2N",
 139     'T' : "C4H7O2N",
 140     'V' : "C5H9ON",
 141     'W' : "C11H10ON2",
 142     'Y' : "C9H9O2N",
 143     }
 144
 145 RESIDUES = sorted(RESIDUE_FORMULA.keys())
 146 RESIDUES_W_BRACKETS = RESIDUES + ['[', ']']
 147
 148
 149 # [0][1] -> 'H' -> fragment mass of H for regime 0
 150 MASS_REGIME_ATOMIC_MASSES = []
 151
 152 def dump_mass_regime_atomic_masses():
 153     return MASS_REGIME_ATOMIC_MASSES
 154
 155
 156 def mass_regime_atomic_masses(spec):
 157     """Given a regime spec like ('MONO', [('N15', 0.9)]), return a map of atom
 158     names to masses.
 159     """
 160     name, isotopes = spec
 161     assert name in ['MONO', 'AVG'] and len(isotopes) <= 1
 162     if name == 'MONO':
 163         r = MONOISOTOPIC_ATOMIC_MASS.copy()
 164     else:
 165         r = AVERAGE_ATOMIC_MASS.copy()
 166     if isotopes:
 167         iname, prevalence = isotopes[0]
 168         assert iname == 'N15' and 0 <= prevalence <= 1
 169         # this is a simplification, but additional accuracy pointless?
 170         if name == 'MONO':
 171             r['N'] = ISOTOPIC_ATOMIC_MASS['N15']
 172         else:
 173             r['N'] += (ISOTOPIC_ATOMIC_MASS['N15'] - r['N']) * prevalence
 174     return r
 175
 176
 177 def initialize_spectrum_parameters(options, GLP, mass_regimes, fixed_mod_map):
 178     """Initialize parameters known to the spectrum module.
 179     fixed_mod_map maps, for example, 'M' to (1, 'O', False, 'M', 'oxidation').
 180     """
 181
 182     # This function can be called multiple times to reinitialize for a new
 183     # search, though probably everything else (previous matches) ought to be
 184     # "forgotten" at that point, too.
 185
 186     # This is the size of vectors that are indexed by residues (A-Z) or
 187     # special characters ('[]').
 188     RESIDUE_LIMIT = max(ord(c) for c in 'Z[]') + 1
 189
 190     # These are currently monoisotopic.  (deuterium pointless?)
 191     CP.proton_mass = PROTON_MASS
 192     CP.hydrogen_mass = formula_mass("H")
 193
 194     regime_manifest = []
 195
 196     global MASS_REGIME_ATOMIC_MASSES
 197     # clear previous state
 198     MASS_REGIME_ATOMIC_MASSES = []
 199     CP.parent_mass_regime.clear()
 200     CP.fragment_mass_regime.clear()
 201
 202     for rn, regime_pair in enumerate(mass_regimes):
 203         assert len(regime_pair) == 2    # parent and fragment
 204         info('mass regime: %s', regime_pair)
 205         MASS_REGIME_ATOMIC_MASSES.append([])
 206         for n, regime in enumerate(regime_pair):
 207             atmass = mass_regime_atomic_masses(regime)
 208             MASS_REGIME_ATOMIC_MASSES[-1].append(atmass)
 209             creg = cgreylag.mass_regime_parameters()
 210
 211             creg.hydroxyl_mass = formula_mass("OH", atmass)
 212             creg.water_mass = formula_mass("H2O", atmass)
 213             creg.ammonia_mass = formula_mass("NH3", atmass)
 214
 215             creg.fixed_residue_mass.resize(RESIDUE_LIMIT)
 216
 217             for r in RESIDUES_W_BRACKETS:
 218                 m = 0
 219                 if r in RESIDUES:
 220                     m = (formula_mass(RESIDUE_FORMULA[r], atmass)
 221                          + GLP["mass_regime_debug_delta"])
 222                 if n == 1:
 223                     regime_manifest.append((rn, r, m))
 224                 rmod = fixed_mod_map.get(r)
 225                 if rmod:
 226                     if isinstance(rmod[1], str):
 227                         if rmod[2]:
 228                             m += rmod[0] * formula_mass(rmod[1])
 229                         else:
 230                             m += rmod[0] * formula_mass(rmod[1], atmass)
 231                     else:
 232                         m += rmod[0] * rmod[1]
 233                 creg.fixed_residue_mass[ord(r)] = m
 234             # assuming these are monoisotopic (not regime)
 235             creg.fixed_residue_mass[ord('[')] += formula_mass("H")
 236             creg.fixed_residue_mass[ord(']')] += formula_mass("OH")
 237             if not n:
 238                 CP.parent_mass_regime.append(creg)
 239             else:
 240                 creg.fixed_residue_mass[ord('[')] -= CP.hydrogen_mass
 241                 creg.fixed_residue_mass[ord(']')] -= creg.hydroxyl_mass
 242                 CP.fragment_mass_regime.append(creg)
 243     for r in RESIDUES_W_BRACKETS:
 244         info('fixed mass %s: %s', r,
 245              [ ("%.6f" % CP.parent_mass_regime[rn].fixed_residue_mass[ord(r)],
 246                 "%.6f" % CP.fragment_mass_regime[rn].fixed_residue_mass[ord(r)])
 247                for rn in range(len(mass_regimes)) ])
 248     for r in RESIDUES:
 249         for rn in range(len(mass_regimes)):
 250             # check for physically impossible/meaningless masses
 251             if CP.parent_mass_regime[rn].fixed_residue_mass[ord(r)] < 1.0:
 252                 raise ValueError('bogus parent mass specification for %s' % r)
 253             if CP.fragment_mass_regime[rn].fixed_residue_mass[ord(r)] < 1.0:
 254                 raise ValueError('bogus fragment mass specification for %s' % r)
 255
 256     CP.parent_mass_tolerance_1 = GLP["parent_mz_tolerance"]
 257     CP.parent_mass_tolerance_max = (GLP["parent_mz_tolerance"]
 258                                     * GLP["charge_limit"])
 259
 260     CP.fragment_mass_tolerance = GLP["fragment_mass_tolerance"]
 261     CP.intensity_class_count = GLP["intensity_class_count"]
 262
 263     CP.minimum_peptide_length = GLP["min_peptide_length"]
 264
 265     # CP.ln_factorial[n] == ln(n!)
 266     CP.ln_factorial.resize(int(GLP["max_parent_spectrum_mass"]
 267                                / GLP["fragment_mass_tolerance"] + 100), 0.0)
 268     for n in range(2, len(CP.ln_factorial)):
 269         CP.ln_factorial[n] = CP.ln_factorial[n-1] + math.log(n)
 270
 271     # FIX or eliminate
 272     #CP.estimate_only = bool(options.estimate_only)
 273     #CP.show_progress = bool(options.show_progress)
 274     CP.estimate_only = False
 275     CP.show_progress = False
 276
 277     return regime_manifest
 278
 279
 280 def enumerate_conjunction(mod_tree, limit, conjuncts=[]):
 281     if not mod_tree:
 282         if 0 < len(conjuncts) <= limit:
 283             yield conjuncts
 284         return
 285     first, rest = mod_tree[0], mod_tree[1:]
 286     if isinstance(first, list):
 287         for x in enumerate_disjunction(first, limit):
 288             for y in enumerate_conjunction(rest, limit, conjuncts + x):
 289                 yield y
 290     else:
 291         for y in enumerate_conjunction(rest, limit, conjuncts):
 292             yield y
 293         for y in enumerate_conjunction(rest, limit, conjuncts + [first]):
 294             yield y
 295
 296 def enumerate_disjunction(mod_tree, limit=sys.maxint):
 297     """Generates the conjuncts for mod_tree that are no longer than limit.
 298
 299     >>> list(enumerate_disjunction([['a'],['b'],['c']]))
 300     [[], ['a'], ['b'], ['c']]
 301     >>> list(enumerate_disjunction([[1,2,3]]))
 302     [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3]]
 303     >>> list(enumerate_disjunction([[1,2,3],[4,5]]))
 304     [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [1, 2, 3], [5], [4], [4, 5]]
 305     >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=2))
 306     [[], [3], [2], [2, 3], [1], [1, 3], [1, 2], [5], [4], [4, 5]]
 307     >>> list(enumerate_disjunction([[1,2,3],[4,5]], limit=0))
 308     [[]]
 309
 310     """
 311     assert isinstance(mod_tree, list)
 312     yield []
 313     for b in mod_tree:
 314         for s in enumerate_conjunction(b, limit):
 315             yield s
 316
 317 def get_mod_conjunct_triples(mod_tree, limit, mass_regimes):
 318     """Return a triple (N, C, rest), where N (C) is a tuple of at most one N
 319     (C) conjunct, and rest is a tuple of conjuncts in a canonical order,
 320     ordered by increasing number of conjuncts, and with duplicates within and
 321     across removed.  A mass table is also appended to each conjunct.
 322     """
 323     # FIX: is there a more elegant way or place for all this?
 324     def enmass(t):
 325         def rmass(regime_index, par_frag, sign, delta, is_mono):
 326             global MASS_REGIME_ATOMIC_MASSES
 327             if isinstance(delta, str):
 328                 if is_mono:
 329                     regime_index = 0
 330                 return (sign * formula_mass(delta,
 331                            MASS_REGIME_ATOMIC_MASSES[regime_index][par_frag]))
 332             else:
 333                 return sign * delta
 334
 335         sign, delta, is_mono = t[:3]
 336         return t + (tuple((rmass(r, 0, sign, delta, is_mono),
 337                            rmass(r, 1, sign, delta, is_mono))
 338                           for r in range(len(mass_regimes))),)
 339
 340     def triple(c):
 341         Ns = tuple(frozenset(enmass(x) for x in c if x[3] == '['))
 342         Cs = tuple(frozenset(enmass(x) for x in c if x[3] == ']'))
 343         assert len(Ns) <= 1 and len(Cs) <= 1
 344         rest = [ enmass(x) for x in c if x[3] not in '[]' ]
 345         rest = tuple(sorted(list(frozenset(rest))))
 346         return (Ns, Cs, rest)
 347
 348     return sorted(list(frozenset(triple(conjunct)
 349                                  for conjunct
 350                                  in enumerate_disjunction(mod_tree, limit))),
 351                   key=lambda x: (sum(len(y) for y in x), x))
 352
 353
 354 def cleavage_motif_re(motif):
 355     """Return (regexp, pos), where regexp is a regular expression that will
 356     match a cleavage motif, and pos is the position of the cleavage with
 357     respect to the match (or None).  (The RE actually matches one character,
 358     the rest matching as lookahead, so that re.finditer will find all
 359     overlapping matches.)
 360
 361     >>> cleavage_motif_re('[KR]|{P}')
 362     ('[KR](?=[^P])', 1)
 363     >>> cleavage_motif_re('[X]|[X]')
 364     ('.(?=.)', 1)
 365
 366     """
 367     cleavage_pos = None
 368     re_parts = []
 369     motif_re = re.compile(r'(\||(\[(X|[A-WYZ]+)\])|({([A-WYZ]+)}))')
 370     parts = [ p[0] for p in motif_re.findall(motif) ]
 371     if ''.join(parts) != motif:
 372         chase_error('invalid cleavage motif pattern')
 373     i = 0
 374     for part in parts:
 375         if part == '|':
 376             if cleavage_pos != None:
 377                 chase_error("invalid cleavage motif pattern" " (multiple '|'s)")
 378             cleavage_pos = i
 379             continue
 380         if part == '[X]':
 381             re_parts.append('.')
 382         elif part[0] == '[':
 383             re_parts.append('[%s]' % part[1:-1])
 384         elif part[0] == '{':
 385             re_parts.append('[^%s]' % part[1:-1])
 386         else:
 387             assert False, "unknown cleavage motif syntax"
 388         i += 1
 389     if len(re_parts) == 0:
 390         re_pattern = '.'
 391     elif len(re_parts) == 1:
 392         re_pattern = re_parts[0]
 393     else:
 394         re_pattern = re_parts[0] + '(?=' + ''.join(re_parts[1:]) + ')'
 395     return (re_pattern, cleavage_pos)
 396
 397
 398 def file_sha1(filename):
 399     """Return the (hex) SHA1 digest of the given file."""
 400     try:
 401         import hashlib
 402     except:
 403         return "no checksum--libs missing"
 404     h = hashlib.sha1()
 405     with open(filename) as f:
 406         h.update(f.read())
 407     return h.hexdigest()
 408
 409
 410 def read_fasta_file(f):
 411     """Yield (locusname, defline, sequence) tuples as read from the given
 412     FASTA file (uppercasing sequence)."""
 413
 414     locusname, defline = None, None
 415     seqs = []
 416
 417     for line in f:
 418         line = line.strip()
 419         if line[:1] == '>':
 420             if defline != None:
 421                 yield (locusname, defline, ''.join(seqs))
 422             defline = line[1:]
 423             locusname_rest = defline.split(None, 1)
 424             locusname = locusname_rest[0] if locusname_rest else ''
 425             seqs = []
 426         else:
 427             if defline == None:
 428                 chase_error("bad format: line precedes initial defline"
 429                             " in '%s'", (f.name if hasattr(f, 'name')
 430                                          else 'unknown FASTA file'))
 431             seqs.append(line.upper())
 432     if defline != None:
 433         yield (locusname, defline, ''.join(seqs))
 434
 435
 436 def read_fasta_files(filenames):
 437     """Yield (locusname, defline, sequence, filename) tuples as read from
 438     FASTA files (uppercasing sequence).  An error is given if locusname is
 439     empty or not unique across all sequences."""
 440
 441     loci_seen = set()
 442
 443     for filename in filenames:
 444         with open(filename) as f:
 445             for locusname, defline, sequence in read_fasta_file(f):
 446                 if not locusname:
 447                     chase_error("empty locus name not allowed in '%s'",
 448                                 filename)
 449                 if locusname in loci_seen:
 450                     chase_error("locus name '%s' is not unique in the search"
 451                                 " database(s) in '%s'", locusname, filename)
 452                 loci_seen.add(locusname)
 453
 454                 yield (locusname, defline, sequence, filename)