3 * This source code is part of
7 * GROningen MAchine for Chemical Simulations
9 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
10 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
11 * Copyright (c) 2001-2009, The GROMACS development team,
12 * check out http://www.gromacs.org for more information.
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version 2
17 * of the License, or (at your option) any later version.
19 * If you want to redistribute modifications, please consider that
20 * scientific software is very special. Version control is crucial -
21 * bugs must be traceable. We will be happy to consider code for
22 * inclusion in the official distribution, but derived work must not
23 * be called official GROMACS. Details are found in the README & COPYING
24 * files - if they are missing, get the official version at www.gromacs.org.
26 * To help us fund GROMACS development, we humbly ask that you cite
27 * the papers on the package - you can find them in the top README file.
29 * For more info, check our website at http://www.gromacs.org
31 /*! \page selengine Text-based selections
33 * \section selection_basics Basics
35 * Selections are enabled for an analysis program automatically.
36 * By default, dynamic selections are allowed, and the user can freely
37 * choose whether to analyze atoms or centers of mass/geometry of
39 * These defaults, as well as some others, can be changed by specifying
40 * flags for gmx_ana_traj_create().
42 * Two command-line options are always added:
43 * - \p -select can be used to specify the selection on the command line.
44 * - \p -sf can be used to specify a file name from which the selection is
46 * If both options are specified, \p -select takes precedence.
47 * - If neither of the above is present, the user is prompted to type
49 * This is also done if an empty string is passed to \p -select.
50 * An index file can be provided with \p -n, and the user can also select
51 * groups from it instead of writing a full selection.
52 * If no index file is provided, the default groups are generated.
55 * Improve the user interface for interactive selection input.
57 * If \ref ANA_ONLY_ATOMPOS is not specified, an additional command-line
59 * - \p -seltype can be used to specify the default type of positions to
60 * calculate for each selection.
62 * An additional command-line option is added if dynamic selections are
63 * allowed (i.e., if \ref ANA_NO_DYNSEL is not specified).
64 * This options control how position keywords work within selections:
65 * - By default, each atom can be selected separately based on its
66 * coordinates (unless \ref ANA_REQUIRE_WHOLE is specified), but this
67 * can be changed with \p -selrpos to select all the atoms in a
68 * residue/molecule based on the center of mass/geometry of the
71 * The two options that specify the position types can take any of the
73 * - \p atom uses individual atom positions.
74 * - \p res_com and \p res_cog use the centers of mass/geometry of residues.
75 * - \p mol_com and \p mol_cog use the centers of mass/geometry of molecules.
76 * - The way the COM/COG are calculated can be altered with a one of the
77 * following prefixes (e.g., \p part_res_com):
78 * - \p whole_ prefix calcualtes the centers for the whole
79 * residue/molecule, even if only part of it is selected.
80 * - \p part_ prefix calculates the centers for the selected atoms,
81 * but uses always the same atoms for the same residue/molecule.
82 * The used atoms are determined from the the largest group allowed by
84 * - \p dyn_ prefix calculates the centers strictly only for the
87 * - If no prefix is specified, \p -seltype defaults to \p part_ and
88 * \p -selrpos defaults to \p whole_.
89 * The latter is often desirable to select the same molecules in different
90 * tools, while the first is a compromise between speed (\p dyn_ positions
91 * can be slower to evaluate than \p part_) and intuitive behavior.
93 * The analysis program can then access the selected positions for each frame
94 * through a \p gmx_ana_selection_t array that is passed to the frame
95 * analysis function (see gmx_analysisfunc()).
96 * As long as the analysis program is written such that it does not assume
97 * that the number of positions or the atoms in the groups groups remain
98 * constant, any kind of selection expression can be used.
100 * Some analysis programs may require a special structure for the index groups
101 * (e.g., \c g_angle requires the index group to be made of groups of three or
103 * For such programs, it is up to the user to provide a proper selection
104 * expression that always returns such positions.
105 * The analysis program can define \ref ANA_REQUIRE_WHOLE to make the
106 * default behavior appropriate for the most common uses where the groups
107 * should consist of atoms within a single residue/molecule.
110 * Currently, it is not possible to write selection expressions that would
111 * evaluate to index groups where the same atom occurs more than once.
112 * Such index groups may be useful with tools like \c g_angle to calculate
113 * the averages over several angles.
114 * It is possible to implement such a feature, but currently the best solution is to
115 * write/modify the tool such that it can deal with multiple index groups,
116 * each of which then defines angles that don't have overlapping atoms.
119 * \section selection_syntax Selection syntax
121 * A set of selections consists of one or more selections, separated by
122 * semicolons. Each selection defines a set of positions for the analysis.
123 * Each selection can also be preceded by a string that gives a name for
124 * the selection for use in, e.g., graph legends.
125 * If no name is provided, a the string used for the selection is used
127 * It is also possible to use variables to store selection expressions.
128 * A variable is defined with the following syntax:
132 * After this, you can use VARNAME anywhere where EXPR would be valid.
134 * No semicolons should be used when providing the selections interactively;
135 * in the interactive prompt, each selection should appear on a line of its
136 * own. Lines can be continued with \\ if necessary.
137 * Notice that the above only applies to real interactive input,
138 * not if you provide the selections, e.g., from a pipe.
140 * Positions can be defined in several different ways (ATOM_EXPR stands for any
141 * expression that evaluates to a set of atoms):
142 * - <tt>(X, Y, Z)</tt>: a fixed position (X, Y, Z should be real numbers).
143 * - <tt>cog of ATOM_EXPR [pbc]</tt>: center of geometry of ATOM_EXPR.
144 * - <tt>com of EXPR [pbc]</tt>: center of mass of ATOM_EXPR.
145 * - <tt>res_com of ATOM_EXPR</tt>: a set of positions consisting of centers
146 * of mass of the residues in ATOM_EXPR.
147 * <tt>res_com</tt> can be replaced with any of the position types
148 * acceptable for the \p -seltype and \p -selrpos options.
149 * - <tt>ATOM_EXPR</tt>: the default positions provided with \p -seltype are
150 * evaluated for ATOM_EXPR.
152 * If you specify \p pbc with the \p cog or \p com expressions,
153 * the center is calculated iteratively to try to deal with cases where
154 * ATOM_EXPR wraps around periodic boundary conditions.
156 * To select atoms (ATOM_EXPR expressions above), the following keywords are
157 * currently available:
158 * - <tt>atomnr [INT|INT to INT] ... </tt>: selects atoms by number
160 * - <tt>resnr [INT|INT to INT] ... </tt>: selects residues by number
161 * (first residue = 1)
162 * - <tt>name STR ... </tt>: selects atoms by name
163 * - <tt>type STR ... </tt>: selects atoms by type
164 * - <tt>resname STR ... </tt>: selects residues by name
165 * - <tt>insertcode STR ... </tt>: selects residues by insertion code
166 * - <tt>chain STR ... </tt>: selects residues by chain ID
167 * - <tt>mass</tt>: returns the mass of the atom, use, e.g., like
169 * - <tt>charge</tt>: returns the charge of the atom
170 * - <tt>altloc STR ... </tt>: selects atoms by the alternate location
171 * identifier specified in the PDB file
172 * - <tt>occupancy</tt>: returns the occupancy of the atom from the PDB file
173 * - <tt>betafactor</tt>: returns the B-factor of the atom from the PDB file
174 * - <tt>within RADIUS of POS_EXPR</tt>: Selects atoms that are within
175 * RADIUS of any positions defined by POS_EXPR. POS_EXPR can be defined
176 * as in selections above (\p -selrpos is used instead of \p -seltype to
177 * convert atom expressions to positions).
178 * - <tt>insolidangle span POS_EXPR center POS [cutoff ANGLE]</tt>:
179 * Selects atoms that are within ANGLE degrees (default=5) of any position
180 * in POS_EXPR as seen from POS (a position expression that evaluates to
181 * a single position), i.e., atoms in the solid angle spanned by the
182 * positions in POS_EXPR and centered at POS
183 * (see \subpage sm_insolidangle "detailed explanation").
184 * - <tt>same residue as ATOM_EXPR</tt>:
185 * Selects any atoms that are in a same residue with any atom in ATOM_EXPR.
187 * For string-based selection keywords, it is possible to use wildcards
188 * (e.g., <tt>name "C*"</tt>) or regular expressions
189 * (e.g., <tt>resname "R[AB]"</tt>).
190 * Strings that contain non-alphanumeric characters should be enclosed in
191 * double quotes as in the examples.
192 * The match type is automatically guessed from the string: if it contains any
193 * other characters than letters, numbers, '*', or '?', it is interpreted
194 * as a regular expression.
196 * In addition, the following keywords yield numeric values that can
197 * be compared with each other or constant values to select a subset of
198 * the particles (\p resnr and similar keywords can also be used in this way):
199 * - <tt>distance from POS [cutoff REAL]</tt>: calculates the distance from
201 * - <tt>mindistance from POS_EXPR [cutoff REAL]</tt>: calculates the minimum
202 * distance from the positions in POS_EXPR
203 * - <tt>x</tt>, <tt>y</tt>, <tt>z</tt>: returns the respective coordinate
206 * For the distance selection keywords, it is possible to specify a cutoff
207 * to possibly speed up the evaluation: all distances above the specified
208 * cutoff are returned as equal to the cutoff.
210 * For all selection keywords that select by position, the position specified
211 * by \p -selrpos is used. This can be overridden by prepending a reference
212 * position specifier to the keyword. For example,
213 * <tt>res_com dist from POS</tt>
214 * evaluates the residue center of mass distances irrespective of the value of
217 * Arithmetic expressions are not implemented.
219 * It is also possible to combine expressions through boolean operations:
220 * - <tt>not ATOM_EXPR</tt>
221 * - <tt>ATOM_EXPR and ATOM_EXPR</tt>
222 * - <tt>ATOM_EXPR or ATOM_EXPR</tt>
224 * are all valid expressions.
225 * Parenthesis can also be used to alter the evaluation order.
227 * All selections are by default evaluated such that the atom indices are
228 * returned in ascending order. This can be changed by appending
229 * <tt>permute P1 P2 ... Pn</tt> to an expression.
230 * The \c Pi should form a permutation of the numbers 1 to n.
231 * This keyword permutes each n-position block in the selection such that the
232 * i'th position in the block becomes Pi'th.
233 * Note that it is the positions that are permuted, not individual atoms.
234 * A fatal error occurs if the size of the selection is not a multiple of n.
235 * It is only possible to permute the whole selection expression, not any
236 * subexpressions, i.e., the \c permute keyword should appear last in a
240 * \section selection_eval Selection evaluation and optimization
242 * Boolean evaluation proceeds from left to right and is short-circuiting,
243 * i.e., as soon as it is known whether an atom will be selected, the
244 * remaining expressions are not evaluated at all.
245 * This can be used to optimize the selections: you should write the
246 * most restrictive and/or the most inexpensive expressions first in
247 * boolean expressions.
248 * The relative ordering between dynamic and static expressions does not
249 * matter: all static expressions are evaluated only once, before the first
250 * frame, and the result becomes the leftmost expression.
252 * Another point for optimization is in common subexpressions: they are not
253 * automatically recognized, but can be manually optimized by the use of
254 * variables. This can have a big impact on the performance of complex
255 * selections, in particular if you define several index groups like this:
257 rdist = distance from com of resnr 1 to 5;
258 resname RES and rdist < 2;
259 resname RES and rdist < 4;
260 resname RES and rdist < 6;
262 * Without the variable assignment, the distances would be evaluated three
263 * times, although they are exactly the same within each selection.
264 * Anything assigned into a variable becomes a common subexpression that
265 * is evaluated only once during a frame.
266 * Currently, in some cases the use of variables can actually lead to a small
267 * performance loss because of the checks necessary to determine for which
268 * atoms the expression has already been evaluated, but this should not be
272 * \section selection_methods Implementing new keywords
274 * New selection keywords can be easily implemented, either directly into the
275 * library or as part of analysis programs (the latter may be useful for
276 * testing or methods very specific to some analysis).
277 * For both cases, you should first create a \c gmx_ana_selmethod_t structure
278 * and fill it with the necessary information.
279 * Details can be found on a separate page: \ref selmethods.
283 * Implementation of functions in selection.h.
285 /*! \internal \dir src/gmxlib/selection
287 * Source code for selection-related routines.
294 #include <statutil.h>
299 #include <selection.h>
300 #include <selmethod.h>
302 #include "selcollection.h"
307 * \param[out] scp Pointer to a newly allocated empty selection collection.
308 * \param[in] pcc Position calculation data structure to use for selection
309 * position evaluation.
310 * \returns 0 on success.
313 gmx_ana_selcollection_create(gmx_ana_selcollection_t
**scp
,
314 gmx_ana_poscalc_coll_t
*pcc
)
316 gmx_ana_selcollection_t
*sc
;
326 gmx_ana_index_clear(&sc
->gall
);
328 _gmx_sel_symtab_create(&sc
->symtab
);
334 * \param[in,out] sc Selection collection to free.
336 * The pointer \p sc is invalid after the call.
339 gmx_ana_selcollection_free(gmx_ana_selcollection_t
*sc
)
344 _gmx_selelem_free_chain(sc
->root
);
347 for (i
= 0; i
< sc
->nr
; ++i
)
349 gmx_ana_selection_free(sc
->sel
[i
]);
353 gmx_ana_index_deinit(&sc
->gall
);
354 _gmx_selcollection_clear_symtab(sc
);
359 * \param[in,out] sc Selection collection.
362 _gmx_selcollection_clear_symtab(gmx_ana_selcollection_t
*sc
)
366 _gmx_sel_symtab_free(sc
->symtab
);
372 * \param[in,out] sc Selection collection to modify.
373 * \param[in] type Default selection reference position type
374 * (one of the strings acceptable for gmx_ana_poscalc_type_from_enum()).
376 * Should be called before calling gmx_ana_selcollection_requires_top() or
377 * gmx_ana_selcollection_parse_*().
380 gmx_ana_selcollection_set_refpostype(gmx_ana_selcollection_t
*sc
,
387 * \param[in,out] sc Selection collection to modify.
388 * \param[in] type Default selection output position type
389 * (one of the strings acceptable for gmx_ana_poslcalc_type_from_enum()).
390 * \param[in] bMaskOnly If TRUE, the output positions are initialized
391 * using \ref POS_MASKONLY.
393 * If \p type is NULL, the default type is not modified.
394 * Should be called before calling gmx_ana_selcollection_requires_top() or
395 * gmx_ana_selcollection_parse_*().
398 gmx_ana_selcollection_set_outpostype(gmx_ana_selcollection_t
*sc
,
399 const char *type
, bool bMaskOnly
)
405 sc
->bMaskOnly
= bMaskOnly
;
409 * \param[in,out] sc Selection collection to set the topology for.
410 * \param[in] top Topology data.
411 * \param[in] natoms Number of atoms. If <=0, the number of atoms in the
413 * \returns 0 on success, EINVAL if \p top is NULL and \p natoms <= 0.
415 * The topology is also set for the position calculation collection
416 * associated with \p sc.
418 * \p natoms determines the largest atom index that can be selected by the
419 * selection: even if the topology contains more atoms, they will not be
423 gmx_ana_selcollection_set_topology(gmx_ana_selcollection_t
*sc
, t_topology
*top
,
426 gmx_ana_poscalc_coll_set_topology(sc
->pcc
, top
);
429 /* Get the number of atoms from the topology if it is not given */
434 gmx_incons("selections need either the topology or the number of atoms");
437 natoms
= sc
->top
->atoms
.nr
;
439 gmx_ana_index_init_simple(&sc
->gall
, natoms
, NULL
);
444 * \param[in] sc Selection collection to query.
445 * \returns Number of selections in \p sc.
447 * If gmx_ana_selcollection_parse_*() has not been called, returns 0.
449 * \see gmx_ana_selcollection_get_selection()
452 gmx_ana_selcollection_get_count(gmx_ana_selcollection_t
*sc
)
458 * \param[in] sc Selection collection to query.
459 * \param[in] i Number of the selection.
460 * \returns Pointer to the \p i'th selection in \p sc,
461 * or NULL if there is no such selection.
463 * \p i should be between 0 and the value returned by
464 * gmx_ana_selcollection_get_count().
465 * The returned pointer should not be freed.
466 * If gmx_ana_selcollection_compile() has not been called, the returned
467 * selection is not completely initialized (but the returned pointer will be
468 * valid even after compilation, and will point to the initialized selection).
470 * \see gmx_ana_selcollection_get_count()
472 gmx_ana_selection_t
*
473 gmx_ana_selcollection_get_selection(gmx_ana_selcollection_t
*sc
, int i
)
475 if (i
< 0 || i
>= sc
->nr
|| !sc
->sel
)
481 * \param[in] sc Selection collection to query.
482 * \returns TRUE if any selection in \p sc requires topology information,
485 * Before gmx_ana_selcollection_parse_*(), the return value is based just on
486 * the position types set.
487 * After gmx_ana_selcollection_parse_*(), the return value also takes into account the
488 * selection keywords used.
491 gmx_ana_selcollection_requires_top(gmx_ana_selcollection_t
*sc
)
501 rc
= gmx_ana_poscalc_type_from_enum(sc
->rpost
, &type
, &flags
);
502 if (rc
== 0 && type
!= POS_ATOM
)
510 rc
= gmx_ana_poscalc_type_from_enum(sc
->spost
, &type
, &flags
);
511 if (rc
== 0 && type
!= POS_ATOM
)
520 if (_gmx_selelem_requires_top(sel
))
530 * \param[in] fp File handle to receive the output.
531 * \param[in] sc Selection collection to print.
532 * \param[in] bValues If TRUE, the evaluated values of selection elements
533 * are printed as well.
536 gmx_ana_selcollection_print_tree(FILE *fp
, gmx_ana_selcollection_t
*sc
, bool bValues
)
543 _gmx_selelem_print_tree(fp
, sel
, bValues
, 0);
549 * \param[in] sel Selection to free.
551 * After the call, the pointer \p sel is invalid.
554 gmx_ana_selection_free(gmx_ana_selection_t
*sel
)
558 gmx_ana_pos_deinit(&sel
->p
);
559 if (sel
->m
!= sel
->orgm
)
563 if (sel
->q
!= sel
->orgq
)
573 * \param[in] sel Selection whose name is needed.
574 * \returns Pointer to the name of the selection.
576 * The return value should not be freed by the caller.
579 gmx_ana_selection_name(gmx_ana_selection_t
*sel
)
585 * \param[in] sel Selection for which information should be printed.
588 gmx_ana_selection_print_info(gmx_ana_selection_t
*sel
)
590 fprintf(stderr
, "\"%s\" (%d position%s, %d atom%s%s)", sel
->name
,
591 sel
->p
.nr
, sel
->p
.nr
== 1 ? "" : "s",
592 sel
->g
->isize
, sel
->g
->isize
== 1 ? "" : "s",
593 sel
->bDynamic
? ", dynamic" : "");
594 fprintf(stderr
, "\n");
598 * \param[in] sel Selection to initialize.
599 * \param[in] type Type of covered fraction required.
600 * \returns TRUE if the covered fraction can be calculated for the selection,
604 gmx_ana_selection_init_coverfrac(gmx_ana_selection_t
*sel
, e_coverfrac_t type
)
606 sel
->cfractype
= type
;
607 if (type
== CFRAC_NONE
|| !sel
->selelem
)
609 sel
->bCFracDyn
= FALSE
;
611 else if (!_gmx_selelem_can_estimate_cover(sel
->selelem
))
613 sel
->cfractype
= CFRAC_NONE
;
614 sel
->bCFracDyn
= FALSE
;
618 sel
->bCFracDyn
= TRUE
;
620 sel
->cfrac
= sel
->bCFracDyn
? 0.0 : 1.0;
621 sel
->avecfrac
= sel
->cfrac
;
622 return type
== CFRAC_NONE
|| sel
->cfractype
!= CFRAC_NONE
;
626 * \param[in] out Output file.
627 * \param[in] sc Selection collection which should be written.
628 * \param[in] oenv Output options structure.
630 void xvgr_selcollection(FILE *out
, gmx_ana_selcollection_t
*sc
,
631 const output_env_t oenv
)
636 if (get_print_xvgr_codes(oenv
) && sc
&& sc
->selstr
)
638 fprintf(out
, "# Selections:\n");
639 buf
= strdup(sc
->selstr
);
641 while (p
&& p
[0] != 0)
643 nl
= strchr(p
, '\n');
648 fprintf(out
, "# %s\n", p
);