Move computeSlowForces into stepWork
[gromacs.git] / src / gromacs / gmxpreprocess / pdb2gmx.cpp
blobd4b8301b5411abc3bcdde68dbc3ebd6a1ecd382e
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
8 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9 * and including many others, as listed in the AUTHORS file in the
10 * top-level source directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
38 #include "gmxpre.h"
40 #include "pdb2gmx.h"
42 #include <cctype>
43 #include <cstdlib>
44 #include <cstring>
45 #include <ctime>
47 #include <algorithm>
48 #include <memory>
49 #include <string>
50 #include <vector>
52 #include "gromacs/commandline/cmdlineoptionsmodule.h"
53 #include "gromacs/fileio/confio.h"
54 #include "gromacs/fileio/filetypes.h"
55 #include "gromacs/fileio/gmxfio.h"
56 #include "gromacs/fileio/pdbio.h"
57 #include "gromacs/gmxlib/conformation_utilities.h"
58 #include "gromacs/gmxpreprocess/fflibutil.h"
59 #include "gromacs/gmxpreprocess/genhydro.h"
60 #include "gromacs/gmxpreprocess/gpp_atomtype.h"
61 #include "gromacs/gmxpreprocess/grompp_impl.h"
62 #include "gromacs/gmxpreprocess/h_db.h"
63 #include "gromacs/gmxpreprocess/hizzie.h"
64 #include "gromacs/gmxpreprocess/pdb2top.h"
65 #include "gromacs/gmxpreprocess/pgutil.h"
66 #include "gromacs/gmxpreprocess/specbond.h"
67 #include "gromacs/gmxpreprocess/ter_db.h"
68 #include "gromacs/gmxpreprocess/toputil.h"
69 #include "gromacs/gmxpreprocess/xlate.h"
70 #include "gromacs/math/vec.h"
71 #include "gromacs/options/basicoptions.h"
72 #include "gromacs/options/filenameoption.h"
73 #include "gromacs/options/ioptionscontainer.h"
74 #include "gromacs/topology/atomprop.h"
75 #include "gromacs/topology/block.h"
76 #include "gromacs/topology/index.h"
77 #include "gromacs/topology/residuetypes.h"
78 #include "gromacs/topology/symtab.h"
79 #include "gromacs/topology/topology.h"
80 #include "gromacs/utility/dir_separator.h"
81 #include "gromacs/utility/exceptions.h"
82 #include "gromacs/utility/fatalerror.h"
83 #include "gromacs/utility/filestream.h"
84 #include "gromacs/utility/loggerbuilder.h"
85 #include "gromacs/utility/path.h"
86 #include "gromacs/utility/smalloc.h"
87 #include "gromacs/utility/strdb.h"
88 #include "gromacs/utility/stringutil.h"
90 #include "hackblock.h"
91 #include "resall.h"
93 struct RtpRename
95 RtpRename(const char* newGmx, const char* newMain, const char* newNter, const char* newCter, const char* newBter) :
96 gmx(newGmx),
97 main(newMain),
98 nter(newNter),
99 cter(newCter),
100 bter(newBter)
103 std::string gmx;
104 std::string main;
105 std::string nter;
106 std::string cter;
107 std::string bter;
110 static const char* res2bb_notermini(const std::string& name, gmx::ArrayRef<const RtpRename> rr)
112 /* NOTE: This function returns the main building block name,
113 * it does not take terminal renaming into account.
115 auto found = std::find_if(rr.begin(), rr.end(), [&name](const auto& rename) {
116 return gmx::equalCaseInsensitive(name, rename.gmx);
118 return found != rr.end() ? found->main.c_str() : name.c_str();
121 static const char* select_res(int nr,
122 int resnr,
123 const char* name[],
124 const char* expl[],
125 const char* title,
126 gmx::ArrayRef<const RtpRename> rr)
128 printf("Which %s type do you want for residue %d\n", title, resnr + 1);
129 for (int sel = 0; (sel < nr); sel++)
131 printf("%d. %s (%s)\n", sel, expl[sel], res2bb_notermini(name[sel], rr));
133 printf("\nType a number:");
134 fflush(stdout);
136 int userSelection;
137 if (scanf("%d", &userSelection) != 1)
139 gmx_fatal(FARGS, "Answer me for res %s %d!", title, resnr + 1);
142 return name[userSelection];
145 static const char* get_asptp(int resnr, gmx::ArrayRef<const RtpRename> rr)
147 enum
149 easp,
150 easpH,
151 easpNR
153 const char* lh[easpNR] = { "ASP", "ASPH" };
154 const char* expl[easpNR] = { "Not protonated (charge -1)", "Protonated (charge 0)" };
156 return select_res(easpNR, resnr, lh, expl, "ASPARTIC ACID", rr);
159 static const char* get_glutp(int resnr, gmx::ArrayRef<const RtpRename> rr)
161 enum
163 eglu,
164 egluH,
165 egluNR
167 const char* lh[egluNR] = { "GLU", "GLUH" };
168 const char* expl[egluNR] = { "Not protonated (charge -1)", "Protonated (charge 0)" };
170 return select_res(egluNR, resnr, lh, expl, "GLUTAMIC ACID", rr);
173 static const char* get_glntp(int resnr, gmx::ArrayRef<const RtpRename> rr)
175 enum
177 egln,
178 eglnH,
179 eglnNR
181 const char* lh[eglnNR] = { "GLN", "QLN" };
182 const char* expl[eglnNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
184 return select_res(eglnNR, resnr, lh, expl, "GLUTAMINE", rr);
187 static const char* get_lystp(int resnr, gmx::ArrayRef<const RtpRename> rr)
189 enum
191 elys,
192 elysH,
193 elysNR
195 const char* lh[elysNR] = { "LYSN", "LYS" };
196 const char* expl[elysNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
198 return select_res(elysNR, resnr, lh, expl, "LYSINE", rr);
201 static const char* get_argtp(int resnr, gmx::ArrayRef<const RtpRename> rr)
203 enum
205 earg,
206 eargH,
207 eargNR
209 const char* lh[eargNR] = { "ARGN", "ARG" };
210 const char* expl[eargNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
212 return select_res(eargNR, resnr, lh, expl, "ARGININE", rr);
215 static const char* get_histp(int resnr, gmx::ArrayRef<const RtpRename> rr)
217 const char* expl[ehisNR] = { "H on ND1 only", "H on NE2 only", "H on ND1 and NE2",
218 "Coupled to Heme" };
220 return select_res(ehisNR, resnr, hh, expl, "HISTIDINE", rr);
223 static void read_rtprename(const char* fname, FILE* fp, std::vector<RtpRename>* rtprename)
225 char line[STRLEN], buf[STRLEN];
227 int ncol = 0;
228 while (get_a_line(fp, line, STRLEN))
230 /* line is NULL-terminated and length<STRLEN, so final arg cannot overflow.
231 * For other args, we read up to 6 chars (so we can detect if the length is > 5).
232 * Note that the buffer length has been increased to 7 to allow this,
233 * so we just need to make sure the strings have zero-length initially.
235 char gmx[STRLEN];
236 char main[STRLEN];
237 char nter[STRLEN];
238 char cter[STRLEN];
239 char bter[STRLEN];
240 gmx[0] = '\0';
241 main[0] = '\0';
242 nter[0] = '\0';
243 cter[0] = '\0';
244 bter[0] = '\0';
245 int nc = sscanf(line, "%6s %6s %6s %6s %6s %s", gmx, main, nter, cter, bter, buf);
246 RtpRename newEntry(gmx, main, nter, cter, bter);
247 if (ncol == 0)
249 if (nc != 2 && nc != 5)
251 gmx_fatal(FARGS, "Residue renaming database '%s' has %d columns instead of %d or %d",
252 fname, ncol, 2, 5);
254 ncol = nc;
256 else if (nc != ncol)
258 gmx_fatal(FARGS,
259 "A line in residue renaming database '%s' has %d columns, while previous "
260 "lines have %d columns",
261 fname, nc, ncol);
264 if (nc == 2)
266 /* This file does not have special termini names, copy them from main */
267 newEntry.nter = newEntry.main;
268 newEntry.cter = newEntry.main;
269 newEntry.bter = newEntry.main;
271 rtprename->push_back(newEntry);
275 static std::string search_resrename(gmx::ArrayRef<const RtpRename> rr,
276 const char* name,
277 bool bStart,
278 bool bEnd,
279 bool bCompareFFRTPname)
281 auto found = std::find_if(rr.begin(), rr.end(), [&name, &bCompareFFRTPname](const auto& rename) {
282 return ((!bCompareFFRTPname && (name == rename.gmx))
283 || (bCompareFFRTPname && (name == rename.main)));
286 std::string newName;
287 /* If found in the database, rename this residue's rtp building block,
288 * otherwise keep the old name.
290 if (found != rr.end())
292 if (bStart && bEnd)
294 newName = found->bter;
296 else if (bStart)
298 newName = found->nter;
300 else if (bEnd)
302 newName = found->cter;
304 else
306 newName = found->main;
309 if (newName[0] == '-')
311 gmx_fatal(FARGS, "In the chosen force field there is no residue type for '%s'%s", name,
312 bStart ? (bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus")
313 : (bEnd ? " as an ending terminus" : ""));
317 return newName;
320 static void rename_resrtp(t_atoms* pdba,
321 int nterpairs,
322 gmx::ArrayRef<const int> r_start,
323 gmx::ArrayRef<const int> r_end,
324 gmx::ArrayRef<const RtpRename> rr,
325 t_symtab* symtab,
326 bool bVerbose,
327 const gmx::MDLogger& logger)
329 bool bFFRTPTERRNM = (getenv("GMX_NO_FFRTP_TER_RENAME") == nullptr);
331 for (int r = 0; r < pdba->nres; r++)
333 bool bStart = false;
334 bool bEnd = false;
335 for (int j = 0; j < nterpairs; j++)
337 if (r == r_start[j])
339 bStart = true;
342 for (int j = 0; j < nterpairs; j++)
344 if (r == r_end[j])
346 bEnd = true;
350 std::string newName = search_resrename(rr, *pdba->resinfo[r].rtp, bStart, bEnd, false);
352 if (bFFRTPTERRNM && newName.empty() && (bStart || bEnd))
354 /* This is a terminal residue, but the residue name,
355 * currently stored in .rtp, is not a standard residue name,
356 * but probably a force field specific rtp name.
357 * Check if we need to rename it because it is terminal.
359 newName = search_resrename(rr, *pdba->resinfo[r].rtp, bStart, bEnd, true);
362 if (!newName.empty() && newName != *pdba->resinfo[r].rtp)
364 if (bVerbose)
366 GMX_LOG(logger.info)
367 .asParagraph()
368 .appendTextFormatted("Changing rtp entry of residue %d %s to '%s'",
369 pdba->resinfo[r].nr, *pdba->resinfo[r].name,
370 newName.c_str());
372 pdba->resinfo[r].rtp = put_symtab(symtab, newName.c_str());
377 static void pdbres_to_gmxrtp(t_atoms* pdba)
379 int i;
381 for (i = 0; (i < pdba->nres); i++)
383 if (pdba->resinfo[i].rtp == nullptr)
385 pdba->resinfo[i].rtp = pdba->resinfo[i].name;
390 static void rename_pdbres(t_atoms* pdba, const char* oldnm, const char* newnm, bool bFullCompare, t_symtab* symtab)
392 char* resnm;
393 int i;
395 for (i = 0; (i < pdba->nres); i++)
397 resnm = *pdba->resinfo[i].name;
398 if ((bFullCompare && (gmx::equalCaseInsensitive(resnm, oldnm)))
399 || (!bFullCompare && strstr(resnm, oldnm) != nullptr))
401 /* Rename the residue name (not the rtp name) */
402 pdba->resinfo[i].name = put_symtab(symtab, newnm);
407 static void rename_bb(t_atoms* pdba, const char* oldnm, const char* newnm, bool bFullCompare, t_symtab* symtab)
409 char* bbnm;
410 int i;
412 for (i = 0; (i < pdba->nres); i++)
414 /* We have not set the rtp name yes, use the residue name */
415 bbnm = *pdba->resinfo[i].name;
416 if ((bFullCompare && (gmx::equalCaseInsensitive(bbnm, oldnm)))
417 || (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
419 /* Change the rtp builing block name */
420 pdba->resinfo[i].rtp = put_symtab(symtab, newnm);
425 static void rename_bbint(t_atoms* pdba,
426 const char* oldnm,
427 const char* gettp(int, gmx::ArrayRef<const RtpRename>),
428 bool bFullCompare,
429 t_symtab* symtab,
430 gmx::ArrayRef<const RtpRename> rr)
432 int i;
433 const char* ptr;
434 char* bbnm;
436 for (i = 0; i < pdba->nres; i++)
438 /* We have not set the rtp name yet, use the residue name */
439 bbnm = *pdba->resinfo[i].name;
440 if ((bFullCompare && (strcmp(bbnm, oldnm) == 0)) || (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
442 ptr = gettp(i, rr);
443 pdba->resinfo[i].rtp = put_symtab(symtab, ptr);
448 static void check_occupancy(t_atoms* atoms, const char* filename, bool bVerbose, const gmx::MDLogger& logger)
450 int i, ftp;
451 int nzero = 0;
452 int nnotone = 0;
454 ftp = fn2ftp(filename);
455 if (!atoms->pdbinfo || ((ftp != efPDB) && (ftp != efBRK) && (ftp != efENT)))
457 GMX_LOG(logger.warning).asParagraph().appendTextFormatted("No occupancies in %s", filename);
459 else
461 for (i = 0; (i < atoms->nr); i++)
463 if (atoms->pdbinfo[i].occup != 1)
465 if (bVerbose)
467 GMX_LOG(logger.warning)
468 .asParagraph()
469 .appendTextFormatted("Occupancy for atom %s%d-%s is %f rather than 1",
470 *atoms->resinfo[atoms->atom[i].resind].name,
471 atoms->resinfo[atoms->atom[i].resind].nr,
472 *atoms->atomname[i], atoms->pdbinfo[i].occup);
474 if (atoms->pdbinfo[i].occup == 0)
476 nzero++;
478 else
480 nnotone++;
484 if (nzero == atoms->nr)
486 GMX_LOG(logger.warning)
487 .asParagraph()
488 .appendTextFormatted(
489 "All occupancy fields zero. This is probably not an X-Ray structure");
491 else if ((nzero > 0) || (nnotone > 0))
493 GMX_LOG(logger.warning)
494 .asParagraph()
495 .appendTextFormatted(
496 "there were %d atoms with zero occupancy and %d atoms with "
497 " occupancy unequal to one (out of %d atoms). Check your pdb "
498 "file.",
499 nzero, nnotone, atoms->nr);
501 else
503 GMX_LOG(logger.warning).asParagraph().appendTextFormatted("All occupancies are one");
508 static void write_posres(const char* fn, t_atoms* pdba, real fc)
510 FILE* fp;
511 int i;
513 fp = gmx_fio_fopen(fn, "w");
514 fprintf(fp,
515 "; In this topology include file, you will find position restraint\n"
516 "; entries for all the heavy atoms in your original pdb file.\n"
517 "; This means that all the protons which were added by pdb2gmx are\n"
518 "; not restrained.\n"
519 "\n"
520 "[ position_restraints ]\n"
521 "; %4s%6s%8s%8s%8s\n",
522 "atom", "type", "fx", "fy", "fz");
523 for (i = 0; (i < pdba->nr); i++)
525 if (!is_hydrogen(*pdba->atomname[i]) && !is_dummymass(*pdba->atomname[i]))
527 fprintf(fp, "%6d%6d %g %g %g\n", i + 1, 1, fc, fc, fc);
530 gmx_fio_fclose(fp);
533 static int read_pdball(const char* inf,
534 bool bOutput,
535 const char* outf,
536 char** title,
537 t_atoms* atoms,
538 rvec** x,
539 PbcType* pbcType,
540 matrix box,
541 bool bRemoveH,
542 t_symtab* symtab,
543 ResidueType* rt,
544 const char* watres,
545 AtomProperties* aps,
546 bool bVerbose)
547 /* Read a pdb file. (containing proteins) */
549 int natom, new_natom, i;
551 /* READ IT */
552 printf("Reading %s...\n", inf);
553 readConfAndAtoms(inf, symtab, title, atoms, pbcType, x, nullptr, box);
554 natom = atoms->nr;
555 if (atoms->pdbinfo == nullptr)
557 snew(atoms->pdbinfo, atoms->nr);
559 if (fn2ftp(inf) == efPDB)
561 get_pdb_atomnumber(atoms, aps);
563 if (bRemoveH)
565 new_natom = 0;
566 for (i = 0; i < atoms->nr; i++)
568 if (!is_hydrogen(*atoms->atomname[i]))
570 atoms->atom[new_natom] = atoms->atom[i];
571 atoms->atomname[new_natom] = atoms->atomname[i];
572 atoms->pdbinfo[new_natom] = atoms->pdbinfo[i];
573 copy_rvec((*x)[i], (*x)[new_natom]);
574 new_natom++;
577 atoms->nr = new_natom;
578 natom = new_natom;
581 printf("Read");
582 if (title[0])
584 printf(" '%s',", *title);
586 printf(" %d atoms\n", natom);
588 /* Rename residues */
589 rename_pdbres(atoms, "HOH", watres, false, symtab);
590 rename_pdbres(atoms, "SOL", watres, false, symtab);
591 rename_pdbres(atoms, "WAT", watres, false, symtab);
593 rename_atoms("xlateat.dat", nullptr, atoms, symtab, {}, true, rt, true, bVerbose);
595 if (natom == 0)
597 return 0;
599 if (bOutput)
601 write_sto_conf(outf, *title, atoms, *x, nullptr, *pbcType, box);
604 return natom;
607 static void process_chain(t_atoms* pdba,
608 gmx::ArrayRef<gmx::RVec> x,
609 bool bTrpU,
610 bool bPheU,
611 bool bTyrU,
612 bool bLysMan,
613 bool bAspMan,
614 bool bGluMan,
615 bool bHisMan,
616 bool bArgMan,
617 bool bGlnMan,
618 real angle,
619 real distance,
620 t_symtab* symtab,
621 gmx::ArrayRef<const RtpRename> rr)
623 /* Rename aromatics, lys, asp and histidine */
624 if (bTyrU)
626 rename_bb(pdba, "TYR", "TYRU", false, symtab);
628 if (bTrpU)
630 rename_bb(pdba, "TRP", "TRPU", false, symtab);
632 if (bPheU)
634 rename_bb(pdba, "PHE", "PHEU", false, symtab);
636 if (bLysMan)
638 rename_bbint(pdba, "LYS", get_lystp, false, symtab, rr);
640 if (bArgMan)
642 rename_bbint(pdba, "ARG", get_argtp, false, symtab, rr);
644 if (bGlnMan)
646 rename_bbint(pdba, "GLN", get_glntp, false, symtab, rr);
648 if (bAspMan)
650 rename_bbint(pdba, "ASP", get_asptp, false, symtab, rr);
652 else
654 rename_bb(pdba, "ASPH", "ASP", false, symtab);
656 if (bGluMan)
658 rename_bbint(pdba, "GLU", get_glutp, false, symtab, rr);
660 else
662 rename_bb(pdba, "GLUH", "GLU", false, symtab);
665 if (!bHisMan)
667 set_histp(pdba, gmx::as_rvec_array(x.data()), symtab, angle, distance);
669 else
671 rename_bbint(pdba, "HIS", get_histp, true, symtab, rr);
674 /* Initialize the rtp builing block names with the residue names
675 * for the residues that have not been processed above.
677 pdbres_to_gmxrtp(pdba);
679 /* Now we have all rtp names set.
680 * The rtp names will conform to Gromacs naming,
681 * unless the input pdb file contained one or more force field specific
682 * rtp names as residue names.
686 /* struct for sorting the atoms from the pdb file */
687 typedef struct
689 int resnr; /* residue number */
690 int j; /* database order index */
691 int index; /* original atom number */
692 char anm1; /* second letter of atom name */
693 char altloc; /* alternate location indicator */
694 } t_pdbindex;
696 static bool pdbicomp(const t_pdbindex& a, const t_pdbindex& b)
698 int d = (a.resnr - b.resnr);
699 if (d == 0)
701 d = (a.j - b.j);
702 if (d == 0)
704 d = (a.anm1 - b.anm1);
705 if (d == 0)
707 d = (a.altloc - b.altloc);
711 return d < 0;
714 static void sort_pdbatoms(gmx::ArrayRef<const PreprocessResidue> restp_chain,
715 int natoms,
716 t_atoms** pdbaptr,
717 t_atoms** newPdbAtoms,
718 std::vector<gmx::RVec>* x,
719 t_blocka* block,
720 char*** gnames)
722 t_atoms* pdba = *pdbaptr;
723 std::vector<gmx::RVec> xnew;
724 t_pdbindex* pdbi;
725 char* atomnm;
727 natoms = pdba->nr;
728 snew(pdbi, natoms);
730 for (int i = 0; i < natoms; i++)
732 atomnm = *pdba->atomname[i];
733 const PreprocessResidue* localPpResidue = &restp_chain[pdba->atom[i].resind];
734 auto found =
735 std::find_if(localPpResidue->atomname.begin(), localPpResidue->atomname.end(),
736 [&atomnm](char** it) { return gmx::equalCaseInsensitive(atomnm, *it); });
737 if (found == localPpResidue->atomname.end())
739 std::string buf = gmx::formatString(
740 "Atom %s in residue %s %d was not found in rtp entry %s with %d atoms\n"
741 "while sorting atoms.\n%s",
742 atomnm, *pdba->resinfo[pdba->atom[i].resind].name,
743 pdba->resinfo[pdba->atom[i].resind].nr, localPpResidue->resname.c_str(),
744 localPpResidue->natom(),
745 is_hydrogen(atomnm)
746 ? "\nFor a hydrogen, this can be a different protonation state, or it\n"
747 "might have had a different number in the PDB file and was rebuilt\n"
748 "(it might for instance have been H3, and we only expected H1 & "
749 "H2).\n"
750 "Note that hydrogens might have been added to the entry for the "
751 "N-terminus.\n"
752 "Remove this hydrogen or choose a different protonation state to "
753 "solve it.\n"
754 "Option -ignh will ignore all hydrogens in the input."
755 : ".");
756 gmx_fatal(FARGS, "%s", buf.c_str());
758 /* make shadow array to be sorted into indexgroup */
759 pdbi[i].resnr = pdba->atom[i].resind;
760 pdbi[i].j = std::distance(localPpResidue->atomname.begin(), found);
761 pdbi[i].index = i;
762 pdbi[i].anm1 = atomnm[1];
763 pdbi[i].altloc = pdba->pdbinfo[i].altloc;
765 std::sort(pdbi, pdbi + natoms, pdbicomp);
767 /* pdba is sorted in pdbnew using the pdbi index */
768 std::vector<int> a(natoms);
769 srenew(*newPdbAtoms, 1);
770 init_t_atoms((*newPdbAtoms), natoms, true);
771 (*newPdbAtoms)->nr = pdba->nr;
772 (*newPdbAtoms)->nres = pdba->nres;
773 srenew((*newPdbAtoms)->resinfo, pdba->nres);
774 std::copy(pdba->resinfo, pdba->resinfo + pdba->nres, (*newPdbAtoms)->resinfo);
775 for (int i = 0; i < natoms; i++)
777 (*newPdbAtoms)->atom[i] = pdba->atom[pdbi[i].index];
778 (*newPdbAtoms)->atomname[i] = pdba->atomname[pdbi[i].index];
779 (*newPdbAtoms)->pdbinfo[i] = pdba->pdbinfo[pdbi[i].index];
780 xnew.push_back(x->at(pdbi[i].index));
781 /* make indexgroup in block */
782 a[i] = pdbi[i].index;
784 /* clean up */
785 done_atom(pdba);
786 sfree(pdba);
787 /* copy the sorted pdbnew back to pdba */
788 *pdbaptr = *newPdbAtoms;
789 *x = xnew;
790 add_grp(block, gnames, a, "prot_sort");
791 sfree(pdbi);
794 static int remove_duplicate_atoms(t_atoms* pdba, gmx::ArrayRef<gmx::RVec> x, bool bVerbose, const gmx::MDLogger& logger)
796 int i, j, oldnatoms, ndel;
797 t_resinfo* ri;
799 GMX_LOG(logger.info).asParagraph().appendTextFormatted("Checking for duplicate atoms....");
800 oldnatoms = pdba->nr;
801 ndel = 0;
802 /* NOTE: pdba->nr is modified inside the loop */
803 for (i = 1; (i < pdba->nr); i++)
805 /* compare 'i' and 'i-1', throw away 'i' if they are identical
806 this is a 'while' because multiple alternate locations can be present */
807 while ((i < pdba->nr) && (pdba->atom[i - 1].resind == pdba->atom[i].resind)
808 && (strcmp(*pdba->atomname[i - 1], *pdba->atomname[i]) == 0))
810 ndel++;
811 if (bVerbose)
813 ri = &pdba->resinfo[pdba->atom[i].resind];
814 GMX_LOG(logger.info)
815 .asParagraph()
816 .appendTextFormatted("deleting duplicate atom %4s %s%4d%c",
817 *pdba->atomname[i], *ri->name, ri->nr, ri->ic);
818 if (ri->chainid && (ri->chainid != ' '))
820 printf(" ch %c", ri->chainid);
822 if (pdba->pdbinfo)
824 if (pdba->pdbinfo[i].atomnr)
826 printf(" pdb nr %4d", pdba->pdbinfo[i].atomnr);
828 if (pdba->pdbinfo[i].altloc && (pdba->pdbinfo[i].altloc != ' '))
830 printf(" altloc %c", pdba->pdbinfo[i].altloc);
833 printf("\n");
835 pdba->nr--;
836 /* We can not free, since it might be in the symtab */
837 /* sfree(pdba->atomname[i]); */
838 for (j = i; j < pdba->nr; j++)
840 pdba->atom[j] = pdba->atom[j + 1];
841 pdba->atomname[j] = pdba->atomname[j + 1];
842 if (pdba->pdbinfo)
844 pdba->pdbinfo[j] = pdba->pdbinfo[j + 1];
846 copy_rvec(x[j + 1], x[j]);
848 srenew(pdba->atom, pdba->nr);
849 /* srenew(pdba->atomname, pdba->nr); */
850 srenew(pdba->pdbinfo, pdba->nr);
853 if (pdba->nr != oldnatoms)
855 GMX_LOG(logger.info)
856 .asParagraph()
857 .appendTextFormatted("Now there are %d atoms. Deleted %d duplicates.", pdba->nr, ndel);
860 return pdba->nr;
863 static void checkResidueTypeSanity(t_atoms* pdba, int r0, int r1, ResidueType* rt)
865 std::string startResidueString =
866 gmx::formatString("%s%d", *pdba->resinfo[r0].name, pdba->resinfo[r0].nr);
867 std::string endResidueString =
868 gmx::formatString("%s%d", *pdba->resinfo[r1 - 1].name, pdba->resinfo[r1 - 1].nr);
870 // Check whether all residues in chain have the same chain ID.
871 bool allResiduesHaveSameChainID = true;
872 char chainID0 = pdba->resinfo[r0].chainid;
873 char chainID;
874 std::string residueString;
876 for (int i = r0 + 1; i < r1; i++)
878 chainID = pdba->resinfo[i].chainid;
879 if (chainID != chainID0)
881 allResiduesHaveSameChainID = false;
882 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
883 break;
887 if (!allResiduesHaveSameChainID)
889 gmx_fatal(FARGS,
890 "The chain covering the range %s--%s does not have a consistent chain ID. "
891 "The first residue has ID '%c', while residue %s has ID '%c'.",
892 startResidueString.c_str(), endResidueString.c_str(), chainID0,
893 residueString.c_str(), chainID);
896 // At this point all residues have the same ID. If they are also non-blank
897 // we can be a bit more aggressive and require the types match too.
898 if (chainID0 != ' ')
900 bool allResiduesHaveSameType = true;
901 std::string restype;
902 std::string restype0 = rt->typeOfNamedDatabaseResidue(*pdba->resinfo[r0].name);
904 for (int i = r0 + 1; i < r1; i++)
906 restype = rt->typeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
907 if (!gmx::equalCaseInsensitive(restype, restype0))
909 allResiduesHaveSameType = false;
910 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
911 break;
915 if (!allResiduesHaveSameType)
917 gmx_fatal(FARGS,
918 "The residues in the chain %s--%s do not have a consistent type. "
919 "The first residue has type '%s', while residue %s is of type '%s'. "
920 "Either there is a mistake in your chain, or it includes nonstandard "
921 "residue names that have not yet been added to the residuetypes.dat "
922 "file in the GROMACS library directory. If there are other molecules "
923 "such as ligands, they should not have the same chain ID as the "
924 "adjacent protein chain since it's a separate molecule.",
925 startResidueString.c_str(), endResidueString.c_str(), restype0.c_str(),
926 residueString.c_str(), restype.c_str());
931 static void find_nc_ter(t_atoms* pdba, int r0, int r1, int* r_start, int* r_end, ResidueType* rt, const gmx::MDLogger& logger)
933 int i;
934 std::optional<std::string> startrestype;
936 *r_start = -1;
937 *r_end = -1;
939 int startWarnings = 0;
940 int endWarnings = 0;
941 int ionNotes = 0;
943 // Check that all residues have the same chain identifier, and if it is
944 // non-blank we also require the residue types to match.
945 checkResidueTypeSanity(pdba, r0, r1, rt);
947 // If we return correctly from checkResidueTypeSanity(), the only
948 // remaining cases where we can have non-matching residue types is if
949 // the chain ID was blank, which could be the case e.g. for a structure
950 // read from a GRO file or other file types without chain information.
951 // In that case we need to be a bit more liberal and detect chains based
952 // on the residue type.
954 // If we get here, the chain ID must be identical for all residues
955 char chainID = pdba->resinfo[r0].chainid;
957 /* Find the starting terminus (typially N or 5') */
958 for (i = r0; i < r1 && *r_start == -1; i++)
960 startrestype = rt->optionalTypeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
961 if (!startrestype)
963 continue;
965 if (gmx::equalCaseInsensitive(*startrestype, "Protein")
966 || gmx::equalCaseInsensitive(*startrestype, "DNA")
967 || gmx::equalCaseInsensitive(*startrestype, "RNA"))
969 GMX_LOG(logger.info)
970 .asParagraph()
971 .appendTextFormatted("Identified residue %s%d as a starting terminus.",
972 *pdba->resinfo[i].name, pdba->resinfo[i].nr);
973 *r_start = i;
975 else if (gmx::equalCaseInsensitive(*startrestype, "Ion"))
977 if (ionNotes < 5)
979 GMX_LOG(logger.info)
980 .asParagraph()
981 .appendTextFormatted(
982 "Residue %s%d has type 'Ion', assuming it is not linked into a "
983 "chain.",
984 *pdba->resinfo[i].name, pdba->resinfo[i].nr);
986 if (ionNotes == 4)
988 GMX_LOG(logger.info)
989 .asParagraph()
990 .appendTextFormatted("Disabling further notes about ions.");
992 ionNotes++;
994 else
996 // Either no known residue type, or one not needing special handling
997 if (startWarnings < 5)
999 if (chainID == ' ')
1001 GMX_LOG(logger.warning)
1002 .asParagraph()
1003 .appendTextFormatted(
1004 "Starting residue %s%d in chain not identified as "
1005 "Protein/RNA/DNA. "
1006 "This chain lacks identifiers, which makes it impossible to do "
1007 "strict "
1008 "classification of the start/end residues. Here we need to "
1009 "guess this residue "
1010 "should not be part of the chain and instead introduce a "
1011 "break, but that will "
1012 "be catastrophic if they should in fact be linked. Please "
1013 "check your structure, "
1014 "and add %s to residuetypes.dat if this was not correct.",
1015 *pdba->resinfo[i].name, pdba->resinfo[i].nr, *pdba->resinfo[i].name);
1017 else
1019 GMX_LOG(logger.warning)
1020 .asParagraph()
1021 .appendTextFormatted(
1022 "No residues in chain starting at %s%d identified as "
1023 "Protein/RNA/DNA. "
1024 "This makes it impossible to link them into a molecule, which "
1025 "could either be "
1026 "correct or a catastrophic error. Please check your structure, "
1027 "and add all "
1028 "necessary residue names to residuetypes.dat if this was not "
1029 "correct.",
1030 *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1033 if (startWarnings == 4)
1035 GMX_LOG(logger.warning)
1036 .asParagraph()
1037 .appendTextFormatted(
1038 "Disabling further warnings about unidentified residues at start "
1039 "of chain.");
1041 startWarnings++;
1045 if (*r_start >= 0)
1047 /* Go through the rest of the residues, check that they are the same class, and identify the ending terminus. */
1048 for (int i = *r_start; i < r1; i++)
1050 std::optional<std::string> restype =
1051 rt->optionalTypeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
1052 if (!restype)
1054 continue;
1056 if (gmx::equalCaseInsensitive(*restype, *startrestype) && endWarnings == 0)
1058 *r_end = i;
1060 else if (gmx::equalCaseInsensitive(*startrestype, "Ion"))
1062 if (ionNotes < 5)
1064 GMX_LOG(logger.info)
1065 .asParagraph()
1066 .appendTextFormatted(
1067 "Residue %s%d has type 'Ion', assuming it is not linked into a "
1068 "chain.",
1069 *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1071 if (ionNotes == 4)
1073 GMX_LOG(logger.info)
1074 .asParagraph()
1075 .appendTextFormatted("Disabling further notes about ions.");
1077 ionNotes++;
1079 else
1081 // Either no known residue type, or one not needing special handling.
1082 GMX_RELEASE_ASSERT(chainID == ' ', "Chain ID must be blank");
1083 // Otherwise the call to checkResidueTypeSanity() will
1084 // have caught the problem.
1085 if (endWarnings < 5)
1087 GMX_LOG(logger.warning)
1088 .asParagraph()
1089 .appendTextFormatted(
1090 "Residue %s%d in chain has different type ('%s') from "
1091 "residue %s%d ('%s'). This chain lacks identifiers, which "
1092 "makes "
1093 "it impossible to do strict classification of the start/end "
1094 "residues. Here we "
1095 "need to guess this residue should not be part of the chain "
1096 "and instead "
1097 "introduce a break, but that will be catastrophic if they "
1098 "should in fact be "
1099 "linked. Please check your structure, and add %s to "
1100 "residuetypes.dat "
1101 "if this was not correct.",
1102 *pdba->resinfo[i].name, pdba->resinfo[i].nr, restype->c_str(),
1103 *pdba->resinfo[*r_start].name, pdba->resinfo[*r_start].nr,
1104 startrestype->c_str(), *pdba->resinfo[i].name);
1106 if (endWarnings == 4)
1108 GMX_LOG(logger.warning)
1109 .asParagraph()
1110 .appendTextFormatted(
1111 "Disabling further warnings about unidentified residues at end "
1112 "of chain.");
1114 endWarnings++;
1119 if (*r_end >= 0)
1121 GMX_LOG(logger.info)
1122 .asParagraph()
1123 .appendTextFormatted("Identified residue %s%d as a ending terminus.",
1124 *pdba->resinfo[*r_end].name, pdba->resinfo[*r_end].nr);
1128 enum class ChainSeparationType : int
1130 IdOrTer,
1131 IdAndTer,
1132 Ter,
1134 Interactive,
1135 Count
1137 static const gmx::EnumerationArray<ChainSeparationType, const char*> c_chainSeparationTypeNames = {
1138 { "id_or_ter", "id_and_ter", "ter", "id", "interactive" }
1140 static const gmx::EnumerationArray<ChainSeparationType, const char*> c_chainSeparationTypeNotificationMessages = {
1141 { "Splitting chemical chains based on TER records or chain id changing.\n",
1142 "Splitting chemical chains based on TER records and chain id changing.\n",
1143 "Splitting chemical chains based on TER records only (ignoring chain id).\n",
1144 "Splitting chemical chains based on changing chain id only (ignoring TER records).\n",
1145 "Splitting chemical chains interactively.\n" }
1148 static void modify_chain_numbers(t_atoms* pdba, ChainSeparationType chainSeparation, const gmx::MDLogger& logger)
1150 int i;
1151 char old_prev_chainid;
1152 char old_this_chainid;
1153 int old_prev_chainnum;
1154 int old_this_chainnum;
1155 t_resinfo* ri;
1156 char select[STRLEN];
1157 int new_chainnum;
1158 int this_atomnum;
1159 int prev_atomnum;
1160 const char* prev_atomname;
1161 const char* this_atomname;
1162 const char* prev_resname;
1163 const char* this_resname;
1164 int prev_resnum;
1165 int this_resnum;
1166 char prev_chainid;
1167 char this_chainid;
1169 /* The default chain enumeration is based on TER records only */
1170 printf("%s", c_chainSeparationTypeNotificationMessages[chainSeparation]);
1172 old_prev_chainid = '?';
1173 old_prev_chainnum = -1;
1174 new_chainnum = -1;
1176 this_atomname = nullptr;
1177 this_atomnum = -1;
1178 this_resname = nullptr;
1179 this_resnum = -1;
1180 this_chainid = '?';
1182 for (i = 0; i < pdba->nres; i++)
1184 ri = &pdba->resinfo[i];
1185 old_this_chainid = ri->chainid;
1186 old_this_chainnum = ri->chainnum;
1188 prev_atomname = this_atomname;
1189 prev_atomnum = this_atomnum;
1190 prev_resname = this_resname;
1191 prev_resnum = this_resnum;
1192 prev_chainid = this_chainid;
1194 this_atomname = *(pdba->atomname[i]);
1195 this_atomnum = (pdba->pdbinfo != nullptr) ? pdba->pdbinfo[i].atomnr : i + 1;
1196 this_resname = *ri->name;
1197 this_resnum = ri->nr;
1198 this_chainid = ri->chainid;
1200 switch (chainSeparation)
1202 case ChainSeparationType::IdOrTer:
1203 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1205 new_chainnum++;
1207 break;
1209 case ChainSeparationType::IdAndTer:
1210 if (old_this_chainid != old_prev_chainid && old_this_chainnum != old_prev_chainnum)
1212 new_chainnum++;
1214 break;
1216 case ChainSeparationType::Id:
1217 if (old_this_chainid != old_prev_chainid)
1219 new_chainnum++;
1221 break;
1223 case ChainSeparationType::Ter:
1224 if (old_this_chainnum != old_prev_chainnum)
1226 new_chainnum++;
1228 break;
1229 case ChainSeparationType::Interactive:
1230 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1232 if (i > 0)
1234 GMX_LOG(logger.info)
1235 .asParagraph()
1236 .appendTextFormatted(
1237 "Split the chain (and introduce termini) between residue %s%d (chain id '%c', atom %d %s)\
1239 "and residue %s%d (chain id '%c', atom %d %s) ? [n/y]",
1240 prev_resname, prev_resnum, prev_chainid, prev_atomnum,
1241 prev_atomname, this_resname, this_resnum, this_chainid,
1242 this_atomnum, this_atomname);
1244 if (nullptr == fgets(select, STRLEN - 1, stdin))
1246 gmx_fatal(FARGS, "Error reading from stdin");
1249 if (i == 0 || select[0] == 'y')
1251 new_chainnum++;
1254 break;
1255 case ChainSeparationType::Count:
1256 gmx_fatal(FARGS, "Internal inconsistency - this shouldn't happen...");
1258 old_prev_chainid = old_this_chainid;
1259 old_prev_chainnum = old_this_chainnum;
1261 ri->chainnum = new_chainnum;
1265 struct t_pdbchain
1267 char chainid = ' ';
1268 char chainnum = ' ';
1269 int start = -1;
1270 int natom = -1;
1271 bool bAllWat = false;
1272 int nterpairs = -1;
1273 std::vector<int> chainstart;
1276 struct t_chain
1278 char chainid = ' ';
1279 int chainnum = ' ';
1280 bool bAllWat = false;
1281 int nterpairs = -1;
1282 std::vector<int> chainstart;
1283 std::vector<MoleculePatchDatabase*> ntdb;
1284 std::vector<MoleculePatchDatabase*> ctdb;
1285 std::vector<int> r_start;
1286 std::vector<int> r_end;
1287 t_atoms* pdba;
1288 std::vector<gmx::RVec> x;
1291 enum class VSitesType : int
1293 None,
1294 Hydrogens,
1295 Aromatics,
1296 Count
1298 static const gmx::EnumerationArray<VSitesType, const char*> c_vsitesTypeNames = {
1299 { "none", "hydrogens", "aromatics" }
1302 enum class WaterType : int
1304 Select,
1305 None,
1306 Spc,
1307 SpcE,
1308 Tip3p,
1309 Tip4p,
1310 Tip5p,
1311 Tips3p,
1312 Count
1314 static const gmx::EnumerationArray<WaterType, const char*> c_waterTypeNames = {
1315 { "select", "none", "spc", "spce", "tip3p", "tip4p", "tip5p", "tips3p" }
1318 enum class MergeType : int
1321 All,
1322 Interactive,
1323 Count
1325 static const gmx::EnumerationArray<MergeType, const char*> c_mergeTypeNames = { { "no", "all",
1326 "interactive" } };
1328 namespace gmx
1331 namespace
1334 class pdb2gmx : public ICommandLineOptionsModule
1336 public:
1337 pdb2gmx() :
1338 bVsites_(FALSE),
1339 bPrevWat_(FALSE),
1340 bVsiteAromatics_(FALSE),
1341 chainSeparation_(ChainSeparationType::IdOrTer),
1342 vsitesType_(VSitesType::None),
1343 waterType_(WaterType::Select),
1344 mergeType_(MergeType::No),
1345 itp_file_(nullptr),
1346 mHmult_(0)
1348 gmx::LoggerBuilder builder;
1349 builder.addTargetStream(gmx::MDLogger::LogLevel::Info, &gmx::TextOutputFile::standardOutput());
1350 builder.addTargetStream(gmx::MDLogger::LogLevel::Warning, &gmx::TextOutputFile::standardError());
1351 loggerOwner_ = std::make_unique<LoggerOwner>(builder.build());
1354 // From ICommandLineOptionsModule
1355 void init(CommandLineModuleSettings* /*settings*/) override {}
1357 void initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings) override;
1359 void optionsFinished() override;
1361 int run() override;
1363 private:
1364 bool bNewRTP_;
1365 bool bInter_;
1366 bool bCysMan_;
1367 bool bLysMan_;
1368 bool bAspMan_;
1369 bool bGluMan_;
1370 bool bHisMan_;
1371 bool bGlnMan_;
1372 bool bArgMan_;
1373 bool bTerMan_;
1374 bool bUnA_;
1375 bool bHeavyH_;
1376 bool bSort_;
1377 bool bAllowMissing_;
1378 bool bRemoveH_;
1379 bool bDeuterate_;
1380 bool bVerbose_;
1381 bool bChargeGroups_;
1382 bool bCmap_;
1383 bool bRenumRes_;
1384 bool bRTPresname_;
1385 bool bIndexSet_;
1386 bool bOutputSet_;
1387 bool bVsites_;
1388 bool bWat_;
1389 bool bPrevWat_;
1390 bool bITP_;
1391 bool bVsiteAromatics_;
1392 real angle_;
1393 real distance_;
1394 real posre_fc_;
1395 real long_bond_dist_;
1396 real short_bond_dist_;
1398 std::string indexOutputFile_;
1399 std::string outputFile_;
1400 std::string topologyFile_;
1401 std::string includeTopologyFile_;
1402 std::string outputConfFile_;
1403 std::string inputConfFile_;
1404 std::string outFile_;
1405 std::string ff_;
1407 ChainSeparationType chainSeparation_;
1408 VSitesType vsitesType_;
1409 WaterType waterType_;
1410 MergeType mergeType_;
1412 FILE* itp_file_;
1413 char forcefield_[STRLEN];
1414 char ffdir_[STRLEN];
1415 char* ffname_;
1416 char* watermodel_;
1417 std::vector<std::string> incls_;
1418 std::vector<t_mols> mols_;
1419 real mHmult_;
1420 std::unique_ptr<LoggerOwner> loggerOwner_;
1423 void pdb2gmx::initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings)
1425 const char* desc[] = {
1426 "[THISMODULE] reads a [REF].pdb[ref] (or [REF].gro[ref]) file, reads",
1427 "some database files, adds hydrogens to the molecules and generates",
1428 "coordinates in GROMACS (GROMOS), or optionally [REF].pdb[ref], format",
1429 "and a topology in GROMACS format.",
1430 "These files can subsequently be processed to generate a run input file.",
1431 "[PAR]",
1432 "[THISMODULE] will search for force fields by looking for",
1433 "a [TT]forcefield.itp[tt] file in subdirectories [TT]<forcefield>.ff[tt]",
1434 "of the current working directory and of the GROMACS library directory",
1435 "as inferred from the path of the binary or the [TT]GMXLIB[tt] environment",
1436 "variable.",
1437 "By default the forcefield selection is interactive,",
1438 "but you can use the [TT]-ff[tt] option to specify one of the short names",
1439 "in the list on the command line instead. In that case [THISMODULE] just looks",
1440 "for the corresponding [TT]<forcefield>.ff[tt] directory.",
1441 "[PAR]",
1442 "After choosing a force field, all files will be read only from",
1443 "the corresponding force field directory.",
1444 "If you want to modify or add a residue types, you can copy the force",
1445 "field directory from the GROMACS library directory to your current",
1446 "working directory. If you want to add new protein residue types,",
1447 "you will need to modify [TT]residuetypes.dat[tt] in the library directory",
1448 "or copy the whole library directory to a local directory and set",
1449 "the environment variable [TT]GMXLIB[tt] to the name of that directory.",
1450 "Check Chapter 5 of the manual for more information about file formats.",
1451 "[PAR]",
1453 "Note that a [REF].pdb[ref] file is nothing more than a file format, and it",
1454 "need not necessarily contain a protein structure. Every kind of",
1455 "molecule for which there is support in the database can be converted.",
1456 "If there is no support in the database, you can add it yourself.[PAR]",
1458 "The program has limited intelligence, it reads a number of database",
1459 "files, that allow it to make special bonds (Cys-Cys, Heme-His, etc.),",
1460 "if necessary this can be done manually. The program can prompt the",
1461 "user to select which kind of LYS, ASP, GLU, CYS or HIS residue is",
1462 "desired. For Lys the choice is between neutral (two protons on NZ) or",
1463 "protonated (three protons, default), for Asp and Glu unprotonated",
1464 "(default) or protonated, for His the proton can be either on ND1,",
1465 "on NE2 or on both. By default these selections are done automatically.",
1466 "For His, this is based on an optimal hydrogen bonding",
1467 "conformation. Hydrogen bonds are defined based on a simple geometric",
1468 "criterion, specified by the maximum hydrogen-donor-acceptor angle",
1469 "and donor-acceptor distance, which are set by [TT]-angle[tt] and",
1470 "[TT]-dist[tt] respectively.[PAR]",
1472 "The protonation state of N- and C-termini can be chosen interactively",
1473 "with the [TT]-ter[tt] flag. Default termini are ionized (NH3+ and COO-),",
1474 "respectively. Some force fields support zwitterionic forms for chains of",
1475 "one residue, but for polypeptides these options should NOT be selected.",
1476 "The AMBER force fields have unique forms for the terminal residues,",
1477 "and these are incompatible with the [TT]-ter[tt] mechanism. You need",
1478 "to prefix your N- or C-terminal residue names with \"N\" or \"C\"",
1479 "respectively to use these forms, making sure you preserve the format",
1480 "of the coordinate file. Alternatively, use named terminating residues",
1481 "(e.g. ACE, NME).[PAR]",
1483 "The separation of chains is not entirely trivial since the markup",
1484 "in user-generated PDB files frequently varies and sometimes it",
1485 "is desirable to merge entries across a TER record, for instance",
1486 "if you want a disulfide bridge or distance restraints between",
1487 "two protein chains or if you have a HEME group bound to a protein.",
1488 "In such cases multiple chains should be contained in a single",
1489 "[TT]moleculetype[tt] definition.",
1490 "To handle this, [THISMODULE] uses two separate options.",
1491 "First, [TT]-chainsep[tt] allows you to choose when a new chemical chain should",
1492 "start, and termini added when applicable. This can be done based on the",
1493 "existence of TER records, when the chain id changes, or combinations of either",
1494 "or both of these. You can also do the selection fully interactively.",
1495 "In addition, there is a [TT]-merge[tt] option that controls how multiple chains",
1496 "are merged into one moleculetype, after adding all the chemical termini (or not).",
1497 "This can be turned off (no merging), all non-water chains can be merged into a",
1498 "single molecule, or the selection can be done interactively.[PAR]",
1500 "[THISMODULE] will also check the occupancy field of the [REF].pdb[ref] file.",
1501 "If any of the occupancies are not one, indicating that the atom is",
1502 "not resolved well in the structure, a warning message is issued.",
1503 "When a [REF].pdb[ref] file does not originate from an X-ray structure determination",
1504 "all occupancy fields may be zero. Either way, it is up to the user",
1505 "to verify the correctness of the input data (read the article!).[PAR]",
1507 "During processing the atoms will be reordered according to GROMACS",
1508 "conventions. With [TT]-n[tt] an index file can be generated that",
1509 "contains one group reordered in the same way. This allows you to",
1510 "convert a GROMOS trajectory and coordinate file to GROMOS. There is",
1511 "one limitation: reordering is done after the hydrogens are stripped",
1512 "from the input and before new hydrogens are added. This means that",
1513 "you should not use [TT]-ignh[tt].[PAR]",
1515 "The [REF].gro[ref] and [TT].g96[tt] file formats do not support chain",
1516 "identifiers. Therefore it is useful to enter a [REF].pdb[ref] file name at",
1517 "the [TT]-o[tt] option when you want to convert a multi-chain [REF].pdb[ref] file.",
1518 "[PAR]",
1520 "The option [TT]-vsite[tt] removes hydrogen and fast improper dihedral",
1521 "motions. Angular and out-of-plane motions can be removed by changing",
1522 "hydrogens into virtual sites and fixing angles, which fixes their",
1523 "position relative to neighboring atoms. Additionally, all atoms in the",
1524 "aromatic rings of the standard amino acids (i.e. PHE, TRP, TYR and HIS)",
1525 "can be converted into virtual sites, eliminating the fast improper dihedral",
1526 "fluctuations in these rings (but this feature is deprecated).",
1527 "[BB]Note[bb] that in this case all other hydrogen",
1528 "atoms are also converted to virtual sites. The mass of all atoms that are",
1529 "converted into virtual sites, is added to the heavy atoms.[PAR]",
1530 "Also slowing down of dihedral motion can be done with [TT]-heavyh[tt]",
1531 "done by increasing the hydrogen-mass by a factor of 4. This is also",
1532 "done for water hydrogens to slow down the rotational motion of water.",
1533 "The increase in mass of the hydrogens is subtracted from the bonded",
1534 "(heavy) atom so that the total mass of the system remains the same."
1537 settings->setHelpText(desc);
1539 options->addOption(BooleanOption("newrtp").store(&bNewRTP_).defaultValue(false).hidden().description(
1540 "Write the residue database in new format to [TT]new.rtp[tt]"));
1541 options->addOption(
1542 RealOption("lb").store(&long_bond_dist_).defaultValue(0.25).hidden().description("Long bond warning distance"));
1543 options->addOption(
1544 RealOption("sb").store(&short_bond_dist_).defaultValue(0.05).hidden().description("Short bond warning distance"));
1545 options->addOption(EnumOption<ChainSeparationType>("chainsep")
1546 .enumValue(c_chainSeparationTypeNames)
1547 .store(&chainSeparation_)
1548 .description("Condition in PDB files when a new chain should be "
1549 "started (adding termini)"));
1550 options->addOption(EnumOption<MergeType>("merge")
1551 .enumValue(c_mergeTypeNames)
1552 .store(&mergeType_)
1553 .description("Merge multiple chains into a single [moleculetype]"));
1554 options->addOption(StringOption("ff").store(&ff_).defaultValue("select").description(
1555 "Force field, interactive by default. Use [TT]-h[tt] for information."));
1556 options->addOption(
1557 EnumOption<WaterType>("water").store(&waterType_).enumValue(c_waterTypeNames).description("Water model to use"));
1558 options->addOption(BooleanOption("inter").store(&bInter_).defaultValue(false).description(
1559 "Set the next 8 options to interactive"));
1560 options->addOption(BooleanOption("ss").store(&bCysMan_).defaultValue(false).description(
1561 "Interactive SS bridge selection"));
1562 options->addOption(BooleanOption("ter").store(&bTerMan_).defaultValue(false).description(
1563 "Interactive termini selection, instead of charged (default)"));
1564 options->addOption(BooleanOption("lys").store(&bLysMan_).defaultValue(false).description(
1565 "Interactive lysine selection, instead of charged"));
1566 options->addOption(BooleanOption("arg").store(&bArgMan_).defaultValue(false).description(
1567 "Interactive arginine selection, instead of charged"));
1568 options->addOption(BooleanOption("asp").store(&bAspMan_).defaultValue(false).description(
1569 "Interactive aspartic acid selection, instead of charged"));
1570 options->addOption(BooleanOption("glu").store(&bGluMan_).defaultValue(false).description(
1571 "Interactive glutamic acid selection, instead of charged"));
1572 options->addOption(BooleanOption("gln").store(&bGlnMan_).defaultValue(false).description(
1573 "Interactive glutamine selection, instead of charged"));
1574 options->addOption(BooleanOption("his").store(&bHisMan_).defaultValue(false).description(
1575 "Interactive histidine selection, instead of checking H-bonds"));
1576 options->addOption(RealOption("angle").store(&angle_).defaultValue(135.0).description(
1577 "Minimum hydrogen-donor-acceptor angle for a H-bond (degrees)"));
1578 options->addOption(
1579 RealOption("dist").store(&distance_).defaultValue(0.3).description("Maximum donor-acceptor distance for a H-bond (nm)"));
1580 options->addOption(BooleanOption("una").store(&bUnA_).defaultValue(false).description(
1581 "Select aromatic rings with united CH atoms on phenylalanine, tryptophane and "
1582 "tyrosine"));
1583 options->addOption(BooleanOption("sort").store(&bSort_).defaultValue(true).hidden().description(
1584 "Sort the residues according to database, turning this off is dangerous as charge "
1585 "groups might be broken in parts"));
1586 options->addOption(
1587 BooleanOption("ignh").store(&bRemoveH_).defaultValue(false).description("Ignore hydrogen atoms that are in the coordinate file"));
1588 options->addOption(
1589 BooleanOption("missing")
1590 .store(&bAllowMissing_)
1591 .defaultValue(false)
1592 .description(
1593 "Continue when atoms are missing and bonds cannot be made, dangerous"));
1594 options->addOption(
1595 BooleanOption("v").store(&bVerbose_).defaultValue(false).description("Be slightly more verbose in messages"));
1596 options->addOption(
1597 RealOption("posrefc").store(&posre_fc_).defaultValue(1000).description("Force constant for position restraints"));
1598 options->addOption(EnumOption<VSitesType>("vsite")
1599 .store(&vsitesType_)
1600 .enumValue(c_vsitesTypeNames)
1601 .description("Convert atoms to virtual sites"));
1602 options->addOption(BooleanOption("heavyh").store(&bHeavyH_).defaultValue(false).description(
1603 "Make hydrogen atoms heavy"));
1604 options->addOption(
1605 BooleanOption("deuterate").store(&bDeuterate_).defaultValue(false).description("Change the mass of hydrogens to 2 amu"));
1606 options->addOption(BooleanOption("chargegrp")
1607 .store(&bChargeGroups_)
1608 .defaultValue(true)
1609 .description("Use charge groups in the [REF].rtp[ref] file"));
1610 options->addOption(BooleanOption("cmap").store(&bCmap_).defaultValue(true).description(
1611 "Use cmap torsions (if enabled in the [REF].rtp[ref] file)"));
1612 options->addOption(BooleanOption("renum")
1613 .store(&bRenumRes_)
1614 .defaultValue(false)
1615 .description("Renumber the residues consecutively in the output"));
1616 options->addOption(BooleanOption("rtpres")
1617 .store(&bRTPresname_)
1618 .defaultValue(false)
1619 .description("Use [REF].rtp[ref] entry names as residue names"));
1620 options->addOption(FileNameOption("f")
1621 .legacyType(efSTX)
1622 .inputFile()
1623 .store(&inputConfFile_)
1624 .required()
1625 .defaultBasename("protein")
1626 .defaultType(efPDB)
1627 .description("Structure file"));
1628 options->addOption(FileNameOption("o")
1629 .legacyType(efSTO)
1630 .outputFile()
1631 .store(&outputConfFile_)
1632 .required()
1633 .defaultBasename("conf")
1634 .description("Structure file"));
1635 options->addOption(FileNameOption("p")
1636 .legacyType(efTOP)
1637 .outputFile()
1638 .store(&topologyFile_)
1639 .required()
1640 .defaultBasename("topol")
1641 .description("Topology file"));
1642 options->addOption(FileNameOption("i")
1643 .legacyType(efITP)
1644 .outputFile()
1645 .store(&includeTopologyFile_)
1646 .required()
1647 .defaultBasename("posre")
1648 .description("Include file for topology"));
1649 options->addOption(FileNameOption("n")
1650 .legacyType(efNDX)
1651 .outputFile()
1652 .store(&indexOutputFile_)
1653 .storeIsSet(&bIndexSet_)
1654 .defaultBasename("index")
1655 .description("Index file"));
1656 options->addOption(FileNameOption("q")
1657 .legacyType(efSTO)
1658 .outputFile()
1659 .store(&outFile_)
1660 .storeIsSet(&bOutputSet_)
1661 .defaultBasename("clean")
1662 .defaultType(efPDB)
1663 .description("Structure file"));
1666 void pdb2gmx::optionsFinished()
1668 if (inputConfFile_.empty())
1670 GMX_THROW(InconsistentInputError("You must supply an input file"));
1672 if (bInter_)
1674 /* if anything changes here, also change description of -inter */
1675 bCysMan_ = true;
1676 bTerMan_ = true;
1677 bLysMan_ = true;
1678 bArgMan_ = true;
1679 bAspMan_ = true;
1680 bGluMan_ = true;
1681 bGlnMan_ = true;
1682 bHisMan_ = true;
1685 if (bHeavyH_)
1687 mHmult_ = 4.0;
1689 else if (bDeuterate_)
1691 mHmult_ = 2.0;
1693 else
1695 mHmult_ = 1.0;
1698 /* Force field selection, interactive or direct */
1699 choose_ff(strcmp(ff_.c_str(), "select") == 0 ? nullptr : ff_.c_str(), forcefield_,
1700 sizeof(forcefield_), ffdir_, sizeof(ffdir_), loggerOwner_->logger());
1702 if (strlen(forcefield_) > 0)
1704 ffname_ = forcefield_;
1705 ffname_[0] = std::toupper(ffname_[0]);
1707 else
1709 gmx_fatal(FARGS, "Empty forcefield string");
1713 int pdb2gmx::run()
1715 char select[STRLEN];
1716 std::vector<DisulfideBond> ssbonds;
1718 int this_atomnum;
1719 int prev_atomnum;
1720 const char* prev_atomname;
1721 const char* this_atomname;
1722 const char* prev_resname;
1723 const char* this_resname;
1724 int prev_resnum;
1725 int this_resnum;
1726 char prev_chainid;
1727 char this_chainid;
1728 int prev_chainnumber;
1729 int this_chainnumber;
1730 int this_chainstart;
1731 int prev_chainstart;
1733 const gmx::MDLogger& logger = loggerOwner_->logger();
1735 GMX_LOG(logger.info)
1736 .asParagraph()
1737 .appendTextFormatted("Using the %s force field in directory %s", ffname_, ffdir_);
1739 choose_watermodel(c_waterTypeNames[waterType_], ffdir_, &watermodel_, logger);
1741 switch (vsitesType_)
1743 case VSitesType::None:
1744 bVsites_ = false;
1745 bVsiteAromatics_ = false;
1746 break;
1747 case VSitesType::Hydrogens:
1748 bVsites_ = true;
1749 bVsiteAromatics_ = false;
1750 break;
1751 case VSitesType::Aromatics:
1752 bVsites_ = true;
1753 bVsiteAromatics_ = true;
1754 break;
1755 case VSitesType::Count:
1756 gmx_fatal(FARGS, "Internal inconsistency: VSitesType is out of range.");
1757 } /* end switch */
1759 /* Open the symbol table */
1760 t_symtab symtab;
1761 open_symtab(&symtab);
1763 /* Residue type database */
1764 ResidueType rt;
1766 /* Read residue renaming database(s), if present */
1767 std::vector<std::string> rrn = fflib_search_file_end(ffdir_, ".r2b", FALSE);
1769 std::vector<RtpRename> rtprename;
1770 for (const auto& filename : rrn)
1772 GMX_LOG(logger.info).asParagraph().appendTextFormatted("going to rename %s", filename.c_str());
1773 FILE* fp = fflib_open(filename);
1774 read_rtprename(filename.c_str(), fp, &rtprename);
1775 gmx_ffclose(fp);
1778 /* Add all alternative names from the residue renaming database to the list
1779 of recognized amino/nucleic acids. */
1780 for (const auto& rename : rtprename)
1782 /* Only add names if the 'standard' gromacs/iupac base name was found */
1783 if (auto restype = rt.optionalTypeOfNamedDatabaseResidue(rename.gmx))
1785 rt.addResidue(rename.main, *restype);
1786 rt.addResidue(rename.nter, *restype);
1787 rt.addResidue(rename.cter, *restype);
1788 rt.addResidue(rename.bter, *restype);
1792 matrix box;
1793 const char* watres;
1794 clear_mat(box);
1795 if (watermodel_ != nullptr && (strstr(watermodel_, "4p") || strstr(watermodel_, "4P")))
1797 watres = "HO4";
1799 else if (watermodel_ != nullptr && (strstr(watermodel_, "5p") || strstr(watermodel_, "5P")))
1801 watres = "HO5";
1803 else
1805 watres = "HOH";
1808 AtomProperties aps;
1809 char* title = nullptr;
1810 PbcType pbcType;
1811 t_atoms pdba_all;
1812 rvec* pdbx;
1813 int natom = read_pdball(inputConfFile_.c_str(), bOutputSet_, outFile_.c_str(), &title, &pdba_all,
1814 &pdbx, &pbcType, box, bRemoveH_, &symtab, &rt, watres, &aps, bVerbose_);
1816 if (natom == 0)
1818 std::string message = formatString("No atoms found in pdb file %s\n", inputConfFile_.c_str());
1819 GMX_THROW(InconsistentInputError(message));
1822 GMX_LOG(logger.info).asParagraph().appendTextFormatted("Analyzing pdb file");
1823 int nwaterchain = 0;
1825 modify_chain_numbers(&pdba_all, chainSeparation_, logger);
1827 int nchainmerges = 0;
1829 this_atomname = nullptr;
1830 this_atomnum = -1;
1831 this_resname = nullptr;
1832 this_resnum = -1;
1833 this_chainid = '?';
1834 this_chainnumber = -1;
1835 this_chainstart = 0;
1836 /* Keep the compiler happy */
1837 prev_chainstart = 0;
1839 int numChains = 0;
1840 std::vector<t_pdbchain> pdb_ch;
1842 t_resinfo* ri;
1843 bool bMerged = false;
1844 for (int i = 0; (i < natom); i++)
1846 ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
1848 /* TODO this should live in a helper object, and consolidate
1849 that with code in modify_chain_numbers */
1850 prev_atomname = this_atomname;
1851 prev_atomnum = this_atomnum;
1852 prev_resname = this_resname;
1853 prev_resnum = this_resnum;
1854 prev_chainid = this_chainid;
1855 prev_chainnumber = this_chainnumber;
1856 if (!bMerged)
1858 prev_chainstart = this_chainstart;
1861 this_atomname = *pdba_all.atomname[i];
1862 this_atomnum = (pdba_all.pdbinfo != nullptr) ? pdba_all.pdbinfo[i].atomnr : i + 1;
1863 this_resname = *ri->name;
1864 this_resnum = ri->nr;
1865 this_chainid = ri->chainid;
1866 this_chainnumber = ri->chainnum;
1868 bWat_ = gmx::equalCaseInsensitive(*ri->name, watres);
1870 if ((i == 0) || (this_chainnumber != prev_chainnumber) || (bWat_ != bPrevWat_))
1872 GMX_RELEASE_ASSERT(
1873 pdba_all.pdbinfo,
1874 "Must have pdbinfo from reading a PDB file if chain number is changing");
1875 this_chainstart = pdba_all.atom[i].resind;
1876 bMerged = false;
1877 if (i > 0 && !bWat_)
1879 if (!strncmp(c_mergeTypeNames[mergeType_], "int", 3))
1881 GMX_LOG(logger.info)
1882 .asParagraph()
1883 .appendTextFormatted(
1884 "Merge chain ending with residue %s%d (chain id '%c', atom %d "
1885 "%s) and chain starting with "
1886 "residue %s%d (chain id '%c', atom %d %s) into a single "
1887 "moleculetype (keeping termini)? [n/y]",
1888 prev_resname, prev_resnum, prev_chainid, prev_atomnum,
1889 prev_atomname, this_resname, this_resnum, this_chainid,
1890 this_atomnum, this_atomname);
1892 if (nullptr == fgets(select, STRLEN - 1, stdin))
1894 gmx_fatal(FARGS, "Error reading from stdin");
1896 bMerged = (select[0] == 'y');
1898 else if (!strncmp(c_mergeTypeNames[mergeType_], "all", 3))
1900 bMerged = true;
1904 if (bMerged)
1906 pdb_ch[numChains - 1].chainstart[pdb_ch[numChains - 1].nterpairs] =
1907 pdba_all.atom[i].resind - prev_chainstart;
1908 pdb_ch[numChains - 1].nterpairs++;
1909 pdb_ch[numChains - 1].chainstart.resize(pdb_ch[numChains - 1].nterpairs + 1);
1910 nchainmerges++;
1912 else
1914 /* set natom for previous chain */
1915 if (numChains > 0)
1917 pdb_ch[numChains - 1].natom = i - pdb_ch[numChains - 1].start;
1919 if (bWat_)
1921 nwaterchain++;
1922 ri->chainid = ' ';
1924 /* check if chain identifier was used before */
1925 for (int j = 0; (j < numChains); j++)
1927 if (pdb_ch[j].chainid != ' ' && pdb_ch[j].chainid == ri->chainid)
1929 GMX_LOG(logger.warning)
1930 .asParagraph()
1931 .appendTextFormatted(
1932 "Chain identifier '%c' is used in two non-sequential "
1933 "blocks. "
1934 "They will be treated as separate chains unless you "
1935 "reorder your file.",
1936 ri->chainid);
1939 t_pdbchain newChain;
1940 newChain.chainid = ri->chainid;
1941 newChain.chainnum = ri->chainnum;
1942 newChain.start = i;
1943 newChain.bAllWat = bWat_;
1944 if (bWat_)
1946 newChain.nterpairs = 0;
1948 else
1950 newChain.nterpairs = 1;
1952 newChain.chainstart.resize(newChain.nterpairs + 1);
1953 /* modified [numChains] to [0] below */
1954 newChain.chainstart[0] = 0;
1955 pdb_ch.push_back(newChain);
1956 numChains++;
1959 bPrevWat_ = bWat_;
1961 pdb_ch.back().natom = natom - pdb_ch.back().start;
1963 /* set all the water blocks at the end of the chain */
1964 std::vector<int> swap_index(numChains);
1965 int j = 0;
1966 for (int i = 0; i < numChains; i++)
1968 if (!pdb_ch[i].bAllWat)
1970 swap_index[j] = i;
1971 j++;
1974 for (int i = 0; i < numChains; i++)
1976 if (pdb_ch[i].bAllWat)
1978 swap_index[j] = i;
1979 j++;
1982 if (nwaterchain > 1)
1984 GMX_LOG(logger.info)
1985 .asParagraph()
1986 .appendTextFormatted("Moved all the water blocks to the end");
1989 t_atoms* pdba;
1990 std::vector<t_chain> chains(numChains);
1991 /* copy pdb data and x for all chains */
1992 for (int i = 0; (i < numChains); i++)
1994 int si = swap_index[i];
1995 chains[i].chainid = pdb_ch[si].chainid;
1996 chains[i].chainnum = pdb_ch[si].chainnum;
1997 chains[i].bAllWat = pdb_ch[si].bAllWat;
1998 chains[i].nterpairs = pdb_ch[si].nterpairs;
1999 chains[i].chainstart = pdb_ch[si].chainstart;
2000 chains[i].ntdb.clear();
2001 chains[i].ctdb.clear();
2002 chains[i].r_start.resize(pdb_ch[si].nterpairs);
2003 chains[i].r_end.resize(pdb_ch[si].nterpairs);
2005 snew(chains[i].pdba, 1);
2006 init_t_atoms(chains[i].pdba, pdb_ch[si].natom, true);
2007 for (j = 0; j < chains[i].pdba->nr; j++)
2009 chains[i].pdba->atom[j] = pdba_all.atom[pdb_ch[si].start + j];
2010 chains[i].pdba->atomname[j] = put_symtab(&symtab, *pdba_all.atomname[pdb_ch[si].start + j]);
2011 chains[i].pdba->pdbinfo[j] = pdba_all.pdbinfo[pdb_ch[si].start + j];
2012 chains[i].x.emplace_back(pdbx[pdb_ch[si].start + j]);
2014 /* Re-index the residues assuming that the indices are continuous */
2015 int k = chains[i].pdba->atom[0].resind;
2016 int nres = chains[i].pdba->atom[chains[i].pdba->nr - 1].resind - k + 1;
2017 chains[i].pdba->nres = nres;
2018 for (int j = 0; j < chains[i].pdba->nr; j++)
2020 chains[i].pdba->atom[j].resind -= k;
2022 srenew(chains[i].pdba->resinfo, nres);
2023 for (int j = 0; j < nres; j++)
2025 chains[i].pdba->resinfo[j] = pdba_all.resinfo[k + j];
2026 chains[i].pdba->resinfo[j].name = put_symtab(&symtab, *pdba_all.resinfo[k + j].name);
2027 /* make all chain identifiers equal to that of the chain */
2028 chains[i].pdba->resinfo[j].chainid = pdb_ch[si].chainid;
2032 if (nchainmerges > 0)
2034 GMX_LOG(logger.info)
2035 .asParagraph()
2036 .appendTextFormatted("Merged chains into joint molecule definitions at %d places.",
2037 nchainmerges);
2040 GMX_LOG(logger.info)
2041 .asParagraph()
2042 .appendTextFormatted(
2043 "There are %d chains and %d blocks of water and "
2044 "%d residues with %d atoms",
2045 numChains - nwaterchain, nwaterchain, pdba_all.nres, natom);
2047 GMX_LOG(logger.info)
2048 .asParagraph()
2049 .appendTextFormatted(" %5s %4s %6s", "chain", "#res", "#atoms");
2050 for (int i = 0; (i < numChains); i++)
2052 GMX_LOG(logger.info)
2053 .asParagraph()
2054 .appendTextFormatted(" %d '%c' %5d %6d %s\n", i + 1,
2055 chains[i].chainid ? chains[i].chainid : '-', chains[i].pdba->nres,
2056 chains[i].pdba->nr, chains[i].bAllWat ? "(only water)" : "");
2059 check_occupancy(&pdba_all, inputConfFile_.c_str(), bVerbose_, logger);
2061 /* Read atomtypes... */
2062 PreprocessingAtomTypes atype = read_atype(ffdir_, &symtab);
2064 /* read residue database */
2065 GMX_LOG(logger.info).asParagraph().appendTextFormatted("Reading residue database... (%s)", forcefield_);
2066 std::vector<std::string> rtpf = fflib_search_file_end(ffdir_, ".rtp", true);
2067 std::vector<PreprocessResidue> rtpFFDB;
2068 for (const auto& filename : rtpf)
2070 readResidueDatabase(filename, &rtpFFDB, &atype, &symtab, logger, false);
2072 if (bNewRTP_)
2074 /* Not correct with multiple rtp input files with different bonded types */
2075 FILE* fp = gmx_fio_fopen("new.rtp", "w");
2076 print_resall(fp, rtpFFDB, atype);
2077 gmx_fio_fclose(fp);
2080 /* read hydrogen database */
2081 std::vector<MoleculePatchDatabase> ah;
2082 read_h_db(ffdir_, &ah);
2084 /* Read Termini database... */
2085 std::vector<MoleculePatchDatabase> ntdb;
2086 std::vector<MoleculePatchDatabase> ctdb;
2087 std::vector<MoleculePatchDatabase*> tdblist;
2088 int nNtdb = read_ter_db(ffdir_, 'n', &ntdb, &atype);
2089 int nCtdb = read_ter_db(ffdir_, 'c', &ctdb, &atype);
2091 FILE* top_file = gmx_fio_fopen(topologyFile_.c_str(), "w");
2093 print_top_header(top_file, topologyFile_.c_str(), FALSE, ffdir_, mHmult_);
2095 t_chain* cc;
2096 std::vector<gmx::RVec> x;
2097 /* new pdb datastructure for sorting. */
2098 t_atoms** sortAtoms = nullptr;
2099 t_atoms** localAtoms = nullptr;
2100 snew(sortAtoms, numChains);
2101 snew(localAtoms, numChains);
2102 for (int chain = 0; (chain < numChains); chain++)
2104 cc = &(chains[chain]);
2106 /* set pdba, natom and nres to the current chain */
2107 pdba = cc->pdba;
2108 x = cc->x;
2109 natom = cc->pdba->nr;
2110 int nres = cc->pdba->nres;
2112 if (cc->chainid && (cc->chainid != ' '))
2114 GMX_LOG(logger.info)
2115 .asParagraph()
2116 .appendTextFormatted("Processing chain %d '%c' (%d atoms, %d residues)",
2117 chain + 1, cc->chainid, natom, nres);
2119 else
2121 GMX_LOG(logger.info)
2122 .asParagraph()
2123 .appendTextFormatted("Processing chain %d (%d atoms, %d residues)", chain + 1,
2124 natom, nres);
2127 process_chain(pdba, x, bUnA_, bUnA_, bUnA_, bLysMan_, bAspMan_, bGluMan_, bHisMan_,
2128 bArgMan_, bGlnMan_, angle_, distance_, &symtab, rtprename);
2130 cc->chainstart[cc->nterpairs] = pdba->nres;
2131 j = 0;
2132 for (int i = 0; i < cc->nterpairs; i++)
2134 find_nc_ter(pdba, cc->chainstart[i], cc->chainstart[i + 1], &(cc->r_start[j]),
2135 &(cc->r_end[j]), &rt, logger);
2137 if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
2139 j++;
2142 cc->nterpairs = j;
2143 if (cc->nterpairs == 0)
2145 GMX_LOG(logger.info)
2146 .asParagraph()
2147 .appendTextFormatted(
2148 "Problem with chain definition, or missing terminal residues. "
2149 "This chain does not appear to contain a recognized chain molecule. "
2150 "If this is incorrect, you can edit residuetypes.dat to modify the "
2151 "behavior.");
2154 /* Check for disulfides and other special bonds */
2155 ssbonds = makeDisulfideBonds(pdba, gmx::as_rvec_array(x.data()), bCysMan_, bVerbose_);
2157 if (!rtprename.empty())
2159 rename_resrtp(pdba, cc->nterpairs, cc->r_start, cc->r_end, rtprename, &symtab, bVerbose_, logger);
2162 for (int i = 0; i < cc->nterpairs; i++)
2164 /* Set termini.
2165 * We first apply a filter so we only have the
2166 * termini that can be applied to the residue in question
2167 * (or a generic terminus if no-residue specific is available).
2169 /* First the N terminus */
2170 if (nNtdb > 0)
2172 tdblist = filter_ter(ntdb, *pdba->resinfo[cc->r_start[i]].name);
2173 if (tdblist.empty())
2175 GMX_LOG(logger.info)
2176 .asParagraph()
2177 .appendTextFormatted(
2178 "No suitable end (N or 5') terminus found in database - "
2179 "assuming this residue "
2180 "is already in a terminus-specific form and skipping terminus "
2181 "selection.");
2182 cc->ntdb.push_back(nullptr);
2184 else
2186 if (bTerMan_ && !tdblist.empty())
2188 sprintf(select, "Select start terminus type for %s-%d",
2189 *pdba->resinfo[cc->r_start[i]].name, pdba->resinfo[cc->r_start[i]].nr);
2190 cc->ntdb.push_back(choose_ter(tdblist, select));
2192 else
2194 cc->ntdb.push_back(tdblist[0]);
2197 printf("Start terminus %s-%d: %s\n", *pdba->resinfo[cc->r_start[i]].name,
2198 pdba->resinfo[cc->r_start[i]].nr, (cc->ntdb[i])->name.c_str());
2199 tdblist.clear();
2202 else
2204 cc->ntdb.push_back(nullptr);
2207 /* And the C terminus */
2208 if (nCtdb > 0)
2210 tdblist = filter_ter(ctdb, *pdba->resinfo[cc->r_end[i]].name);
2211 if (tdblist.empty())
2213 GMX_LOG(logger.info)
2214 .asParagraph()
2215 .appendTextFormatted(
2216 "No suitable end (C or 3') terminus found in database - "
2217 "assuming this residue"
2218 "is already in a terminus-specific form and skipping terminus "
2219 "selection.");
2220 cc->ctdb.push_back(nullptr);
2222 else
2224 if (bTerMan_ && !tdblist.empty())
2226 sprintf(select, "Select end terminus type for %s-%d",
2227 *pdba->resinfo[cc->r_end[i]].name, pdba->resinfo[cc->r_end[i]].nr);
2228 cc->ctdb.push_back(choose_ter(tdblist, select));
2230 else
2232 cc->ctdb.push_back(tdblist[0]);
2234 printf("End terminus %s-%d: %s\n", *pdba->resinfo[cc->r_end[i]].name,
2235 pdba->resinfo[cc->r_end[i]].nr, (cc->ctdb[i])->name.c_str());
2236 tdblist.clear();
2239 else
2241 cc->ctdb.push_back(nullptr);
2244 std::vector<MoleculePatchDatabase> hb_chain;
2245 /* lookup hackblocks and rtp for all residues */
2246 std::vector<PreprocessResidue> restp_chain;
2247 get_hackblocks_rtp(&hb_chain, &restp_chain, rtpFFDB, pdba->nres, pdba->resinfo, cc->nterpairs,
2248 &symtab, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end, bAllowMissing_, logger);
2249 /* ideally, now we would not need the rtp itself anymore, but do
2250 everything using the hb and restp arrays. Unfortunately, that
2251 requires some re-thinking of code in gen_vsite.c, which I won't
2252 do now :( AF 26-7-99 */
2254 rename_atoms(nullptr, ffdir_, pdba, &symtab, restp_chain, false, &rt, false, bVerbose_);
2256 match_atomnames_with_rtp(restp_chain, hb_chain, pdba, &symtab, x, bVerbose_, logger);
2258 if (bSort_)
2260 char** gnames;
2261 t_blocka* block = new_blocka();
2262 snew(gnames, 1);
2263 sort_pdbatoms(restp_chain, natom, &pdba, &sortAtoms[chain], &x, block, &gnames);
2264 remove_duplicate_atoms(pdba, x, bVerbose_, logger);
2265 if (bIndexSet_)
2267 if (bRemoveH_)
2269 GMX_LOG(logger.warning)
2270 .asParagraph()
2271 .appendTextFormatted(
2272 "With the -remh option the generated "
2273 "index file (%s) might be useless "
2274 "(the index file is generated before hydrogens are added)",
2275 indexOutputFile_.c_str());
2277 write_index(indexOutputFile_.c_str(), block, gnames, false, 0);
2279 for (int i = 0; i < block->nr; i++)
2281 sfree(gnames[i]);
2283 sfree(gnames);
2284 done_blocka(block);
2285 sfree(block);
2287 else
2289 GMX_LOG(logger.warning)
2290 .asParagraph()
2291 .appendTextFormatted(
2292 "Without sorting no check for duplicate atoms can be done");
2295 /* Generate Hydrogen atoms (and termini) in the sequence */
2296 GMX_LOG(logger.info)
2297 .asParagraph()
2298 .appendTextFormatted(
2299 "Generating any missing hydrogen atoms and/or adding termini.");
2300 add_h(&pdba, &localAtoms[chain], &x, ah, &symtab, cc->nterpairs, cc->ntdb, cc->ctdb,
2301 cc->r_start, cc->r_end, bAllowMissing_);
2302 GMX_LOG(logger.info)
2303 .asParagraph()
2304 .appendTextFormatted("Now there are %d residues with %d atoms", pdba->nres, pdba->nr);
2306 /* make up molecule name(s) */
2308 int k = (cc->nterpairs > 0 && cc->r_start[0] >= 0) ? cc->r_start[0] : 0;
2310 std::string restype = rt.typeOfNamedDatabaseResidue(*pdba->resinfo[k].name);
2312 std::string molname;
2313 std::string suffix;
2314 if (cc->bAllWat)
2316 molname = "Water";
2318 else
2320 this_chainid = cc->chainid;
2322 /* Add the chain id if we have one */
2323 if (this_chainid != ' ')
2325 suffix.append(formatString("_chain_%c", this_chainid));
2328 /* Check if there have been previous chains with the same id */
2329 int nid_used = 0;
2330 for (int k = 0; k < chain; k++)
2332 if (cc->chainid == chains[k].chainid)
2334 nid_used++;
2337 /* Add the number for this chain identifier if there are multiple copies */
2338 if (nid_used > 0)
2340 suffix.append(formatString("%d", nid_used + 1));
2343 if (suffix.length() > 0)
2345 molname.append(restype);
2346 molname.append(suffix);
2348 else
2350 molname = restype;
2353 std::string itp_fn = topologyFile_;
2355 std::string posre_fn = includeTopologyFile_;
2356 if ((numChains - nwaterchain > 1) && !cc->bAllWat)
2358 bITP_ = true;
2359 printf("Chain time...\n");
2360 // construct the itp file name
2361 itp_fn = stripSuffixIfPresent(itp_fn, ".top");
2362 itp_fn.append("_");
2363 itp_fn.append(molname);
2364 itp_fn.append(".itp");
2365 // now do the same for posre
2366 posre_fn = stripSuffixIfPresent(posre_fn, ".itp");
2367 posre_fn.append("_");
2368 posre_fn.append(molname);
2369 posre_fn.append(".itp");
2370 if (posre_fn == itp_fn)
2372 posre_fn = Path::concatenateBeforeExtension(posre_fn, "_pr");
2374 incls_.emplace_back();
2375 incls_.back() = itp_fn;
2376 itp_file_ = gmx_fio_fopen(itp_fn.c_str(), "w");
2378 else
2380 bITP_ = false;
2383 mols_.emplace_back();
2384 if (cc->bAllWat)
2386 mols_.back().name = "SOL";
2387 mols_.back().nr = pdba->nres;
2389 else
2391 mols_.back().name = molname;
2392 mols_.back().nr = 1;
2395 if (bITP_)
2397 print_top_comment(itp_file_, itp_fn.c_str(), ffdir_, true);
2400 FILE* top_file2;
2401 if (cc->bAllWat)
2403 top_file2 = nullptr;
2405 else if (bITP_)
2407 top_file2 = itp_file_;
2409 else
2411 top_file2 = top_file;
2414 pdb2top(top_file2, posre_fn.c_str(), molname.c_str(), pdba, &x, &atype, &symtab, rtpFFDB,
2415 restp_chain, hb_chain, bAllowMissing_, bVsites_, bVsiteAromatics_, ffdir_, mHmult_,
2416 ssbonds, long_bond_dist_, short_bond_dist_, bDeuterate_, bChargeGroups_, bCmap_,
2417 bRenumRes_, bRTPresname_, logger);
2419 if (!cc->bAllWat)
2421 write_posres(posre_fn.c_str(), pdba, posre_fc_);
2424 if (bITP_)
2426 gmx_fio_fclose(itp_file_);
2429 /* pdba and natom have been reassigned somewhere so: */
2430 cc->pdba = pdba;
2431 cc->x = x;
2434 if (watermodel_ == nullptr)
2436 for (int chain = 0; chain < numChains; chain++)
2438 if (chains[chain].bAllWat)
2440 auto message = formatString(
2441 "You have chosen not to include a water model, "
2442 "but there is water in the input file. Select a "
2443 "water model or remove the water from your input file.");
2444 GMX_THROW(InconsistentInputError(message));
2448 else
2450 std::string waterFile = formatString("%s%c%s.itp", ffdir_, DIR_SEPARATOR, watermodel_);
2451 if (!fflib_fexist(waterFile))
2453 auto message = formatString(
2454 "The topology file '%s' for the selected water "
2455 "model '%s' can not be found in the force field "
2456 "directory. Select a different water model.",
2457 waterFile.c_str(), watermodel_);
2458 GMX_THROW(InconsistentInputError(message));
2462 print_top_mols(top_file, title, ffdir_, watermodel_, incls_, mols_);
2463 gmx_fio_fclose(top_file);
2465 /* now merge all chains back together */
2466 natom = 0;
2467 int nres = 0;
2468 for (int i = 0; (i < numChains); i++)
2470 natom += chains[i].pdba->nr;
2471 nres += chains[i].pdba->nres;
2473 t_atoms* atoms;
2474 snew(atoms, 1);
2475 init_t_atoms(atoms, natom, false);
2476 for (int i = 0; i < atoms->nres; i++)
2478 sfree(atoms->resinfo[i].name);
2480 atoms->nres = nres;
2481 srenew(atoms->resinfo, nres);
2482 x.clear();
2483 int k = 0;
2484 int l = 0;
2485 for (int i = 0; (i < numChains); i++)
2487 if (numChains > 1)
2489 GMX_LOG(logger.info)
2490 .asParagraph()
2491 .appendTextFormatted("Including chain %d in system: %d atoms %d residues",
2492 i + 1, chains[i].pdba->nr, chains[i].pdba->nres);
2494 for (int j = 0; (j < chains[i].pdba->nr); j++)
2496 atoms->atom[k] = chains[i].pdba->atom[j];
2497 atoms->atom[k].resind += l; /* l is processed nr of residues */
2498 atoms->atomname[k] = chains[i].pdba->atomname[j];
2499 atoms->resinfo[atoms->atom[k].resind].chainid = chains[i].chainid;
2500 x.push_back(chains[i].x[j]);
2501 k++;
2503 for (int j = 0; (j < chains[i].pdba->nres); j++)
2505 atoms->resinfo[l] = chains[i].pdba->resinfo[j];
2506 if (bRTPresname_)
2508 atoms->resinfo[l].name = atoms->resinfo[l].rtp;
2510 l++;
2512 done_atom(chains[i].pdba);
2515 if (numChains > 1)
2517 GMX_LOG(logger.info)
2518 .asParagraph()
2519 .appendTextFormatted("Now there are %d atoms and %d residues", k, l);
2520 print_sums(atoms, true, logger);
2523 rvec box_space;
2524 GMX_LOG(logger.info).asParagraph().appendTextFormatted("Writing coordinate file...");
2525 clear_rvec(box_space);
2526 if (box[0][0] == 0)
2528 make_new_box(atoms->nr, gmx::as_rvec_array(x.data()), box, box_space, false);
2530 write_sto_conf(outputConfFile_.c_str(), title, atoms, gmx::as_rvec_array(x.data()), nullptr,
2531 pbcType, box);
2533 done_symtab(&symtab);
2534 done_atom(&pdba_all);
2535 done_atom(atoms);
2536 for (int chain = 0; chain < numChains; chain++)
2538 sfree(sortAtoms[chain]);
2539 sfree(localAtoms[chain]);
2541 sfree(sortAtoms);
2542 sfree(localAtoms);
2543 sfree(atoms);
2544 sfree(title);
2545 sfree(pdbx);
2547 GMX_LOG(logger.info)
2548 .asParagraph()
2549 .appendTextFormatted("\t\t--------- PLEASE NOTE ------------");
2550 GMX_LOG(logger.info)
2551 .asParagraph()
2552 .appendTextFormatted("You have successfully generated a topology from: %s.",
2553 inputConfFile_.c_str());
2554 if (watermodel_ != nullptr)
2556 GMX_LOG(logger.info)
2557 .asParagraph()
2558 .appendTextFormatted("The %s force field and the %s water model are used.", ffname_,
2559 watermodel_);
2560 sfree(watermodel_);
2562 else
2564 GMX_LOG(logger.info).asParagraph().appendTextFormatted("The %s force field is used.", ffname_);
2566 GMX_LOG(logger.info)
2567 .asParagraph()
2568 .appendTextFormatted("\t\t--------- ETON ESAELP ------------");
2570 return 0;
2573 } // namespace
2575 const char pdb2gmxInfo::name[] = "pdb2gmx";
2576 const char pdb2gmxInfo::shortDescription[] =
2577 "Convert coordinate files to topology and FF-compliant coordinate files";
2578 ICommandLineOptionsModulePointer pdb2gmxInfo::create()
2580 return std::make_unique<pdb2gmx>();
2583 } // namespace gmx