2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2015,2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief This file declares functions for mdrun to call to manage the
38 * details of doing a restart (ie. reading checkpoints, appending
41 * \todo Clean up the error-prone logic here. Add doxygen.
43 * \author Berk Hess <hess@kth.se>
44 * \author Erik Lindahl <erik@kth.se>
45 * \author Mark Abraham <mark.j.abraham@gmail.com>
47 * \ingroup module_mdrunutility
52 #include "handlerestart.h"
59 #if GMX_NATIVE_WINDOWS
61 #include <sys/locking.h>
69 #include "gromacs/commandline/filenm.h"
70 #include "gromacs/fileio/checkpoint.h"
71 #include "gromacs/fileio/gmxfio.h"
72 #include "gromacs/gmxlib/network.h"
73 #include "gromacs/mdrunutility/multisim.h"
74 #include "gromacs/mdtypes/commrec.h"
75 #include "gromacs/mdtypes/mdrunoptions.h"
76 #include "gromacs/utility/basedefinitions.h"
77 #include "gromacs/utility/exceptions.h"
78 #include "gromacs/utility/fatalerror.h"
79 #include "gromacs/utility/path.h"
80 #include "gromacs/utility/smalloc.h"
81 #include "gromacs/utility/stringstream.h"
82 #include "gromacs/utility/textwriter.h"
89 /*! \brief Search for \p fnm_cp in fnm and return true iff found
91 * \todo This could be implemented sanely with a for loop. */
92 gmx_bool
exist_output_file(const char *fnm_cp
, int nfile
, const t_filenm fnm
[])
96 /* Check if the output file name stored in the checkpoint file
97 * is one of the output file names of mdrun.
101 !(is_output(&fnm
[i
]) && strcmp(fnm_cp
, fnm
[i
].filenames
[0].c_str()) == 0))
106 return (i
< nfile
&& gmx_fexist(fnm_cp
));
109 /*! \brief Throw when mdrun -cpi fails because previous output files are missing.
111 * If we get here, the user requested restarting from a checkpoint file, that checkpoint
112 * file was found (so it is not the first part of a new run), but we are still missing
113 * some or all checkpoint files. In this case we issue a fatal error since there are
114 * so many special cases we cannot keep track of, and better safe than sorry. */
116 throwBecauseOfMissingOutputFiles(const char *checkpointFilename
,
117 ArrayRef
<const gmx_file_position_t
> outputfiles
,
118 int nfile
, const t_filenm fnm
[],
119 size_t numFilesMissing
)
121 StringOutputStream stream
;
122 TextWriter
writer(&stream
);
123 writer
.writeStringFormatted
124 ("Some output files listed in the checkpoint file %s are not present or not named "
125 "as the output files by the current program:)",
127 auto settings
= writer
.wrapperSettings();
128 auto oldIndent
= settings
.indent(), newIndent
= 2;
130 writer
.writeLine("Expected output files that are present:");
131 settings
.setIndent(newIndent
);
132 for (const auto &outputfile
: outputfiles
)
134 if (exist_output_file(outputfile
.filename
, nfile
, fnm
))
136 writer
.writeLine(outputfile
.filename
);
139 settings
.setIndent(oldIndent
);
140 writer
.ensureEmptyLine();
142 writer
.writeLine("Expected output files that are not present or named differently:");
143 settings
.setIndent(newIndent
);
144 for (const auto &outputfile
: outputfiles
)
146 if (!exist_output_file(outputfile
.filename
,
149 writer
.writeLine(outputfile
.filename
);
152 settings
.setIndent(oldIndent
);
154 writer
.writeLineFormatted(
155 R
"(To keep your simulation files safe, this simulation will not restart. Either name your
156 output files exactly the same as the previous simulation part (e.g. with -deffnm), or
157 make sure all the output files are present (e.g. run from the same directory as the
158 previous simulation part), or instruct mdrun to write new output files with mdrun -noappend.
159 In the last case, you will not be able to use appending in future for this simulation.)",
160 numFilesMissing
, outputfiles
.size());
161 GMX_THROW(InconsistentInputError(stream
.toString()));
164 //! Return a string describing the precision of a build of GROMACS.
165 const char *precisionToString(bool isDoublePrecision
)
167 return isDoublePrecision
? "double" : "mixed";
170 /*! \brief Choose the starting behaviour
172 * This routine cannot print tons of data, since it is called before
173 * the log file is opened. */
174 std::tuple
< StartingBehavior
,
175 CheckpointHeaderContents
,
176 std::vector
< gmx_file_position_t
>>
177 chooseStartingBehavior(const AppendingBehavior appendingBehavior
,
181 CheckpointHeaderContents headerContents
;
182 std::vector
<gmx_file_position_t
> outputFiles
;
183 if (!opt2bSet("-cpi", nfile
, fnm
))
185 // No need to tell the user anything
186 return std::make_tuple(StartingBehavior::NewSimulation
, headerContents
, outputFiles
);
189 // A -cpi option was provided, do a restart if there is an input checkpoint file available
190 const char *checkpointFilename
= opt2fn("-cpi", nfile
, fnm
);
191 if (!gmx_fexist(checkpointFilename
))
193 // This is interpreted as the user intending a new
194 // simulation, so that scripts can call "gmx mdrun -cpi"
195 // for all simulation parts. Thus, appending cannot occur.
196 if (appendingBehavior
== AppendingBehavior::Appending
)
198 GMX_THROW(InconsistentInputError
199 ("Could not do a restart with appending because the checkpoint file "
200 "was not found. Either supply the name of the right checkpoint file "
201 "or do not use -append"));
203 // No need to tell the user that mdrun -cpi without a file means a new simulation
204 return std::make_tuple(StartingBehavior::NewSimulation
, headerContents
, outputFiles
);
207 t_fileio
*fp
= gmx_fio_open(checkpointFilename
, "r");
210 GMX_THROW(FileIOError
211 (formatString("Checkpoint file '%s' was found but could not be opened for "
212 "reading. Check the file permissions.", checkpointFilename
)));
216 read_checkpoint_simulation_part_and_filenames(fp
, &outputFiles
);
218 GMX_RELEASE_ASSERT(!outputFiles
.empty(),
219 "The checkpoint file or its reading is broken, as no output "
220 "file information is stored in it");
221 const char *logFilename
= outputFiles
[0].filename
;
222 GMX_RELEASE_ASSERT(Path::extensionMatches(logFilename
, ftp2ext(efLOG
)),
223 formatString("The checkpoint file or its reading is broken, the first "
224 "output file '%s' must be a log file with extension '%s'",
225 logFilename
, ftp2ext(efLOG
)).c_str());
227 if (appendingBehavior
!= AppendingBehavior::NoAppending
)
229 // See whether appending can be done.
231 size_t numFilesMissing
= std::count_if(std::begin(outputFiles
), std::end(outputFiles
),
232 [nfile
, fnm
](const auto &outputFile
)
234 return !exist_output_file(outputFile
.filename
, nfile
, fnm
);
236 if (numFilesMissing
!= 0)
238 // Appending is not possible, because not all previous
239 // output files are present. We don't automatically switch
240 // to numbered output files either, because that prevents
241 // the user from using appending in future. If they want
242 // to restart with missing files, they need to use
244 throwBecauseOfMissingOutputFiles(checkpointFilename
, outputFiles
, nfile
, fnm
, numFilesMissing
);
247 for (const auto &outputFile
: outputFiles
)
249 if (outputFile
.offset
< 0)
251 // Appending of large files is not possible unless
252 // mdrun and the filesystem can do a correct job. We
253 // don't automatically switch to numbered output files
254 // either, because the user can benefit from
255 // understanding that their infrastructure is not very
256 // suitable for running a simulation producing lots of
259 formatString("The original mdrun wrote a file called '%s' which "
260 "is larger than 2 GB, but that mdrun or the filesystem "
261 "it ran on (e.g FAT32) did not support such large files. "
262 "This simulation cannot be restarted with appending. It will "
263 "be easier for you to use mdrun on a 64-bit filesystem, but "
264 "if you choose not to, then you must run mdrun with "
265 "-noappend once your output gets large enough.",
266 outputFile
.filename
);
267 GMX_THROW(InconsistentInputError(message
));
271 const char *logFilename
= outputFiles
[0].filename
;
272 // If the precision does not match, we cannot continue with
273 // appending, and will switch to not appending unless
274 // instructed otherwise.
275 if (headerContents
.file_version
>= 13 && headerContents
.double_prec
!= GMX_DOUBLE
)
277 if (appendingBehavior
== AppendingBehavior::Appending
)
279 GMX_THROW(InconsistentInputError
280 (formatString("Cannot restart with appending because the previous simulation part used "
281 "%s precision which does not match the %s precision used by this build "
282 "of GROMACS. Either use matching precision or use mdrun -noappend.",
283 precisionToString(headerContents
.double_prec
),
284 precisionToString(GMX_DOUBLE
))));
287 // If the previous log filename had a part number, then we
288 // cannot continue with appending, and will continue without
290 else if (hasSuffixFromNoAppend(logFilename
))
292 if (appendingBehavior
== AppendingBehavior::Appending
)
294 GMX_THROW(InconsistentInputError
295 ("Cannot restart with appending because the previous simulation "
296 "part did not use appending. Either do not use mdrun -append, or "
297 "provide the correct checkpoint file."));
302 // Everything is perfect - we can do an appending restart.
303 return std::make_tuple(StartingBehavior::RestartWithAppending
, headerContents
, outputFiles
);
306 // No need to tell the user anything because the previous
307 // simulation part also didn't append and that can only happen
308 // when they ask for it.
311 GMX_RELEASE_ASSERT(appendingBehavior
!= AppendingBehavior::Appending
, "Logic error in appending");
312 return std::make_tuple(StartingBehavior::RestartWithoutAppending
, headerContents
, outputFiles
);
315 //! Check whether chksum_file output file has a checksum that matches \c outputfile from the checkpoint.
316 void checkOutputFile(t_fileio
*fileToCheck
,
317 const gmx_file_position_t
&outputfile
)
319 /* compute md5 chksum */
320 std::array
<unsigned char, 16> digest
;
321 if (outputfile
.checksumSize
!= -1)
323 if (gmx_fio_get_file_md5(fileToCheck
, outputfile
.offset
,
324 &digest
) != outputfile
.checksumSize
) /*at the end of the call the file position is at the end of the file*/
327 formatString("Can't read %d bytes of '%s' to compute checksum. The file "
328 "has been replaced or its contents have been modified. Cannot "
329 "do appending because of this condition.",
330 outputfile
.checksumSize
,
331 outputfile
.filename
);
332 GMX_THROW(InconsistentInputError(message
));
336 /* compare md5 chksum */
337 if (outputfile
.checksumSize
!= -1 &&
338 digest
!= outputfile
.checksum
)
342 fprintf(debug
, "chksum for %s: ", outputfile
.filename
);
343 for (int j
= 0; j
< 16; j
++)
345 fprintf(debug
, "%02x", digest
[j
]);
347 fprintf(debug
, "\n");
350 formatString("Checksum wrong for '%s'. The file has been replaced "
351 "or its contents have been modified. Cannot do appending "
352 "because of this condition.", outputfile
.filename
);
353 GMX_THROW(InconsistentInputError(message
));
357 /*! \brief If supported, obtain a write lock on the log file.
359 * This wil prevent e.g. other mdrun instances from changing it while
360 * we attempt to restart with appending. */
361 void lockLogFile(t_fileio
*logfio
,
362 const char *logFilename
)
364 /* Note that there are systems where the lock operation
365 * will succeed, but a second process can also lock the file.
366 * We should probably try to detect this.
368 #if defined __native_client__
371 #elif GMX_NATIVE_WINDOWS
372 if (_locking(fileno(gmx_fio_getfp(logfio
)), _LK_NBLCK
, LONG_MAX
) == -1)
374 // don't initialize here: the struct order is OS dependent!
377 fl
.l_whence
= SEEK_SET
;
382 if (fcntl(fileno(gmx_fio_getfp(logfio
)), F_SETLK
, &fl
) == -1)
387 std::string message
= "File locking is not supported on this system. "
388 "Use mdrun -noappend to restart.";
389 GMX_THROW(FileIOError(message
));
391 else if (errno
== EACCES
|| errno
== EAGAIN
)
394 formatString("Failed to lock: %s. Already running "
395 "simulation?", logFilename
);
396 GMX_THROW(FileIOError(message
));
401 formatString("Failed to lock: %s. %s.",
402 logFilename
, std::strerror(errno
));
403 GMX_THROW(FileIOError(message
));
408 /*! \brief Prepare to append to output files.
410 * We use the file pointer positions of the output files stored in the
411 * checkpoint file and truncate the files such that any frames written
412 * after the checkpoint time are removed. All files are md5sum
413 * checked such that we can be sure that we do not truncate other
414 * (maybe important) files. The log file is locked so that we can
415 * avoid cases where another mdrun instance might still be writing to
418 prepareForAppending(const ArrayRef
<const gmx_file_position_t
> outputFiles
,
423 // Can't check or truncate output files in general
424 // TODO do we do this elsewhere for GMX_FAHCORE?
428 // Handle the log file separately - it comes first in the list
429 // because we have already opened the log file. This ensures that
430 // we retain a lock on the open file that is never lifted after
431 // the checksum is calculated.
432 const gmx_file_position_t
&logOutputFile
= outputFiles
[0];
433 lockLogFile(logfio
, logOutputFile
.filename
);
434 checkOutputFile(logfio
, logOutputFile
);
436 if (gmx_fio_seek(logfio
, logOutputFile
.offset
) != 0)
439 formatString("Seek error! Failed to truncate log file: %s.",
440 std::strerror(errno
));
441 GMX_THROW(FileIOError(message
));
444 // Now handle the remaining outputFiles
445 for (const auto &outputFile
: outputFiles
.subArray(1, outputFiles
.size()-1))
447 t_fileio
*fileToCheck
= gmx_fio_open(outputFile
.filename
, "r+");
448 checkOutputFile(fileToCheck
, outputFile
);
449 gmx_fio_close(fileToCheck
);
451 if (GMX_NATIVE_WINDOWS
)
453 // Can't truncate output files on this platform
457 if (gmx_truncate(outputFile
.filename
, outputFile
.offset
) != 0)
460 formatString("Truncation of file %s failed. Cannot do appending "
461 "because of this failure.", outputFile
.filename
);
462 GMX_THROW(FileIOError(message
));
469 std::tuple
<StartingBehavior
, LogFilePtr
>
470 handleRestart(t_commrec
*cr
,
471 const gmx_multisim_t
*ms
,
472 const AppendingBehavior appendingBehavior
,
476 StartingBehavior startingBehavior
;
477 LogFilePtr logFileGuard
= nullptr;
479 // Make sure all ranks agree on whether the (multi-)simulation can
481 int numErrorsFound
= 0;
482 std::exception_ptr exceptionPtr
;
484 // Only the master rank of each simulation can do anything with
485 // output files, so it is the only one that needs to consider
486 // whether a restart might take place, and how to implement it.
491 CheckpointHeaderContents headerContents
;
492 std::vector
<gmx_file_position_t
> outputFiles
;
494 std::tie(startingBehavior
, headerContents
, outputFiles
) = chooseStartingBehavior(appendingBehavior
, nfile
, fnm
);
498 // Multi-simulation restarts require that each
499 // checkpoint describes the same simulation part. If
500 // those don't match, then the simulation cannot
501 // proceed, and can only report a diagnostic to
502 // stderr. Only one simulation should do that.
503 FILE *fpmulti
= isMasterSim(ms
) ? stderr
: nullptr;
504 check_multi_int(fpmulti
, ms
, headerContents
.simulation_part
, "simulation part", TRUE
);
507 if (startingBehavior
== StartingBehavior::RestartWithAppending
)
509 logFileGuard
= openLogFile(ftp2fn(efLOG
, nfile
, fnm
),
510 startingBehavior
== StartingBehavior::RestartWithAppending
);
511 prepareForAppending(outputFiles
, logFileGuard
.get());
515 if (startingBehavior
== StartingBehavior::RestartWithoutAppending
)
517 std::string suffix
= formatString(".part%04d", headerContents
.simulation_part
+ 1);
518 add_suffix_to_output_names(fnm
, nfile
, suffix
.c_str());
520 logFileGuard
= openLogFile(ftp2fn(efLOG
, nfile
, fnm
),
521 startingBehavior
== StartingBehavior::RestartWithAppending
);
524 catch (const std::exception
& /*ex*/)
526 exceptionPtr
= std::current_exception();
530 // Since the master rank (perhaps of only one simulation) may have
531 // found an error condition, we now coordinate the behavior across
532 // all ranks. However, only the applicable ranks will throw a
533 // non-default exception.
535 // TODO Evolve some re-usable infrastructure for this, because it
536 // will be needed in many places while setting up simulations.
539 gmx_sumi(1, &numErrorsFound
, cr
);
543 gmx_sumi_sim(1, &numErrorsFound
, ms
);
546 gmx_bcast(sizeof(numErrorsFound
), &numErrorsFound
, cr
);
550 // Throw in a globally coordinated way, if needed
551 if (numErrorsFound
> 0)
555 std::rethrow_exception(exceptionPtr
);
559 GMX_THROW(ParallelConsistencyError("Another MPI rank encountered an exception"));
564 gmx_bcast(sizeof(startingBehavior
), &startingBehavior
, cr
);
567 return std::make_tuple(startingBehavior
, std::move(logFileGuard
));