2 # -*- coding: utf-8 -*-
4 # fsfs-reshard.py REPOS_PATH MAX_FILES_PER_SHARD
6 # Perform an offline conversion of an FSFS repository between linear (format
7 # 2, usable by Subversion 1.4+) and sharded (format 3, usable by Subversion
10 # The MAX_FILES_PER_SHARD argument specifies the maximum number of files
11 # that will be stored in each shard (directory), or zero to specify a linear
12 # layout. Subversion 1.5 uses a default value of 1000 files per shard.
14 # As the repository will not be valid while the conversion is in progress,
15 # the repository administrator must ensure that access to the repository is
16 # blocked for the duration of the conversion.
18 # In the event that the conversion is interrupted, the repository will be in
19 # an inconsistent state. The repository administrator should then re-run
20 # this tool to completion.
23 # Note that, currently, resharding from one sharded layout to another is
24 # likely to be an extremely slow process. To reshard, we convert from a
25 # sharded to linear layout and then to the new sharded layout. The problem
26 # is that the initial conversion to the linear layout triggers exactly the
27 # same 'large number of files in a directory' problem that sharding is
30 # ====================================================================
31 # Copyright (c) 2007 CollabNet. All rights reserved.
33 # This software is licensed as described in the file COPYING, which
34 # you should have received as part of this distribution. The terms
35 # are also available at http://subversion.tigris.org/license-1.html.
36 # If newer versions of this license are posted there, you may use a
37 # newer version instead, at your option.
39 # This software consists of voluntary contributions made by many
40 # individuals. For exact contribution history, see the revision
41 # history and logs, available at http://subversion.tigris.org/.
42 # ====================================================================
47 # $LastChangedRevision$
51 from errno
import EEXIST
54 """Print a usage message and exit."""
55 print """usage: %s REPOS_PATH MAX_FILES_PER_SHARD [START END]
57 Perform an offline conversion of an FSFS repository between linear
58 (readable by Subversion 1.4 or later) and sharded (readable by
59 Subversion 1.5 or later) layouts.
61 The MAX_FILES_PER_SHARD argument specifies the maximum number of
62 files that will be stored in each shard (directory), or zero to
63 specify a linear layout. Subversion 1.5 uses a default value of
66 Convert revisions START through END inclusive if specified, or all
67 revisions if unspecified.
71 def incompatible_repos_format(repos_path
, format
):
72 """Print an error saying that REPOS_PATH is a repository with an
73 incompatible repository format FORMAT, then exit."""
74 print >> sys
.stderr
, """error: unable to convert repository '%s'.
76 This repository is not compatible with this tool. Valid
77 repository formats are '3' or '5'; this repository is
79 """ % (repos_path
, format
)
82 def incompatible_fs_format(repos_path
, format
):
83 """Print an error saying that REPOS_PATH is a repository with an
84 incompatible filesystem format FORMAT, then exit."""
85 print >> sys
.stderr
, """error: unable to convert repository '%s'.
87 This repository contains a filesystem that is not compatible with
88 this tool. Valid filesystem formats are '1', '2', or '3'; this
89 repository contains a filesystem with format '%s'.
90 """ % (repos_path
, format
)
93 def unexpected_fs_format_options(repos_path
):
94 """Print an error saying that REPOS_PATH is a repository with
95 unexpected filesystem format options, then exit."""
96 print >> sys
.stderr
, """error: unable to convert repository '%s'.
98 This repository contains a filesystem that appears to be invalid -
99 there is unexpected data after the filesystem format number.
103 def incompatible_fs_format_option(repos_path
, option
):
104 """Print an error saying that REPOS_PATH is a repository with an
105 incompatible filesystem format option OPTION, then exit."""
106 print >> sys
.stderr
, """error: unable to convert repository '%s'.
108 This repository contains a filesystem that is not compatible with
109 this tool. This tool recognises the 'layout' option but the
110 filesystem uses the '%s' option.
111 """ % (repos_path
, option
)
114 def warn_about_fs_format_1(repos_path
, format_path
):
115 """Print a warning saying that REPOS_PATH contains a format 1 FSFS
116 filesystem that we can't reconstruct, then exit."""
117 print >> sys
.stderr
, """warning: conversion of '%s' will be one-way.
119 This repository is currently readable by Subversion 1.1 or later.
120 This tool can convert this repository to one that is readable by
121 either Subversion 1.4 (or later) or Subversion 1.5 (or later),
122 but it is not able to convert it back to the original format - a
123 separate dump/load step would be required.
125 If you would like to upgrade this repository anyway, delete the
126 file '%s' and re-run this tool.
127 """ % (repos_path
, format_path
)
130 def check_repos_format(repos_path
):
131 """Check that REPOS_PATH contains a repository with a suitable format;
132 print a message and exit if not."""
133 format_path
= os
.path
.join(repos_path
, 'format')
135 format_file
= open(format_path
)
136 format
= format_file
.readline()
137 if not format
.endswith('\n'):
138 incompatible_repos_format(repos_path
, format
+ ' <missing newline>')
139 format
= format
.rstrip('\n')
140 if format
== '3' or format
== '5':
143 incompatible_repos_format(repos_path
, format
)
145 # In all likelihood, the file doesn't exist.
146 incompatible_repos_format(repos_path
, '<unreadable>')
148 def check_fs_format(repos_path
):
149 """Check that REPOS_PATH contains a filesystem with a suitable format,
150 or that it contains no format file; print a message and exit if neither
151 is true. Return bool whether the filesystem is sharded."""
153 db_path
= os
.path
.join(repos_path
, 'db')
154 format_path
= os
.path
.join(db_path
, 'format')
156 format_file
= open(format_path
)
157 format
= format_file
.readline()
158 if not format
.endswith('\n'):
159 incompatible_fs_format(repos_path
, format
+ ' <missing newline>')
160 format
= format
.rstrip('\n')
162 # This is a format 1 (svndiff0 only) filesystem. We can upgrade it,
163 # but we can't downgrade again (since we can't uncompress any of the
164 # svndiff1 deltas that may have been written). Warn the user and exit.
165 warn_about_fs_format_1(repos_path
, format_path
)
171 incompatible_fs_format(repos_path
, format
)
173 for line
in format_file
:
175 unexpected_fs_format_options(repos_path
)
177 line
= line
.rstrip('\n')
178 if line
== 'layout linear':
180 elif line
.startswith('layout sharded '):
183 incompatible_fs_format_option(repos_path
, line
)
187 # The format file might not exist if we've previously been interrupted,
188 # or if the user is following our advice about upgrading a format 1
189 # repository. In both cases, we'll just assume the format was
195 def current_file(repos_path
):
196 """Return triple of (revision, next_node_id, next_copy_id) from
197 REPOS_PATH/db/current ."""
198 return open(os
.path
.join(repos_path
, 'db', 'current')).readline().split()
200 def remove_fs_format(repos_path
):
201 """Remove the filesystem format file for repository REPOS_PATH.
202 Do not raise an error if the file is already missing."""
203 format_path
= os
.path
.join(repos_path
, 'db', 'format')
205 statinfo
= os
.stat(format_path
)
207 # The file probably doesn't exist.
210 # On Windows, we need to ensure the file is writable before we can
212 os
.chmod(format_path
, statinfo
.st_mode | stat
.S_IWUSR
)
213 os
.remove(format_path
)
215 def write_fs_format(repos_path
, contents
):
216 """Write a new filesystem format file for repository REPOS_PATH containing
218 format_path
= os
.path
.join(repos_path
, 'db', 'format')
219 f
= open(format_path
, 'wb')
222 os
.chmod(format_path
, stat
.S_IRUSR | stat
.S_IRGRP
)
225 """Move all the files in subdirectories of PATH into PATH, and remove the
226 subdirectories. Handle conflicts between subdirectory names and files
227 contained in subdirectories by ensuring subdirectories have a '.shard'
228 suffix prior to moving (the files are assumed not to have this suffix.
229 Abort if a subdirectory is found to contain another subdirectory."""
230 # First enumerate all subdirectories of DIR and rename where necessary
231 # to include a .shard suffix.
232 for name
in os
.listdir(path
):
233 if name
.endswith('.shard'):
235 subdir_path
= os
.path
.join(path
, name
)
236 if not os
.path
.isdir(subdir_path
):
238 os
.rename(subdir_path
, subdir_path
+ '.shard')
240 # Now move all the subdirectory contents into the parent and remove
241 # the subdirectories.
242 for root_path
, dirnames
, filenames
in os
.walk(path
):
243 if root_path
== path
:
245 if len(dirnames
) > 0:
246 print >> sys
.stderr
, \
247 "error: directory '%s' contains other unexpected directories." \
250 for name
in filenames
:
251 from_path
= os
.path
.join(root_path
, name
)
252 to_path
= os
.path
.join(path
, name
)
253 os
.rename(from_path
, to_path
)
256 def shard(path
, max_files_per_shard
, start
, end
):
257 """Move the files for revisions START to END inclusive in PATH into
258 subdirectories of PATH named such that subdirectory '0' contains at most
259 MAX_FILES_PER_SHARD files, those named [0, MAX_FILES_PER_SHARD). Abort if
260 PATH is found to contain any entries with non-numeric names."""
262 tmp
= path
+ '.reshard'
266 if e
.errno
!= EEXIST
:
269 # Move all entries into shards named N.shard.
270 for rev
in xrange(start
, end
+ 1):
272 shard
= rev
// max_files_per_shard
273 shard_name
= str(shard
) + '.shard'
275 from_path
= os
.path
.join(path
, name
)
276 to_path
= os
.path
.join(tmp
, shard_name
, name
)
278 os
.rename(from_path
, to_path
)
280 # The most likely explanation is that the shard directory doesn't
281 # exist. Let's create it and retry the rename.
282 os
.mkdir(os
.path
.join(tmp
, shard_name
))
283 os
.rename(from_path
, to_path
)
285 # Now rename all the shards to remove the suffix.
287 for name
in os
.listdir(tmp
):
288 if not name
.endswith('.shard'):
289 print >> sys
.stderr
, "warning: ignoring unexpected subdirectory '%s'." \
290 % os
.path
.join(tmp
, name
)
293 from_path
= os
.path
.join(tmp
, name
)
294 to_path
= os
.path
.join(path
, os
.path
.basename(from_path
)[:-6])
295 os
.rename(from_path
, to_path
)
296 skipped
== 0 and os
.rmdir(tmp
)
299 if len(sys
.argv
) < 3:
302 repos_path
= sys
.argv
[1]
303 max_files_per_shard
= sys
.argv
[2]
305 start
= int(sys
.argv
[3])
306 end
= int(sys
.argv
[4])
309 end
= int(current_file(repos_path
)[0])
311 # Validate the command-line arguments.
312 db_path
= os
.path
.join(repos_path
, 'db')
313 current_path
= os
.path
.join(db_path
, 'current')
314 if not os
.path
.exists(current_path
):
315 print >> sys
.stderr
, \
316 "error: '%s' doesn't appear to be a Subversion FSFS repository." \
321 max_files_per_shard
= int(max_files_per_shard
)
322 except ValueError, OverflowError:
323 print >> sys
.stderr
, \
324 "error: maximum files per shard ('%s') is not a valid number." \
325 % max_files_per_shard
328 if max_files_per_shard
< 0:
329 print >> sys
.stderr
, \
330 "error: maximum files per shard ('%d') must not be negative." \
331 % max_files_per_shard
334 # Check the format of the repository.
335 check_repos_format(repos_path
)
336 sharded
= check_fs_format(repos_path
)
338 # Let the user know what's going on.
339 if max_files_per_shard
> 0:
340 print "Converting '%s' to a sharded structure with %d files per directory" \
341 % (repos_path
, max_files_per_shard
)
343 print '(will convert to a linear structure first)'
345 print "Converting '%s' to a linear structure" % repos_path
347 # Prevent access to the repository for the duration of the conversion.
348 # There's no clean way to do this, but since the format of the repository
349 # is indeterminate, let's remove the format file while we're converting.
350 print '- marking the repository as invalid'
351 remove_fs_format(repos_path
)
353 # First, convert to a linear scheme (this makes recovery easier because
354 # it's easier to reason about the behaviour on restart).
356 print '- linearising db/revs'
357 linearise(os
.path
.join(repos_path
, 'db', 'revs'))
358 print '- linearising db/revprops'
359 linearise(os
.path
.join(repos_path
, 'db', 'revprops'))
361 if max_files_per_shard
== 0:
362 # We're done. Stamp the filesystem with a format 2 db/format file.
363 print '- marking the repository as a valid linear repository'
364 write_fs_format(repos_path
, '2\n')
366 print '- sharding db/revs'
367 shard(os
.path
.join(repos_path
, 'db', 'revs'), max_files_per_shard
,
369 print '- sharding db/revprops'
370 shard(os
.path
.join(repos_path
, 'db', 'revprops'), max_files_per_shard
,
373 # We're done. Stamp the filesystem with a format 3 db/format file.
374 print '- marking the repository as a valid sharded repository'
375 write_fs_format(repos_path
, '3\nlayout sharded %d\n' % max_files_per_shard
)
380 if __name__
== '__main__':
382 """This script is unfinished and not ready to be used on live data.