Make a status test pass against old servers.
[svn.git] / tools / server-side / fsfs-reshard.py
blobd8d15d4b2b104f93222e97ef9c8894aef40cc36a
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # fsfs-reshard.py REPOS_PATH MAX_FILES_PER_SHARD
6 # Perform an offline conversion of an FSFS repository between linear (format
7 # 2, usable by Subversion 1.4+) and sharded (format 3, usable by Subversion
8 # 1.5+) layouts.
10 # The MAX_FILES_PER_SHARD argument specifies the maximum number of files
11 # that will be stored in each shard (directory), or zero to specify a linear
12 # layout. Subversion 1.5 uses a default value of 1000 files per shard.
14 # As the repository will not be valid while the conversion is in progress,
15 # the repository administrator must ensure that access to the repository is
16 # blocked for the duration of the conversion.
18 # In the event that the conversion is interrupted, the repository will be in
19 # an inconsistent state. The repository administrator should then re-run
20 # this tool to completion.
23 # Note that, currently, resharding from one sharded layout to another is
24 # likely to be an extremely slow process. To reshard, we convert from a
25 # sharded to linear layout and then to the new sharded layout. The problem
26 # is that the initial conversion to the linear layout triggers exactly the
27 # same 'large number of files in a directory' problem that sharding is
28 # intended to solve.
30 # ====================================================================
31 # Copyright (c) 2007 CollabNet. All rights reserved.
33 # This software is licensed as described in the file COPYING, which
34 # you should have received as part of this distribution. The terms
35 # are also available at http://subversion.tigris.org/license-1.html.
36 # If newer versions of this license are posted there, you may use a
37 # newer version instead, at your option.
39 # This software consists of voluntary contributions made by many
40 # individuals. For exact contribution history, see the revision
41 # history and logs, available at http://subversion.tigris.org/.
42 # ====================================================================
44 # $HeadURL$
45 # $LastChangedDate$
46 # $LastChangedBy$
47 # $LastChangedRevision$
49 import os, stat, sys
51 from errno import EEXIST
53 def usage():
54 """Print a usage message and exit."""
55 print """usage: %s REPOS_PATH MAX_FILES_PER_SHARD [START END]
57 Perform an offline conversion of an FSFS repository between linear
58 (readable by Subversion 1.4 or later) and sharded (readable by
59 Subversion 1.5 or later) layouts.
61 The MAX_FILES_PER_SHARD argument specifies the maximum number of
62 files that will be stored in each shard (directory), or zero to
63 specify a linear layout. Subversion 1.5 uses a default value of
64 1000 files per shard.
66 Convert revisions START through END inclusive if specified, or all
67 revisions if unspecified.
68 """ % sys.argv[0]
69 sys.exit(1)
71 def incompatible_repos_format(repos_path, format):
72 """Print an error saying that REPOS_PATH is a repository with an
73 incompatible repository format FORMAT, then exit."""
74 print >> sys.stderr, """error: unable to convert repository '%s'.
76 This repository is not compatible with this tool. Valid
77 repository formats are '3' or '5'; this repository is
78 format '%s'.
79 """ % (repos_path, format)
80 sys.exit(1)
82 def incompatible_fs_format(repos_path, format):
83 """Print an error saying that REPOS_PATH is a repository with an
84 incompatible filesystem format FORMAT, then exit."""
85 print >> sys.stderr, """error: unable to convert repository '%s'.
87 This repository contains a filesystem that is not compatible with
88 this tool. Valid filesystem formats are '1', '2', or '3'; this
89 repository contains a filesystem with format '%s'.
90 """ % (repos_path, format)
91 sys.exit(1)
93 def unexpected_fs_format_options(repos_path):
94 """Print an error saying that REPOS_PATH is a repository with
95 unexpected filesystem format options, then exit."""
96 print >> sys.stderr, """error: unable to convert repository '%s'.
98 This repository contains a filesystem that appears to be invalid -
99 there is unexpected data after the filesystem format number.
100 """ % repos_path
101 sys.exit(1)
103 def incompatible_fs_format_option(repos_path, option):
104 """Print an error saying that REPOS_PATH is a repository with an
105 incompatible filesystem format option OPTION, then exit."""
106 print >> sys.stderr, """error: unable to convert repository '%s'.
108 This repository contains a filesystem that is not compatible with
109 this tool. This tool recognises the 'layout' option but the
110 filesystem uses the '%s' option.
111 """ % (repos_path, option)
112 sys.exit(1)
114 def warn_about_fs_format_1(repos_path, format_path):
115 """Print a warning saying that REPOS_PATH contains a format 1 FSFS
116 filesystem that we can't reconstruct, then exit."""
117 print >> sys.stderr, """warning: conversion of '%s' will be one-way.
119 This repository is currently readable by Subversion 1.1 or later.
120 This tool can convert this repository to one that is readable by
121 either Subversion 1.4 (or later) or Subversion 1.5 (or later),
122 but it is not able to convert it back to the original format - a
123 separate dump/load step would be required.
125 If you would like to upgrade this repository anyway, delete the
126 file '%s' and re-run this tool.
127 """ % (repos_path, format_path)
128 sys.exit(1)
130 def check_repos_format(repos_path):
131 """Check that REPOS_PATH contains a repository with a suitable format;
132 print a message and exit if not."""
133 format_path = os.path.join(repos_path, 'format')
134 try:
135 format_file = open(format_path)
136 format = format_file.readline()
137 if not format.endswith('\n'):
138 incompatible_repos_format(repos_path, format + ' <missing newline>')
139 format = format.rstrip('\n')
140 if format == '3' or format == '5':
141 pass
142 else:
143 incompatible_repos_format(repos_path, format)
144 except IOError:
145 # In all likelihood, the file doesn't exist.
146 incompatible_repos_format(repos_path, '<unreadable>')
148 def check_fs_format(repos_path):
149 """Check that REPOS_PATH contains a filesystem with a suitable format,
150 or that it contains no format file; print a message and exit if neither
151 is true. Return bool whether the filesystem is sharded."""
152 sharded = False
153 db_path = os.path.join(repos_path, 'db')
154 format_path = os.path.join(db_path, 'format')
155 try:
156 format_file = open(format_path)
157 format = format_file.readline()
158 if not format.endswith('\n'):
159 incompatible_fs_format(repos_path, format + ' <missing newline>')
160 format = format.rstrip('\n')
161 if format == '1':
162 # This is a format 1 (svndiff0 only) filesystem. We can upgrade it,
163 # but we can't downgrade again (since we can't uncompress any of the
164 # svndiff1 deltas that may have been written). Warn the user and exit.
165 warn_about_fs_format_1(repos_path, format_path)
166 if format == '2':
167 pass
168 elif format == '3':
169 pass
170 else:
171 incompatible_fs_format(repos_path, format)
173 for line in format_file:
174 if format == '2':
175 unexpected_fs_format_options(repos_path)
177 line = line.rstrip('\n')
178 if line == 'layout linear':
179 pass
180 elif line.startswith('layout sharded '):
181 sharded = True
182 else:
183 incompatible_fs_format_option(repos_path, line)
185 format_file.close()
186 except IOError:
187 # The format file might not exist if we've previously been interrupted,
188 # or if the user is following our advice about upgrading a format 1
189 # repository. In both cases, we'll just assume the format was
190 # compatible.
191 pass
193 return sharded
195 def current_file(repos_path):
196 """Return triple of (revision, next_node_id, next_copy_id) from
197 REPOS_PATH/db/current ."""
198 return open(os.path.join(repos_path, 'db', 'current')).readline().split()
200 def remove_fs_format(repos_path):
201 """Remove the filesystem format file for repository REPOS_PATH.
202 Do not raise an error if the file is already missing."""
203 format_path = os.path.join(repos_path, 'db', 'format')
204 try:
205 statinfo = os.stat(format_path)
206 except OSError:
207 # The file probably doesn't exist.
208 return
210 # On Windows, we need to ensure the file is writable before we can
211 # remove it.
212 os.chmod(format_path, statinfo.st_mode | stat.S_IWUSR)
213 os.remove(format_path)
215 def write_fs_format(repos_path, contents):
216 """Write a new filesystem format file for repository REPOS_PATH containing
217 CONTENTS."""
218 format_path = os.path.join(repos_path, 'db', 'format')
219 f = open(format_path, 'wb')
220 f.write(contents)
221 f.close()
222 os.chmod(format_path, stat.S_IRUSR | stat.S_IRGRP)
224 def linearise(path):
225 """Move all the files in subdirectories of PATH into PATH, and remove the
226 subdirectories. Handle conflicts between subdirectory names and files
227 contained in subdirectories by ensuring subdirectories have a '.shard'
228 suffix prior to moving (the files are assumed not to have this suffix.
229 Abort if a subdirectory is found to contain another subdirectory."""
230 # First enumerate all subdirectories of DIR and rename where necessary
231 # to include a .shard suffix.
232 for name in os.listdir(path):
233 if name.endswith('.shard'):
234 continue
235 subdir_path = os.path.join(path, name)
236 if not os.path.isdir(subdir_path):
237 continue
238 os.rename(subdir_path, subdir_path + '.shard')
240 # Now move all the subdirectory contents into the parent and remove
241 # the subdirectories.
242 for root_path, dirnames, filenames in os.walk(path):
243 if root_path == path:
244 continue
245 if len(dirnames) > 0:
246 print >> sys.stderr, \
247 "error: directory '%s' contains other unexpected directories." \
248 % root_path
249 sys.exit(1)
250 for name in filenames:
251 from_path = os.path.join(root_path, name)
252 to_path = os.path.join(path, name)
253 os.rename(from_path, to_path)
254 os.rmdir(root_path)
256 def shard(path, max_files_per_shard, start, end):
257 """Move the files for revisions START to END inclusive in PATH into
258 subdirectories of PATH named such that subdirectory '0' contains at most
259 MAX_FILES_PER_SHARD files, those named [0, MAX_FILES_PER_SHARD). Abort if
260 PATH is found to contain any entries with non-numeric names."""
262 tmp = path + '.reshard'
263 try:
264 os.mkdir(tmp)
265 except OSError, e:
266 if e.errno != EEXIST:
267 raise
269 # Move all entries into shards named N.shard.
270 for rev in xrange(start, end + 1):
271 name = str(rev)
272 shard = rev // max_files_per_shard
273 shard_name = str(shard) + '.shard'
275 from_path = os.path.join(path, name)
276 to_path = os.path.join(tmp, shard_name, name)
277 try:
278 os.rename(from_path, to_path)
279 except OSError:
280 # The most likely explanation is that the shard directory doesn't
281 # exist. Let's create it and retry the rename.
282 os.mkdir(os.path.join(tmp, shard_name))
283 os.rename(from_path, to_path)
285 # Now rename all the shards to remove the suffix.
286 skipped = 0
287 for name in os.listdir(tmp):
288 if not name.endswith('.shard'):
289 print >> sys.stderr, "warning: ignoring unexpected subdirectory '%s'." \
290 % os.path.join(tmp, name)
291 skipped += 1
292 continue
293 from_path = os.path.join(tmp, name)
294 to_path = os.path.join(path, os.path.basename(from_path)[:-6])
295 os.rename(from_path, to_path)
296 skipped == 0 and os.rmdir(tmp)
298 def main():
299 if len(sys.argv) < 3:
300 usage()
302 repos_path = sys.argv[1]
303 max_files_per_shard = sys.argv[2]
304 try:
305 start = int(sys.argv[3])
306 end = int(sys.argv[4])
307 except IndexError:
308 start = 0
309 end = int(current_file(repos_path)[0])
311 # Validate the command-line arguments.
312 db_path = os.path.join(repos_path, 'db')
313 current_path = os.path.join(db_path, 'current')
314 if not os.path.exists(current_path):
315 print >> sys.stderr, \
316 "error: '%s' doesn't appear to be a Subversion FSFS repository." \
317 % repos_path
318 sys.exit(1)
320 try:
321 max_files_per_shard = int(max_files_per_shard)
322 except ValueError, OverflowError:
323 print >> sys.stderr, \
324 "error: maximum files per shard ('%s') is not a valid number." \
325 % max_files_per_shard
326 sys.exit(1)
328 if max_files_per_shard < 0:
329 print >> sys.stderr, \
330 "error: maximum files per shard ('%d') must not be negative." \
331 % max_files_per_shard
332 sys.exit(1)
334 # Check the format of the repository.
335 check_repos_format(repos_path)
336 sharded = check_fs_format(repos_path)
338 # Let the user know what's going on.
339 if max_files_per_shard > 0:
340 print "Converting '%s' to a sharded structure with %d files per directory" \
341 % (repos_path, max_files_per_shard)
342 if sharded:
343 print '(will convert to a linear structure first)'
344 else:
345 print "Converting '%s' to a linear structure" % repos_path
347 # Prevent access to the repository for the duration of the conversion.
348 # There's no clean way to do this, but since the format of the repository
349 # is indeterminate, let's remove the format file while we're converting.
350 print '- marking the repository as invalid'
351 remove_fs_format(repos_path)
353 # First, convert to a linear scheme (this makes recovery easier because
354 # it's easier to reason about the behaviour on restart).
355 if sharded:
356 print '- linearising db/revs'
357 linearise(os.path.join(repos_path, 'db', 'revs'))
358 print '- linearising db/revprops'
359 linearise(os.path.join(repos_path, 'db', 'revprops'))
361 if max_files_per_shard == 0:
362 # We're done. Stamp the filesystem with a format 2 db/format file.
363 print '- marking the repository as a valid linear repository'
364 write_fs_format(repos_path, '2\n')
365 else:
366 print '- sharding db/revs'
367 shard(os.path.join(repos_path, 'db', 'revs'), max_files_per_shard,
368 start, end)
369 print '- sharding db/revprops'
370 shard(os.path.join(repos_path, 'db', 'revprops'), max_files_per_shard,
371 start, end)
373 # We're done. Stamp the filesystem with a format 3 db/format file.
374 print '- marking the repository as a valid sharded repository'
375 write_fs_format(repos_path, '3\nlayout sharded %d\n' % max_files_per_shard)
377 print '- done.'
378 sys.exit(0)
380 if __name__ == '__main__':
381 raise Exception, \
382 """This script is unfinished and not ready to be used on live data.
383 Trust us."""
384 main()