1 # (Be in -*- python -*- mode.)
3 # ====================================================================
4 # Copyright (c) 2000-2009 CollabNet. All rights reserved.
6 # This software is licensed as described in the file COPYING, which
7 # you should have received as part of this distribution. The terms
8 # are also available at http://subversion.tigris.org/license-1.html.
9 # If newer versions of this license are posted there, you may use a
10 # newer version instead, at your option.
12 # This software consists of voluntary contributions made by many
13 # individuals. For exact contribution history, see the revision
14 # history and logs, available at http://cvs2svn.tigris.org/.
15 # ====================================================================
17 """Functions to sort large files.
19 The functions in this module were originally downloaded from the
22 http://code.activestate.com/recipes/466302/
24 It was apparently submitted by Nicolas Lehuen on Tue, 17 Jan 2006.
25 According to the terms of service of that website, the code is usable
26 under the MIT license.
38 # The buffer size to use for open files:
42 def get_default_max_merge():
43 """Return the default maximum number of files to merge at once."""
45 # The maximum number of files to merge at once. This number cannot
46 # be unlimited because there are operating system restrictions on
47 # the number of files that a process can have open at once. So...
49 # If this constant is available via sysconf, we use half the
50 # number available to the process as a whole.
51 _SC_OPEN_MAX
= os
.sysconf('SC_OPEN_MAX')
52 if _SC_OPEN_MAX
== -1:
53 # This also indicates an error:
55 return min(_SC_OPEN_MAX
// 2, 100)
57 # Otherwise, simply limit the number to this constant, which will
58 # hopefully be OK on all operating systems:
62 DEFAULT_MAX_MERGE
= get_default_max_merge()
65 def merge(iterables
, key
=None):
66 """Merge (in the sense of mergesort) ITERABLES.
68 Generate the output in order. If KEY is specified, it should be a
69 function that returns the sort key."""
76 for index
, iterable
in enumerate(iterables
):
78 iterator
= iter(iterable
)
79 value
= iterator
.next()
83 values
.append((key(value
), index
, value
, iterator
))
88 k
, index
, value
, iterator
= heapq
.heappop(values
)
91 value
= iterator
.next()
95 heapq
.heappush(values
, (key(value
), index
, value
, iterator
))
98 def merge_files_onepass(input_filenames
, output_filename
, key
=None):
99 """Merge a number of input files into one output file.
101 This is a merge in the sense of mergesort; namely, it is assumed
102 that the input files are each sorted, and (under that assumption)
103 the output file will also be sorted."""
105 input_filenames
= list(input_filenames
)
106 if len(input_filenames
) == 1:
107 shutil
.move(input_filenames
[0], output_filename
)
109 output_file
= file(output_filename
, 'wb', BUFSIZE
)
113 for input_filename
in input_filenames
:
114 chunks
.append(open(input_filename
, 'rb', BUFSIZE
))
115 output_file
.writelines(merge(chunks
, key
))
126 def _try_delete_files(filenames
):
127 """Try to remove the named files. Ignore errors."""
129 for filename
in filenames
:
136 def tempfile_generator(tempdirs
=[]):
137 """Yield filenames of temporary files."""
139 # Create an iterator that will choose directories to hold the
142 tempdirs
= itertools
.cycle(tempdirs
)
144 tempdirs
= itertools
.repeat(tempfile
.gettempdir())
148 (fd
, filename
) = tempfile
.mkstemp(
149 '', 'sort%06i-' % (i
,), tempdirs
.next(), False
156 def _merge_file_generation(
157 input_filenames
, delete_inputs
, key
=None,
158 max_merge
=DEFAULT_MAX_MERGE
, tempfiles
=None,
160 """Merge multiple input files into fewer output files.
162 This is a merge in the sense of mergesort; namely, it is assumed
163 that the input files are each sorted, and (under that assumption)
164 the output file will also be sorted. At most MAX_MERGE input files
165 will be merged at once, to avoid exceeding operating system
166 restrictions on the number of files that can be open at one time.
168 If DELETE_INPUTS is True, then the input files will be deleted when
169 they are no longer needed.
171 If temporary files need to be used, they will be created using the
172 specified TEMPFILES tempfile generator.
174 Generate the names of the output files."""
177 raise ValueError('max_merge must be greater than one')
179 if tempfiles
is None:
180 tempfiles
= tempfile_generator()
182 filenames
= list(input_filenames
)
183 if len(filenames
) <= 1:
184 raise ValueError('It makes no sense to merge a single file')
187 group
= filenames
[:max_merge
]
188 del filenames
[:max_merge
]
189 group_output
= tempfiles
.next()
190 merge_files_onepass(group
, group_output
, key
=key
)
192 _try_delete_files(group
)
197 input_filenames
, output_filename
, key
=None, delete_inputs
=False,
198 max_merge
=DEFAULT_MAX_MERGE
, tempfiles
=None,
200 """Merge a number of input files into one output file.
202 This is a merge in the sense of mergesort; namely, it is assumed
203 that the input files are each sorted, and (under that assumption)
204 the output file will also be sorted. At most MAX_MERGE input files
205 will be merged at once, to avoid exceeding operating system
206 restrictions on the number of files that can be open at one time.
208 If DELETE_INPUTS is True, then the input files will be deleted when
209 they are no longer needed.
211 If temporary files need to be used, they will be created using the
212 specified TEMPFILES tempfile generator."""
214 filenames
= list(input_filenames
)
216 # Create an empty file:
217 open(output_filename
, 'wb').close()
219 if tempfiles
is None:
220 tempfiles
= tempfile_generator()
221 while len(filenames
) > max_merge
:
222 # Reduce the number of files by performing groupwise merges:
224 _merge_file_generation(
225 filenames
, delete_inputs
, key
=key
,
226 max_merge
=max_merge
, tempfiles
=tempfiles
229 # After the first iteration, we are only working with temporary
230 # files so they can definitely be deleted them when we are done
234 # The last merge writes the results directly into the output
236 merge_files_onepass(filenames
, output_filename
, key
=key
)
238 _try_delete_files(filenames
)
242 input, output
, key
=None,
243 buffer_size
=32000, tempdirs
=[], max_merge
=DEFAULT_MAX_MERGE
,
245 tempfiles
= tempfile_generator(tempdirs
)
249 input_file
= file(input, 'rb', BUFSIZE
)
252 input_iterator
= iter(input_file
)
254 current_chunk
= list(itertools
.islice(input_iterator
, buffer_size
))
255 if not current_chunk
:
257 current_chunk
.sort(key
=key
)
258 filename
= tempfiles
.next()
259 filenames
.append(filename
)
260 f
= open(filename
, 'w+b', BUFSIZE
)
262 f
.writelines(current_chunk
)
269 filenames
, output
, key
=key
,
270 delete_inputs
=True, max_merge
=max_merge
, tempfiles
=tempfiles
,
273 _try_delete_files(filenames
)