Merge pull request #42 from solgenomics/topic/duplicate_image_warning
[cxgn-corelibs.git] / lib / CXGN / DB / Copy.pm
blob8fe8318d5d3bc59e510a834259c0910e10255176
1 # A module that adds a copy() function to CXGN::DB::Connection.
3 # The implementation here tries to take maximal advantage of the symmetries
4 # between copying to/from the database, both structurally and syntactically.
5 # To that end we wrap filehandles in a trivial class so we can have
6 # getline/putline methods on that class's instances, and we wrap pg_getline
7 # in a tiny method with a cleaner interface than DBD::Pg's pg_getline.
9 use strict;
11 # Note: this file adds a method to CXGN::DB::Connection.
12 use CXGN::DB::Connection;
13 package CXGN::DB::Connection;
15 sub copy {
16 my $dbh = shift;
17 my @args = CXGN::DB::Copy::helpers::process_args (@_);
19 # @args now contains
20 # ($direction, $csv, $table, $file, $delimiter, $null, $quote, $escape, $munge),
21 # but we only need to deal with $direction and $file in this subroutine.
22 my ($direction, $file, $delimiter, $munge) = @args[0,3,4,8];
24 # Manufacture the command to put the backend into COPY mode.
25 my $copycmd = CXGN::DB::Copy::helpers::copy_command ($dbh, @args);
27 # We want to support copying to/from either named files or
28 # already-open filehandles. So if $file is a reference to
29 # a glob, use it, and otherwise try opening it (and see
30 # below for CXGN::DB::Copy::copy_filehandle).
31 my $fh;
32 if (ref ($file) eq "GLOB") {
33 $fh = CXGN::DB::Copy::copy_filehandle->wrap_filehandle($file);
34 } elsif (ref ($file) eq "") {
35 my $fileh;
36 if ($direction eq "in") {
37 open ($fileh, "<$file") or die ("copy() couldn't open $file for reading: $!");
38 } else {
39 open ($fileh, ">$file") or die ("copy() couldn't open $file for writing: $!");
41 $fh = CXGN::DB::Copy::copy_filehandle->wrap_filehandle($fileh);
42 } else {
43 die ("copy doesn't know what to do with $file.");
46 # At this point all we need to do is to line up source/sink
47 # methods.
48 my ($source, $getline, $sink, $putline);
49 if ($direction eq "in") {
50 $source = $fh;
51 $getline = "fh_getline";
52 $sink = $dbh;
53 $putline = "pg_putline";
54 } else {
55 $source = $dbh;
56 $getline = "non_idiotic_pg_getline";
57 $sink = $fh;
58 $putline = "fh_putline";
61 # And here's the actual work.
62 # print STDERR "!!".$copycmd."\n";
63 $dbh->do ($copycmd); # Put the backend into COPY mode.
64 my $line_count = 0;
65 while (my $line = $source->$getline) { # As long as there's data from the source,
66 if ($munge) { # maybe munge it,
67 chomp ($line);
68 my @a = split ($delimiter, $line);
69 $line = join ($delimiter, $munge->(@a));
70 if ($line) {
71 $line = $line . "\n";
74 if ($line) { # (the munger may return undef, meaning 'skip this one')
75 $sink->$putline ($line); # and sink it.
76 $line_count++;
79 if ($direction eq "in") { # COPY ... TO requires an explicit close operation.
80 $dbh->pg_endcopy;
82 return ($line_count);
84 # That's it for the copy operation. The logic above is simplified by
85 # creating a symmetry between the interface to getting and putting
86 # lines from and to data sources.
88 # pg_getline has the documented interface of fgets (it takes a buffer
89 # and a size, and mutates the buffer), but doesn't seem to *stop*
90 # writing into the string after the specified number of characters.
91 # Awful. Probably it's diddling unallocated memory or somesuch;
92 # consequently it's advisable to make the variable $sz below be at
93 # least as large as any line you're liable to receive from the
94 # database. In any case, if it should happen that pg_getline ever does
95 # honor its second argument, this routine ought to do the memory
96 # management properly.
97 sub non_idiotic_pg_getline {
98 my ($dbh) = @_;
99 my $sz = 4096; # Totally arbitrary size.
100 my $ret = "";
101 while (1) {
102 my $buf = " " x $sz;
103 my $r;
104 $r = $dbh->pg_getline ($buf, $sz);
105 if ($r eq "") {
106 last;
108 $ret .= $buf;
109 if ($ret =~ m|$/$|) {
110 last;
113 if ($ret eq "") {
114 return (undef);
115 } else {
116 return $ret;
119 # my $buf = " " x $sz;
120 # my $r = $dbh->pg_getline ($buf, $sz);
121 # if ($r eq "") {
122 # return undef;
123 # } else {
124 # return $buf;
129 # As Perl's ordinary <> operator and print functions are not
130 # syntactically available as methods of anything, we introduce a tiny
131 # wrapper that makes getting a line and writing a line a method of a
132 # wrapped filehandle object.
134 # DO NOT USE OUTSIDE THIS FILE. It's not very sturdy, but only meant
135 # for this one purpose.
136 package CXGN::DB::Copy::copy_filehandle;
138 # Constructor.
139 sub wrap_filehandle {
140 my ($class, $fh) = @_;
141 my $self = {};
142 $self->{fh} = $fh;
143 bless ($self, $class);
144 return ($self);
146 sub fh_putline {
147 my ($self, $line) = @_;
148 my $fh = $self->{fh};
149 print $fh $line;
151 sub fh_getline {
152 my ($self) = @_;
153 my $fh = $self->{fh};
154 my $ln = <$fh>;
155 return ($ln);
158 package CXGN::DB::Copy::helpers;
160 # Construct
161 sub copy_command {
162 my $dbh = shift;
163 my ($direction, $csv, $table, undef, $delimiter, $null, $quote, $escape) = @_;
165 my ($stream, $fmt, @copyargs);
166 if ($direction eq "in") {
167 $stream = "FROM STDIN";
168 } else {
169 $stream = "TO STDOUT";
171 if ($csv) {
172 $fmt = "COPY %s %s DELIMITER AS %s NULL AS %s CSV QUOTE AS %s ESCAPE AS %s",
173 @copyargs = ($delimiter, $null, $quote, $escape);
174 } else {
175 $fmt = "COPY %s %s WITH DELIMITER AS %s NULL AS %s",
176 @copyargs = ($delimiter, $null);
178 my $command = sprintf $fmt, $table, $stream, map {$dbh->quote($_)} @copyargs;
179 return ($command);
182 sub process_args {
183 my %args = @_;
184 # Source/sink must be either (fromtable AND tofile) OR (fromfile AND totable).
185 unless ((exists ($args{fromtable}) && exists ($args{tofile})) ||
186 (exists ($args{fromfile}) && exists ($args{totable}))) {
187 die ("copy must have (fromtable and tofile) or (fromfile and totable).");
190 my $direction;
191 if (exists ($args{fromtable})) {
192 $direction = "out";
193 } else {
194 $direction = "in";
196 my $csv;
197 if ((exists ($args{quote}))|| exists ($args{escape})) {
198 $csv = 1;
201 # Fill in canonical defaults for everything.
202 unless (exists ($args{delimiter})) {
203 $args{delimiter} = $csv ? "," : "\t";
205 unless (exists ($args{null})) {
206 $args{null} = $csv ? "" : "\\N";
208 unless (exists ($args{quote})) {
209 $args{quote} = "\""; #"
211 unless (exists ($args{escape})) {
212 $args{escape} = "\\";
214 # Strictly, munging is not incompatible with CSV encoding,
215 # but it's a pain to decode CSV (and there are variations in
216 # how CSV is done, too). So we'll just say that munging
217 # is not allowed for CSV encodings.
218 if ($csv && exists ($args{munge})) {
219 die ("can't use a munge in CSV mode. Sorry.");
221 unless (exists ($args{munge})) {
222 $args{munge} = undef;
225 # This is a bit messy because I was having trouble with
226 # hash slices. Preserve the order of the return arguments,
227 # or else change the other methods in this class.
228 my @ret = ($direction, $csv);
229 if ($direction eq "in") {
230 push @ret, @args{"totable", "fromfile"};
231 } else {
232 push @ret, @args{"fromtable", "tofile"};
234 push @ret, @args{"delimiter", "null", "quote", "escape", "munge"};
235 return (@ret);
241 =head1 NAME
243 CXGN::DB::Copy -- a wrapper for the COPY operation in PostgreSQL.
245 =cut
247 =head1 SYNOPSIS
249 # CXGN::DB::Copy adds 1 notable method to the CXGN::DB::Connection
250 # class. Use both CXGN::DB::Connection and CXGN::DB::Copy to get the
251 # fancy copy method.
252 use CXGN::DB::Connection;
253 use CXGN::DB::Copy;
255 # Copy from a table to a file, with default options for everything
256 # else (i.e., delimiter will be tab, null will be
257 # backslash-capital-n). Tablename and filename must be strings.
258 my $dbh = CXGN::DB::Connection->new();
259 $dbh->copy(fromtable => "$tablename", tofile => "$filename");
261 # Copy from /etc/passwd to a table (assumes a suitable table
262 # structure):
263 $dbh->copy (totable => "passwd",
264 fromfile => "/etc/passwd",
265 delimiter => ":",
266 null => "");
268 # Copy into a table from a file, upcasing the second field of each
269 # line. The munge function must return an array.
270 my $munge =
271 $dbh->copy (totable => "sometable", fromfile => "somefile",
272 munge => sub { return (shift, uc(shift), @_); });
275 =cut
277 =head1 DESCRIPTION
279 DBD::Pg offers a low-level interface to using the COPY operation in
280 the Postgres backend, but it's comparatively tedious to use. This
281 module adds a method called copy() to CXGN::DB::Connection that has
282 approximately the same compact expression as psql's \copy builtin,
283 which is somewhat tidier than the explicit loops involved using the
284 DBD::Pg interface.
286 =head1 METHODS
288 =head2 copy
290 Description: copies data to or from a database table from or to a
291 file or filehandle, respectively.
292 Arguments: similar to those taken by the Postgres backend's COPY
293 command, viz:
295 totable => name of a table
296 fromfile => name of a file, or a filehandle
298 fromtable => name of a table
299 tofile => name of a file, or a filehandle
301 delimiter => a string of length 1 that will delimit
302 fields in the file (default "\t")
303 null => a textual representation for NULL (default "\\N")
304 quote => a string of length 1 for quoting fields
305 containing whitespace in the file.
306 escape => a string of length 1 for escaping quotes in
307 fields in the file
309 munge => a subroutine. See below.
310 Returns: nothing
311 Side effects: either populates a database table with stuff from a
312 file, or fills a file with stuff from a database table.
313 Limitations: has whatever limits the Pg backend's COPY command
314 does, e.g., you can't COPY to or from a view.
315 Also has whatever bugs DBD::Pg has: in particular,
316 DBD::Pg's pg_getline routine is likely subject to
317 serious buffer overflow problems, which this module
318 tries to avoid by assuming that no printed representation
319 of a row in a table will be bigger than 4KiB.
320 Unimplemented: COPY lets you select a subset of columns in the
321 database table, and to specify to always quote some
322 columns when doing CSV copies. It'd be a SMOP to
323 add these, however.
324 Notes: Either totable and fromfile must be supplied, or tofile
325 and fromtable must be supplied, and not both.
327 The quote and escape arguments are used only with CSV
328 formatted files. I (Marty) haven't really stress-tested
329 the CSV side of COPY too much, so you should perhaps
330 expect some bugs there.
332 The order of arguments is not significant.
334 For certain simple filters and transforms between
335 database and file, a subroutine may be supplied as the
336 munge argument to the copy method. The subroutine will
337 receive the broken-up fields from the database or file as
338 separate arguments, and should return a list (which will
339 be joined using the copy operation's delimiter string and
340 then passed on to the database or file) or undef (which
341 will not be inserted into the table or written to the
342 file). For example, to copy a tabular file with
343 identifiers prefixed by "SGN-U" into a table, stripping
344 off the "SGN-U", you might say something like this:
346 $dbh->copy(fromfile=>"file.tab", totable=>"sometable",
347 munge=>sub{ my $id=shift; $id =~s /^SGN-U/;
348 return ($id, @_); } );
350 There are a couple of caveats to using such a munge
351 function: first, you can't supply a munge argument if the
352 file is to be encoded/decoded in CSV format. Second, the
353 line of text from the file or database will be split
354 using the delimiter character, and so it's important to
355 ensure that the delimiter character doesn't appear in the
356 fields in the database (you might want to supply a
357 control character such as ^_ (C-_) for the delimiter to
358 avoid this possibility). Finally, while you can, in
359 principle, do arbitrarily complex transformations with
360 the munge function, probably you shouldn't.
362 =cut