Merge pull request #5163 from solgenomics/audit-error-checking
[sgn.git] / lib / CXGN / Bulk.pm
blob0800e8d0c6af0220a7412c55cb5a1622481a2e1a
2 =head1 NAME
4 /CXGN/Bulk.pm
6 =head1 DESCRIPTION
8 The CXGN::Bulk package is a superclass of all the different Bulk packages
9 used by the download.pl script. It is never instantiated itself, but contains
10 many methods its subclasses share.
12 =cut
14 package CXGN::Bulk;
16 use strict;
17 use warnings;
18 use Carp;
19 use File::Temp;
20 use File::Spec::Functions;
21 use Cache::File;
23 use File::Slurp qw/slurp/;
25 =head2 new
27 Desc: sub new
28 Args: hash reference containing parameters from user
29 Ret : $self
31 All subclasses of Bulk share the same constructor. The constructor takes a reference to a hash
32 which contains the parameters as an argument and creates a new variable and value for every
33 key-value pair in the hash reference. It creates several other variables for linking to the
34 database and printing a web page.
36 =cut
38 sub new {
39 my $class = shift;
40 my $self = {};
41 my $params = shift;
42 foreach ( keys %$params ) {
43 $self->{$_} = $params->{$_};
45 $self->{tempdir} or confess "must provide a tempdir argument to bulk constructor";
46 -d $self->{tempdir} or confess "no such directory $self->{tempdir}";
47 $self->{content} = ""; # the content of the page
48 $self->{db} = $self->{dbc}
49 or confess "must provide a dbc argument (database handle) to bulk constructor";
51 my @empty_array = ();
52 $self->{data} = \@empty_array;
53 bless $self, $class;
54 return $self;
57 =head2 create_dumpfile
59 Desc: sub create_dumpfile
60 Args: none
61 Ret : file handle for dumpfile and notfound file
63 Creates a dumpfile and a notfound file and returns file handles for them.
65 =cut
67 sub create_dumpfile {
68 my $self = shift;
69 my $db = $self->{db};
70 my @output_fields = @{ $self->{output_fields} };
71 my @notfound = ();
73 if ( !exists( $self->{dump_fh} )
74 || !exists( $self->{notfound_fh} ) )
77 # generate tmp file names and open files for writing
78 $self->{dumpfile} = File::Temp->new( TEMPLATE => catfile( $self->{tempdir}, "bulk-XXXXXX" ), UNLINK => 0 )->filename;
79 $self->{dumpfile} =~ s!$self->{tempdir}/!!;
81 $self->debug( "FILENAME: " . $self->{dumpfile} );
82 my $filepath = catfile($self->{tempdir}, $self->{dumpfile});
83 open( $self->{dump_fh}, '>', $filepath )
84 || die "Can't open $filepath";
85 $self->{notfoundfile} = $self->{dumpfile} . ".notfound";
86 open( $self->{notfound_fh}, '>', "$self->{tempdir}/$self->{notfoundfile}" )
87 || die "Can't open $self -> {notfoundfile}";
89 # write file header
90 @output_fields = @{ $self->{output_fields} };
91 my $dump_fh = $self->{dump_fh};
92 print $dump_fh join( "\t", @output_fields ) . "\n";
95 # warn "dumpfile: $filepath \n";
96 return $self->{dump_fh}, $self->{notfound_fh};
99 sub result_summary {
100 my $self = shift;
102 my $cache_root = catfile($self->{tempdir}, $self->{dumpfile} . ".summary");
104 my $lines_read = $self->get_file_lines( catfile($self->{tempdir}, $self->{dumpfile}) ) - 1;
106 my $notfoundcount = $self->get_file_lines( catfile($self->{tempdir}, $self->{notfoundfile} ));
107 my $total = $lines_read;
108 my $file = $self->{dumpfile};
109 my $notfoundfile = $self->{notfoundfile};
110 my $idType = $self->{idType};
111 my $query_time = $self->{query_time};
112 my $seq_type = $self->{seq_type};
113 my $est_seq = $self->{est_seq};
114 my $unigene_seq = $self->{unigene_seq};
115 my $filesize =
116 (stat( catfile($self->{tempdir}, $self->{dumpfile})))[7] / 1000;
117 my $missing_ids = @{ $self->{ids} } - $notfoundcount - $lines_read;
118 my $missing_ids_msg = "";
119 my $file_lines = 0; #fix
120 #$self->getFileLines( $self->{tempdir} . "/" . $self->{dumpfile} );
122 my $numlines = scalar( @{ $self->{ids} } );
123 if ( $missing_ids > 0 ) {
124 $missing_ids_msg = "$missing_ids ids were not retrieved from the database because they were not part of the corresponding unigene set or because there were duplicate entries in the submitted id list.";
127 my $fastalink = " Fasta ";
128 my $fastadownload = " Fasta ";
129 my $fastamessage = "Note: Fasta option is not available because you didn't
130 choose a sequence to download.<br />";
131 if ( join( " ", @{ $self->{output_fields} } ) =~ /seq/i ) {
132 $fastalink =
133 "<a href=\"/tools/bulk/display?outputType=Fasta&amp;dumpfile=$file&amp;unigene_seq=$unigene_seq&amp;est_seq=$est_seq\">Fasta</a>";
134 $fastadownload =
135 "<a href=\"/tools/bulk/display?outputType=Fasta&amp;dumpfile=$file&amp;unigene_seq=$unigene_seq&amp;est_seq=$est_seq&amp;download=1\">Fasta</a>";
136 $fastamessage = "";
139 my $cache = Cache::File->new( cache_root => $cache_root );
141 my $summary_data = {
142 fastalink => $fastalink,
143 fastadownload => $fastadownload,
144 fastamessage => $fastamessage,
145 missing_ids_msg => $missing_ids_msg,
146 file => $file,
147 total => $total,
148 query_time => $query_time,
149 filesize => $filesize,
150 numlines => $numlines, # number of lines in query
151 lines_read => $lines_read, # number of lines in file
152 idType => $idType,
153 seq_type => $seq_type,
154 est_seq => $est_seq,
155 unigene_seq => $unigene_seq,
158 foreach my $k (keys(%$summary_data)) {
159 $cache->set($k, $summary_data->{$k});
164 =head2 error_message
166 Desc: sub error_message
167 Args: none
168 Ret : none
170 This method is called by download.pl if the Bulk object has a problem with the
171 input parameters. It prints a page that contains an error message.
173 =cut
175 sub error_message {
176 my $self = shift;
179 my $html = $self ->{content} . "\n";
180 $html .= <<EOH;
181 <h3>Bulk download error</h3>
182 I could not process your input. Possible reasons for this include:
183 <ul>
184 <li>You may not have entered any identifiers for download</li>
185 <li>You may not have entered any identifiers that exist</li>
186 <li>Your identifiers may not be one-per-line, with no quotation marks, etc (see input example)</li>
187 <li>The input set may be larger than the limit of 10,000 identifiers</li>
188 </ul>
189 Please check your input and use your browser\'s "back" button to go back and try again!
192 return $html;
195 =head2 get_file_lines
197 Desc: sub get_file_lines
198 Args: file; example. $self -> get_file_lines($self->{tempdir});
199 Ret : $list[0];
201 Counts file lines (used on temp directories).
203 =cut
205 sub get_file_lines {
206 my $self = shift;
207 my $file = shift;
208 open my $f, $file or die "$! opening $file";
209 my $cnt = 0;
210 $cnt++ while <$f>;
211 close $f;
212 return $cnt;
215 =head2 clean_up
217 Desc: sub clean_up
218 Args: default; example. $bulk -> clean_up();
219 Ret : n/a
221 Drops unigene, blast, and submitted_ids temp tables (i.e. cleans up after
222 queries are done).
224 =cut
226 sub clean_up {
227 my $self = shift;
228 $self->{db}->disconnect();
231 =head2 debug
233 Desc: sub debug
234 Args: string; example. $self -> debug("input_ok: Input is NOT OK!");
235 Ret : n/a
237 Function for printing adds break and new line to messages.
239 =cut
241 sub debug {
242 my $self = shift;
244 #print messages if debug flag is set
245 my $message = shift;
246 if ( $self->{debug} ) {
247 print <<EOHTML;
248 <h4>Bulk download summary</h4>
249 $message <br />
250 EOHTML
254 =head2 check_ids
256 Desc: sub check_ids
257 Args: default
258 Ret : @ids, array of IDs from input
260 A common method between all types of Bulk objects, it checks that there are
261 less than 10,000 IDs from input and puts them all into an array, returning
262 the array of ids if everything went well, otherwise 0.
264 =cut
266 sub check_ids {
267 my $self = shift;
268 my @ids = ();
270 #do some simple parameter checking
272 print STDERR "PROCESSING IDS: $self->{ids}. ($self->{idType})\n";
274 return @ids if ( $self->{idType} eq "" );
275 return @ids if ( $self->{ids} !~ /\w/ );
277 #make sure the input string isn't too big
278 return @ids if length( $self->{ids} ) > 1_000_000;
281 # clean up data retrieved
282 my $ids = $self->{ids};
283 $ids =~ s/^\s+//;
284 $ids =~ s/\n+/ /g;
285 $ids =~ s/\s+/ /g; # compress multiple returns into one
286 $ids =~ s/\r+/ /g; # convert carriage returns to space
287 @ids = split /\s+/, $ids;
288 @ids = () if @ids > 10_000; #limit to 10_000 ids to process
289 return @ids;
292 =head2 process_parameters
294 Desc: sub process_parameters
295 Args: none
296 Ret : 1 if the parameters were OK, 0 if not
298 Modifies some of the parameters received set in get_parameters. Preparing
299 data for the database query.
301 =cut
303 sub process_parameters {
307 =head2 process_ids
309 Desc: sub process_ids
310 Args: default;
311 Ret : data from database printed to a file;
313 Queries database using Persistent (see perldoc Persistent) and
314 object oriented perl to obtain data on Bulk Objects using formatted
315 IDs.
317 =cut
319 sub process_ids {
323 =head2 accessors get_dumpfile, set_dumpfile
325 Usage:
326 Desc:
327 Property
328 Side Effects:
329 Example:
331 =cut
333 sub get_dumpfile {
334 my $self = shift;
335 return $self->{dumpfile};
338 sub set_dumpfile {
339 my $self = shift;
340 $self->{dumpfile} = shift;
343 =head2 accessors get_notfoundfile, set_notfoundfile
345 Usage:
346 Desc:
347 Property
348 Side Effects:
349 Example:
351 =cut
353 sub get_notfoundfile {
354 my $self = shift;
355 return $self->{notfoundfile};
358 sub set_notfoundfile {
359 my $self = shift;
360 $self->{notfoundfile} = shift;
363 =head2 accessors get_tempdir, set_tempdir
365 Usage:
366 Desc:
367 Property
368 Side Effects:
369 Example:
371 =cut
373 sub get_tempdir {
374 my $self = shift;
375 return $self->{tempdir};
378 sub set_tempdir {
379 my $self = shift;
380 $self->{tempdir} = shift;