lib/CXGN/Bulk.pm

   1
   2 =head1 NAME
   3
   4   /CXGN/Bulk.pm
   5
   6 =head1 DESCRIPTION
   7
   8   The CXGN::Bulk package is a superclass of all the different Bulk packages
   9   used by the download.pl script. It is never instantiated itself, but contains
  10   many methods its subclasses share.
  11
  12 =cut
  13
  14 package CXGN::Bulk;
  15
  16 use strict;
  17 use warnings;
  18 use Carp;
  19 use File::Temp;
  20 use File::Spec::Functions;
  21 use Cache::File;
  22
  23 use File::Slurp qw/slurp/;
  24
  25 =head2 new
  26
  27   Desc: sub new
  28   Args: hash reference containing parameters from user
  29   Ret : $self
  30
  31   All subclasses of Bulk share the same constructor. The constructor takes a reference to a hash
  32   which contains the parameters as an argument and creates a new variable and value for every
  33   key-value pair in the hash reference. It creates several other variables for linking to the
  34   database and printing a web page.
  35
  36 =cut
  37
  38 sub new {
  39     my $class  = shift;
  40     my $self   = {};
  41     my $params = shift;
  42     foreach ( keys %$params ) {
  43         $self->{$_} = $params->{$_};
  44     }
  45     $self->{tempdir} or confess "must provide a tempdir argument to bulk constructor";
  46     -d $self->{tempdir} or confess "no such directory $self->{tempdir}";
  47     $self->{content} = "";    # the content of the page
  48     $self->{db} = $self->{dbc}
  49         or confess "must provide a dbc argument (database handle) to bulk constructor";
  50
  51     my @empty_array = ();
  52     $self->{data} = \@empty_array;
  53     bless $self, $class;
  54     return $self;
  55 }
  56
  57 =head2 create_dumpfile
  58
  59   Desc: sub create_dumpfile
  60   Args: none
  61   Ret : file handle for dumpfile and notfound file
  62
  63   Creates a dumpfile and a notfound file and returns file handles for them.
  64
  65 =cut
  66
  67 sub create_dumpfile {
  68     my $self          = shift;
  69     my $db            = $self->{db};
  70     my @output_fields = @{ $self->{output_fields} };
  71     my @notfound      = ();
  72
  73     if (   !exists( $self->{dump_fh} )
  74         || !exists( $self->{notfound_fh} ) )
  75     {
  76
  77         # generate tmp file names and open files for writing
  78         $self->{dumpfile} = File::Temp->new( TEMPLATE => catfile( $self->{tempdir}, "bulk-XXXXXX" ), UNLINK => 0 )->filename;
  79         $self->{dumpfile} =~ s!$self->{tempdir}/!!;
  80
  81         $self->debug( "FILENAME: " . $self->{dumpfile} );
  82         my $filepath = catfile($self->{tempdir}, $self->{dumpfile});
  83         open( $self->{dump_fh}, '>', $filepath )
  84           || die "Can't open $filepath";
  85         $self->{notfoundfile} = $self->{dumpfile} . ".notfound";
  86         open( $self->{notfound_fh}, '>', "$self->{tempdir}/$self->{notfoundfile}" )
  87             || die "Can't open $self -> {notfoundfile}";
  88
  89         # write file header
  90         @output_fields = @{ $self->{output_fields} };
  91         my $dump_fh = $self->{dump_fh};
  92         print $dump_fh join( "\t", @output_fields ) . "\n";
  93     }
  94
  95     # warn "dumpfile: $filepath \n";
  96     return $self->{dump_fh}, $self->{notfound_fh};
  97 }
  98
  99 sub result_summary {
 100     my $self = shift;
 101
 102     my $cache_root = catfile($self->{tempdir}, $self->{dumpfile} . ".summary");
 103
 104     my $lines_read = $self->get_file_lines( catfile($self->{tempdir}, $self->{dumpfile}) ) - 1;
 105
 106     my $notfoundcount = $self->get_file_lines( catfile($self->{tempdir}, $self->{notfoundfile} ));
 107     my $total        = $lines_read;
 108     my $file         = $self->{dumpfile};
 109     my $notfoundfile = $self->{notfoundfile};
 110     my $idType       = $self->{idType};
 111     my $query_time   = $self->{query_time};
 112     my $seq_type     = $self->{seq_type};
 113     my $est_seq      = $self->{est_seq};
 114     my $unigene_seq  = $self->{unigene_seq};
 115     my $filesize =
 116         (stat( catfile($self->{tempdir}, $self->{dumpfile})))[7] / 1000;
 117     my $missing_ids     = @{ $self->{ids} } - $notfoundcount - $lines_read;
 118     my $missing_ids_msg = "";
 119     my $file_lines = 0; #fix
 120         #$self->getFileLines( $self->{tempdir} . "/" . $self->{dumpfile} );
 121
 122     my $numlines = scalar( @{ $self->{ids} } );
 123     if ( $missing_ids > 0 ) {
 124         $missing_ids_msg = "$missing_ids ids were not retrieved from the database because they were not part of the corresponding unigene set or because there were duplicate entries in the submitted id list.";
 125     }
 126
 127     my $fastalink     = " Fasta ";
 128     my $fastadownload = " Fasta ";
 129     my $fastamessage  = "Note: Fasta option is not available because you didn't
 130                         choose a sequence to download.<br />";
 131     if ( join( " ", @{ $self->{output_fields} } ) =~ /seq/i ) {
 132         $fastalink =
 133 "<a href=\"/tools/bulk/display?outputType=Fasta&amp;dumpfile=$file&amp;unigene_seq=$unigene_seq&amp;est_seq=$est_seq\">Fasta</a>";
 134         $fastadownload =
 135 "<a href=\"/tools/bulk/display?outputType=Fasta&amp;dumpfile=$file&amp;unigene_seq=$unigene_seq&amp;est_seq=$est_seq&amp;download=1\">Fasta</a>";
 136         $fastamessage = "";
 137     }
 138
 139     my $cache = Cache::File->new( cache_root => $cache_root );
 140
 141     my $summary_data = {
 142         fastalink => $fastalink,
 143         fastadownload => $fastadownload,
 144         fastamessage => $fastamessage,
 145         missing_ids_msg => $missing_ids_msg,
 146         file => $file,
 147         total => $total,
 148         query_time => $query_time,
 149         filesize => $filesize,
 150         numlines => $numlines, # number of lines in query
 151         lines_read => $lines_read,  # number of lines in file
 152         idType => $idType,
 153         seq_type => $seq_type,
 154         est_seq => $est_seq,
 155         unigene_seq => $unigene_seq,
 156     };
 157
 158     foreach my $k (keys(%$summary_data)) {
 159         $cache->set($k, $summary_data->{$k});
 160     }
 161 }
 162
 163
 164 =head2 error_message
 165
 166   Desc: sub error_message
 167   Args: none
 168   Ret : none
 169
 170   This method is called by download.pl if the Bulk object has a problem with the
 171   input parameters. It prints a page that contains an error message.
 172
 173 =cut
 174
 175 sub error_message {
 176     my $self = shift;
 177
 178
 179     my $html = $self ->{content} . "\n";
 180     $html .= <<EOH;
 181 <h3>Bulk download error</h3>
 182 I could not process your input.  Possible reasons for this include:
 183 <ul>
 184 <li>You may not have entered any identifiers for download</li>
 185 <li>You may not have entered any identifiers that exist</li>
 186 <li>Your identifiers may not be one-per-line, with no quotation marks, etc (see input example)</li>
 187 <li>The input set may be larger than the limit of 10,000 identifiers</li>
 188 </ul>
 189 Please check your input and use your browser\'s "back" button to go back and try again!
 190 EOH
 191
 192 return $html;
 193 }
 194
 195 =head2 get_file_lines
 196
 197   Desc: sub get_file_lines
 198   Args: file; example. $self -> get_file_lines($self->{tempdir});
 199   Ret : $list[0];
 200
 201   Counts file lines (used on temp directories).
 202
 203 =cut
 204
 205 sub get_file_lines {
 206     my $self   = shift;
 207     my $file   = shift;
 208     open my $f, $file or die "$! opening $file";
 209     my $cnt = 0;
 210     $cnt++ while <$f>;
 211     close $f;
 212     return $cnt;
 213 }
 214
 215 =head2 clean_up
 216
 217   Desc: sub clean_up
 218   Args: default; example. $bulk -> clean_up();
 219   Ret : n/a
 220
 221   Drops unigene, blast, and submitted_ids temp tables (i.e. cleans up after
 222   queries are done).
 223
 224 =cut
 225
 226 sub clean_up {
 227     my $self = shift;
 228     $self->{db}->disconnect();
 229 }
 230
 231 =head2 debug
 232
 233   Desc: sub debug
 234   Args: string; example. $self -> debug("input_ok: Input is NOT OK!");
 235   Ret : n/a
 236
 237   Function for printing adds break and new line to messages.
 238
 239 =cut
 240
 241 sub debug {
 242     my $self = shift;
 243
 244     #print messages if debug flag is set
 245     my $message = shift;
 246     if ( $self->{debug} ) {
 247         print <<EOHTML;
 248    <h4>Bulk download summary</h4>
 249     $message <br />
 250 EOHTML
 251     }
 252 }
 253
 254 =head2 check_ids
 255
 256   Desc: sub check_ids
 257   Args: default
 258   Ret : @ids, array of IDs from input
 259
 260   A common method between all types of Bulk objects, it checks that there are
 261   less than 10,000 IDs from input and puts them all into an array, returning
 262   the array of ids if everything went well, otherwise 0.
 263
 264 =cut
 265
 266 sub check_ids {
 267     my $self = shift;
 268     my @ids  = ();
 269
 270     #do some simple parameter checking
 271
 272     print STDERR "PROCESSING IDS: $self->{ids}. ($self->{idType})\n";
 273
 274     return @ids if ( $self->{idType} eq "" );
 275     return @ids if ( $self->{ids} !~ /\w/ );
 276
 277     #make sure the input string isn't too big
 278     return @ids if length( $self->{ids} ) > 1_000_000;
 279
 280
 281     # clean up data retrieved
 282     my $ids = $self->{ids};
 283     $ids =~ s/^\s+//;
 284     $ids =~ s/\n+/ /g;
 285     $ids =~ s/\s+/ /g;    # compress multiple returns into one
 286     $ids =~ s/\r+/ /g;    # convert carriage returns to space
 287     @ids = split /\s+/, $ids;
 288     @ids = () if @ids > 10_000;    #limit to 10_000 ids to process
 289     return @ids;
 290 }
 291
 292 =head2 process_parameters
 293
 294   Desc: sub process_parameters
 295   Args: none
 296   Ret : 1 if the parameters were OK, 0 if not
 297
 298   Modifies some of the parameters received set in get_parameters. Preparing
 299   data for the database query.
 300
 301 =cut
 302
 303 sub process_parameters {
 304
 305 }
 306
 307 =head2 process_ids
 308
 309   Desc: sub process_ids
 310   Args: default;
 311   Ret : data from database printed to a file;
 312
 313   Queries database using Persistent (see perldoc Persistent) and
 314   object oriented perl to obtain data on Bulk Objects using formatted
 315   IDs.
 316
 317 =cut
 318
 319 sub process_ids {
 320
 321 }
 322
 323 =head2 accessors get_dumpfile, set_dumpfile
 324
 325  Usage:
 326  Desc:
 327  Property
 328  Side Effects:
 329  Example:
 330
 331 =cut
 332
 333 sub get_dumpfile {
 334   my $self = shift;
 335   return $self->{dumpfile};
 336 }
 337
 338 sub set_dumpfile {
 339   my $self = shift;
 340   $self->{dumpfile} = shift;
 341 }
 342
 343 =head2 accessors get_notfoundfile, set_notfoundfile
 344
 345  Usage:
 346  Desc:
 347  Property
 348  Side Effects:
 349  Example:
 350
 351 =cut
 352
 353 sub get_notfoundfile {
 354   my $self = shift;
 355   return $self->{notfoundfile};
 356 }
 357
 358 sub set_notfoundfile {
 359   my $self = shift;
 360   $self->{notfoundfile} = shift;
 361 }
 362
 363 =head2 accessors get_tempdir, set_tempdir
 364
 365  Usage:
 366  Desc:
 367  Property
 368  Side Effects:
 369  Example:
 370
 371 =cut
 372
 373 sub get_tempdir {
 374   my $self = shift;
 375   return $self->{tempdir};
 376 }
 377
 378 sub set_tempdir {
 379   my $self = shift;
 380   $self->{tempdir} = shift;
 381 }
 382
 383
 384 1;