clean
[sgn.git] / lib / SGN / Controller / AJAX / Blast.pm
blob798a11f1b19214a2597f343e3e8428f06c009436
2 package SGN::Controller::AJAX::Blast;
4 use Moose;
6 use Bio::SeqIO;
7 use Config::Any;
8 use Data::Dumper;
9 use Storable qw | nstore retrieve |;
10 use Try::Tiny;
11 use Tie::UrlEncoder; our %urlencode;
12 use File::Temp qw | tempfile |;
13 use File::Basename qw | basename |;
14 use File::Copy qw | copy |;
15 use File::Spec qw | catfile |;
16 use File::Slurp qw | read_file write_file |;
17 use File::NFSLock qw | uncache |;
18 use CXGN::Tools::Run;
19 use CXGN::Page::UserPrefs;
20 use CXGN::Tools::List qw/distinct evens/;
21 use CXGN::Blast::Parse;
22 use CXGN::Blast::SeqQuery;
24 use Time::HiRes qw(gettimeofday tv_interval);
26 BEGIN { extends 'Catalyst::Controller::REST'; };
28 __PACKAGE__->config(
29 default => 'application/json',
30 stash_key => 'rest',
31 map => { 'application/json' => 'JSON', 'text/html' => 'JSON' },
34 sub run : Path('/tools/blast/run') Args(0) {
35 my $self = shift;
36 my $c = shift;
38 my $params = $c->req->params();
39 my $input_query = CXGN::Blast::SeqQuery->new();
40 my $valid = $input_query->validate($c, $params->{input_options}, $params->{sequence});
42 if ($valid ne "OK") {
43 $c->stash->{rest} = { error => "Your input contains illegal characters. Please verify your input." };
44 return;
47 $params->{sequence} = $input_query->process($c, $params->{input_options}, $params->{sequence});
49 if ($params->{input_options} eq 'autodetect') {
50 my $detected_type = $input_query->autodetect_seq_type($c, $params->{input_options}, $params->{sequence});
52 # print STDERR "SGN BLAST detected your sequence is: $detected_type\n";
54 # create a hash with the valid options =1 and check and if result 0 return error
55 my %blast_seq_db_program = (
56 nucleotide => {
57 nucleotide => {
58 blastn => 1,
59 tblastx => 1,
61 protein => {
62 blastx => 1,
65 protein => {
66 protein => {
67 blastp => 1,
69 nucleotide => {
70 tblastn => 1,
75 if (!$blast_seq_db_program{$detected_type}{$params->{db_type}}{$params->{program}}) {
76 $c->stash->{rest} = { error => "the program ".$params->{program}." can not be used with a ".$detected_type." sequence (autodetected) and a ".$params->{db_type}." database.\n\nPlease, use different options and disable the autodetection of the query type if it is wrong." };
77 return;
81 my $seq_count = 1;
82 my $blast_tmp_output = $c->config->{cluster_shared_tempdir}."/blast";
83 mkdir $blast_tmp_output if ! -d $blast_tmp_output;
84 if ($params->{sequence} =~ /\>/) {
85 $seq_count= $params->{sequence} =~ tr/\>/\>/;
88 print STDERR "SEQ COUNT = $seq_count\n";
89 my ($seq_fh, $seqfile) = tempfile(
90 "blast_XXXXXX",
91 DIR=> $blast_tmp_output,
94 my $jobid = basename($seqfile);
96 print STDERR "JOB ID CREATED: $jobid\n";
98 my $schema = $c->dbic_schema("SGN::Schema");
100 my %arg_handlers =
103 sequence =>
104 sub {
105 my $sequence = $params->{sequence};
106 if( $sequence ) {
107 $sequence =~ s/^\s+|\s+$|\n\s*\n//g; #< trim out leading and trailing whitespace and blank lines
108 # if ($sequence !~ /^\s*>/) {
109 # $sequence = ">WEB-USER-SEQUENCE (Unknown)\n$sequence";
111 # $sequence .= "\n"; #< add a final newline
113 print STDERR "Opening file for sequence ($seqfile)... ";
114 open(my $FH, ">", $seqfile) || die "Can't open file for query ($seqfile)\n";
115 print $FH $sequence if $sequence;
116 close($FH);
118 # print STDERR "Done.\n";
120 # if(my $file_upload = $page->get_upload) {
121 # if ( my $fh = $file_upload->fh ) {
122 # print $seq_fh $_ while <$fh>;
126 print STDERR "Parsing with bioperl... ";
127 my $i = Bio::SeqIO->new(
128 -format => 'fasta',
129 -file => $seqfile,
132 try {
133 while ( my $s = $i->next_seq ) {
134 $s->length or $c->throw(
135 message => 'Sequence '.encode_entities('"'.$s->id.'"').' is empty, this is not allowed by BLAST.',
136 is_error => 0,
139 } catch {
140 print STDERR $@;
141 die $_ if ref; #< throw it onward if it's an exception
142 my $full_error = $_;
143 if( /MSG:([^\n]+)/ ) {
144 $_ = $1;
146 s/at \/[\-\w \/\.]+ line \d+.+//; # remove anything resembling backtraces
147 $c->throw( message => $_,
148 is_error => 0,
149 developer_message => $full_error,
153 $seq_count >= 1 or $c->throw( message => 'no sequence submitted, cannot run BLAST',
154 is_error => 0,
155 developer_message => Data::Dumper::Dumper({
156 '$seq_count' => $seq_count,
157 '$seq_filename' => $seqfile,
161 return -i => $seqfile;
165 matrix =>
166 sub {
167 my $m = $params->{matrix};
168 $m =~ /^(BLOSUM|PAM)\d+$/
169 or $c->throw( is_error => 0, message => "invalid matrix '$m'" );
170 return -M => $m;
174 expect =>
175 sub {
176 $params->{evalue} =~ s/[^\d\.e\-\+]//gi; #can only be these characters
177 return -e => $params->{evalue} ? $params->{evalue} : 1;
180 maxhits =>
181 sub {
182 my $h = $params->{maxhits} || 20;
183 $h =~ s/\D//g; #only digits allowed
184 return -b => $h;
187 hits_list =>
188 sub {
189 my $h = $params->{maxhits} || 20;
190 $h =~ s/\D//g; #only digits allowed
191 return -v => $h;
194 filterq =>
195 sub {
196 return -F => $params->{filterq} ? 'T' : 'F';
199 # outformat =>
200 # sub {
201 # $params->{outformat} =~ s/\D//g; #only digits allowed
202 # return -m => $params->{outformat};
203 # },
205 database =>
207 sub {
208 my $bdb = $schema->resultset("BlastDb")->find($params->{database} )
209 or die "could not find bdb with file_base '$params->{database}'";
211 my $basename = File::Spec->catfile($c->config->{blast_db_path},$bdb->file_base());
212 #returns '/data/shared/blast/databases/genbank/nr'
213 #remember the ID of the blast db the user just blasted with
215 return -d => $basename;
218 program =>
219 sub {
220 $params->{program} =~ s/[^a-z]//g; #only lower-case letters
221 return -p => $params->{program};
225 print STDERR "BUILDING COMMAND...\n";
228 # build our command with our arg handlers
230 my @command = ('blastall');
231 foreach my $k (keys %arg_handlers) {
233 print STDERR "evaluating $k...";
234 my @x = $arg_handlers{$k}->();
235 print STDERR "component:
236 ", (join ",", @x)."\n";
237 @command = (@command, @x);
240 # To get the proper format for gi sequences (CitrusGreening.org case)
241 push(@command, '-I');
242 push(@command, 'T');
244 print STDERR "COMMAND: ".join(" ", @command);
245 print STDERR "\n";
247 # save our prefs
248 # $prefs->save;
250 # now run the blast
254 my $job;
255 eval {
256 $job = CXGN::Tools::Run->run_cluster(
257 @command,
259 temp_base => $blast_tmp_output,
260 queue => $c->config->{'web_cluster_queue'},
261 working_dir => $blast_tmp_output,
263 # temp_base => $c->config->{'cluster_shared_tempdir'},
264 # queue => $c->config->{'web_cluster_queue'},
265 # working_dir => $c->config->{'cluster_shared_tempdir'},
267 # don't block and wait if the cluster looks full
268 max_cluster_jobs => 1_000_000_000,
273 print STDERR "Saving job state to $seqfile.job for id ".$job->job_id()."\n";
275 $job->do_not_cleanup(1);
277 nstore( $job, $seqfile.".job" ) or die 'could not serialize job object';
283 if ($@) {
284 print STDERR "An error occurred! $@\n";
285 $c->stash->{rest} = { error => $@ };
287 else {
288 # write data in blast.log
289 my $blast_log_path = $c->config->{blast_log};
290 my $blast_log_fh;
291 if (-e $blast_log_path) {
292 open($blast_log_fh, ">>", $blast_log_path) || print STDERR "cannot create $blast_log_path\n";
293 } else {
294 open($blast_log_fh, ">", $blast_log_path) || print STDERR "cannot open $blast_log_path\n";
295 print $blast_log_fh "Seq_num\tDB_id\tProgram\teval\tMaxHits\tMatrix\tDate\n";
297 print $blast_log_fh "$seq_count\t".$params->{database}."\t".$params->{program}."\t".$params->{evalue}."\t".$params->{maxhits}."\t".$params->{matrix}."\t".localtime()."\n";
300 print STDERR "Passing jobid code ".(basename($jobid))."\n";
301 $c->stash->{rest} = { jobid => basename($jobid),
302 seq_count => $seq_count,
308 sub check : Path('/tools/blast/check') Args(1) {
309 my $self = shift;
310 my $c = shift;
311 my $jobid = shift;
313 # my $t0 = [gettimeofday]; #-------------------------- TIME CHECK
315 my $blast_tmp_output = $c->get_conf('cluster_shared_tempdir')."/blast";
317 #my $jobid =~ s/\.\.//g; # prevent hacks
318 my $job = retrieve($blast_tmp_output."/".$jobid.".job");
321 if ( $job->alive ){
322 # my $t1 = [gettimeofday]; #-------------------------- TIME CHECK
324 sleep(1);
325 $c->stash->{rest} = { status => 'running', };
327 # my $t2 = [gettimeofday]; #-------------------------- TIME CHECK
329 # my $t1_t2 = tv_interval $t1, $t2;
330 # print STDERR "Job alive: $t1_t2\n";
332 return;
334 else {
336 # my $t3 = [gettimeofday]; #-------------------------- TIME CHECK
338 # the job has finished
339 # copy the cluster temp file back into "apache space"
341 my $result_file = $self->jobid_to_file($c, $jobid.".out");
343 my $job_out_file = $job->out_file();
344 for( 1..10 ) {
345 uncache($job_out_file);
346 last if -f $job_out_file;
347 sleep 1;
348 # my $t4 = [gettimeofday]; #-------------------------- TIME CHECK
349 # my $t3_t4 = tv_interval $t3, $t4;
350 # print STDERR "Job not alive loop: $t3_t4\n";
353 # my $t5 = [gettimeofday]; #-------------------------- TIME CHECK
355 -f $job_out_file or die "job output file ($job_out_file) doesn't exist";
356 -r $job_out_file or die "job output file ($job_out_file) not readable";
358 # my $t6 = [gettimeofday]; #-------------------------- TIME CHECK
360 # You may wish to provide a different output file to send back
361 # rather than STDOUT from the job. Use the out_file_override
362 # parameter if this is the case.
363 #my $out_file = $out_file_override || $job->out_file();
364 # system("ls $blast_tmp_output 2>&1 >/dev/null");
366 # my $t7 = [gettimeofday]; #-------------------------- TIME CHECK
368 # system("ls $c->{config}->{cluster_shared_tempdir} 2>&1 >/dev/null");
369 copy($job_out_file, $result_file) or die "Can't copy result file '$job_out_file' to $result_file ($!)";
371 # my $t8 = [gettimeofday]; #-------------------------- TIME CHECK
373 #clean up the job tempfiles
374 $job->cleanup();
376 # my $t9 = [gettimeofday]; #-------------------------- TIME CHECK
378 #also delete the job file
380 # my $t10 = [gettimeofday]; #-------------------------- TIME CHECK
382 # my $t5_t6 = tv_interval $t5, $t6;
383 # my $t6_t7 = tv_interval $t6, $t7;
384 # my $t7_t8 = tv_interval $t7, $t8;
385 # my $t8_t9 = tv_interval $t8, $t9;
386 # my $t9_t10 = tv_interval $t9, $t10;
388 # my $t3_t10 = tv_interval $t3, $t10;
389 # my $t0_t10 = tv_interval $t0, $t10;
391 # print STDERR "check 5-6 interval: $t5_t6\n";
392 # print STDERR "check 6-7 interval: $t6_t7\n";
393 # print STDERR "check 7-8 interval: $t7_t8\n";
394 # print STDERR "check 8-9 interval: $t8_t9\n";
395 # print STDERR "check 9-10 interval: $t9_t10\n";
397 # print STDERR "Job not alive (else): $t3_t10\n";
398 # print STDERR "CHECK SUB TIME: $t0_t10\n";
401 $c->stash->{rest} = { status => "complete" };
405 # fetch some html/js required for displaying the parse report
406 # sub get_prereqs : Path('/tools/blast/prereqs') Args(1) {
407 # my $self = shift;
408 # my $c = shift;
409 # my $jobid = shift;
411 # my $format=$c->req->param('format');
412 # my $parser = CXGN::Blast::Parse->new();
413 # my $prereqs = $parser->prereqs($format);
414 # $c->stash->{rest} = { prereqs => $prereqs };
417 sub get_result : Path('/tools/blast/result') Args(1) {
418 my $self = shift;
419 my $c = shift;
420 my $jobid = shift;
422 # my $t0 = [gettimeofday]; #-------------------------- TIME CHECK
424 my $format = $c->req->param('format');
425 my $db_id = $c->req->param('db_id');
427 my $result_file = $self->jobid_to_file($c, $jobid.".out");
428 my $blast_tmp_output = $c->get_conf('cluster_shared_tempdir')."/blast";
430 # system("ls $blast_tmp_output 2>&1 >/dev/null");
431 # system("ls ".($c->config->{cluster_shared_tempdir})." 2>&1 >/dev/null");
433 my $schema = $c->dbic_schema("SGN::Schema");
434 my $db = $schema->resultset("BlastDb")->find($db_id);
435 if (!$db) { die "Can't find database with id $db_id"; }
436 my $parser = CXGN::Blast::Parse->new();
437 my $parsed_data = $parser->parse($c, $format, $result_file, $db);
439 # my $t1 = [gettimeofday]; #-------------------------- TIME CHECK
440 # my $t0_t1 = tv_interval $t0, $t1;
441 # print STDERR "GET RESULT SUB TIME: $t0_t1\n";
443 $c->stash->{rest} = $parsed_data; # { blast_report => '<pre>'.(join("\n", read_file($parsed_file))).'</pre>', };
447 sub jobid_to_file {
448 my $self = shift;
449 my $c = shift;
450 my $jobid = shift;
451 return File::Spec->catfile($c->config->{basepath}, $c->tempfiles_subdir('blast'), "$jobid");
455 sub search_gene_ids {
456 my $ids_array = shift;
457 my $blastdb_path = shift;
458 my @ids = @{$ids_array};
459 my @output_seqs;
461 my $fs = Bio::BLAST::Database->open(full_file_basename => "$blastdb_path",);
463 foreach my $input_string (@ids) {
465 if ($fs->get_sequence($input_string)) {
466 my $seq_obj = $fs->get_sequence($input_string);
467 my $seq = $seq_obj->seq();
468 my $id = $seq_obj->id();
469 my $desc = $seq_obj->desc();
470 my $new_seq = "";
472 for (my $i=0; $i<length($seq); $i=$i+60) {
473 $new_seq = $new_seq.substr($seq,$i,60)."<br>";
476 push(@output_seqs, ">$id $desc<br>$new_seq");
479 return join('', @output_seqs);
482 sub search_desc : Path('/tools/blast/desc_search/') Args(0) {
483 my $self = shift;
484 my $c = shift;
486 my @ids;
487 my $schema = $c->dbic_schema("SGN::Schema");
488 my $params = $c->req->params();
489 my $input_string = $params->{blast_desc};
490 my $db_id = $params->{database};
492 my $bdb = $schema->resultset("BlastDb")->find($db_id) || die "could not find bdb with file_base $db_id";
493 my $blastdb_path = File::Spec->catfile($c->config->{blast_db_path}, $bdb->file_base());
495 my $grepcmd = "grep -i \"$input_string\" $blastdb_path \| sed 's/>//' \| cut -d ' ' -f 1";
496 my $output_seq = `$grepcmd`;
497 my $output_seqs;
499 if ($output_seq) {
500 @ids = split(/\n/, $output_seq);
501 $output_seqs = search_gene_ids(\@ids,$blastdb_path);
503 else {
504 $output_seqs = "There were not results for your search\n";
506 $c->stash->{rest} = {output_seq => "$output_seqs"};