v1.7.3
[bioperl-live.git] / bin / bp_find-blast-matches
blob25855adea3db39b71fb1331bfab1fddb7820f175
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
5 =head1 NAME
7 bp_find-blast-matches.pl - extract DNA sequences based on BLAST hits
9 =head1 SYNOPSIS
11 bp_find-blast-matches.pl [-h -e -p -5 -n -o -3 -header] -blast <BLAST_FILE> -fasta <FASTA_FILE>
13 =head1 OPTIONS
15 =head2 Mandatory:
17 =over
19 =item B<-blast>
21 BLAST output file to read from. The alignment should use the file specified by
22 '-fasta' option ideally
24 =item B<-fasta>
26 FASTA file to read from. This is where the sequence will be extracted from
28 =back
30 =head2 Optional:
32 =over
34 =item B<-h>
36 Displays this help message
38 =item B<-e>
40 Maximum e-value for matches (0.01 by default)
42 =item B<-p>
44 Number of base pairs of 5' region to be included with the sequence
46 =item B<-5>
48 Number of base pairs of 5' region only, excluding the regular sequence
50 =item B<-3>
52 Number of base pairs of 3' region only, excluding the regular sequence
54 =item B<-n>
56 Number of top hits to display, starting with the best hit
58 =item B<-o>
60 Exact match to display (this option can't be used in conjunction with '-n'
62 =item B<-header>
64 The FASTA header to display instead of the default
66 =back
68 =head1 DESCRIPTION
70 This script takes a BLAST output file and a FASTA file as arguments,
71 given after the '-blast' and '-fasta' options respectively. The BLAST output
72 file should have been generated with your sequence of interest and the
73 FASTA file supplied as an argument.
74 Example: find-blast-matches.pl -blast BLAST_FILE -fasta FASTA_FILE
76 It parses through the BLAST file to check for high quality matches,
77 which are then searched for in the FASTA file. The sequence may vary
78 from you candidate sequence, hence the BLAST search prior.
80 The sequence from the FASTA file is then displayed to STDOUT.
81 Optional arguments can be used, such as to extract the 5' or 3' region.
83 =head1 AUTHOR
85 Gabriel Abud - E<lt>gabriel.jabud-at-gmail.comE<gt>
87 =head1 FEEDBACK
89 =head2 Mailing Lists
91 User feedback is an integral part of the evolution of this and other
92 Bioperl modules. Send your comments and suggestions preferably to
93 the Bioperl mailing list. Your participation is much appreciated
95 bioperl-l@bioperl.org - General discussion
96 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
98 =head2 Reporting Bugs
100 Report bugs to the Bioperl bug tracking system to help us keep track
101 of the bugs and their resolution. Bug reports can be submitted via
102 email or the web:
104 https://github.com/bioperl/bioperl-live/issues
106 =head1 EDIT HISTORY
108 2014-08-04 - Gabriel Abud
109 First features added
111 =head1 DEPENDANCIES
113 Getopt::long, Pod::Usage, Bio::SearchIO, Bio::Seq, Bio::SeqIO,
114 File::Basename
116 =cut
119 # Modules
120 use Bio::SearchIO qw(new);
121 use Bio::Seq qw(new);
122 use Bio::SeqIO qw(new);
123 use File::Basename qw(basename);
124 use Getopt::Long qw(GetOptions);
125 use Pod::Usage;
127 # Variables
128 my $baseProg = basename($0); # Program name
129 my $line;
130 my @scaffolds;
131 my $inputFile;
132 my $blastFile;
133 my $scaffoldFind;
134 my $baseList;
135 my @start;
136 my @end;
137 my @strand;
138 my @arrayBases;
139 my $query_name;
140 my @query_names;
141 my @accessions;
142 my $accession;
143 my $baseFile;
144 my $total_size;
145 my $title;
146 my $default_header;
147 my $in;
148 my $out;
150 # Command line options
151 my $exact_match; # Undef by default
152 my $e_value = 0.01; # Default e-value
153 my $matches = 1; # Default number of matches
154 my $promoter; # Undef by default
155 my $three_prime; # Undef by default
156 my $promoter_only; # Undef by default
157 my $opt_help; # Undef by default
158 my $header; # Undef by default
160 # Functions for proper command line usage
161 sub syntax {
162 print STDERR
163 "Usage: $baseProg -blast 'BLAST_file' -fasta 'FASTA_file' [OPTIONS]\n";
164 print STDERR "Try '$baseProg --help' for more information.\n";
165 exit;
168 sub help {
169 print STDERR
170 "\nNAME:\n",
171 "\t$baseProg - extract a DNA sequence based on BLAST hits\n\n",
172 "SYNTAX:\n",
173 "\t$baseProg -blast 'BLAST_file' -fasta 'FASTA__file' [OPTIONS]\n\n",
174 "OPTIONS:\n",
175 "\t(All options require an additional number argument [ie: -e 0.01])\n\n",
176 "\t-e, maximum e-value for matches (0.01 by default)\n\n",
177 "\t-p, number of base pairs of 5' region to be included with the\n",
178 "\t sequence of interest\n\n",
179 "\t-5, number of base pairs of 5' region, excluding the sequence\n",
180 "\t of interest (unlike '-p')\n\n",
181 "\t-n, number of top hits to display, starting with the highest hit\n",
182 "\t(1 by default)\n\n",
183 "\t-o, exact match to display (this option can't be used in conjuction\n",
184 "\t with '-n')\n\n",
185 "\t-3, number of base pairs of 3' region to display\n\n",
186 "\t-header, the fasta header to display instead of the default\n\n";
187 exit;
190 # Get command line options
191 GetOptions(
192 'e=f' => \$e_value,
193 'p=i' => \$promoter,
194 'n=i' => \$matches,
195 'o=i' => \$exact_match,
196 '5=i' => \$promoter_only,
197 '3=i' => \$three_prime,
198 'help|h' => \$opt_help,
199 'header|head=s' => \$header,
200 'blast|b=s' => \$blastFile,
201 'fasta|f=s' => \$inputFile
202 ) or pod2usage(-exitstatus => 0, -verbose => 1);
204 # Help screen
205 pod2usage(-exitstatus => 0, -verbose => 2) if $opt_help;
206 #help() if defined $opt_help;
208 # Checks for required arguments
209 pod2usage(-exitstatus => 0, -verbose => 1,
210 -msg => "You must specify the FASTA and BLAST files to read from!\n")
211 if (!defined $blastFile || !defined $inputFile);
212 #syntax() if ( !defined $blastFile || !defined $inputFile );
214 # Checks for any negative numbers
215 #syntax()
216 pod2usage(-exitstatus => 0, -verbose => 1,
217 -msg => "You must use positive numbers as values to options!")
218 if ( (defined $e_value && $e_value < 0)
219 || (defined $promoter && $promoter < 0)
220 || (defined $matches && $matches < 0)
221 || (defined $exact_match && $exact_match < 0)
222 || (defined $promoter_only && $promoter_only < 0)
223 || (defined $three_prime && $three_prime < 0 ) );
225 if ( $matches > 1 && defined $exact_match ) {
226 print STDERR "Cannot use both options '-n' and '-o' at the same time\n";
227 print STDERR "(Type '$baseProg --help' for more information\n)";
228 exit;
231 if ( defined $promoter && defined $promoter_only ) {
232 print STDERR "Cannot use both options '-p' and '-5' at the same time\n";
233 print STDERR "(Type '$baseProg --help' for more information)\n";
234 exit;
237 if ( defined $three_prime && ( defined $promoter || defined $promoter_only ) ) {
238 print STDERR "Cannot use both '-3' with '-p' or '-5' at the same time\n";
239 print STDERR "(Type $baseProg --help' for more information)\n";
240 exit;
243 # Class used to search through the blast file
244 eval { $in = Bio::SearchIO->new( -file => $blastFile, -format => "blast" ); };
245 if ($@) {
246 die "'$blastFile' does not appear to be a BLAST output file! Exiting...\n";
248 $out = Bio::SeqIO->new( -fh => \*STDOUT, -format => "fasta" );
250 # Creates arrays of all the scaffold names and the coordinates of where those scaffolds are found
251 my $n = 0;
252 OUTERLOOP: while ( my $result = $in->next_result ) {
253 LOOP: while ( my $hit = $result->next_hit ) {
254 while ( my $hsp = $hit->next_hsp ) {
256 # Finds all matches with an evalue <= $e_value (or 0.01 by default)
257 if ( $hsp->evalue <= $e_value ) {
258 ( $start[$n], $end[$n] ) = $hsp->range('hit');
259 $scaffolds[$n] = $hit->name, $strand[$n] = $hsp->strand('hit');
260 $n += 1;
263 # If an exact_match option is given
264 if ( defined($exact_match) && $exact_match == $n ) {
265 ( $start[0], $end[0] ) = $hsp->range('hit');
266 $scaffolds[0] = $hit->name, $strand[0] = $hsp->strand('hit');
267 last OUTERLOOP;
269 elsif ( !defined($exact_match) && $n >= $matches )
270 { # Exits after the correct amount of matches have been found (1 by default)
271 last LOOP;
277 $baseFile = basename $inputFile;
279 open INFILE, "<$inputFile"
280 or die "Couldn't open the input file '$inputFile'! Exiting...\n";
282 $scaffoldFind = 0;
283 my %scaffoldList;
284 my %baseList;
286 # Extracts only the scaffolds of interest, avoiding duplicates
287 if ( defined $exact_match ) {
288 $scaffoldList{ $scaffolds[0] } = 1;
290 else {
291 foreach my $num ( 0 .. $#scaffolds ) {
292 if ( !defined $scaffoldList{ $scaffolds[$num] } ) {
293 $scaffoldList{ $scaffolds[$num] } = 1;
297 my $remaining_scaffolds = my $unique_scaffolds = keys(%scaffoldList);
299 local $/ = "\n>";
301 # Reads the FASTA file here, storing FASTA headers where the sequence is found
302 while ( $line = <INFILE> ) {
303 chomp($line);
304 next if ( $line =~ m/^\s*?$/ );
306 foreach my $scaffold ( sort keys %scaffoldList ) {
307 if ( $line =~ /^[ \t]{0,5}$scaffold/ )
308 { # True if FASTA segment contains the scaffold
309 $line =~ s/^.*?\n//;
310 $line =~ s/\s//g;
311 $baseList{$scaffold} = $line;
312 $scaffoldFind++;
313 $remaining_scaffolds--;
316 last unless ($remaining_scaffolds);
319 if ( $scaffoldFind != $unique_scaffolds ) {
320 print STDERR "The scaffold specified in the BLAST file was not found.\n";
321 print STDERR "Make sure you are using the correct FASTA file.\n";
322 exit;
325 for my $m ( 0 .. $#scaffolds )
326 { # Runs a loop as many times as there are scaffolds
327 $baseList = $baseList{ $scaffolds[$m] };
329 # Print title line for each scaffold
330 $accession = $baseFile;
331 $default_header = "(BLAST hit:$scaffolds[$m]|";
333 my $real_start = $start[$m];
334 my $real_end = $end[$m];
336 # For "normal", positive strands (+/+)
337 if ( $strand[$m] == 1 ) {
339 # If the -p flag was used
340 if ( defined $promoter ) { # 5' region specified with -p flag
341 # If 5' region is too large for sequence, don't include the 5' region
342 if ( $promoter >= $start[$m] ) {
343 print STDERR "ERROR: 5' region is too big!! (max promoter = ",
344 $start[$m] - 1, " )\n",
345 "Showing sequence without 5' region...\n";
346 $baseList = substr $baseList, $start[$m], $end[$m] - $start[$m];
348 else {
349 my $real_start = $start[$m] - $promoter;
350 $baseList = substr $baseList, $start[$m] - $promoter - 1,
351 $promoter + $end[$m] - $start[$m] + 1;
355 # If the -3 flag was used
356 elsif ( defined $three_prime ) {
357 $total_size = length($baseList);
359 if ( ( $three_prime + $end[$m] ) > $total_size ) {
360 die "ERROR: 3' region is too big!! ",
361 "(max 3' region = ", $total_size - $end[$m], ")\n",
364 else {
365 $real_start = $end[$m] + 1;
366 $real_end = $end[$m] + $three_prime;
367 $baseList = substr $baseList, $end[$m], $three_prime;
371 # If the -5 flag was used
372 elsif ( defined $promoter_only )
373 { # 5' region was specified with -5 flag
374 # If 5' region is too large for sequence, don't include the 5' region
375 if ( $promoter_only >= $start[$m] ) {
376 die "ERROR: 5' region is too big!! (max promoter = ",
377 $start[$m] - 1, " )\n",
380 else {
381 $real_start = $start[$m] - $promoter_only;
382 $real_end = $start[$m] - 1;
383 $baseList = substr $baseList, $start[$m] - $promoter_only - 1,
384 $promoter_only;
387 else { # Default: just the BLAST hit (no 5' or 3')
388 $baseList = substr $baseList, $start[$m] - 1,
389 $end[$m] - $start[$m] + 1;
393 # Checks to see if the hit sequence is the reverse compliment
394 elsif ( $strand[$m] == -1 ) {
396 # If -p flag was used:
397 if ( defined $promoter ) {
398 $total_size = length($baseList);
400 if ( ( $promoter + $end[$m] ) > $total_size ) {
401 print STDERR "ERROR: 5' region is too big!! ",
402 "(max 5' region = ", $total_size - $end[$m], ")\n",
403 "Showing sequence without 5' region...\n";
404 $baseList = substr $baseList, $start[$m], $end[$m] - $start[$m];
406 else {
407 $real_start = $start[$m] + 1;
408 $real_end = $end[$m] + $promoter;
409 $baseList = substr $baseList, $start[$m] - 1,
410 $end[$m] - $start[$m] + $promoter + 1;
415 # If -3 flag was used
416 elsif ( defined $three_prime ) {
417 if ( $three_prime >= $start[$m] ) {
418 die "ERROR: 3' region is too big!! (max = ", $start[$m] - 1,
419 " )\n";
421 else {
422 $real_start = $start[$m] - $three_prime;
423 $real_end = $start[$m] - 1;
424 $baseList = substr $baseList, $start[$m] - $three_prime - 1,
425 $three_prime;
429 # If -5 flag was used
430 elsif ( defined $promoter_only ) {
431 $total_size = length($baseList);
433 if ( ( $promoter_only + $end[$m] ) > $total_size ) {
434 die "ERROR: 5' region is too big!! ",
435 "(max promoter = ", $total_size - $end[$m], ")\n",
438 else {
439 $real_start = $end[$m] + 1;
440 $real_end = $end[$m] + $promoter_only;
441 $baseList = substr $baseList, $end[$m], $promoter_only;
445 else { # If 5' region wasn't specified at all
446 $baseList = substr $baseList, $start[$m] - 1,
447 $end[$m] - $start[$m] + 1;
450 $default_header .= "$real_start-$real_end)";
451 $default_header .= " showing 5' region ($promoter_only bp) only"
452 if defined($promoter_only);
453 $default_header .= " showing 3' region ($three_prime bp)"
454 if defined($three_prime);
455 $default_header .= " with 5' region ($promoter bp)" if defined($promoter);
457 my $seq_obj = Bio::Seq->new( -seq => "$baseList", -alphabet => 'dna' );
458 if ($strand[$m] == -1) {
459 $seq_obj = $seq_obj->revcom();
462 if ( defined $header ) {
463 $header =~ s/^\s*//;
464 $header =~ s/\s*$//;
465 $header =~ s/^>//;
466 $header =~ s/^([^\s]+)//;
467 my $default_name = $1;
468 $seq_obj->display_id($default_name);
469 $seq_obj->desc($header);
471 else {
472 $seq_obj->desc($default_header);
473 $seq_obj->display_id($accession);
476 # Prints the sequence to STDOUT
477 $out->write_seq($seq_obj);
478 print "\n";
480 # If 'exact_match' option is specified, exit after first (and only) iteration
481 if ( defined($exact_match) ) {
482 close INFILE;
483 exit;
486 close INFILE;
487 } # End of major for loop
489 __END__