7 bp_find-blast-matches.pl - extract DNA sequences based on BLAST hits
11 bp_find-blast-matches.pl [-h -e -p -5 -n -o -3 -header] -blast <BLAST_FILE> -fasta <FASTA_FILE>
21 BLAST output file to read from. The alignment should use the file specified by
22 '-fasta' option ideally
26 FASTA file to read from. This is where the sequence will be extracted from
36 Displays this help message
40 Maximum e-value for matches (0.01 by default)
44 Number of base pairs of 5' region to be included with the sequence
48 Number of base pairs of 5' region only, excluding the regular sequence
52 Number of base pairs of 3' region only, excluding the regular sequence
56 Number of top hits to display, starting with the best hit
60 Exact match to display (this option can't be used in conjunction with '-n'
64 The FASTA header to display instead of the default
70 This script takes a BLAST output file and a FASTA file as arguments,
71 given after the '-blast' and '-fasta' options respectively. The BLAST output
72 file should have been generated with your sequence of interest and the
73 FASTA file supplied as an argument.
74 Example: find-blast-matches.pl -blast BLAST_FILE -fasta FASTA_FILE
76 It parses through the BLAST file to check for high quality matches,
77 which are then searched for in the FASTA file. The sequence may vary
78 from you candidate sequence, hence the BLAST search prior.
80 The sequence from the FASTA file is then displayed to STDOUT.
81 Optional arguments can be used, such as to extract the 5' or 3' region.
85 Gabriel Abud - E<lt>gabriel.jabud-at-gmail.comE<gt>
91 User feedback is an integral part of the evolution of this and other
92 Bioperl modules. Send your comments and suggestions preferably to
93 the Bioperl mailing list. Your participation is much appreciated
95 bioperl-l@bioperl.org - General discussion
96 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
100 Report bugs to the Bioperl bug tracking system to help us keep track
101 of the bugs and their resolution. Bug reports can be submitted via
104 https://github.com/bioperl/bioperl-live/issues
108 2014-08-04 - Gabriel Abud
113 Getopt::long, Pod::Usage, Bio::SearchIO, Bio::Seq, Bio::SeqIO,
120 use Bio
::SearchIO
qw(new);
121 use Bio
::Seq
qw(new);
122 use Bio
::SeqIO
qw(new);
123 use File
::Basename
qw(basename);
124 use Getopt
::Long
qw(GetOptions);
128 my $baseProg = basename
($0); # Program name
150 # Command line options
151 my $exact_match; # Undef by default
152 my $e_value = 0.01; # Default e-value
153 my $matches = 1; # Default number of matches
154 my $promoter; # Undef by default
155 my $three_prime; # Undef by default
156 my $promoter_only; # Undef by default
157 my $opt_help; # Undef by default
158 my $header; # Undef by default
160 # Functions for proper command line usage
163 "Usage: $baseProg -blast 'BLAST_file' -fasta 'FASTA_file' [OPTIONS]\n";
164 print STDERR
"Try '$baseProg --help' for more information.\n";
171 "\t$baseProg - extract a DNA sequence based on BLAST hits\n\n",
173 "\t$baseProg -blast 'BLAST_file' -fasta 'FASTA__file' [OPTIONS]\n\n",
175 "\t(All options require an additional number argument [ie: -e 0.01])\n\n",
176 "\t-e, maximum e-value for matches (0.01 by default)\n\n",
177 "\t-p, number of base pairs of 5' region to be included with the\n",
178 "\t sequence of interest\n\n",
179 "\t-5, number of base pairs of 5' region, excluding the sequence\n",
180 "\t of interest (unlike '-p')\n\n",
181 "\t-n, number of top hits to display, starting with the highest hit\n",
182 "\t(1 by default)\n\n",
183 "\t-o, exact match to display (this option can't be used in conjuction\n",
185 "\t-3, number of base pairs of 3' region to display\n\n",
186 "\t-header, the fasta header to display instead of the default\n\n";
190 # Get command line options
195 'o=i' => \
$exact_match,
196 '5=i' => \
$promoter_only,
197 '3=i' => \
$three_prime,
198 'help|h' => \
$opt_help,
199 'header|head=s' => \
$header,
200 'blast|b=s' => \
$blastFile,
201 'fasta|f=s' => \
$inputFile
202 ) or pod2usage
(-exitstatus
=> 0, -verbose
=> 1);
205 pod2usage
(-exitstatus
=> 0, -verbose
=> 2) if $opt_help;
206 #help() if defined $opt_help;
208 # Checks for required arguments
209 pod2usage
(-exitstatus
=> 0, -verbose
=> 1,
210 -msg
=> "You must specify the FASTA and BLAST files to read from!\n")
211 if (!defined $blastFile || !defined $inputFile);
212 #syntax() if ( !defined $blastFile || !defined $inputFile );
214 # Checks for any negative numbers
216 pod2usage
(-exitstatus
=> 0, -verbose
=> 1,
217 -msg
=> "You must use positive numbers as values to options!")
218 if ( (defined $e_value && $e_value < 0)
219 || (defined $promoter && $promoter < 0)
220 || (defined $matches && $matches < 0)
221 || (defined $exact_match && $exact_match < 0)
222 || (defined $promoter_only && $promoter_only < 0)
223 || (defined $three_prime && $three_prime < 0 ) );
225 if ( $matches > 1 && defined $exact_match ) {
226 print STDERR
"Cannot use both options '-n' and '-o' at the same time\n";
227 print STDERR
"(Type '$baseProg --help' for more information\n)";
231 if ( defined $promoter && defined $promoter_only ) {
232 print STDERR
"Cannot use both options '-p' and '-5' at the same time\n";
233 print STDERR
"(Type '$baseProg --help' for more information)\n";
237 if ( defined $three_prime && ( defined $promoter || defined $promoter_only ) ) {
238 print STDERR
"Cannot use both '-3' with '-p' or '-5' at the same time\n";
239 print STDERR
"(Type $baseProg --help' for more information)\n";
243 # Class used to search through the blast file
244 eval { $in = Bio
::SearchIO
->new( -file
=> $blastFile, -format
=> "blast" ); };
246 die "'$blastFile' does not appear to be a BLAST output file! Exiting...\n";
248 $out = Bio
::SeqIO
->new( -fh
=> \
*STDOUT
, -format
=> "fasta" );
250 # Creates arrays of all the scaffold names and the coordinates of where those scaffolds are found
252 OUTERLOOP
: while ( my $result = $in->next_result ) {
253 LOOP
: while ( my $hit = $result->next_hit ) {
254 while ( my $hsp = $hit->next_hsp ) {
256 # Finds all matches with an evalue <= $e_value (or 0.01 by default)
257 if ( $hsp->evalue <= $e_value ) {
258 ( $start[$n], $end[$n] ) = $hsp->range('hit');
259 $scaffolds[$n] = $hit->name, $strand[$n] = $hsp->strand('hit');
263 # If an exact_match option is given
264 if ( defined($exact_match) && $exact_match == $n ) {
265 ( $start[0], $end[0] ) = $hsp->range('hit');
266 $scaffolds[0] = $hit->name, $strand[0] = $hsp->strand('hit');
269 elsif ( !defined($exact_match) && $n >= $matches )
270 { # Exits after the correct amount of matches have been found (1 by default)
277 $baseFile = basename
$inputFile;
279 open INFILE
, "<$inputFile"
280 or die "Couldn't open the input file '$inputFile'! Exiting...\n";
286 # Extracts only the scaffolds of interest, avoiding duplicates
287 if ( defined $exact_match ) {
288 $scaffoldList{ $scaffolds[0] } = 1;
291 foreach my $num ( 0 .. $#scaffolds ) {
292 if ( !defined $scaffoldList{ $scaffolds[$num] } ) {
293 $scaffoldList{ $scaffolds[$num] } = 1;
297 my $remaining_scaffolds = my $unique_scaffolds = keys(%scaffoldList);
301 # Reads the FASTA file here, storing FASTA headers where the sequence is found
302 while ( $line = <INFILE
> ) {
304 next if ( $line =~ m/^\s*?$/ );
306 foreach my $scaffold ( sort keys %scaffoldList ) {
307 if ( $line =~ /^[ \t]{0,5}$scaffold/ )
308 { # True if FASTA segment contains the scaffold
311 $baseList{$scaffold} = $line;
313 $remaining_scaffolds--;
316 last unless ($remaining_scaffolds);
319 if ( $scaffoldFind != $unique_scaffolds ) {
320 print STDERR
"The scaffold specified in the BLAST file was not found.\n";
321 print STDERR
"Make sure you are using the correct FASTA file.\n";
325 for my $m ( 0 .. $#scaffolds )
326 { # Runs a loop as many times as there are scaffolds
327 $baseList = $baseList{ $scaffolds[$m] };
329 # Print title line for each scaffold
330 $accession = $baseFile;
331 $default_header = "(BLAST hit:$scaffolds[$m]|";
333 my $real_start = $start[$m];
334 my $real_end = $end[$m];
336 # For "normal", positive strands (+/+)
337 if ( $strand[$m] == 1 ) {
339 # If the -p flag was used
340 if ( defined $promoter ) { # 5' region specified with -p flag
341 # If 5' region is too large for sequence, don't include the 5' region
342 if ( $promoter >= $start[$m] ) {
343 print STDERR
"ERROR: 5' region is too big!! (max promoter = ",
344 $start[$m] - 1, " )\n",
345 "Showing sequence without 5' region...\n";
346 $baseList = substr $baseList, $start[$m], $end[$m] - $start[$m];
349 my $real_start = $start[$m] - $promoter;
350 $baseList = substr $baseList, $start[$m] - $promoter - 1,
351 $promoter + $end[$m] - $start[$m] + 1;
355 # If the -3 flag was used
356 elsif ( defined $three_prime ) {
357 $total_size = length($baseList);
359 if ( ( $three_prime + $end[$m] ) > $total_size ) {
360 die "ERROR: 3' region is too big!! ",
361 "(max 3' region = ", $total_size - $end[$m], ")\n",
365 $real_start = $end[$m] + 1;
366 $real_end = $end[$m] + $three_prime;
367 $baseList = substr $baseList, $end[$m], $three_prime;
371 # If the -5 flag was used
372 elsif ( defined $promoter_only )
373 { # 5' region was specified with -5 flag
374 # If 5' region is too large for sequence, don't include the 5' region
375 if ( $promoter_only >= $start[$m] ) {
376 die "ERROR: 5' region is too big!! (max promoter = ",
377 $start[$m] - 1, " )\n",
381 $real_start = $start[$m] - $promoter_only;
382 $real_end = $start[$m] - 1;
383 $baseList = substr $baseList, $start[$m] - $promoter_only - 1,
387 else { # Default: just the BLAST hit (no 5' or 3')
388 $baseList = substr $baseList, $start[$m] - 1,
389 $end[$m] - $start[$m] + 1;
393 # Checks to see if the hit sequence is the reverse compliment
394 elsif ( $strand[$m] == -1 ) {
396 # If -p flag was used:
397 if ( defined $promoter ) {
398 $total_size = length($baseList);
400 if ( ( $promoter + $end[$m] ) > $total_size ) {
401 print STDERR
"ERROR: 5' region is too big!! ",
402 "(max 5' region = ", $total_size - $end[$m], ")\n",
403 "Showing sequence without 5' region...\n";
404 $baseList = substr $baseList, $start[$m], $end[$m] - $start[$m];
407 $real_start = $start[$m] + 1;
408 $real_end = $end[$m] + $promoter;
409 $baseList = substr $baseList, $start[$m] - 1,
410 $end[$m] - $start[$m] + $promoter + 1;
415 # If -3 flag was used
416 elsif ( defined $three_prime ) {
417 if ( $three_prime >= $start[$m] ) {
418 die "ERROR: 3' region is too big!! (max = ", $start[$m] - 1,
422 $real_start = $start[$m] - $three_prime;
423 $real_end = $start[$m] - 1;
424 $baseList = substr $baseList, $start[$m] - $three_prime - 1,
429 # If -5 flag was used
430 elsif ( defined $promoter_only ) {
431 $total_size = length($baseList);
433 if ( ( $promoter_only + $end[$m] ) > $total_size ) {
434 die "ERROR: 5' region is too big!! ",
435 "(max promoter = ", $total_size - $end[$m], ")\n",
439 $real_start = $end[$m] + 1;
440 $real_end = $end[$m] + $promoter_only;
441 $baseList = substr $baseList, $end[$m], $promoter_only;
445 else { # If 5' region wasn't specified at all
446 $baseList = substr $baseList, $start[$m] - 1,
447 $end[$m] - $start[$m] + 1;
450 $default_header .= "$real_start-$real_end)";
451 $default_header .= " showing 5' region ($promoter_only bp) only"
452 if defined($promoter_only);
453 $default_header .= " showing 3' region ($three_prime bp)"
454 if defined($three_prime);
455 $default_header .= " with 5' region ($promoter bp)" if defined($promoter);
457 my $seq_obj = Bio
::Seq
->new( -seq
=> "$baseList", -alphabet
=> 'dna' );
458 if ($strand[$m] == -1) {
459 $seq_obj = $seq_obj->revcom();
462 if ( defined $header ) {
466 $header =~ s/^([^\s]+)//;
467 my $default_name = $1;
468 $seq_obj->display_id($default_name);
469 $seq_obj->desc($header);
472 $seq_obj->desc($default_header);
473 $seq_obj->display_id($accession);
476 # Prints the sequence to STDOUT
477 $out->write_seq($seq_obj);
480 # If 'exact_match' option is specified, exit after first (and only) iteration
481 if ( defined($exact_match) ) {
487 } # End of major for loop