bin/bp_biofetch_genbank_proxy: move to Bio-DB-NCBIHelper
[bioperl-live.git] / bin / bp_mask_by_search
blobfb550ab4811383655a0224885d8220ebbdcbc676
1 #!/usr/bin/perl
2 # Author: Jason Stajich <jason-at-bioperl-dot-org>
5 =head1 NAME
7 bp_mask_by_search - mask sequence(s) based on its alignment results
9 =head1 SYNOPSIS
11 bp_mask_by_search.pl -f blast genomefile blastfile.bls > maskedgenome.fa
13 =head1 DESCRIPTION
15 Mask sequence based on significant alignments of another sequence.
16 You need to provide the report file and the entire sequence data which
17 you want to mask. By default this will assume you have done a TBLASTN
18 (or TFASTY) and try and mask the hit sequence assuming you've provided
19 the sequence file for the hit database. If you would like to do the
20 reverse and mask the query sequence specify the -t/--type query flag.
22 This is going to read in the whole sequence file into memory so for
23 large genomes this may fall over. I'm using DB_File to prevent
24 keeping everything in memory, one solution is to split the genome into
25 pieces (BEFORE you run the DB search though, you want to use the exact
26 file you BLASTed with as input to this program).
28 Below the double dash (--) options are of the form
29 --format=fasta or --format fasta
30 or you can just say
31 -f fasta
33 By -f/--format I mean either are acceptable options. The =s or =n
34 or =c specify these arguments expect a 'string'
36 Options:
37 -f/--format=s Search report format (fasta,blast,axt,hmmer,etc)
38 -sf/--sformat=s Sequence format (fasta,genbank,embl,swissprot)
39 --hardmask (booelean) Hard mask the sequence
40 with the maskchar [default is lowercase mask]
41 --maskchar=c Character to mask with [default is N], change
42 to 'X' for protein sequences
43 -e/--evalue=n Evalue cutoff for HSPs and Hits, only
44 mask sequence if alignment has specified evalue
45 or better
46 -o/--out/
47 --outfile=file Output file to save the masked sequence to.
48 -t/--type=s Alignment seq type you want to mask, the
49 'hit' or the 'query' sequence. [default is 'hit']
50 --minlen=n Minimum length of an HSP for it to be used
51 in masking [default 0]
52 -h/--help See this help information
54 =head1 AUTHOR - Jason Stajich
56 Jason Stajich, jason-at-bioperl-dot-org.
58 =cut
61 use strict;
62 use warnings;
63 use Bio::SeqIO;
64 use Bio::SearchIO;
65 use Getopt::Long;
66 use Bio::Seq;
67 use DB_File;
68 # assuming tblastn or tfasty type alignment
70 my $format = 'blast';
71 my $sformat= undef;
72 my $evalue = undef;
73 my $type = 'hit';
74 my $minlen = 50;
75 my $hardmask = 0; # mask with $maskchar instead of lowercase
76 my $maskchar = 'N'; # if we hard mask, mask with this cahr
77 my $outfile;
78 GetOptions(
79 'f|format:s' => \$format,
80 'sf|sformat:s'=> \$sformat,
81 'hardmask' => \$hardmask,
82 'maskchar:s' => \$maskchar,
83 'e|evalue:s' => \$evalue,
84 'o|out|outfile:s' => \$outfile,
85 't|type:s' => \$type,
86 'minlen:s' => \$minlen,
87 'h|help' => sub { system('perldoc', $0);
88 exit; },
90 if( $type !~ /^(hit|query)/i ) {
91 die("type must be query or hit[default] not $type") ;
93 $type = lc($type);
95 if(length($maskchar) > 1 ) {
96 die("expected a mask character, not a string (you gave $maskchar)");
98 my $genomefile = shift || die('need a file containing the genome');
99 my $reportfile = shift;
101 # this could be problem for large genomes, figure out a
102 # better way to do this later on
103 # or force people to split it up
104 my $genomeparser = new Bio::SeqIO(-file => $genomefile,
105 -format=> $sformat);
106 my %seqs;
107 unlink('/tmp/genome.idx');
108 tie(%seqs,'DB_File','/tmp/genome.idx');
109 while( my $seq = $genomeparser->next_seq ) {
110 # should we pre-force to upper case?
111 $seqs{$seq->display_id} = $seq->seq();
114 my $parser = new Bio::SearchIO(-file => $reportfile,
115 -format => $format);
117 while( my $r = $parser->next_result ) {
118 while( my $h = $r->next_hit ) {
119 last if( defined $evalue && $h->significance > $evalue );
120 my $hname = $h->name;
121 if( ! $seqs{$hname} ) {
122 die("Cannot find sequence $hname in genome seq");
124 while( my $hsp = $h->next_hsp ) {
125 last if( defined $evalue && $hsp->evalue > $evalue );
126 next if( $hsp->length('total') < $minlen);
127 my ($s,$len) = ( $hsp->$type()->start,
128 $hsp->$type()->length);
130 if( $hardmask ) {
131 substr($seqs{$hname}, $s,$len, $maskchar x $len);
132 } else {
133 substr($seqs{$hname}, $s,$len,
134 lc(substr($seqs{$hname}, $s,$len)));
140 my $out;
141 if( $outfile ) {
142 $out = new Bio::SeqIO(-file => ">$outfile",
143 -format => $sformat);
144 } else {
145 $out = new Bio::SeqIO(-format => $sformat);
148 while( my ($seqname,$seq) = each %seqs ) {
149 $out->write_seq(Bio::Seq->new(-seq => $seq,
150 -display_id => $seqname,
151 -description=> 'MASKED'));
153 END {
154 unlink('/tmp/genome.idx');