Bio::Tools::Analysis::Protein:ELM: move to separate distribution
[bioperl-live.git] / bin / bp_search2gff
blob1f316b086d39f8be3687783c286a2429da2e89a3
1 #!/usr/bin/perl
3 =head1 NAME
5 bp_search2gff - turn a SearchIO report into GFF
7 =head1 SYNOPSIS
9 Usage:
11 bp_search2gff [-o outputfile] [-f reportformat] [-i inputfilename] OR file1 file2 ..
13 =head1 DESCRIPTION
15 This script will turn a SearchIO report (BLAST, FASTP, SSEARCH,
16 AXT, WABA) into GFF.
18 The options are:
20 -i infilename - (optional) inputfilename, will read
21 either ARGV files or from STDIN
22 -o filename - the output filename [default STDOUT]
23 -f format - search result format (blast, fasta,waba,axt)
24 (ssearch is fasta format). default is blast.
25 -t/--type seqtype - if you want to see query or hit information
26 in the GFF report
27 -s/--source - specify the source (will be algorithm name
28 otherwise like BLASTN)
29 --method - the method tag (primary_tag) of the features
30 (default is similarity)
31 --scorefunc - a string or a file that when parsed evaluates
32 to a closure which will be passed a feature
33 object and that returns the score to be printed
34 --locfunc - a string or a file that when parsed evaluates
35 to a closure which will be passed two
36 features, query and hit, and returns the
37 location (Bio::LocationI compliant) for the
38 GFF3 feature created for each HSP; the closure
39 may use the clone_loc() and create_loc()
40 functions for convenience, see their PODs
41 --onehsp - only print the first HSP feature for each hit
42 -p/--parent - the parent to which HSP features should refer
43 if not the name of the hit or query (depending
44 on --type)
45 --target/--notarget - whether to always add the Target tag or not
46 -h - this help menu
47 --version - GFF version to use (put a 3 here to use gff 3)
48 --component - generate GFF component fields (chromosome)
49 -m/--match - generate a 'match' line which is a container
50 of all the similarity HSPs
51 --addid - add ID tag in the absence of --match
52 -c/--cutoff - specify an evalue cutoff
54 Additionally specify the filenames you want to process on the
55 command-line. If no files are specified then STDIN input is assumed.
56 You specify this by doing: bp_search2gff E<lt> file1 file2 file3
58 =head1 AUTHOR
60 Jason Stajich, jason-at-bioperl-dot-org
62 =head1 Contributors
64 Hilmar Lapp, hlapp-at-gmx-dot-net
66 =cut
68 use strict;
69 use warnings;
70 use Bio::Tools::GFF;
71 use Getopt::Long;
72 use Bio::SearchIO;
73 use Bio::Location::Simple; # pre-declare to simplify $locfunc implementations
74 use Bio::Location::Atomic; # pre-declare to simplify $locfunc implementations
75 use Storable qw(dclone); # for cloning location objects
76 use Bio::Factory::FTLocationFactory;
78 my (
79 $output, # output file (if not stdout)
80 $input, # name of the input file
81 $format, # format of the input file, defauly is blast
82 $type, # 'query' or 'hit'
83 $cutoff, # cut-off value for e-value filter
84 $sourcetag, # explicit source tag (will be taken from program
85 # otherwise
86 $methodtag, # primary tag (a.k.a. method), default 'similarity'
87 $gffver, # GFF version (dialect) to write
88 $scorefunc, # closure returning the score for a passed feature
89 $locfunc, # closure returning a location object for a passed
90 # query and hit feature
91 $addid, # flag: whether to always add the ID for $match == 0
92 $parent, # the name of the parent to use; if set and $match == 0
93 # will always add the target
94 $comp, # flag: whether to print a component feature
95 $addtarget, # flag: whether to always add the Target tag, default
96 # is true
97 $match, # flag: whether to print match lines as containers
98 $onehsp, # flag: whether to consider only the first HSP for a hit
99 $quiet, # flag: run quietly
100 $help # flag: show help screen
103 # set defaults:
104 $format = 'blast';
105 $type = 'query';
106 $gffver = 2;
107 $methodtag = "similarity";
108 $addtarget = 1;
110 GetOptions(
111 'i|input:s' => \$input,
112 'component' => \$comp,
113 'm|match' => \$match,
114 'o|output:s' => \$output,
115 'f|format:s' => \$format,
116 's|source:s' => \$sourcetag,
117 'method=s' => \$methodtag,
118 'addid' => \$addid,
119 'scorefunc=s' => \$scorefunc,
120 'locfunc=s' => \$locfunc,
121 'p|parent=s' => \$parent,
122 'target!' => \$addtarget,
123 'onehsp' => \$onehsp,
124 't|type:s' => \$type,
125 'c|cutoff:s' => \$cutoff,
126 'v|version:i' => \$gffver,
127 'q|quiet' => \$quiet,
128 'h|help' => sub {
129 exec( 'perldoc', $0 );
130 exit(0);
133 $type = lc($type);
134 if ( $type =~ /target/ ) { $type = 'hit' }
135 elsif ( $type ne 'query' && $type ne 'hit' ) {
136 die("seqtype must be either 'query' or 'hit'");
139 # custom or default function returning the score
140 $scorefunc =
141 defined($scorefunc) ? parse_code($scorefunc) : sub { shift->score };
143 # custom or default function returning the location
144 $locfunc = defined($locfunc) ? parse_code($locfunc) : sub { shift->location };
146 # if --match is given then $addid needs to be disabled
147 $addid = undef if $addid && $match;
149 # if no input is provided STDIN will be used
150 my $parser = new Bio::SearchIO(
151 -format => $format,
152 -verbose => $quiet ? -1 : 0,
153 -file => $input
156 my $out;
157 if ( defined $output ) {
158 $out = new Bio::Tools::GFF(
159 -gff_version => $gffver,
160 -file => ">$output"
163 else {
164 $out = new Bio::Tools::GFF( -gff_version => $gffver ); # STDOUT
166 my ( %seen_hit, %seen );
167 my $other = $type eq 'query' ? 'hit' : 'query';
169 while ( my $result = $parser->next_result ) {
170 my $qname = $result->query_name;
171 if ( $comp
172 && $type eq 'query'
173 && $result->query_length )
175 $out->write_feature(
176 Bio::SeqFeature::Generic->new(
177 -start => 1,
178 -end => $result->query_length,
179 -seq_id => $qname,
180 -source_tag => 'chromosome',
181 -primary_tag => 'Component',
182 -tag => {
183 'Sequence' => $qname
188 while ( my $hit = $result->next_hit ) {
189 next if ( defined $cutoff && $hit->significance > $cutoff );
190 my $acc = $qname;
191 if ( $seen{ $qname . "-" . $hit->name }++ ) {
192 $acc = $qname . "-" . $seen{ $qname . '-' . $hit->name };
195 if ( $comp
196 && $type eq 'hit'
197 && $hit->length
198 && !$seen_hit{ $hit->name }++ )
200 $out->write_feature(
201 Bio::SeqFeature::Generic->new(
202 -start => 1,
203 -end => $hit->length,
204 -seq_id => $hit->name,
205 -source_tag => 'chromosome',
206 -primary_tag => 'Component',
207 -tag => {
208 'Sequence' => $hit->name
213 my ( %min, %max, $seqid, $name, $st );
214 while ( my $hsp = $hit->next_hsp ) {
215 my $feature = new Bio::SeqFeature::Generic;
216 my ( $proxyfor, $otherf );
217 if ( $type eq 'query' ) {
218 ( $proxyfor, $otherf ) = ( $hsp->query, $hsp->hit );
219 $name ||= $hit->name;
221 else {
222 ( $otherf, $proxyfor ) = ( $hsp->query, $hsp->hit );
223 $name ||= $acc;
225 $proxyfor->score( $hit->bits ) unless ( $proxyfor->score );
226 if ( ( $gffver == 3 ) && ( $match || $parent ) ) {
227 $feature->add_tag_value( 'Parent', $parent || $name );
230 $min{$type} = $proxyfor->start
231 unless defined $min{$type} && $min{$type} < $proxyfor->start;
232 $max{$type} = $proxyfor->end
233 unless defined $max{$type} && $max{$type} > $proxyfor->end;
234 $min{$other} = $otherf->start
235 unless defined $min{$other} && $min{$other} < $otherf->start;
236 $max{$other} = $otherf->end
237 unless defined $max{$other} && $max{$other} > $otherf->end;
238 if ( $addtarget || $match ) {
239 $feature->add_tag_value( 'Target', 'Sequence:' . $name );
240 $feature->add_tag_value( 'Target', $otherf->start );
241 $feature->add_tag_value( 'Target', $otherf->end );
243 if ($addid) {
244 $feature->add_tag_value( 'ID', $name );
247 $feature->location( &$locfunc( $proxyfor, $otherf ) );
249 # strand for feature is always going to be product of
250 # query & hit strands so that target can always be just
251 # '+'
252 $feature->strand( $proxyfor->strand * $otherf->strand );
253 if ($sourcetag) {
254 $feature->source_tag($sourcetag);
256 else {
257 $feature->source_tag( $proxyfor->source_tag );
259 $feature->score( &$scorefunc($proxyfor) );
260 $feature->frame( $proxyfor->frame );
261 $feature->seq_id( $proxyfor->seq_id );
262 $feature->primary_tag($methodtag);
264 # add annotation if encoded in the query description
265 my $desc = $result->query_description;
266 while ( $desc =~ /\/([^=]+)=(\S+)/g ) {
267 $feature->add_tag_value( $1, $2 );
269 $seqid ||= $proxyfor->seq_id;
270 $out->write_feature($feature);
271 $st ||= $sourcetag || $proxyfor->source_tag;
272 last if $onehsp;
274 if ($match) {
276 my $matchf = Bio::SeqFeature::Generic->new(
277 -start => $min{$type},
278 -end => $max{$type},
279 -strand => $hit->strand($type) * $hit->strand($other),
280 -primary_tag => 'match',
281 -source_tag => $st,
282 -score => $hit->bits,
283 -seq_id => $seqid
285 if ( $gffver == 3 ) {
286 $matchf->add_tag_value( 'ID', $name );
288 $matchf->add_tag_value( 'Target', "Sequence:$name" );
289 $out->write_feature($matchf);
294 sub parse_code {
295 my $src = shift;
296 my $code;
298 # file or subroutine?
299 if ( -r $src ) {
300 if ( !( ( $code = do $src ) && ( ref($code) eq "CODE" ) ) ) {
301 die "error in parsing code block $src: $@" if $@;
302 die "unable to read file $src: $!" if $!;
303 die "failed to run $src, or it failed to return a closure";
306 else {
307 $code = eval $src;
308 die "error in parsing code block \"$src\": $@" if $@;
309 die "\"$src\" fails to return a closure"
310 unless ref($code) eq "CODE";
312 return $code;
315 =head2 clone_loc
317 Title : clone_loc
318 Usage : my $l = clone_loc($feature->location);
319 Function: Helper function to simplify the task of cloning locations
320 for --locfunc closures.
322 Presently simply implemented using Storable::dclone().
323 Example :
324 Returns : A L<Bio::LocationI> object of the same type and with the
325 same properties as the argument, but physically different.
326 All structured properties will be cloned as well.
327 Args : A L<Bio::LocationI> compliant object
329 =cut
331 sub clone_loc {
332 return dclone(shift);
335 =head2 create_loc
337 Title : create_loc
338 Usage : my $l = create_loc("10..12");
339 Function: Helper function to simplify the task of creating locations
340 for --locfunc closures. Creates a location from a feature-
341 table formatted string.
343 Example :
344 Returns : A L<Bio::LocationI> object representing the location given
345 as formatted string.
346 Args : A GenBank feature-table formatted string.
348 =cut
350 sub create_loc {
351 return Bio::Factory::FTLocationFactory->from_string(shift);