5 bp_search2gff - turn a SearchIO report into GFF
11 bp_search2gff [-o outputfile] [-f reportformat] [-i inputfilename] OR file1 file2 ..
15 This script will turn a SearchIO report (BLAST, FASTP, SSEARCH,
20 -i infilename - (optional) inputfilename, will read
21 either ARGV files or from STDIN
22 -o filename - the output filename [default STDOUT]
23 -f format - search result format (blast, fasta,waba,axt)
24 (ssearch is fasta format). default is blast.
25 -t/--type seqtype - if you want to see query or hit information
27 -s/--source - specify the source (will be algorithm name
28 otherwise like BLASTN)
29 --method - the method tag (primary_tag) of the features
30 (default is similarity)
31 --scorefunc - a string or a file that when parsed evaluates
32 to a closure which will be passed a feature
33 object and that returns the score to be printed
34 --locfunc - a string or a file that when parsed evaluates
35 to a closure which will be passed two
36 features, query and hit, and returns the
37 location (Bio::LocationI compliant) for the
38 GFF3 feature created for each HSP; the closure
39 may use the clone_loc() and create_loc()
40 functions for convenience, see their PODs
41 --onehsp - only print the first HSP feature for each hit
42 -p/--parent - the parent to which HSP features should refer
43 if not the name of the hit or query (depending
45 --target/--notarget - whether to always add the Target tag or not
47 --version - GFF version to use (put a 3 here to use gff 3)
48 --component - generate GFF component fields (chromosome)
49 -m/--match - generate a 'match' line which is a container
50 of all the similarity HSPs
51 --addid - add ID tag in the absence of --match
52 -c/--cutoff - specify an evalue cutoff
54 Additionally specify the filenames you want to process on the
55 command-line. If no files are specified then STDIN input is assumed.
56 You specify this by doing: bp_search2gff E<lt> file1 file2 file3
60 Jason Stajich, jason-at-bioperl-dot-org
64 Hilmar Lapp, hlapp-at-gmx-dot-net
73 use Bio
::Location
::Simple
; # pre-declare to simplify $locfunc implementations
74 use Bio
::Location
::Atomic
; # pre-declare to simplify $locfunc implementations
75 use Storable
qw(dclone); # for cloning location objects
76 use Bio
::Factory
::FTLocationFactory
;
79 $output, # output file (if not stdout)
80 $input, # name of the input file
81 $format, # format of the input file, defauly is blast
82 $type, # 'query' or 'hit'
83 $cutoff, # cut-off value for e-value filter
84 $sourcetag, # explicit source tag (will be taken from program
86 $methodtag, # primary tag (a.k.a. method), default 'similarity'
87 $gffver, # GFF version (dialect) to write
88 $scorefunc, # closure returning the score for a passed feature
89 $locfunc, # closure returning a location object for a passed
90 # query and hit feature
91 $addid, # flag: whether to always add the ID for $match == 0
92 $parent, # the name of the parent to use; if set and $match == 0
93 # will always add the target
94 $comp, # flag: whether to print a component feature
95 $addtarget, # flag: whether to always add the Target tag, default
97 $match, # flag: whether to print match lines as containers
98 $onehsp, # flag: whether to consider only the first HSP for a hit
99 $quiet, # flag: run quietly
100 $help # flag: show help screen
107 $methodtag = "similarity";
111 'i|input:s' => \
$input,
112 'component' => \
$comp,
113 'm|match' => \
$match,
114 'o|output:s' => \
$output,
115 'f|format:s' => \
$format,
116 's|source:s' => \
$sourcetag,
117 'method=s' => \
$methodtag,
119 'scorefunc=s' => \
$scorefunc,
120 'locfunc=s' => \
$locfunc,
121 'p|parent=s' => \
$parent,
122 'target!' => \
$addtarget,
123 'onehsp' => \
$onehsp,
124 't|type:s' => \
$type,
125 'c|cutoff:s' => \
$cutoff,
126 'v|version:i' => \
$gffver,
127 'q|quiet' => \
$quiet,
129 exec( 'perldoc', $0 );
134 if ( $type =~ /target/ ) { $type = 'hit' }
135 elsif ( $type ne 'query' && $type ne 'hit' ) {
136 die("seqtype must be either 'query' or 'hit'");
139 # custom or default function returning the score
141 defined($scorefunc) ? parse_code
($scorefunc) : sub { shift->score };
143 # custom or default function returning the location
144 $locfunc = defined($locfunc) ? parse_code
($locfunc) : sub { shift->location };
146 # if --match is given then $addid needs to be disabled
147 $addid = undef if $addid && $match;
149 # if no input is provided STDIN will be used
150 my $parser = new Bio
::SearchIO
(
152 -verbose
=> $quiet ?
-1 : 0,
157 if ( defined $output ) {
158 $out = new Bio
::Tools
::GFF
(
159 -gff_version
=> $gffver,
164 $out = new Bio
::Tools
::GFF
( -gff_version
=> $gffver ); # STDOUT
166 my ( %seen_hit, %seen );
167 my $other = $type eq 'query' ?
'hit' : 'query';
169 while ( my $result = $parser->next_result ) {
170 my $qname = $result->query_name;
173 && $result->query_length )
176 Bio
::SeqFeature
::Generic
->new(
178 -end
=> $result->query_length,
180 -source_tag
=> 'chromosome',
181 -primary_tag
=> 'Component',
188 while ( my $hit = $result->next_hit ) {
189 next if ( defined $cutoff && $hit->significance > $cutoff );
191 if ( $seen{ $qname . "-" . $hit->name }++ ) {
192 $acc = $qname . "-" . $seen{ $qname . '-' . $hit->name };
198 && !$seen_hit{ $hit->name }++ )
201 Bio
::SeqFeature
::Generic
->new(
203 -end
=> $hit->length,
204 -seq_id
=> $hit->name,
205 -source_tag
=> 'chromosome',
206 -primary_tag
=> 'Component',
208 'Sequence' => $hit->name
213 my ( %min, %max, $seqid, $name, $st );
214 while ( my $hsp = $hit->next_hsp ) {
215 my $feature = new Bio
::SeqFeature
::Generic
;
216 my ( $proxyfor, $otherf );
217 if ( $type eq 'query' ) {
218 ( $proxyfor, $otherf ) = ( $hsp->query, $hsp->hit );
219 $name ||= $hit->name;
222 ( $otherf, $proxyfor ) = ( $hsp->query, $hsp->hit );
225 $proxyfor->score( $hit->bits ) unless ( $proxyfor->score );
226 if ( ( $gffver == 3 ) && ( $match || $parent ) ) {
227 $feature->add_tag_value( 'Parent', $parent || $name );
230 $min{$type} = $proxyfor->start
231 unless defined $min{$type} && $min{$type} < $proxyfor->start;
232 $max{$type} = $proxyfor->end
233 unless defined $max{$type} && $max{$type} > $proxyfor->end;
234 $min{$other} = $otherf->start
235 unless defined $min{$other} && $min{$other} < $otherf->start;
236 $max{$other} = $otherf->end
237 unless defined $max{$other} && $max{$other} > $otherf->end;
238 if ( $addtarget || $match ) {
239 $feature->add_tag_value( 'Target', 'Sequence:' . $name );
240 $feature->add_tag_value( 'Target', $otherf->start );
241 $feature->add_tag_value( 'Target', $otherf->end );
244 $feature->add_tag_value( 'ID', $name );
247 $feature->location( &$locfunc( $proxyfor, $otherf ) );
249 # strand for feature is always going to be product of
250 # query & hit strands so that target can always be just
252 $feature->strand( $proxyfor->strand * $otherf->strand );
254 $feature->source_tag($sourcetag);
257 $feature->source_tag( $proxyfor->source_tag );
259 $feature->score( &$scorefunc($proxyfor) );
260 $feature->frame( $proxyfor->frame );
261 $feature->seq_id( $proxyfor->seq_id );
262 $feature->primary_tag($methodtag);
264 # add annotation if encoded in the query description
265 my $desc = $result->query_description;
266 while ( $desc =~ /\/([^=]+)=(\S
+)/g
) {
267 $feature->add_tag_value( $1, $2 );
269 $seqid ||= $proxyfor->seq_id;
270 $out->write_feature($feature);
271 $st ||= $sourcetag || $proxyfor->source_tag;
276 my $matchf = Bio
::SeqFeature
::Generic
->new(
277 -start
=> $min{$type},
279 -strand
=> $hit->strand($type) * $hit->strand($other),
280 -primary_tag
=> 'match',
282 -score
=> $hit->bits,
285 if ( $gffver == 3 ) {
286 $matchf->add_tag_value( 'ID', $name );
288 $matchf->add_tag_value( 'Target', "Sequence:$name" );
289 $out->write_feature($matchf);
298 # file or subroutine?
300 if ( !( ( $code = do $src ) && ( ref($code) eq "CODE" ) ) ) {
301 die "error in parsing code block $src: $@" if $@
;
302 die "unable to read file $src: $!" if $!;
303 die "failed to run $src, or it failed to return a closure";
308 die "error in parsing code block \"$src\": $@" if $@
;
309 die "\"$src\" fails to return a closure"
310 unless ref($code) eq "CODE";
318 Usage : my $l = clone_loc($feature->location);
319 Function: Helper function to simplify the task of cloning locations
320 for --locfunc closures.
322 Presently simply implemented using Storable::dclone().
324 Returns : A L<Bio::LocationI> object of the same type and with the
325 same properties as the argument, but physically different.
326 All structured properties will be cloned as well.
327 Args : A L<Bio::LocationI> compliant object
332 return dclone
(shift);
338 Usage : my $l = create_loc("10..12");
339 Function: Helper function to simplify the task of creating locations
340 for --locfunc closures. Creates a location from a feature-
341 table formatted string.
344 Returns : A L<Bio::LocationI> object representing the location given
346 Args : A GenBank feature-table formatted string.
351 return Bio
::Factory
::FTLocationFactory
->from_string(shift);